Actual source code: fnexp.c

slepc-3.15.1 2021-05-28
Report Typos and Errors
  1: /*
  2:    - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
  3:    SLEPc - Scalable Library for Eigenvalue Problem Computations
  4:    Copyright (c) 2002-2021, Universitat Politecnica de Valencia, Spain

  6:    This file is part of SLEPc.
  7:    SLEPc is distributed under a 2-clause BSD license (see LICENSE).
  8:    - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
  9: */
 10: /*
 11:    Exponential function  exp(x)
 12: */

 14: #include <slepc/private/fnimpl.h>
 15: #include <slepcblaslapack.h>

 17: PetscErrorCode FNEvaluateFunction_Exp(FN fn,PetscScalar x,PetscScalar *y)
 18: {
 20:   *y = PetscExpScalar(x);
 21:   return(0);
 22: }

 24: PetscErrorCode FNEvaluateDerivative_Exp(FN fn,PetscScalar x,PetscScalar *y)
 25: {
 27:   *y = PetscExpScalar(x);
 28:   return(0);
 29: }

 31: #define MAX_PADE 6
 32: #define SWAP(a,b,t) {t=a;a=b;b=t;}

 34: PetscErrorCode FNEvaluateFunctionMat_Exp_Pade(FN fn,Mat A,Mat B)
 35: {
 36:   PetscErrorCode    ierr;
 37:   PetscBLASInt      n=0,ld,ld2,*ipiv,info,inc=1;
 38:   PetscInt          m,j,k,sexp;
 39:   PetscBool         odd;
 40:   const PetscInt    p=MAX_PADE;
 41:   PetscReal         c[MAX_PADE+1],s,*rwork;
 42:   PetscScalar       scale,mone=-1.0,one=1.0,two=2.0,zero=0.0;
 43:   PetscScalar       *Ba,*As,*A2,*Q,*P,*W,*aux;
 44:   const PetscScalar *Aa;

 47:   MatDenseGetArrayRead(A,&Aa);
 48:   MatDenseGetArray(B,&Ba);
 49:   MatGetSize(A,&m,NULL);
 50:   PetscBLASIntCast(m,&n);
 51:   ld  = n;
 52:   ld2 = ld*ld;
 53:   P   = Ba;
 54:   PetscMalloc6(m*m,&Q,m*m,&W,m*m,&As,m*m,&A2,ld,&rwork,ld,&ipiv);
 55:   PetscArraycpy(As,Aa,ld2);

 57:   /* Pade' coefficients */
 58:   c[0] = 1.0;
 59:   for (k=1;k<=p;k++) c[k] = c[k-1]*(p+1-k)/(k*(2*p+1-k));

 61:   /* Scaling */
 62:   s = LAPACKlange_("I",&n,&n,As,&ld,rwork);
 63:   PetscLogFlops(1.0*n*n);
 64:   if (s>0.5) {
 65:     sexp = PetscMax(0,(int)(PetscLogReal(s)/PetscLogReal(2.0))+2);
 66:     scale = PetscPowRealInt(2.0,-sexp);
 67:     PetscStackCallBLAS("BLASscal",BLASscal_(&ld2,&scale,As,&inc));
 68:     PetscLogFlops(1.0*n*n);
 69:   } else sexp = 0;

 71:   /* Horner evaluation */
 72:   PetscStackCallBLAS("BLASgemm",BLASgemm_("N","N",&n,&n,&n,&one,As,&ld,As,&ld,&zero,A2,&ld));
 73:   PetscLogFlops(2.0*n*n*n);
 74:   PetscArrayzero(Q,ld2);
 75:   PetscArrayzero(P,ld2);
 76:   for (j=0;j<n;j++) {
 77:     Q[j+j*ld] = c[p];
 78:     P[j+j*ld] = c[p-1];
 79:   }

 81:   odd = PETSC_TRUE;
 82:   for (k=p-1;k>0;k--) {
 83:     if (odd) {
 84:       PetscStackCallBLAS("BLASgemm",BLASgemm_("N","N",&n,&n,&n,&one,Q,&ld,A2,&ld,&zero,W,&ld));
 85:       SWAP(Q,W,aux);
 86:       for (j=0;j<n;j++) Q[j+j*ld] += c[k-1];
 87:       odd = PETSC_FALSE;
 88:     } else {
 89:       PetscStackCallBLAS("BLASgemm",BLASgemm_("N","N",&n,&n,&n,&one,P,&ld,A2,&ld,&zero,W,&ld));
 90:       SWAP(P,W,aux);
 91:       for (j=0;j<n;j++) P[j+j*ld] += c[k-1];
 92:       odd = PETSC_TRUE;
 93:     }
 94:     PetscLogFlops(2.0*n*n*n);
 95:   }
 96:   /*if (odd) {
 97:     PetscStackCallBLAS("BLASgemm",BLASgemm_("N","N",&n,&n,&n,&one,Q,&ld,As,&ld,&zero,W,&ld));
 98:     SWAP(Q,W,aux);
 99:     PetscStackCallBLAS("BLASaxpy",BLASaxpy_(&ld2,&mone,P,&inc,Q,&inc));
100:     PetscStackCallBLAS("LAPACKgesv",LAPACKgesv_(&n,&n,Q,&ld,ipiv,P,&ld,&info));
101:     SlepcCheckLapackInfo("gesv",info);
102:     PetscStackCallBLAS("BLASscal",BLASscal_(&ld2,&two,P,&inc));
103:     for (j=0;j<n;j++) P[j+j*ld] += 1.0;
104:     PetscStackCallBLAS("BLASscal",BLASscal_(&ld2,&mone,P,&inc));
105:   } else {*/
106:     PetscStackCallBLAS("BLASgemm",BLASgemm_("N","N",&n,&n,&n,&one,P,&ld,As,&ld,&zero,W,&ld));
107:     SWAP(P,W,aux);
108:     PetscStackCallBLAS("BLASaxpy",BLASaxpy_(&ld2,&mone,P,&inc,Q,&inc));
109:     PetscStackCallBLAS("LAPACKgesv",LAPACKgesv_(&n,&n,Q,&ld,ipiv,P,&ld,&info));
110:     SlepcCheckLapackInfo("gesv",info);
111:     PetscStackCallBLAS("BLASscal",BLASscal_(&ld2,&two,P,&inc));
112:     for (j=0;j<n;j++) P[j+j*ld] += 1.0;
113:   /*}*/
114:   PetscLogFlops(2.0*n*n*n+2.0*n*n*n/3.0+4.0*n*n);

116:   for (k=1;k<=sexp;k++) {
117:     PetscStackCallBLAS("BLASgemm",BLASgemm_("N","N",&n,&n,&n,&one,P,&ld,P,&ld,&zero,W,&ld));
118:     PetscArraycpy(P,W,ld2);
119:   }
120:   if (P!=Ba) { PetscArraycpy(Ba,P,ld2); }
121:   PetscLogFlops(2.0*n*n*n*sexp);

123:   PetscFree6(Q,W,As,A2,rwork,ipiv);
124:   MatDenseRestoreArrayRead(A,&Aa);
125:   MatDenseRestoreArray(B,&Ba);
126:   return(0);
127: }

129: /*
130:  * Set scaling factor (s) and Pade degree (k,m)
131:  */
132: static PetscErrorCode sexpm_params(PetscReal nrm,PetscInt *s,PetscInt *k,PetscInt *m)
133: {
135:   if (nrm>1) {
136:     if      (nrm<200)  {*s = 4; *k = 5; *m = *k-1;}
137:     else if (nrm<1e4)  {*s = 4; *k = 4; *m = *k+1;}
138:     else if (nrm<1e6)  {*s = 4; *k = 3; *m = *k+1;}
139:     else if (nrm<1e9)  {*s = 3; *k = 3; *m = *k+1;}
140:     else if (nrm<1e11) {*s = 2; *k = 3; *m = *k+1;}
141:     else if (nrm<1e12) {*s = 2; *k = 2; *m = *k+1;}
142:     else if (nrm<1e14) {*s = 2; *k = 1; *m = *k+1;}
143:     else               {*s = 1; *k = 1; *m = *k+1;}
144:   } else { /* nrm<1 */
145:     if       (nrm>0.5)  {*s = 4; *k = 4; *m = *k-1;}
146:     else  if (nrm>0.3)  {*s = 3; *k = 4; *m = *k-1;}
147:     else  if (nrm>0.15) {*s = 2; *k = 4; *m = *k-1;}
148:     else  if (nrm>0.07) {*s = 1; *k = 4; *m = *k-1;}
149:     else  if (nrm>0.01) {*s = 0; *k = 4; *m = *k-1;}
150:     else  if (nrm>3e-4) {*s = 0; *k = 3; *m = *k-1;}
151:     else  if (nrm>1e-5) {*s = 0; *k = 3; *m = 0;}
152:     else  if (nrm>1e-8) {*s = 0; *k = 2; *m = 0;}
153:     else                {*s = 0; *k = 1; *m = 0;}
154:   }
155:   return(0);
156: }

158: #if defined(PETSC_HAVE_COMPLEX)
159: /*
160:  * Partial fraction form coefficients.
161:  * If query, the function returns the size necessary to store the coefficients.
162:  */
163: static PetscErrorCode getcoeffs(PetscInt k,PetscInt m,PetscComplex *r,PetscComplex *q,PetscComplex *remain,PetscBool query)
164: {
165:   PetscInt i;
166:   const PetscComplex /* m == k+1 */
167:     p1r4[5] = {-1.582680186458572e+01 - 2.412564578224361e+01*PETSC_i,
168:                -1.582680186458572e+01 + 2.412564578224361e+01*PETSC_i,
169:                 1.499984465975511e+02 + 6.804227952202417e+01*PETSC_i,
170:                 1.499984465975511e+02 - 6.804227952202417e+01*PETSC_i,
171:                -2.733432894659307e+02                                },
172:     p1q4[5] = { 3.655694325463550e+00 + 6.543736899360086e+00*PETSC_i,
173:                 3.655694325463550e+00 - 6.543736899360086e+00*PETSC_i,
174:                 5.700953298671832e+00 + 3.210265600308496e+00*PETSC_i,
175:                 5.700953298671832e+00 - 3.210265600308496e+00*PETSC_i,
176:                 6.286704751729261e+00                               },
177:     p1r3[4] = {-1.130153999597152e+01 + 1.247167585025031e+01*PETSC_i,
178:                -1.130153999597152e+01 - 1.247167585025031e+01*PETSC_i,
179:                 1.330153999597152e+01 - 6.007173273704750e+01*PETSC_i,
180:                 1.330153999597152e+01 + 6.007173273704750e+01*PETSC_i},
181:     p1q3[4] = { 3.212806896871536e+00 + 4.773087433276636e+00*PETSC_i,
182:                 3.212806896871536e+00 - 4.773087433276636e+00*PETSC_i,
183:                 4.787193103128464e+00 + 1.567476416895212e+00*PETSC_i,
184:                 4.787193103128464e+00 - 1.567476416895212e+00*PETSC_i},
185:     p1r2[3] = { 7.648749087422928e+00 + 4.171640244747463e+00*PETSC_i,
186:                 7.648749087422928e+00 - 4.171640244747463e+00*PETSC_i,
187:                -1.829749817484586e+01                                },
188:     p1q2[3] = { 2.681082873627756e+00 + 3.050430199247411e+00*PETSC_i,
189:                 2.681082873627756e+00 - 3.050430199247411e+00*PETSC_i,
190:                 3.637834252744491e+00                                },
191:     p1r1[2] = { 1.000000000000000e+00 - 3.535533905932738e+00*PETSC_i,
192:                 1.000000000000000e+00 + 3.535533905932738e+00*PETSC_i},
193:     p1q1[2] = { 2.000000000000000e+00 + 1.414213562373095e+00*PETSC_i,
194:                 2.000000000000000e+00 - 1.414213562373095e+00*PETSC_i};
195:   const PetscComplex /* m == k-1 */
196:     m1r5[4] = {-1.423367961376821e+02 - 1.385465094833037e+01*PETSC_i,
197:                -1.423367961376821e+02 + 1.385465094833037e+01*PETSC_i,
198:                 2.647367961376822e+02 - 4.814394493714596e+02*PETSC_i,
199:                 2.647367961376822e+02 + 4.814394493714596e+02*PETSC_i},
200:     m1q5[4] = { 5.203941240131764e+00 + 5.805856841805367e+00*PETSC_i,
201:                 5.203941240131764e+00 - 5.805856841805367e+00*PETSC_i,
202:                 6.796058759868242e+00 + 1.886649260140217e+00*PETSC_i,
203:                 6.796058759868242e+00 - 1.886649260140217e+00*PETSC_i},
204:     m1r4[3] = { 2.484269593165883e+01 + 7.460342395992306e+01*PETSC_i,
205:                 2.484269593165883e+01 - 7.460342395992306e+01*PETSC_i,
206:                -1.734353918633177e+02                                },
207:     m1q4[3] = { 4.675757014491557e+00 + 3.913489560603711e+00*PETSC_i,
208:                 4.675757014491557e+00 - 3.913489560603711e+00*PETSC_i,
209:                 5.648485971016893e+00                                },
210:     m1r3[2] = { 2.533333333333333e+01 - 2.733333333333333e+01*PETSC_i,
211:                 2.533333333333333e+01 + 2.733333333333333e+01*PETSC_i},
212:     m1q3[2] = { 4.000000000000000e+00 + 2.000000000000000e+00*PETSC_i,
213:                 4.000000000000000e+00 - 2.000000000000000e+00*PETSC_i};
214:   const PetscScalar /* m == k-1 */
215:     m1remain5[2] = { 2.000000000000000e-01,  9.800000000000000e+00},
216:     m1remain4[2] = {-2.500000000000000e-01, -7.750000000000000e+00},
217:     m1remain3[2] = { 3.333333333333333e-01,  5.666666666666667e+00},
218:     m1remain2[2] = {-0.5,                   -3.5},
219:     remain3[4] = {1.0/6.0, 1.0/2.0, 1, 1},
220:     remain2[3] = {1.0/2.0, 1, 1};

223:   if (query) { /* query about buffer's size */
224:     if (m==k+1) {
225:       *remain = 0;
226:       *r = *q = k+1;
227:       return(0); /* quick return */
228:     }
229:     if (m==k-1) {
230:       *remain = 2;
231:       if (k==5) *r = *q = 4;
232:       else if (k==4) *r = *q = 3;
233:       else if (k==3) *r = *q = 2;
234:       else if (k==2) *r = *q = 1;
235:     }
236:     if (m==0) {
237:       *r = *q = 0;
238:       *remain = k+1;
239:     }
240:   } else {
241:     if (m==k+1) {
242:       if (k==4) {
243:         for (i=0;i<5;i++) { r[i] = p1r4[i]; q[i] = p1q4[i]; }
244:       } else if (k==3) {
245:         for (i=0;i<4;i++) { r[i] = p1r3[i]; q[i] = p1q3[i]; }
246:       } else if (k==2) {
247:         for (i=0;i<3;i++) { r[i] = p1r2[i]; q[i] = p1q2[i]; }
248:       } else if (k==1) {
249:         for (i=0;i<2;i++) { r[i] = p1r1[i]; q[i] = p1q1[i]; }
250:       }
251:       return(0); /* quick return */
252:     }
253:     if (m==k-1) {
254:       if (k==5) {
255:         for (i=0;i<4;i++) { r[i] = m1r5[i]; q[i] = m1q5[i]; }
256:         for (i=0;i<2;i++) remain[i] = m1remain5[i];
257:       } else if (k==4) {
258:         for (i=0;i<3;i++) { r[i] = m1r4[i]; q[i] = m1q4[i]; }
259:         for (i=0;i<2;i++) remain[i] = m1remain4[i];
260:       } else if (k==3) {
261:         for (i=0;i<2;i++) { r[i] = m1r3[i]; q[i] = m1q3[i]; remain[i] = m1remain3[i]; }
262:       } else if (k==2) {
263:         r[0] = -13.5; q[0] = 3;
264:         for (i=0;i<2;i++) remain[i] = m1remain2[i];
265:       }
266:     }
267:     if (m==0) {
268:       r = q = 0;
269:       if (k==3) {
270:         for (i=0;i<4;i++) remain[i] = remain3[i];
271:       } else if (k==2) {
272:         for (i=0;i<3;i++) remain[i] = remain2[i];
273:       }
274:     }
275:   }
276:   return(0);
277: }

279: /*
280:  * Product form coefficients.
281:  * If query, the function returns the size necessary to store the coefficients.
282:  */
283: static PetscErrorCode getcoeffsproduct(PetscInt k,PetscInt m,PetscComplex *p,PetscComplex *q,PetscComplex *mult,PetscBool query)
284: {
285:   PetscInt i;
286:   const PetscComplex /* m == k+1 */
287:   p1p4[4] = {-5.203941240131764e+00 + 5.805856841805367e+00*PETSC_i,
288:              -5.203941240131764e+00 - 5.805856841805367e+00*PETSC_i,
289:              -6.796058759868242e+00 + 1.886649260140217e+00*PETSC_i,
290:              -6.796058759868242e+00 - 1.886649260140217e+00*PETSC_i},
291:   p1q4[5] = { 3.655694325463550e+00 + 6.543736899360086e+00*PETSC_i,
292:               3.655694325463550e+00 - 6.543736899360086e+00*PETSC_i,
293:               6.286704751729261e+00                                ,
294:               5.700953298671832e+00 + 3.210265600308496e+00*PETSC_i,
295:               5.700953298671832e+00 - 3.210265600308496e+00*PETSC_i},
296:   p1p3[3] = {-4.675757014491557e+00 + 3.913489560603711e+00*PETSC_i,
297:              -4.675757014491557e+00 - 3.913489560603711e+00*PETSC_i,
298:              -5.648485971016893e+00                                },
299:   p1q3[4] = { 3.212806896871536e+00 + 4.773087433276636e+00*PETSC_i,
300:               3.212806896871536e+00 - 4.773087433276636e+00*PETSC_i,
301:               4.787193103128464e+00 + 1.567476416895212e+00*PETSC_i,
302:               4.787193103128464e+00 - 1.567476416895212e+00*PETSC_i},
303:   p1p2[2] = {-4.00000000000000e+00  + 2.000000000000000e+00*PETSC_i,
304:              -4.00000000000000e+00  - 2.000000000000000e+00*PETSC_i},
305:   p1q2[3] = { 2.681082873627756e+00 + 3.050430199247411e+00*PETSC_i,
306:               2.681082873627756e+00 - 3.050430199247411e+00*PETSC_i,
307:               3.637834252744491e+00                               },
308:   p1q1[2] = { 2.000000000000000e+00 + 1.414213562373095e+00*PETSC_i,
309:               2.000000000000000e+00 - 1.414213562373095e+00*PETSC_i};
310:   const PetscComplex /* m == k-1 */
311:   m1p5[5] = {-3.655694325463550e+00 + 6.543736899360086e+00*PETSC_i,
312:              -3.655694325463550e+00 - 6.543736899360086e+00*PETSC_i,
313:              -6.286704751729261e+00                                ,
314:              -5.700953298671832e+00 + 3.210265600308496e+00*PETSC_i,
315:              -5.700953298671832e+00 - 3.210265600308496e+00*PETSC_i},
316:   m1q5[4] = { 5.203941240131764e+00 + 5.805856841805367e+00*PETSC_i,
317:               5.203941240131764e+00 - 5.805856841805367e+00*PETSC_i,
318:               6.796058759868242e+00 + 1.886649260140217e+00*PETSC_i,
319:               6.796058759868242e+00 - 1.886649260140217e+00*PETSC_i},
320:   m1p4[4] = {-3.212806896871536e+00 + 4.773087433276636e+00*PETSC_i,
321:              -3.212806896871536e+00 - 4.773087433276636e+00*PETSC_i,
322:              -4.787193103128464e+00 + 1.567476416895212e+00*PETSC_i,
323:              -4.787193103128464e+00 - 1.567476416895212e+00*PETSC_i},
324:   m1q4[3] = { 4.675757014491557e+00 + 3.913489560603711e+00*PETSC_i,
325:               4.675757014491557e+00 - 3.913489560603711e+00*PETSC_i,
326:               5.648485971016893e+00                                },
327:   m1p3[3] = {-2.681082873627756e+00 + 3.050430199247411e+00*PETSC_i,
328:              -2.681082873627756e+00 - 3.050430199247411e+00*PETSC_i,
329:              -3.637834252744491e+00                                },
330:   m1q3[2] = { 4.000000000000000e+00 + 2.000000000000000e+00*PETSC_i,
331:               4.000000000000000e+00 - 2.000000000000001e+00*PETSC_i},
332:   m1p2[2] = {-2.000000000000000e+00 + 1.414213562373095e+00*PETSC_i,
333:              -2.000000000000000e+00 - 1.414213562373095e+00*PETSC_i};

336:   if (query) {
337:     if (m == k+1) {
338:       *mult = 1;
339:       *p = k;
340:       *q = k+1;
341:       return(0);
342:     }
343:     if (m==k-1) {
344:       *mult = 1;
345:       *p = k;
346:       *q = k-1;
347:     }
348:   } else {
349:     if (m == k+1) {
350:       *mult = PetscPowInt(-1,m);
351:       *mult *= m;
352:       if (k==4) {
353:         for (i=0;i<4;i++) { p[i] = p1p4[i]; q[i] = p1q4[i]; }
354:         q[4] = p1q4[4];
355:       } else if (k==3) {
356:         for (i=0;i<3;i++) { p[i] = p1p3[i]; q[i] = p1q3[i]; }
357:         q[3] = p1q3[3];
358:       } else if (k==2) {
359:         for (i=0;i<2;i++) { p[i] = p1p2[i]; q[i] = p1q2[i]; }
360:         q[2] = p1q2[2];
361:       } else if (k==1) {
362:         p[0] = -3;
363:         for (i=0;i<2;i++) q[i] = p1q1[i];
364:       }
365:       return(0);
366:     }
367:     if (m==k-1) {
368:       *mult = PetscPowInt(-1,m);
369:       *mult /= k;
370:       if (k==5) {
371:         for (i=0;i<4;i++) { p[i] = m1p5[i]; q[i] = m1q5[i]; }
372:         p[4] = m1p5[4];
373:       } else if (k==4) {
374:         for (i=0;i<3;i++) { p[i] = m1p4[i]; q[i] = m1q4[i]; }
375:         p[3] = m1p4[3];
376:       } else if (k==3) {
377:         for (i=0;i<2;i++) { p[i] = m1p3[i]; q[i] = m1q3[i]; }
378:         p[2] = m1p3[2];
379:       } else if (k==2) {
380:         for (i=0;i<2;i++) p[i] = m1p2[i];
381:         q[0] = 3;
382:       }
383:     }
384:   }
385:   return(0);
386: }
387: #endif /* PETSC_HAVE_COMPLEX */

389: #if defined(PETSC_USE_COMPLEX)
390: static PetscErrorCode getisreal(PetscInt n,PetscComplex *a,PetscBool *result)
391: {
392:   PetscInt i;

395:   *result=PETSC_TRUE;
396:   for (i=0;i<n&&*result;i++) {
397:     if (PetscImaginaryPartComplex(a[i])) *result=PETSC_FALSE;
398:   }
399:   return(0);
400: }
401: #endif

403: /*
404:  * Matrix exponential implementation based on algorithm and matlab code by Stefan Guettel
405:  * and Yuji Nakatsukasa
406:  *
407:  *     Stefan Guettel and Yuji Nakatsukasa, "Scaled and Squared Subdiagonal Pade
408:  *     Approximation for the Matrix Exponential",
409:  *     SIAM J. Matrix Anal. Appl. 37(1):145-170, 2016.
410:  *     https://doi.org/10.1137/15M1027553
411:  */
412: PetscErrorCode FNEvaluateFunctionMat_Exp_GuettelNakatsukasa(FN fn,Mat A,Mat B)
413: {
414: #if !defined(PETSC_HAVE_COMPLEX)
416:   SETERRQ(PETSC_COMM_SELF,1,"This function requires C99 or C++ complex support");
417: #else
418:   PetscInt          i,j,n_,s,k,m,mod;
419:   PetscBLASInt      n=0,n2=0,irsize=0,rsizediv2,ipsize=0,iremainsize=0,info,*piv,minlen,lwork=0,one=1;
420:   PetscReal         nrm,shift=0.0;
421: #if defined(PETSC_USE_COMPLEX) || defined(PETSC_HAVE_ESSL)
422:   PetscReal         *rwork=NULL;
423: #endif
424:   PetscComplex      *As,*RR,*RR2,*expmA,*expmA2,*Maux,*Maux2,rsize,*r,psize,*p,remainsize,*remainterm,*rootp,*rootq,mult=0.0,scale,cone=1.0,czero=0.0,*aux;
425:   PetscScalar       *Ba,*Ba2,*sMaux,*wr,*wi,expshift,sone=1.0,szero=0.0,*saux;
426:   const PetscScalar *Aa;
427:   PetscErrorCode    ierr;
428:   PetscBool         isreal,flg;
429: #if defined(PETSC_HAVE_ESSL)
430:   PetscScalar       sdummy,*wri;
431:   PetscBLASInt      idummy,io=0;
432: #else
433:   PetscBLASInt      query=-1;
434:   PetscScalar       work1,*work;
435: #endif

438:   MatGetSize(A,&n_,NULL);
439:   PetscBLASIntCast(n_,&n);
440:   MatDenseGetArrayRead(A,&Aa);
441:   MatDenseGetArray(B,&Ba);
442:   Ba2 = Ba;
443:   PetscBLASIntCast(n*n,&n2);

445:   PetscMalloc2(n2,&sMaux,n2,&Maux);
446:   Maux2 = Maux;
447:   PetscOptionsGetReal(NULL,NULL,"-fn_expm_estimated_eig",&shift,&flg);
448:   if (!flg) {
449:     PetscMalloc2(n,&wr,n,&wi);
450:     PetscArraycpy(sMaux,Aa,n2);
451:     /* estimate rightmost eigenvalue and shift A with it */
452: #if !defined(PETSC_HAVE_ESSL)
453: #if !defined(PETSC_USE_COMPLEX)
454:     PetscStackCallBLAS("LAPACKgeev",LAPACKgeev_("N","N",&n,sMaux,&n,wr,wi,NULL,&n,NULL,&n,&work1,&query,&info));
455:     SlepcCheckLapackInfo("geev",info);
456:     PetscBLASIntCast((PetscInt)work1,&lwork);
457:     PetscMalloc1(lwork,&work);
458:     PetscStackCallBLAS("LAPACKgeev",LAPACKgeev_("N","N",&n,sMaux,&n,wr,wi,NULL,&n,NULL,&n,work,&lwork,&info));
459:     PetscFree(work);
460: #else
461:     PetscArraycpy(Maux,Aa,n2);
462:     PetscStackCallBLAS("LAPACKgeev",LAPACKgeev_("N","N",&n,Maux,&n,wr,NULL,&n,NULL,&n,&work1,&query,rwork,&info));
463:     SlepcCheckLapackInfo("geev",info);
464:     PetscBLASIntCast((PetscInt)PetscRealPart(work1),&lwork);
465:     PetscMalloc2(2*n,&rwork,lwork,&work);
466:     PetscStackCallBLAS("LAPACKgeev",LAPACKgeev_("N","N",&n,Maux,&n,wr,NULL,&n,NULL,&n,work,&lwork,rwork,&info));
467:     PetscFree2(rwork,work);
468: #endif
469:     SlepcCheckLapackInfo("geev",info);
470: #else /* defined(PETSC_HAVE_ESSL) */
471:     PetscBLASIntCast(4*n,&lwork);
472:     PetscMalloc2(lwork,&rwork,2*n,&wri);
473: #if !defined(PETSC_USE_COMPLEX)
474:     PetscStackCallBLAS("LAPACKgeev",LAPACKgeev_(&io,sMaux,&n,wri,&sdummy,&idummy,&idummy,&n,rwork,&lwork));
475:     for (i=0;i<n;i++) {
476:       wr[i] = wri[2*i];
477:       wi[i] = wri[2*i+1];
478:     }
479: #else
480:     PetscStackCallBLAS("LAPACKgeev",LAPACKgeev_(&io,Maux,&n,wri,&sdummy,&idummy,&idummy,&n,rwork,&lwork));
481:     for (i=0;i<n;i++) wr[i] = wri[i];
482: #endif
483:     PetscFree2(rwork,wri);
484: #endif
485:     PetscLogFlops(25.0*n*n*n+(n*n*n)/3.0+1.0*n*n*n);

487:     shift = PetscRealPart(wr[0]);
488:     for (i=1;i<n;i++) {
489:       if (PetscRealPart(wr[i]) > shift) shift = PetscRealPart(wr[i]);
490:     }
491:     PetscFree2(wr,wi);
492:   }
493:   /* shift so that largest real part is (about) 0 */
494:   PetscArraycpy(sMaux,Aa,n2);
495:   if (shift) {
496:     for (i=0;i<n;i++) sMaux[i+i*n] -= shift;
497:     PetscLogFlops(1.0*n);
498:   }
499: #if defined(PETSC_USE_COMPLEX)
500:   PetscArraycpy(Maux,Aa,n2);
501:   if (shift) {
502:     for (i=0;i<n;i++) Maux[i+i*n] -= shift;
503:     PetscLogFlops(1.0*n);
504:   }
505: #endif

507:   /* estimate norm(A) and select the scaling factor */
508:   nrm = LAPACKlange_("O",&n,&n,sMaux,&n,NULL);
509:   PetscLogFlops(1.0*n*n);
510:   sexpm_params(nrm,&s,&k,&m);
511:   if (s==0 && k==1 && m==0) { /* exp(A) = I+A to eps! */
512:     if (shift) expshift = PetscExpReal(shift);
513:     for (i=0;i<n;i++) sMaux[i+i*n] += 1.0;
514:     if (shift) {
515:       PetscStackCallBLAS("BLASscal",BLASscal_(&n2,&expshift,sMaux,&one));
516:       PetscLogFlops(1.0*(n+n2));
517:     } else {
518:       PetscLogFlops(1.0*n);
519:     }
520:     PetscArraycpy(Ba,sMaux,n2);
521:     PetscFree2(sMaux,Maux);
522:     MatDenseRestoreArrayRead(A,&Aa);
523:     MatDenseRestoreArray(B,&Ba);
524:     return(0); /* quick return */
525:   }

527:   PetscMalloc4(n2,&expmA,n2,&As,n2,&RR,n,&piv);
528:   expmA2 = expmA; RR2 = RR;
529:   /* scale matrix */
530: #if !defined(PETSC_USE_COMPLEX)
531:   for (i=0;i<n2;i++) {
532:     As[i] = sMaux[i];
533:   }
534: #else
535:   PetscArraycpy(As,sMaux,n2);
536: #endif
537:   scale = 1.0/PetscPowRealInt(2.0,s);
538:   PetscStackCallBLAS("BLASCOMPLEXscal",BLASCOMPLEXscal_(&n2,&scale,As,&one));
539:   SlepcLogFlopsComplex(1.0*n2);

541:   /* evaluate Pade approximant (partial fraction or product form) */
542:   if (fn->method==3 || !m) { /* partial fraction */
543:     getcoeffs(k,m,&rsize,&psize,&remainsize,PETSC_TRUE);
544:     PetscBLASIntCast((PetscInt)PetscRealPartComplex(rsize),&irsize);
545:     PetscBLASIntCast((PetscInt)PetscRealPartComplex(psize),&ipsize);
546:     PetscBLASIntCast((PetscInt)PetscRealPartComplex(remainsize),&iremainsize);
547:     PetscMalloc3(irsize,&r,ipsize,&p,iremainsize,&remainterm);
548:     getcoeffs(k,m,r,p,remainterm,PETSC_FALSE);

550:     PetscArrayzero(expmA,n2);
551: #if !defined(PETSC_USE_COMPLEX)
552:     isreal = PETSC_TRUE;
553: #else
554:     getisreal(n2,Maux,&isreal);
555: #endif
556:     if (isreal) {
557:       rsizediv2 = irsize/2;
558:       for (i=0;i<rsizediv2;i++) { /* use partial fraction to get R(As) */
559:         PetscArraycpy(Maux,As,n2);
560:         PetscArrayzero(RR,n2);
561:         for (j=0;j<n;j++) {
562:           Maux[j+j*n] -= p[2*i];
563:           RR[j+j*n] = r[2*i];
564:         }
565:         PetscStackCallBLAS("LAPACKCOMPLEXgesv",LAPACKCOMPLEXgesv_(&n,&n,Maux,&n,piv,RR,&n,&info));
566:         SlepcCheckLapackInfo("gesv",info);
567:         for (j=0;j<n2;j++) {
568:           expmA[j] += RR[j] + PetscConj(RR[j]);
569:         }
570:         /* loop(n) + gesv + loop(n2) */
571:         SlepcLogFlopsComplex(1.0*n+(2.0*n*n*n/3.0+2.0*n*n*n)+2.0*n2);
572:       }

574:       mod = ipsize % 2;
575:       if (mod) {
576:         PetscArraycpy(Maux,As,n2);
577:         PetscArrayzero(RR,n2);
578:         for (j=0;j<n;j++) {
579:           Maux[j+j*n] -= p[ipsize-1];
580:           RR[j+j*n] = r[irsize-1];
581:         }
582:         PetscStackCallBLAS("LAPACKCOMPLEXgesv",LAPACKCOMPLEXgesv_(&n,&n,Maux,&n,piv,RR,&n,&info));
583:         SlepcCheckLapackInfo("gesv",info);
584:         for (j=0;j<n2;j++) {
585:           expmA[j] += RR[j];
586:         }
587:         SlepcLogFlopsComplex(1.0*n+(2.0*n*n*n/3.0+2.0*n*n*n)+1.0*n2);
588:       }
589:     } else { /* complex */
590:       for (i=0;i<irsize;i++) { /* use partial fraction to get R(As) */
591:         PetscArraycpy(Maux,As,n2);
592:         PetscArrayzero(RR,n2);
593:         for (j=0;j<n;j++) {
594:           Maux[j+j*n] -= p[i];
595:           RR[j+j*n] = r[i];
596:         }
597:         PetscStackCallBLAS("LAPACKCOMPLEXgesv",LAPACKCOMPLEXgesv_(&n,&n,Maux,&n,piv,RR,&n,&info));
598:         SlepcCheckLapackInfo("gesv",info);
599:         for (j=0;j<n2;j++) {
600:           expmA[j] += RR[j];
601:         }
602:         SlepcLogFlopsComplex(1.0*n+(2.0*n*n*n/3.0+2.0*n*n*n)+1.0*n2);
603:       }
604:     }
605:     for (i=0;i<iremainsize;i++) {
606:       if (!i) {
607:         PetscArrayzero(RR,n2);
608:         for (j=0;j<n;j++) {
609:           RR[j+j*n] = remainterm[iremainsize-1];
610:         }
611:       } else {
612:         PetscArraycpy(RR,As,n2);
613:         for (j=1;j<i;j++) {
614:           PetscStackCallBLAS("BLASCOMPLEXgemm",BLASCOMPLEXgemm_("N","N",&n,&n,&n,&cone,RR,&n,RR,&n,&czero,Maux,&n));
615:           SWAP(RR,Maux,aux);
616:           SlepcLogFlopsComplex(2.0*n*n*n);
617:         }
618:         PetscStackCallBLAS("BLASCOMPLEXscal",BLASCOMPLEXscal_(&n2,&remainterm[iremainsize-1-i],RR,&one));
619:         SlepcLogFlopsComplex(1.0*n2);
620:       }
621:       for (j=0;j<n2;j++) {
622:         expmA[j] += RR[j];
623:       }
624:       SlepcLogFlopsComplex(1.0*n2);
625:     }
626:     PetscFree3(r,p,remainterm);
627:   } else { /* product form, default */
628:     getcoeffsproduct(k,m,&rsize,&psize,&mult,PETSC_TRUE);
629:     PetscBLASIntCast((PetscInt)PetscRealPartComplex(rsize),&irsize);
630:     PetscBLASIntCast((PetscInt)PetscRealPartComplex(psize),&ipsize);
631:     PetscMalloc2(irsize,&rootp,ipsize,&rootq);
632:     getcoeffsproduct(k,m,rootp,rootq,&mult,PETSC_FALSE);

634:     PetscArrayzero(expmA,n2);
635:     for (i=0;i<n;i++) { /* initialize */
636:       expmA[i+i*n] = 1.0;
637:     }
638:     minlen = PetscMin(irsize,ipsize);
639:     for (i=0;i<minlen;i++) {
640:       PetscArraycpy(RR,As,n2);
641:       for (j=0;j<n;j++) {
642:         RR[j+j*n] -= rootp[i];
643:       }
644:       PetscStackCallBLAS("BLASCOMPLEXgemm",BLASCOMPLEXgemm_("N","N",&n,&n,&n,&cone,RR,&n,expmA,&n,&czero,Maux,&n));
645:       SWAP(expmA,Maux,aux);
646:       PetscArraycpy(RR,As,n2);
647:       for (j=0;j<n;j++) {
648:         RR[j+j*n] -= rootq[i];
649:       }
650:       PetscStackCallBLAS("LAPACKCOMPLEXgesv",LAPACKCOMPLEXgesv_(&n,&n,RR,&n,piv,expmA,&n,&info));
651:       SlepcCheckLapackInfo("gesv",info);
652:       /* loop(n) + gemm + loop(n) + gesv */
653:       SlepcLogFlopsComplex(1.0*n+(2.0*n*n*n)+1.0*n+(2.0*n*n*n/3.0+2.0*n*n*n));
654:     }
655:     /* extra numerator */
656:     for (i=minlen;i<irsize;i++) {
657:       PetscArraycpy(RR,As,n2);
658:       for (j=0;j<n;j++) {
659:         RR[j+j*n] -= rootp[i];
660:       }
661:       PetscStackCallBLAS("BLASCOMPLEXgemm",BLASCOMPLEXgemm_("N","N",&n,&n,&n,&cone,RR,&n,expmA,&n,&czero,Maux,&n));
662:       SWAP(expmA,Maux,aux);
663:       SlepcLogFlopsComplex(1.0*n+2.0*n*n*n);
664:     }
665:     /* extra denominator */
666:     for (i=minlen;i<ipsize;i++) {
667:       PetscArraycpy(RR,As,n2);
668:       for (j=0;j<n;j++) RR[j+j*n] -= rootq[i];
669:       PetscStackCallBLAS("LAPACKCOMPLEXgesv",LAPACKCOMPLEXgesv_(&n,&n,RR,&n,piv,expmA,&n,&info));
670:       SlepcCheckLapackInfo("gesv",info);
671:       SlepcLogFlopsComplex(1.0*n+(2.0*n*n*n/3.0+2.0*n*n*n));
672:     }
673:     PetscStackCallBLAS("BLASCOMPLEXscal",BLASCOMPLEXscal_(&n2,&mult,expmA,&one));
674:     SlepcLogFlopsComplex(1.0*n2);
675:     PetscFree2(rootp,rootq);
676:   }

678: #if !defined(PETSC_USE_COMPLEX)
679:   for (i=0;i<n2;i++) {
680:     Ba2[i] = PetscRealPartComplex(expmA[i]);
681:   }
682: #else
683:   PetscArraycpy(Ba2,expmA,n2);
684: #endif

686:   /* perform repeated squaring */
687:   for (i=0;i<s;i++) { /* final squaring */
688:     PetscStackCallBLAS("BLASgemm",BLASgemm_("N","N",&n,&n,&n,&sone,Ba2,&n,Ba2,&n,&szero,sMaux,&n));
689:     SWAP(Ba2,sMaux,saux);
690:     PetscLogFlops(2.0*n*n*n);
691:   }
692:   if (Ba2!=Ba) {
693:     PetscArraycpy(Ba,Ba2,n2);
694:     sMaux = Ba2;
695:   }
696:   if (shift) {
697:     expshift = PetscExpReal(shift);
698:     PetscStackCallBLAS("BLASscal",BLASscal_(&n2,&expshift,Ba,&one));
699:     PetscLogFlops(1.0*n2);
700:   }

702:   /* restore pointers */
703:   Maux = Maux2; expmA = expmA2; RR = RR2;
704:   PetscFree2(sMaux,Maux);
705:   PetscFree4(expmA,As,RR,piv);
706:   MatDenseRestoreArrayRead(A,&Aa);
707:   MatDenseRestoreArray(B,&Ba);
708:   return(0);
709: #endif
710: }

712: #define SMALLN 100

714: /*
715:  * Function needed to compute optimal parameters (required workspace is 3*n*n)
716:  */
717: static PetscInt ell(PetscBLASInt n,PetscScalar *A,PetscReal coeff,PetscInt m,PetscScalar *work,PetscRandom rand)
718: {
719:   PetscScalar    *Ascaled=work;
720:   PetscReal      nrm,alpha,beta,rwork[1];
721:   PetscInt       t;
722:   PetscBLASInt   i,j;

726:   beta = PetscPowReal(coeff,1.0/(2*m+1));
727:   for (i=0;i<n;i++)
728:     for (j=0;j<n;j++)
729:       Ascaled[i+j*n] = beta*PetscAbsScalar(A[i+j*n]);
730:   nrm = LAPACKlange_("O",&n,&n,A,&n,rwork);
731:   PetscLogFlops(2.0*n*n);
732:   SlepcNormAm(n,Ascaled,2*m+1,work+n*n,rand,&alpha);
733:   alpha /= nrm;
734:   t = PetscMax((PetscInt)PetscCeilReal(PetscLogReal(2.0*alpha/PETSC_MACHINE_EPSILON)/PetscLogReal(2.0)/(2*m)),0);
735:   PetscFunctionReturn(t);
736: }

738: /*
739:  * Compute scaling parameter (s) and order of Pade approximant (m)  (required workspace is 4*n*n)
740:  */
741: static PetscErrorCode expm_params(PetscInt n,PetscScalar **Apowers,PetscInt *s,PetscInt *m,PetscScalar *work)
742: {
743:   PetscErrorCode  ierr;
744:   PetscScalar     sfactor,sone=1.0,szero=0.0,*A=Apowers[0],*Ascaled;
745:   PetscReal       d4,d6,d8,d10,eta1,eta3,eta4,eta5,rwork[1];
746:   PetscBLASInt    n_=0,n2,one=1;
747:   PetscRandom     rand;
748:   const PetscReal coeff[5] = { 9.92063492063492e-06, 9.94131285136576e-11,  /* backward error function */
749:                                2.22819456055356e-16, 1.69079293431187e-22, 8.82996160201868e-36 };
750:   const PetscReal theta[5] = { 1.495585217958292e-002,    /* m = 3  */
751:                                2.539398330063230e-001,    /* m = 5  */
752:                                9.504178996162932e-001,    /* m = 7  */
753:                                2.097847961257068e+000,    /* m = 9  */
754:                                5.371920351148152e+000 };  /* m = 13 */

757:   *s = 0;
758:   *m = 13;
759:   PetscBLASIntCast(n,&n_);
760:   PetscRandomCreate(PETSC_COMM_SELF,&rand);
761:   d4 = PetscPowReal(LAPACKlange_("O",&n_,&n_,Apowers[2],&n_,rwork),1.0/4.0);
762:   if (d4==0.0) { /* safeguard for the case A = 0 */
763:     *m = 3;
764:     goto done;
765:   }
766:   d6 = PetscPowReal(LAPACKlange_("O",&n_,&n_,Apowers[3],&n_,rwork),1.0/6.0);
767:   PetscLogFlops(2.0*n*n);
768:   eta1 = PetscMax(d4,d6);
769:   if (eta1<=theta[0] && !ell(n_,A,coeff[0],3,work,rand)) {
770:     *m = 3;
771:     goto done;
772:   }
773:   if (eta1<=theta[1] && !ell(n_,A,coeff[1],5,work,rand)) {
774:     *m = 5;
775:     goto done;
776:   }
777:   if (n<SMALLN) {
778:     PetscStackCallBLAS("BLASgemm",BLASgemm_("N","N",&n_,&n_,&n_,&sone,Apowers[2],&n_,Apowers[2],&n_,&szero,work,&n_));
779:     d8 = PetscPowReal(LAPACKlange_("O",&n_,&n_,work,&n_,rwork),1.0/8.0);
780:     PetscLogFlops(2.0*n*n*n+1.0*n*n);
781:   } else {
782:     SlepcNormAm(n_,Apowers[2],2,work,rand,&d8);
783:     d8 = PetscPowReal(d8,1.0/8.0);
784:   }
785:   eta3 = PetscMax(d6,d8);
786:   if (eta3<=theta[2] && !ell(n_,A,coeff[2],7,work,rand)) {
787:     *m = 7;
788:     goto done;
789:   }
790:   if (eta3<=theta[3] && !ell(n_,A,coeff[3],9,work,rand)) {
791:     *m = 9;
792:     goto done;
793:   }
794:   if (n<SMALLN) {
795:     PetscStackCallBLAS("BLASgemm",BLASgemm_("N","N",&n_,&n_,&n_,&sone,Apowers[2],&n_,Apowers[3],&n_,&szero,work,&n_));
796:     d10 = PetscPowReal(LAPACKlange_("O",&n_,&n_,work,&n_,rwork),1.0/10.0);
797:     PetscLogFlops(2.0*n*n*n+1.0*n*n);
798:   } else {
799:     SlepcNormAm(n_,Apowers[1],5,work,rand,&d10);
800:     d10 = PetscPowReal(d10,1.0/10.0);
801:   }
802:   eta4 = PetscMax(d8,d10);
803:   eta5 = PetscMin(eta3,eta4);
804:   *s = PetscMax((PetscInt)PetscCeilReal(PetscLogReal(eta5/theta[4])/PetscLogReal(2.0)),0);
805:   if (*s) {
806:     Ascaled = work+3*n*n;
807:     n2 = n_*n_;
808:     PetscStackCallBLAS("BLAScopy",BLAScopy_(&n2,A,&one,Ascaled,&one));
809:     sfactor = PetscPowRealInt(2.0,-(*s));
810:     PetscStackCallBLAS("BLASscal",BLASscal_(&n2,&sfactor,Ascaled,&one));
811:     PetscLogFlops(1.0*n*n);
812:   } else Ascaled = A;
813:   *s += ell(n_,Ascaled,coeff[4],13,work,rand);
814: done:
815:   PetscRandomDestroy(&rand);
816:   return(0);
817: }

819: /*
820:  * Matrix exponential implementation based on algorithm and matlab code by N. Higham and co-authors
821:  *
822:  *     N. J. Higham, "The scaling and squaring method for the matrix exponential
823:  *     revisited", SIAM J. Matrix Anal. Appl. 26(4):1179-1193, 2005.
824:  */
825: PetscErrorCode FNEvaluateFunctionMat_Exp_Higham(FN fn,Mat A,Mat B)
826: {
827:   PetscErrorCode    ierr;
828:   PetscBLASInt      n_=0,n2,*ipiv,info,one=1;
829:   PetscInt          n,m,j,s;
830:   PetscScalar       scale,smone=-1.0,sone=1.0,stwo=2.0,szero=0.0;
831:   PetscScalar       *Ba,*Apowers[5],*Q,*P,*W,*work,*aux;
832:   const PetscScalar *Aa,*c;
833:   const PetscScalar c3[4]   = { 120, 60, 12, 1 };
834:   const PetscScalar c5[6]   = { 30240, 15120, 3360, 420, 30, 1 };
835:   const PetscScalar c7[8]   = { 17297280, 8648640, 1995840, 277200, 25200, 1512, 56, 1 };
836:   const PetscScalar c9[10]  = { 17643225600.0, 8821612800.0, 2075673600, 302702400, 30270240,
837:                                 2162160, 110880, 3960, 90, 1 };
838:   const PetscScalar c13[14] = { 64764752532480000.0, 32382376266240000.0, 7771770303897600.0,
839:                                 1187353796428800.0,  129060195264000.0,   10559470521600.0,
840:                                 670442572800.0,      33522128640.0,       1323241920.0,
841:                                 40840800,          960960,            16380,  182,  1 };

844:   MatDenseGetArrayRead(A,&Aa);
845:   MatDenseGetArray(B,&Ba);
846:   MatGetSize(A,&n,NULL);
847:   PetscBLASIntCast(n,&n_);
848:   n2 = n_*n_;
849:   PetscMalloc2(8*n*n,&work,n,&ipiv);

851:   /* Matrix powers */
852:   Apowers[0] = work;                  /* Apowers[0] = A   */
853:   Apowers[1] = Apowers[0] + n*n;      /* Apowers[1] = A^2 */
854:   Apowers[2] = Apowers[1] + n*n;      /* Apowers[2] = A^4 */
855:   Apowers[3] = Apowers[2] + n*n;      /* Apowers[3] = A^6 */
856:   Apowers[4] = Apowers[3] + n*n;      /* Apowers[4] = A^8 */

858:   PetscArraycpy(Apowers[0],Aa,n2);
859:   PetscStackCallBLAS("BLASgemm",BLASgemm_("N","N",&n_,&n_,&n_,&sone,Apowers[0],&n_,Apowers[0],&n_,&szero,Apowers[1],&n_));
860:   PetscStackCallBLAS("BLASgemm",BLASgemm_("N","N",&n_,&n_,&n_,&sone,Apowers[1],&n_,Apowers[1],&n_,&szero,Apowers[2],&n_));
861:   PetscStackCallBLAS("BLASgemm",BLASgemm_("N","N",&n_,&n_,&n_,&sone,Apowers[1],&n_,Apowers[2],&n_,&szero,Apowers[3],&n_));
862:   PetscLogFlops(6.0*n*n*n);

864:   /* Compute scaling parameter and order of Pade approximant */
865:   expm_params(n,Apowers,&s,&m,Apowers[4]);

867:   if (s) { /* rescale */
868:     for (j=0;j<4;j++) {
869:       scale = PetscPowRealInt(2.0,-PetscMax(2*j,1)*s);
870:       PetscStackCallBLAS("BLASscal",BLASscal_(&n2,&scale,Apowers[j],&one));
871:     }
872:     PetscLogFlops(4.0*n*n);
873:   }

875:   /* Evaluate the Pade approximant */
876:   switch (m) {
877:     case 3:  c = c3;  break;
878:     case 5:  c = c5;  break;
879:     case 7:  c = c7;  break;
880:     case 9:  c = c9;  break;
881:     case 13: c = c13; break;
882:     default: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Wrong value of m %d",m);
883:   }
884:   P = Ba;
885:   Q = Apowers[4] + n*n;
886:   W = Q + n*n;
887:   switch (m) {
888:     case 3:
889:     case 5:
890:     case 7:
891:     case 9:
892:       if (m==9) PetscStackCallBLAS("BLASgemm",BLASgemm_("N","N",&n_,&n_,&n_,&sone,Apowers[1],&n_,Apowers[3],&n_,&szero,Apowers[4],&n_));
893:       PetscArrayzero(P,n2);
894:       PetscArrayzero(Q,n2);
895:       for (j=0;j<n;j++) {
896:         P[j+j*n] = c[1];
897:         Q[j+j*n] = c[0];
898:       }
899:       for (j=m;j>=3;j-=2) {
900:         PetscStackCallBLAS("BLASaxpy",BLASaxpy_(&n2,&c[j],Apowers[(j+1)/2-1],&one,P,&one));
901:         PetscStackCallBLAS("BLASaxpy",BLASaxpy_(&n2,&c[j-1],Apowers[(j+1)/2-1],&one,Q,&one));
902:         PetscLogFlops(4.0*n*n);
903:       }
904:       PetscStackCallBLAS("BLASgemm",BLASgemm_("N","N",&n_,&n_,&n_,&sone,Apowers[0],&n_,P,&n_,&szero,W,&n_));
905:       PetscLogFlops(2.0*n*n*n);
906:       SWAP(P,W,aux);
907:       break;
908:     case 13:
909:       /*  P = A*(Apowers[3]*(c[13]*Apowers[3] + c[11]*Apowers[2] + c[9]*Apowers[1])
910:               + c[7]*Apowers[3] + c[5]*Apowers[2] + c[3]*Apowers[1] + c[1]*I)       */
911:       PetscStackCallBLAS("BLAScopy",BLAScopy_(&n2,Apowers[3],&one,P,&one));
912:       PetscStackCallBLAS("BLASscal",BLASscal_(&n2,&c[13],P,&one));
913:       PetscStackCallBLAS("BLASaxpy",BLASaxpy_(&n2,&c[11],Apowers[2],&one,P,&one));
914:       PetscStackCallBLAS("BLASaxpy",BLASaxpy_(&n2,&c[9],Apowers[1],&one,P,&one));
915:       PetscStackCallBLAS("BLASgemm",BLASgemm_("N","N",&n_,&n_,&n_,&sone,Apowers[3],&n_,P,&n_,&szero,W,&n_));
916:       PetscLogFlops(5.0*n*n+2.0*n*n*n);
917:       PetscArrayzero(P,n2);
918:       for (j=0;j<n;j++) P[j+j*n] = c[1];
919:       PetscStackCallBLAS("BLASaxpy",BLASaxpy_(&n2,&c[7],Apowers[3],&one,P,&one));
920:       PetscStackCallBLAS("BLASaxpy",BLASaxpy_(&n2,&c[5],Apowers[2],&one,P,&one));
921:       PetscStackCallBLAS("BLASaxpy",BLASaxpy_(&n2,&c[3],Apowers[1],&one,P,&one));
922:       PetscStackCallBLAS("BLASaxpy",BLASaxpy_(&n2,&sone,P,&one,W,&one));
923:       PetscStackCallBLAS("BLASgemm",BLASgemm_("N","N",&n_,&n_,&n_,&sone,Apowers[0],&n_,W,&n_,&szero,P,&n_));
924:       PetscLogFlops(7.0*n*n+2.0*n*n*n);
925:       /*  Q = Apowers[3]*(c[12]*Apowers[3] + c[10]*Apowers[2] + c[8]*Apowers[1])
926:               + c[6]*Apowers[3] + c[4]*Apowers[2] + c[2]*Apowers[1] + c[0]*I        */
927:       PetscStackCallBLAS("BLAScopy",BLAScopy_(&n2,Apowers[3],&one,Q,&one));
928:       PetscStackCallBLAS("BLASscal",BLASscal_(&n2,&c[12],Q,&one));
929:       PetscStackCallBLAS("BLASaxpy",BLASaxpy_(&n2,&c[10],Apowers[2],&one,Q,&one));
930:       PetscStackCallBLAS("BLASaxpy",BLASaxpy_(&n2,&c[8],Apowers[1],&one,Q,&one));
931:       PetscStackCallBLAS("BLASgemm",BLASgemm_("N","N",&n_,&n_,&n_,&sone,Apowers[3],&n_,Q,&n_,&szero,W,&n_));
932:       PetscLogFlops(5.0*n*n+2.0*n*n*n);
933:       PetscArrayzero(Q,n2);
934:       for (j=0;j<n;j++) Q[j+j*n] = c[0];
935:       PetscStackCallBLAS("BLASaxpy",BLASaxpy_(&n2,&c[6],Apowers[3],&one,Q,&one));
936:       PetscStackCallBLAS("BLASaxpy",BLASaxpy_(&n2,&c[4],Apowers[2],&one,Q,&one));
937:       PetscStackCallBLAS("BLASaxpy",BLASaxpy_(&n2,&c[2],Apowers[1],&one,Q,&one));
938:       PetscStackCallBLAS("BLASaxpy",BLASaxpy_(&n2,&sone,W,&one,Q,&one));
939:       PetscLogFlops(7.0*n*n);
940:       break;
941:     default: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Wrong value of m %d",m);
942:   }
943:   PetscStackCallBLAS("BLASaxpy",BLASaxpy_(&n2,&smone,P,&one,Q,&one));
944:   PetscStackCallBLAS("LAPACKgesv",LAPACKgesv_(&n_,&n_,Q,&n_,ipiv,P,&n_,&info));
945:   SlepcCheckLapackInfo("gesv",info);
946:   PetscStackCallBLAS("BLASscal",BLASscal_(&n2,&stwo,P,&one));
947:   for (j=0;j<n;j++) P[j+j*n] += 1.0;
948:   PetscLogFlops(2.0*n*n*n/3.0+4.0*n*n);

950:   /* Squaring */
951:   for (j=1;j<=s;j++) {
952:     PetscStackCallBLAS("BLASgemm",BLASgemm_("N","N",&n_,&n_,&n_,&sone,P,&n_,P,&n_,&szero,W,&n_));
953:     SWAP(P,W,aux);
954:   }
955:   if (P!=Ba) { PetscArraycpy(Ba,P,n2); }
956:   PetscLogFlops(2.0*n*n*n*s);

958:   PetscFree2(work,ipiv);
959:   MatDenseRestoreArrayRead(A,&Aa);
960:   MatDenseRestoreArray(B,&Ba);
961:   return(0);
962: }

964: #if defined(PETSC_HAVE_CUDA)
965: #include "../src/sys/classes/fn/impls/cuda/fnutilcuda.h"
966: #include <slepccublas.h>

968: PetscErrorCode FNEvaluateFunctionMat_Exp_Pade_CUDA(FN fn,Mat A,Mat B)
969: {
971:   PetscBLASInt   n=0,ld,ld2,*d_ipiv,*d_info,info,one=1,zero=0;
972:   PetscInt       m,k,sexp;
973:   PetscBool      odd;
974:   const PetscInt p=MAX_PADE;
975:   PetscReal      c[MAX_PADE+1],s;
976:   PetscScalar    scale,smone=-1.0,sone=1.0,stwo=2.0,szero=0.0;
977:   PetscScalar    *Aa,*Ba;
978:   PetscScalar    *d_Ba,*d_As,*d_A2,*d_Q,*d_P,*d_W,*aux,**ppP,**d_ppP,**ppQ,**d_ppQ;
979:   cublasHandle_t cublasv2handle;
980:   cublasStatus_t cberr;
981:   cudaError_t    cerr;

984:   PetscCUBLASGetHandle(&cublasv2handle);
985:   MatDenseGetArray(A,&Aa);
986:   MatDenseGetArray(B,&Ba);
987:   MatGetSize(A,&m,NULL);
988:   PetscBLASIntCast(m,&n);
989:   ld  = n;
990:   ld2 = ld*ld;

992:   cerr = cudaMalloc((void **)&d_Ba,sizeof(PetscScalar)*m*m);CHKERRCUDA(cerr);
993:   cerr = cudaMalloc((void **)&d_Q,sizeof(PetscScalar)*m*m);CHKERRCUDA(cerr);
994:   cerr = cudaMalloc((void **)&d_W,sizeof(PetscScalar)*m*m);CHKERRCUDA(cerr);
995:   cerr = cudaMalloc((void **)&d_As,sizeof(PetscScalar)*m*m);CHKERRCUDA(cerr);
996:   cerr = cudaMalloc((void **)&d_A2,sizeof(PetscScalar)*m*m);CHKERRCUDA(cerr);
997:   cerr = cudaMalloc((void **)&d_ipiv,sizeof(PetscBLASInt)*ld);CHKERRCUDA(cerr);
998:   cerr = cudaMalloc((void **)&d_info,sizeof(PetscBLASInt));CHKERRCUDA(cerr);
999:   cerr = cudaMalloc((void **)&d_ppP,sizeof(PetscScalar*));CHKERRCUDA(cerr);
1000:   cerr = cudaMalloc((void **)&d_ppQ,sizeof(PetscScalar*));CHKERRCUDA(cerr);

1002:   PetscMalloc(sizeof(PetscScalar*),&ppP);
1003:   PetscMalloc(sizeof(PetscScalar*),&ppQ);

1005:   cerr = cudaMemcpy(d_As,Aa,sizeof(PetscScalar)*ld2,cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
1006:   cerr = cudaMemcpy(d_Ba,Ba,sizeof(PetscScalar)*ld2,cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
1007:   d_P = d_Ba;

1009:   /* Pade' coefficients */
1010:   c[0] = 1.0;
1011:   for (k=1;k<=p;k++) c[k] = c[k-1]*(p+1-k)/(k*(2*p+1-k));

1013:   /* Scaling */
1014:   cberr = cublasXnrm2(cublasv2handle,ld2,d_As,one,&s);CHKERRCUBLAS(cberr);
1015:   if (s>0.5) {
1016:     sexp = PetscMax(0,(int)(PetscLogReal(s)/PetscLogReal(2.0))+2);
1017:     scale = PetscPowRealInt(2.0,-sexp);
1018:     cberr = cublasXscal(cublasv2handle,ld2,&scale,d_As,one);CHKERRCUBLAS(cberr);
1019:     PetscLogFlops(1.0*n*n);
1020:   } else sexp = 0;

1022:   /* Horner evaluation */
1023:   cberr = cublasXgemm(cublasv2handle,CUBLAS_OP_N,CUBLAS_OP_N,n,n,n,&sone,d_As,ld,d_As,ld,&szero,d_A2,ld);CHKERRCUBLAS(cberr);
1024:   PetscLogFlops(2.0*n*n*n);
1025:   cerr = cudaMemset(d_Q,zero,sizeof(PetscScalar)*ld2);CHKERRCUDA(cerr);
1026:   cerr = cudaMemset(d_P,zero,sizeof(PetscScalar)*ld2);CHKERRCUDA(cerr);
1027:   set_diagonal(n,d_Q,ld,c[p]);CHKERRQ(cerr);
1028:   set_diagonal(n,d_P,ld,c[p-1]);CHKERRQ(cerr);

1030:   odd = PETSC_TRUE;
1031:   for (k=p-1;k>0;k--) {
1032:     if (odd) {
1033:       cberr = cublasXgemm(cublasv2handle,CUBLAS_OP_N,CUBLAS_OP_N,n,n,n,&sone,d_Q,ld,d_A2,ld,&szero,d_W,ld);CHKERRCUBLAS(cberr);
1034:       SWAP(d_Q,d_W,aux);
1035:       shift_diagonal(n,d_Q,ld,c[k-1]);CHKERRQ(cerr);
1036:       odd = PETSC_FALSE;
1037:     } else {
1038:       cberr = cublasXgemm(cublasv2handle,CUBLAS_OP_N,CUBLAS_OP_N,n,n,n,&sone,d_P,ld,d_A2,ld,&szero,d_W,ld);CHKERRCUBLAS(cberr);
1039:       SWAP(d_P,d_W,aux);
1040:       shift_diagonal(n,d_P,ld,c[k-1]);CHKERRQ(cerr);
1041:       odd = PETSC_TRUE;
1042:     }
1043:     PetscLogFlops(2.0*n*n*n);
1044:   }
1045:   if (odd) {
1046:     cberr = cublasXgemm(cublasv2handle,CUBLAS_OP_N,CUBLAS_OP_N,n,n,n,&sone,d_Q,ld,d_As,ld,&szero,d_W,ld);CHKERRCUBLAS(cberr);
1047:     SWAP(d_Q,d_W,aux);
1048:     cberr = cublasXaxpy(cublasv2handle,ld2,&smone,d_P,one,d_Q,one);CHKERRCUBLAS(cberr);

1050:     ppQ[0] = d_Q;
1051:     ppP[0] = d_P;
1052:     cerr = cudaMemcpy(d_ppQ,ppQ,sizeof(PetscScalar*),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
1053:     cerr = cudaMemcpy(d_ppP,ppP,sizeof(PetscScalar*),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);

1055:     cberr = cublasXgetrfBatched(cublasv2handle,n,d_ppQ,ld,d_ipiv,d_info,one);CHKERRCUBLAS(cberr);
1056:     cerr = cudaMemcpy(&info,d_info,sizeof(PetscBLASInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
1057:     if (info < 0) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"LAPACKgetrf: Illegal value on argument %d",PetscAbsInt(info));
1058:     if (info > 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_MAT_LU_ZRPVT,"LAPACKgetrf: Matrix is singular. U(%d,%d) is zero",info,info);
1059: #if defined (CUDA_VERSION) && CUDA_VERSION >= 5050
1060:     cberr = cublasXgetrsBatched(cublasv2handle,CUBLAS_OP_N,n,n,(const PetscScalar **)d_ppQ,ld,d_ipiv,d_ppP,ld,&info,one);CHKERRCUBLAS(cberr);
1061:     if (info < 0) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"LAPACKgetri: Illegal value on argument %d",PetscAbsInt(info));
1062:     if (info > 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_MAT_LU_ZRPVT,"LAPACKgetri: Matrix is singular. U(%d,%d) is zero",info,info);
1063: #else
1064:     SETERRQ(communicator,PETSC_ERR_LIB,"cublasXgetrsBatched needs CUDA >= 7");
1065: #endif
1066:     cberr = cublasXscal(cublasv2handle,ld2,&stwo,d_P,one);CHKERRCUBLAS(cberr);
1067:     shift_diagonal(n,d_P,ld,sone);CHKERRQ(cerr);
1068:     cberr = cublasXscal(cublasv2handle,ld2,&smone,d_P,one);CHKERRCUBLAS(cberr);
1069:   } else {
1070:     cberr = cublasXgemm(cublasv2handle,CUBLAS_OP_N,CUBLAS_OP_N,n,n,n,&sone,d_P,ld,d_As,ld,&szero,d_W,ld);CHKERRCUBLAS(cberr);
1071:     SWAP(d_P,d_W,aux);
1072:     cberr = cublasXaxpy(cublasv2handle,ld2,&smone,d_P,one,d_Q,one);CHKERRCUBLAS(cberr);

1074:     ppQ[0] = d_Q;
1075:     ppP[0] = d_P;
1076:     cerr = cudaMemcpy(d_ppQ,ppQ,sizeof(PetscScalar*),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
1077:     cerr = cudaMemcpy(d_ppP,ppP,sizeof(PetscScalar*),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);

1079:     cberr = cublasXgetrfBatched(cublasv2handle,n,d_ppQ,ld,d_ipiv,d_info,one);CHKERRCUBLAS(cberr);
1080:     cerr = cudaMemcpy(&info,d_info,sizeof(PetscBLASInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
1081:     if (info < 0) SETERRQ1(PETSC_COMM_SELF, PETSC_ERR_LIB, "LAPACKgetrf: Illegal value on argument %d",PetscAbsInt(info));
1082:     if (info > 0) SETERRQ2(PETSC_COMM_SELF, PETSC_ERR_MAT_LU_ZRPVT, "LAPACKgetrf: Matrix is singular. U(%d,%d) is zero",info,info);
1083: #if defined (CUDA_VERSION) && CUDA_VERSION >= 5050
1084:     cberr = cublasXgetrsBatched(cublasv2handle,CUBLAS_OP_N,n,n,(const PetscScalar **)d_ppQ,ld,d_ipiv,d_ppP,ld,&info,one);CHKERRCUBLAS(cberr);
1085:     if (info < 0) SETERRQ1(PETSC_COMM_SELF, PETSC_ERR_LIB, "LAPACKgetri: Illegal value on argument %d",PetscAbsInt(info));
1086:     if (info > 0) SETERRQ2(PETSC_COMM_SELF, PETSC_ERR_MAT_LU_ZRPVT, "LAPACKgetri: Matrix is singular. U(%d,%d) is zero",info,info);
1087: #else
1088:     SETERRQ(communicator,PETSC_ERR_LIB,"cublasXgetrsBatched needs CUDA >= 7");
1089: #endif
1090:     cberr = cublasXscal(cublasv2handle,ld2,&stwo,d_P,one);CHKERRCUBLAS(cberr);
1091:     shift_diagonal(n,d_P,ld,sone);CHKERRQ(cerr);
1092:   }
1093:   PetscLogFlops(2.0*n*n*n+2.0*n*n*n/3.0+4.0*n*n);

1095:   for (k=1;k<=sexp;k++) {
1096:     cberr = cublasXgemm(cublasv2handle,CUBLAS_OP_N,CUBLAS_OP_N,n,n,n,&sone,d_P,ld,d_P,ld,&szero,d_W,ld);CHKERRCUBLAS(cberr);
1097:     cerr = cudaMemcpy(d_P,d_W,sizeof(PetscScalar)*ld2,cudaMemcpyDeviceToDevice);CHKERRCUDA(cerr);
1098:   }
1099:   if (d_P!=d_Ba) {
1100:     cerr = cudaMemcpy(Ba,d_P,sizeof(PetscScalar)*ld2,cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
1101:   } else {
1102:     cerr = cudaMemcpy(Ba,d_Ba,sizeof(PetscScalar)*ld2,cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
1103:   }
1104:   PetscLogFlops(2.0*n*n*n*sexp);

1106:   cerr = cudaFree(d_Ba);CHKERRCUDA(cerr);
1107:   cerr = cudaFree(d_Q);CHKERRCUDA(cerr);
1108:   cerr = cudaFree(d_W);CHKERRCUDA(cerr);
1109:   cerr = cudaFree(d_As);CHKERRCUDA(cerr);
1110:   cerr = cudaFree(d_A2);CHKERRCUDA(cerr);
1111:   cerr = cudaFree(d_ipiv);CHKERRCUDA(cerr);
1112:   cerr = cudaFree(d_info);CHKERRCUDA(cerr);
1113:   cerr = cudaFree(d_ppP);CHKERRCUDA(cerr);
1114:   cerr = cudaFree(d_ppQ);CHKERRCUDA(cerr);

1116:   PetscFree(ppP);
1117:   PetscFree(ppQ);

1119:   MatDenseRestoreArray(A,&Aa);
1120:   MatDenseRestoreArray(B,&Ba);
1121:   return(0);
1122: }

1124: #if defined(PETSC_HAVE_MAGMA)
1125: #include <slepcmagma.h>

1127: PetscErrorCode FNEvaluateFunctionMat_Exp_Pade_CUDAm(FN fn,Mat A,Mat B)
1128: {
1130:   PetscBLASInt   n=0,ld,ld2,*piv,info,one=1,zero=0;
1131:   PetscInt       m,k,sexp;
1132:   PetscBool      odd;
1133:   const PetscInt p=MAX_PADE;
1134:   PetscReal      c[MAX_PADE+1],s;
1135:   PetscScalar    scale,smone=-1.0,sone=1.0,stwo=2.0,szero=0.0;
1136:   PetscScalar    *Aa,*Ba;
1137:   PetscScalar    *d_Ba,*d_As,*d_A2,*d_Q,*d_P,*d_W,*aux;
1138:   cublasHandle_t cublasv2handle;
1139:   cublasStatus_t cberr;
1140:   cudaError_t    cerr;
1141:   magma_int_t    mierr;

1144:   PetscCUBLASGetHandle(&cublasv2handle);
1145:   magma_init();
1146:   MatDenseGetArray(A,&Aa);
1147:   MatDenseGetArray(B,&Ba);
1148:   MatGetSize(A,&m,NULL);
1149:   PetscBLASIntCast(m,&n);
1150:   ld  = n;
1151:   ld2 = ld*ld;

1153:   cerr = cudaMalloc((void **)&d_Ba,sizeof(PetscScalar)*m*m);CHKERRCUDA(cerr);
1154:   cerr = cudaMalloc((void **)&d_Q,sizeof(PetscScalar)*m*m);CHKERRCUDA(cerr);
1155:   cerr = cudaMalloc((void **)&d_W,sizeof(PetscScalar)*m*m);CHKERRCUDA(cerr);
1156:   cerr = cudaMalloc((void **)&d_As,sizeof(PetscScalar)*m*m);CHKERRCUDA(cerr);
1157:   cerr = cudaMalloc((void **)&d_A2,sizeof(PetscScalar)*m*m);CHKERRCUDA(cerr);

1159:   PetscMalloc(sizeof(PetscInt)*n,&piv);

1161:   cerr = cudaMemcpy(d_As,Aa,sizeof(PetscScalar)*ld2,cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
1162:   cerr = cudaMemcpy(d_Ba,Ba,sizeof(PetscScalar)*ld2,cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
1163:   d_P = d_Ba;

1165:   /* Pade' coefficients */
1166:   c[0] = 1.0;
1167:   for (k=1;k<=p;k++) c[k] = c[k-1]*(p+1-k)/(k*(2*p+1-k));

1169:   /* Scaling */
1170:   cberr = cublasXnrm2(cublasv2handle,ld2,d_As,one,&s);CHKERRCUBLAS(cberr);
1171:   PetscLogFlops(1.0*n*n);

1173:   if (s>0.5) {
1174:     sexp = PetscMax(0,(int)(PetscLogReal(s)/PetscLogReal(2.0))+2);
1175:     scale = PetscPowRealInt(2.0,-sexp);
1176:     cberr = cublasXscal(cublasv2handle,ld2,&scale,d_As,one);CHKERRCUBLAS(cberr);
1177:     PetscLogFlops(1.0*n*n);
1178:   } else sexp = 0;

1180:   /* Horner evaluation */
1181:   cberr = cublasXgemm(cublasv2handle,CUBLAS_OP_N,CUBLAS_OP_N,n,n,n,&sone,d_As,ld,d_As,ld,&szero,d_A2,ld);CHKERRCUBLAS(cberr);
1182:   PetscLogFlops(2.0*n*n*n);
1183:   cerr = cudaMemset(d_Q,zero,sizeof(PetscScalar)*ld2);CHKERRCUDA(cerr);
1184:   cerr = cudaMemset(d_P,zero,sizeof(PetscScalar)*ld2);CHKERRCUDA(cerr);
1185:   set_diagonal(n,d_Q,ld,c[p]);CHKERRQ(cerr);
1186:   set_diagonal(n,d_P,ld,c[p-1]);CHKERRQ(cerr);

1188:   odd = PETSC_TRUE;
1189:   for (k=p-1;k>0;k--) {
1190:     if (odd) {
1191:       cberr = cublasXgemm(cublasv2handle,CUBLAS_OP_N,CUBLAS_OP_N,n,n,n,&sone,d_Q,ld,d_A2,ld,&szero,d_W,ld);CHKERRCUBLAS(cberr);
1192:       SWAP(d_Q,d_W,aux);
1193:       shift_diagonal(n,d_Q,ld,c[k-1]);CHKERRQ(cerr);
1194:       odd = PETSC_FALSE;
1195:     } else {
1196:       cberr = cublasXgemm(cublasv2handle,CUBLAS_OP_N,CUBLAS_OP_N,n,n,n,&sone,d_P,ld,d_A2,ld,&szero,d_W,ld);CHKERRCUBLAS(cberr);
1197:       SWAP(d_P,d_W,aux);
1198:       shift_diagonal(n,d_P,ld,c[k-1]);CHKERRQ(cerr);
1199:       odd = PETSC_TRUE;
1200:     }
1201:     PetscLogFlops(2.0*n*n*n);
1202:   }
1203:   if (odd) {
1204:     cberr = cublasXgemm(cublasv2handle,CUBLAS_OP_N,CUBLAS_OP_N,n,n,n,&sone,d_Q,ld,d_As,ld,&szero,d_W,ld);CHKERRCUBLAS(cberr);
1205:     SWAP(d_Q,d_W,aux);
1206:     cberr = cublasXaxpy(cublasv2handle,ld2,&smone,d_P,one,d_Q,one);CHKERRCUBLAS(cberr);
1207:     mmagma_xgesv_gpu(n,n,d_Q,ld,piv,d_P,ld,&info);CHKERRMAGMA(mierr);
1208:     if (info) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"Error in Lapack xGESV %d",info);
1209:     cberr = cublasXscal(cublasv2handle,ld2,&stwo,d_P,one);CHKERRCUBLAS(cberr);
1210:     shift_diagonal(n,d_P,ld,sone);CHKERRQ(cerr);
1211:     cberr = cublasXscal(cublasv2handle,ld2,&smone,d_P,one);CHKERRCUBLAS(cberr);
1212:   } else {
1213:     cberr = cublasXgemm(cublasv2handle,CUBLAS_OP_N,CUBLAS_OP_N,n,n,n,&sone,d_P,ld,d_As,ld,&szero,d_W,ld);CHKERRCUBLAS(cberr);
1214:     SWAP(d_P,d_W,aux);
1215:     cberr = cublasXaxpy(cublasv2handle,ld2,&smone,d_P,one,d_Q,one);CHKERRCUBLAS(cberr);
1216:     mmagma_xgesv_gpu(n,n,d_Q,ld,piv,d_P,ld,&info);CHKERRMAGMA(mierr);
1217:     if (info) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"Error in Lapack xGESV %d",info);
1218:     cberr = cublasXscal(cublasv2handle,ld2,&stwo,d_P,one);CHKERRCUBLAS(cberr);
1219:     shift_diagonal(n,d_P,ld,sone);CHKERRQ(cerr);
1220:   }
1221:   PetscLogFlops(2.0*n*n*n+2.0*n*n*n/3.0+4.0*n*n);

1223:   for (k=1;k<=sexp;k++) {
1224:     cberr = cublasXgemm(cublasv2handle,CUBLAS_OP_N,CUBLAS_OP_N,n,n,n,&sone,d_P,ld,d_P,ld,&szero,d_W,ld);CHKERRCUBLAS(cberr);
1225:     cerr = cudaMemcpy(d_P,d_W,sizeof(PetscScalar)*ld2,cudaMemcpyDeviceToDevice);CHKERRCUDA(cerr);
1226:   }
1227:   if (d_P!=d_Ba) {
1228:     cerr = cudaMemcpy(Ba,d_P,sizeof(PetscScalar)*ld2,cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
1229:   } else {
1230:     cerr = cudaMemcpy(Ba,d_Ba,sizeof(PetscScalar)*ld2,cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
1231:   }
1232:   PetscLogFlops(2.0*n*n*n*sexp);

1234:   cerr = cudaFree(d_Ba);CHKERRCUDA(cerr);
1235:   cerr = cudaFree(d_Q);CHKERRCUDA(cerr);
1236:   cerr = cudaFree(d_W);CHKERRCUDA(cerr);
1237:   cerr = cudaFree(d_As);CHKERRCUDA(cerr);
1238:   cerr = cudaFree(d_A2);CHKERRCUDA(cerr);
1239:   PetscFree(piv);

1241:   MatDenseRestoreArray(A,&Aa);
1242:   MatDenseRestoreArray(B,&Ba);
1243:   magma_finalize();
1244:   return(0);
1245: }

1247: /*
1248:  * Matrix exponential implementation based on algorithm and matlab code by N. Higham and co-authors
1249:  *
1250:  *     N. J. Higham, "The scaling and squaring method for the matrix exponential
1251:  *     revisited", SIAM J. Matrix Anal. Appl. 26(4):1179-1193, 2005.
1252:  */
1253: PetscErrorCode FNEvaluateFunctionMat_Exp_Higham_CUDAm(FN fn,Mat A,Mat B)
1254: {
1255:   PetscErrorCode    ierr;
1256:   PetscBLASInt      n_=0,n2,*ipiv,info,one=1;
1257:   PetscInt          n,m,j,s,zero=0;
1258:   PetscScalar       scale,smone=-1.0,sone=1.0,stwo=2.0,szero=0.0;
1259:   PetscScalar       *Aa,*Ba,*d_Ba,*Apowers[5],*d_Apowers[5],*d_Q,*d_P,*d_W,*work,*d_work,*aux;
1260:   const PetscScalar *c;
1261:   const PetscScalar c3[4]   = { 120, 60, 12, 1 };
1262:   const PetscScalar c5[6]   = { 30240, 15120, 3360, 420, 30, 1 };
1263:   const PetscScalar c7[8]   = { 17297280, 8648640, 1995840, 277200, 25200, 1512, 56, 1 };
1264:   const PetscScalar c9[10]  = { 17643225600, 8821612800, 2075673600, 302702400, 30270240,
1265:     2162160, 110880, 3960, 90, 1 };
1266:   const PetscScalar c13[14] = { 64764752532480000, 32382376266240000, 7771770303897600,
1267:     1187353796428800,  129060195264000,   10559470521600,
1268:     670442572800,      33522128640,       1323241920,
1269:     40840800,          960960,            16380,  182,  1 };
1270:   cublasHandle_t    cublasv2handle;
1271:   cublasStatus_t    cberr;
1272:   cudaError_t       cerr;
1273:   magma_int_t       mierr;

1276:   PetscCUBLASGetHandle(&cublasv2handle);
1277:   magma_init();
1278:   MatDenseGetArray(A,&Aa);
1279:   MatDenseGetArray(B,&Ba);
1280:   MatGetSize(A,&n,NULL);
1281:   PetscBLASIntCast(n,&n_);
1282:   n2 = n_*n_;
1283:   PetscMalloc2(8*n*n,&work,n,&ipiv);
1284:   cudaMalloc((void**)&d_work,8*n*n*sizeof(PetscScalar));
1285:   cerr = cudaMalloc((void **)&d_Ba,sizeof(PetscScalar)*n*n);CHKERRCUDA(cerr);
1286:   cudaMemcpy(d_Ba,Ba,n2*sizeof(PetscScalar),cudaMemcpyHostToDevice);

1288:   /* Matrix powers */
1289:   Apowers[0] = work;                  /* Apowers[0] = A   */
1290:   Apowers[1] = Apowers[0] + n*n;      /* Apowers[1] = A^2 */
1291:   Apowers[2] = Apowers[1] + n*n;      /* Apowers[2] = A^4 */
1292:   Apowers[3] = Apowers[2] + n*n;      /* Apowers[3] = A^6 */
1293:   Apowers[4] = Apowers[3] + n*n;      /* Apowers[4] = A^8 */
1294:   /* Matrix powers on device */
1295:   d_Apowers[0] = d_work;                /* d_Apowers[0] = A   */
1296:   d_Apowers[1] = d_Apowers[0] + n*n;    /* d_Apowers[1] = A^2 */
1297:   d_Apowers[2] = d_Apowers[1] + n*n;    /* d_Apowers[2] = A^4 */
1298:   d_Apowers[3] = d_Apowers[2] + n*n;    /* d_Apowers[3] = A^6 */
1299:   d_Apowers[4] = d_Apowers[3] + n*n;    /* d_Apowers[4] = A^8 */

1301:   cudaMemcpy(d_Apowers[0],Aa,n2*sizeof(PetscScalar),cudaMemcpyHostToDevice);
1302:   cberr = cublasXgemm(cublasv2handle,CUBLAS_OP_N,CUBLAS_OP_N,n_,n_,n_,&sone,d_Apowers[0],n_,d_Apowers[0],n_,&szero,d_Apowers[1],n_);CHKERRCUBLAS(cberr);
1303:   cberr = cublasXgemm(cublasv2handle,CUBLAS_OP_N,CUBLAS_OP_N,n_,n_,n_,&sone,d_Apowers[1],n_,d_Apowers[1],n_,&szero,d_Apowers[2],n_);CHKERRCUBLAS(cberr);
1304:   cberr = cublasXgemm(cublasv2handle,CUBLAS_OP_N,CUBLAS_OP_N,n_,n_,n_,&sone,d_Apowers[1],n_,d_Apowers[2],n_,&szero,d_Apowers[3],n_);CHKERRCUBLAS(cberr);
1305:   PetscLogFlops(6.0*n*n*n);

1307:   cudaMemcpy(Apowers[0],d_Apowers[0],4*n2*sizeof(PetscScalar),cudaMemcpyDeviceToHost);
1308:   /* Compute scaling parameter and order of Pade approximant */
1309:   expm_params(n,Apowers,&s,&m,Apowers[4]);

1311:   if (s) { /* rescale */
1312:     for (j=0;j<4;j++) {
1313:       scale = PetscPowRealInt(2.0,-PetscMax(2*j,1)*s);
1314:       cberr = cublasXscal(cublasv2handle,n2,&scale,d_Apowers[j],one);CHKERRCUBLAS(cberr);
1315:     }
1316:     PetscLogFlops(4.0*n*n);
1317:   }

1319:   /* Evaluate the Pade approximant */
1320:   switch (m) {
1321:     case 3:  c = c3;  break;
1322:     case 5:  c = c5;  break;
1323:     case 7:  c = c7;  break;
1324:     case 9:  c = c9;  break;
1325:     case 13: c = c13; break;
1326:     default: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Wrong value of m %d",m);
1327:   }
1328:   d_P = d_Ba;
1329:   d_Q = d_Apowers[4] + n*n;
1330:   d_W = d_Q + n*n;
1331:   switch (m) {
1332:     case 3:
1333:     case 5:
1334:     case 7:
1335:     case 9:
1336:       if (m==9) {cberr = cublasXgemm(cublasv2handle,CUBLAS_OP_N,CUBLAS_OP_N,n_,n_,n_,&sone,d_Apowers[1],n_,d_Apowers[3],n_,&szero,d_Apowers[4],n_);CHKERRCUBLAS(cberr);}
1337:       cerr = cudaMemset(d_P,zero,sizeof(PetscScalar)*n2);CHKERRCUDA(cerr);
1338:       cerr = cudaMemset(d_Q,zero,sizeof(PetscScalar)*n2);CHKERRCUDA(cerr);
1339:       set_diagonal(n,d_P,n,c[1]);CHKERRQ(cerr);
1340:       set_diagonal(n,d_Q,n,c[0]);CHKERRQ(cerr);
1341:       for (j=m;j>=3;j-=2) {
1342:         cberr = cublasXaxpy(cublasv2handle,n2,&c[j],d_Apowers[(j+1)/2-1],one,d_P,one);CHKERRCUBLAS(cberr);
1343:         cberr = cublasXaxpy(cublasv2handle,n2,&c[j-1],d_Apowers[(j+1)/2-1],one,d_Q,one);CHKERRCUBLAS(cberr);
1344:         PetscLogFlops(4.0*n*n);
1345:       }
1346:       cberr = cublasXgemm(cublasv2handle,CUBLAS_OP_N,CUBLAS_OP_N,n_,n_,n_,&sone,d_Apowers[0],n_,d_P,n_,&szero,d_W,n_);CHKERRCUBLAS(cberr);
1347:       PetscLogFlops(2.0*n*n*n);
1348:       SWAP(d_P,d_W,aux);
1349:       break;
1350:     case 13:
1351:       /*  P = A*(Apowers[3]*(c[13]*Apowers[3] + c[11]*Apowers[2] + c[9]*Apowers[1])
1352:           + c[7]*Apowers[3] + c[5]*Apowers[2] + c[3]*Apowers[1] + c[1]*I)       */
1353:       cerr = cudaMemcpy(d_P,d_Apowers[3],n2*sizeof(PetscScalar),cudaMemcpyDeviceToDevice);CHKERRCUDA(cerr);
1354:       cberr = cublasXscal(cublasv2handle,n2,&c[13],d_P,one);CHKERRCUBLAS(cberr);
1355:       cberr = cublasXaxpy(cublasv2handle,n2,&c[11],d_Apowers[2],one,d_P,one);CHKERRCUBLAS(cberr);
1356:       cberr = cublasXaxpy(cublasv2handle,n2,&c[9],d_Apowers[1],one,d_P,one);CHKERRCUBLAS(cberr);
1357:       cberr = cublasXgemm(cublasv2handle,CUBLAS_OP_N,CUBLAS_OP_N,n_,n_,n_,&sone,d_Apowers[3],n_,d_P,n_,&szero,d_W,n_);CHKERRCUBLAS(cberr);
1358:       PetscLogFlops(5.0*n*n+2.0*n*n*n);

1360:       cerr = cudaMemset(d_P,zero,sizeof(PetscScalar)*n2);CHKERRCUDA(cerr);
1361:       set_diagonal(n,d_P,n,c[1]);CHKERRQ(cerr);
1362:       cberr = cublasXaxpy(cublasv2handle,n2,&c[7],d_Apowers[3],one,d_P,one);CHKERRCUBLAS(cberr);
1363:       cberr = cublasXaxpy(cublasv2handle,n2,&c[5],d_Apowers[2],one,d_P,one);CHKERRCUBLAS(cberr);
1364:       cberr = cublasXaxpy(cublasv2handle,n2,&c[3],d_Apowers[1],one,d_P,one);CHKERRCUBLAS(cberr);
1365:       cberr = cublasXaxpy(cublasv2handle,n2,&sone,d_P,one,d_W,one);CHKERRCUBLAS(cberr);
1366:       cberr = cublasXgemm(cublasv2handle,CUBLAS_OP_N,CUBLAS_OP_N,n_,n_,n_,&sone,d_Apowers[0],n_,d_W,n_,&szero,d_P,n_);CHKERRCUBLAS(cberr);
1367:       PetscLogFlops(7.0*n*n+2.0*n*n*n);
1368:       /*  Q = Apowers[3]*(c[12]*Apowers[3] + c[10]*Apowers[2] + c[8]*Apowers[1])
1369:           + c[6]*Apowers[3] + c[4]*Apowers[2] + c[2]*Apowers[1] + c[0]*I        */
1370:       cerr = cudaMemcpy(d_Q,d_Apowers[3],n2*sizeof(PetscScalar),cudaMemcpyDeviceToDevice);CHKERRCUDA(cerr);
1371:       cberr = cublasXscal(cublasv2handle,n2,&c[12],d_Q,one);CHKERRCUBLAS(cberr);
1372:       cberr = cublasXaxpy(cublasv2handle,n2,&c[10],d_Apowers[2],one,d_Q,one);CHKERRCUBLAS(cberr);
1373:       cberr = cublasXaxpy(cublasv2handle,n2,&c[8],d_Apowers[1],one,d_Q,one);CHKERRCUBLAS(cberr);
1374:       cberr = cublasXgemm(cublasv2handle,CUBLAS_OP_N,CUBLAS_OP_N,n_,n_,n_,&sone,d_Apowers[3],n_,d_Q,n_,&szero,d_W,n_);CHKERRCUBLAS(cberr);
1375:       PetscLogFlops(5.0*n*n+2.0*n*n*n);
1376:       cerr = cudaMemset(d_Q,zero,sizeof(PetscScalar)*n2);CHKERRCUDA(cerr);
1377:       set_diagonal(n,d_Q,n,c[0]);CHKERRQ(cerr);
1378:       cberr = cublasXaxpy(cublasv2handle,n2,&c[6],d_Apowers[3],one,d_Q,one);CHKERRCUBLAS(cberr);
1379:       cberr = cublasXaxpy(cublasv2handle,n2,&c[4],d_Apowers[2],one,d_Q,one);CHKERRCUBLAS(cberr);
1380:       cberr = cublasXaxpy(cublasv2handle,n2,&c[2],d_Apowers[1],one,d_Q,one);CHKERRCUBLAS(cberr);
1381:       cberr = cublasXaxpy(cublasv2handle,n2,&sone,d_W,one,d_Q,one);CHKERRCUBLAS(cberr);
1382:       PetscLogFlops(7.0*n*n);
1383:       break;
1384:     default: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Wrong value of m %d",m);
1385:   }
1386:   cberr = cublasXaxpy(cublasv2handle,n2,&smone,d_P,one,d_Q,one);CHKERRCUBLAS(cberr);

1388:   mmagma_xgesv_gpu(n_,n_,d_Q,n_,ipiv,d_P,n_,&info);CHKERRMAGMA(mierr);
1389:   if (info) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"Error in Lapack xGESV %d",info);

1391:   cberr = cublasXscal(cublasv2handle,n2,&stwo,d_P,one);CHKERRCUBLAS(cberr);
1392:   shift_diagonal(n,d_P,n,sone);
1393:   PetscLogFlops(2.0*n*n*n/3.0+4.0*n*n);

1395:   /* Squaring */
1396:   for (j=1;j<=s;j++) {
1397:     cberr = cublasXgemm(cublasv2handle,CUBLAS_OP_N,CUBLAS_OP_N,n_,n_,n_,&sone,d_P,n_,d_P,n_,&szero,d_W,n_);CHKERRCUBLAS(cberr);
1398:     SWAP(d_P,d_W,aux);
1399:   }
1400:   if (d_P!=d_Ba) {
1401:     cerr = cudaMemcpy(Ba,d_P,n2*sizeof(PetscScalar),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
1402:   } else {
1403:     cerr = cudaMemcpy(Ba,d_Ba,n2*sizeof(PetscScalar),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
1404:   }
1405:   PetscLogFlops(2.0*n*n*n*s);

1407:   PetscFree2(work,ipiv);
1408:   MatDenseRestoreArray(A,&Aa);
1409:   MatDenseRestoreArray(B,&Ba);
1410:   magma_finalize();
1411:   return(0);
1412: }

1414: /*
1415:  * Matrix exponential implementation based on algorithm and matlab code by Stefan Guettel
1416:  * and Yuji Nakatsukasa
1417:  *
1418:  *     Stefan Guettel and Yuji Nakatsukasa, "Scaled and Squared Subdiagonal Pade'
1419:  *     Approximation for the Matrix Exponential",
1420:  *     SIAM J. Matrix Anal. Appl. 37(1):145-170, 2016.
1421:  *     https://doi.org/10.1137/15M1027553
1422:  */
1423: PetscErrorCode FNEvaluateFunctionMat_Exp_GuettelNakatsukasa_CUDAm(FN fn,Mat A,Mat B)
1424: {
1425:   PetscInt       i,j,n_,s,k,m,zero=0,mod;
1426:   PetscBLASInt   n=0,n2=0,irsize=0,rsizediv2,ipsize=0,iremainsize=0,query=-1,info,*piv,minlen,lwork=0,one=1;
1427:   PetscReal      nrm,shift=0.0,rone=1.0,rzero=0.0;
1428: #if defined(PETSC_USE_COMPLEX)
1429:   PetscReal      *rwork=NULL;
1430: #endif
1431:   PetscComplex   *d_As,*d_RR,*d_RR2,*d_expmA,*d_expmA2,*d_Maux,*d_Maux2,rsize,*r,psize,*p,remainsize,*remainterm,*rootp,*rootq,mult=0.0,scale,cone=1.0,czero=0.0,*aux;
1432:   PetscScalar    *Aa,*Ba,*d_Ba,*d_Ba2,*Maux,*sMaux,*d_sMaux,*wr,*wi,expshift,sone=1.0,szero=0.0,*work,work1,*saux;
1434:   PetscBool      isreal,*d_isreal,flg;
1435:   cublasHandle_t cublasv2handle;
1436:   cudaError_t    cerr;
1437:   cublasStatus_t cberr;
1438:   magma_int_t    mierr;

1441:   PetscCUBLASGetHandle(&cublasv2handle);
1442:   magma_init();
1443:   MatGetSize(A,&n_,NULL);
1444:   PetscBLASIntCast(n_,&n);
1445:   MatDenseGetArray(A,&Aa);
1446:   MatDenseGetArray(B,&Ba);
1447:   PetscBLASIntCast(n*n,&n2);

1449:   cerr = cudaMalloc((void **)&d_Ba,sizeof(PetscScalar)*n2);CHKERRCUDA(cerr);
1450:   cerr = cudaMemcpy(d_Ba,Ba,sizeof(PetscScalar)*n2,cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
1451:   d_Ba2 = d_Ba;

1453:   PetscMalloc2(n2,&sMaux,n2,&Maux);
1454:   cerr = cudaMalloc((void **)&d_isreal,sizeof(PetscBool));CHKERRCUDA(cerr);
1455:   cerr = cudaMalloc((void **)&d_sMaux,sizeof(PetscScalar)*n2);CHKERRCUDA(cerr);
1456:   cerr = cudaMalloc((void **)&d_Maux,sizeof(PetscComplex)*n2);CHKERRCUDA(cerr);

1458:   cerr = cudaMemcpy(d_sMaux,Aa,sizeof(PetscScalar)*n2,cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
1459:   d_Maux2 = d_Maux;
1460:   PetscOptionsGetReal(NULL,NULL,"-fn_expm_estimated_eig",&shift,&flg);
1461:   if (!flg) {
1462:     PetscMalloc2(n,&wr,n,&wi);
1463:     PetscArraycpy(sMaux,Aa,n2);
1464:     /* estimate rightmost eigenvalue and shift A with it */
1465: #if !defined(PETSC_USE_COMPLEX)
1466:     mmagma_xgeev(MagmaNoVec,MagmaNoVec,n,sMaux,n,wr,wi,NULL,n,NULL,n,&work1,query,&info);CHKERRMAGMA(mierr);
1467:     SlepcCheckLapackInfo("geev",info);
1468:     PetscBLASIntCast((PetscInt)PetscRealPart(work1),&lwork);
1469:     PetscMalloc1(lwork,&work);
1470:     mmagma_xgeev(MagmaNoVec,MagmaNoVec,n,sMaux,n,wr,wi,NULL,n,NULL,n,work,lwork,&info);CHKERRMAGMA(mierr);
1471:     PetscFree(work);
1472: #else
1473:     PetscArraycpy(Maux,Aa,n2);
1474:     mmagma_xgeev(MagmaNoVec,MagmaNoVec,n,Maux,n,wr,NULL,n,NULL,n,&work1,query,rwork,&info);CHKERRMAGMA(mierr);
1475:     SlepcCheckLapackInfo("geev",info);
1476:     PetscBLASIntCast((PetscInt)PetscRealPart(work1),&lwork);
1477:     PetscMalloc2(2*n,&rwork,lwork,&work);
1478:     mmagma_xgeev(MagmaNoVec,MagmaNoVec,n,Maux,n,wr,NULL,n,NULL,n,work,lwork,rwork,&info);CHKERRMAGMA(mierr);
1479:     PetscFree2(rwork,work);
1480: #endif
1481:     SlepcCheckLapackInfo("geev",info);
1482:     PetscLogFlops(25.0*n*n*n+(n*n*n)/3.0+1.0*n*n*n);

1484:     shift = PetscRealPart(wr[0]);
1485:     for (i=1;i<n;i++) {
1486:       if (PetscRealPart(wr[i]) > shift) shift = PetscRealPart(wr[i]);
1487:     }
1488:     PetscFree2(wr,wi);
1489:   }
1490:   /* shift so that largest real part is (about) 0 */
1491:   cerr = cudaMemcpy(d_sMaux,Aa,sizeof(PetscScalar)*n2,cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
1492:   if (shift) {
1493:     shift_diagonal(n,d_sMaux,n,-shift);
1494:     PetscLogFlops(1.0*n);
1495:   }
1496: #if defined(PETSC_USE_COMPLEX)
1497:   cerr = cudaMemcpy(d_Maux,Aa,sizeof(PetscScalar)*n2,cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
1498:   if (shift) {
1499:     shift_diagonal(n,d_Maux,n,-shift);
1500:     PetscLogFlops(1.0*n);
1501:   }
1502: #endif

1504:   /* estimate norm(A) and select the scaling factor */
1505:   cberr = cublasXnrm2(cublasv2handle,n2,d_sMaux,one,&nrm);CHKERRCUBLAS(cberr);
1506:   PetscLogFlops(2.0*n*n);
1507:   sexpm_params(nrm,&s,&k,&m);
1508:   if (s==0 && k==1 && m==0) { /* exp(A) = I+A to eps! */
1509:     if (shift) expshift = PetscExpReal(shift);
1510:     shift_Cdiagonal(n,d_Maux,n,rone,rzero);
1511:     if (shift) {
1512:       cberr = cublasXscal(cublasv2handle,n2,&expshift,d_sMaux,one);CHKERRCUBLAS(cberr);
1513:       PetscLogFlops(1.0*(n+n2));
1514:     } else {
1515:       PetscLogFlops(1.0*n);
1516:     }
1517:     cerr = cudaMemcpy(Ba,d_sMaux,sizeof(PetscScalar)*n2,cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
1518:     cerr = cudaFree(d_sMaux);CHKERRCUDA(cerr);
1519:     cerr = cudaFree(d_Ba);CHKERRCUDA(cerr);
1520:     MatDenseRestoreArray(A,&Aa);
1521:     MatDenseRestoreArray(B,&Ba);
1522:     return(0); /* quick return */
1523:   }

1525:   cerr = cudaMalloc((void **)&d_expmA,sizeof(PetscComplex)*n2);CHKERRCUDA(cerr);
1526:   cerr = cudaMalloc((void **)&d_As,sizeof(PetscComplex)*n2);CHKERRCUDA(cerr);
1527:   cerr = cudaMalloc((void **)&d_RR,sizeof(PetscComplex)*n2);CHKERRCUDA(cerr);
1528:   d_expmA2 = d_expmA; d_RR2 = d_RR;
1529:   PetscMalloc1(n,&piv);
1530:   /* scale matrix */
1531: #if !defined(PETSC_USE_COMPLEX)
1532:   copy_array2D_S2C(n,n,d_As,n,d_sMaux,n);
1533: #else
1534:   cerr = cudaMemcpy(d_As,d_sMaux,sizeof(PetscScalar)*n2,cudaMemcpyDeviceToDevice);CHKERRCUDA(cerr);
1535: #endif
1536:   scale = 1.0/PetscPowRealInt(2.0,s);
1537:   cberr = cublasXCscal(cublasv2handle,n2,(const cuComplex *)&scale,(cuComplex *)d_As,one);CHKERRCUBLAS(cberr);
1538:   SlepcLogFlopsComplex(1.0*n2);

1540:   /* evaluate Pade approximant (partial fraction or product form) */
1541:   if (fn->method==8 || !m) { /* partial fraction */
1542:     getcoeffs(k,m,&rsize,&psize,&remainsize,PETSC_TRUE);
1543:     PetscBLASIntCast((PetscInt)PetscRealPartComplex(rsize),&irsize);
1544:     PetscBLASIntCast((PetscInt)PetscRealPartComplex(psize),&ipsize);
1545:     PetscBLASIntCast((PetscInt)PetscRealPartComplex(remainsize),&iremainsize);
1546:     PetscMalloc3(irsize,&r,ipsize,&p,iremainsize,&remainterm);
1547:     getcoeffs(k,m,r,p,remainterm,PETSC_FALSE);

1549:     cerr = cudaMemset(d_expmA,zero,sizeof(PetscComplex)*n2);CHKERRCUDA(cerr);
1550: #if !defined(PETSC_USE_COMPLEX)
1551:     isreal = PETSC_TRUE;
1552: #else
1553:     getisreal_array2D(n,n,d_Maux,n,d_isreal);
1554:     cerr = cudaMemcpy(&isreal,d_isreal,sizeof(PetscBool),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
1555: #endif
1556:     if (isreal) {
1557:       rsizediv2 = irsize/2;
1558:       for (i=0;i<rsizediv2;i++) { /* use partial fraction to get R(As) */
1559:         cerr = cudaMemcpy(d_Maux,d_As,sizeof(PetscComplex)*n2,cudaMemcpyDeviceToDevice);CHKERRCUDA(cerr);
1560:         cerr = cudaMemset(d_RR,zero,sizeof(PetscComplex)*n2);CHKERRCUDA(cerr);
1561:         shift_Cdiagonal(n,d_Maux,n,-PetscRealPartComplex(p[2*i]),-PetscImaginaryPartComplex(p[2*i]));
1562:         set_Cdiagonal(n,d_RR,n,PetscRealPartComplex(r[2*i]),PetscImaginaryPartComplex(r[2*i]));
1563:         mmagma_Cgesv_gpu(n,n,d_Maux,n,piv,d_RR,n,&info);CHKERRMAGMA(mierr);
1564:         SlepcCheckLapackInfo("gesv",info);
1565:         add_array2D_Conj(n,n,d_RR,n);
1566:         cberr = cublasXCaxpy(cublasv2handle,n2,&cone,d_RR,one,d_expmA,one);CHKERRCUBLAS(cberr);
1567:         /* shift(n) + gesv + axpy(n2) */
1568:         SlepcLogFlopsComplex(1.0*n+(2.0*n*n*n/3.0+2.0*n*n*n)+2.0*n2);
1569:       }

1571:       mod = ipsize % 2;
1572:       if (mod) {
1573:         cerr = cudaMemcpy(d_Maux,d_As,sizeof(PetscComplex)*n2,cudaMemcpyDeviceToDevice);CHKERRCUDA(cerr);
1574:         cerr = cudaMemset(d_RR,zero,sizeof(PetscComplex)*n2);CHKERRCUDA(cerr);
1575:         shift_Cdiagonal(n,d_Maux,n,-PetscRealPartComplex(p[ipsize-1]),-PetscImaginaryPartComplex(p[ipsize-1]));
1576:         set_Cdiagonal(n,d_RR,n,PetscRealPartComplex(r[irsize-1]),PetscImaginaryPartComplex(r[irsize-1]));
1577:         mmagma_Cgesv_gpu(n,n,d_Maux,n,piv,d_RR,n,&info);CHKERRMAGMA(mierr);
1578:         SlepcCheckLapackInfo("gesv",info);
1579:         cberr = cublasXCaxpy(cublasv2handle,n2,&cone,d_RR,one,d_expmA,one);CHKERRCUBLAS(cberr);
1580:         SlepcLogFlopsComplex(1.0*n+(2.0*n*n*n/3.0+2.0*n*n*n)+1.0*n2);
1581:       }
1582:     } else { /* complex */
1583:       for (i=0;i<irsize;i++) { /* use partial fraction to get R(As) */
1584:         cerr = cudaMemcpy(d_Maux,d_As,sizeof(PetscComplex)*n2,cudaMemcpyDeviceToDevice);CHKERRCUDA(cerr);
1585:         cerr = cudaMemset(d_RR,zero,sizeof(PetscComplex)*n2);CHKERRCUDA(cerr);
1586:         shift_Cdiagonal(n,d_Maux,n,-PetscRealPartComplex(p[i]),-PetscImaginaryPartComplex(p[i]));
1587:         set_Cdiagonal(n,d_RR,n,PetscRealPartComplex(r[i]),PetscImaginaryPartComplex(r[i]));
1588:         mmagma_Cgesv_gpu(n,n,d_Maux,n,piv,d_RR,n,&info);CHKERRMAGMA(mierr);
1589:         SlepcCheckLapackInfo("gesv",info);
1590:         cberr = cublasXCaxpy(cublasv2handle,n2,&cone,d_RR,one,d_expmA,one);CHKERRCUBLAS(cberr);
1591:         SlepcLogFlopsComplex(1.0*n+(2.0*n*n*n/3.0+2.0*n*n*n)+1.0*n2);
1592:       }
1593:     }
1594:     for (i=0;i<iremainsize;i++) {
1595:       if (!i) {
1596:         cerr = cudaMemset(d_RR,zero,sizeof(PetscComplex)*n2);CHKERRCUDA(cerr);
1597:         set_Cdiagonal(n,d_RR,n,PetscRealPartComplex(remainterm[iremainsize-1]),PetscImaginaryPartComplex(remainterm[iremainsize-1]));
1598:       } else {
1599:         cerr = cudaMemcpy(d_RR,d_As,sizeof(PetscComplex)*n2,cudaMemcpyDeviceToDevice);CHKERRCUDA(cerr);
1600:         for (j=1;j<i;j++) {
1601:           cberr = cublasXCgemm(cublasv2handle,CUBLAS_OP_N,CUBLAS_OP_N,n,n,n,&cone,d_RR,n,d_RR,n,&czero,d_Maux,n);CHKERRCUBLAS(cberr);
1602:           SWAP(d_RR,d_Maux,aux);
1603:           SlepcLogFlopsComplex(2.0*n*n*n);
1604:         }
1605:         cberr = cublasXCscal(cublasv2handle,n2,&remainterm[iremainsize-1-i],d_RR,one);CHKERRCUBLAS(cberr);
1606:         SlepcLogFlopsComplex(1.0*n2);
1607:       }
1608:       cberr = cublasXCaxpy(cublasv2handle,n2,&cone,d_RR,one,d_expmA,one);CHKERRCUBLAS(cberr);
1609:       SlepcLogFlopsComplex(1.0*n2);
1610:     }
1611:     PetscFree3(r,p,remainterm);
1612:   } else { /* product form, default */
1613:     getcoeffsproduct(k,m,&rsize,&psize,&mult,PETSC_TRUE);
1614:     PetscBLASIntCast((PetscInt)PetscRealPartComplex(rsize),&irsize);
1615:     PetscBLASIntCast((PetscInt)PetscRealPartComplex(psize),&ipsize);
1616:     PetscMalloc2(irsize,&rootp,ipsize,&rootq);
1617:     getcoeffsproduct(k,m,rootp,rootq,&mult,PETSC_FALSE);

1619:     cerr = cudaMemset(d_expmA,zero,sizeof(PetscComplex)*n2);CHKERRCUDA(cerr);
1620:     set_Cdiagonal(n,d_expmA,n,rone,rzero); /* initialize */
1621:     minlen = PetscMin(irsize,ipsize);
1622:     for (i=0;i<minlen;i++) {
1623:       cerr = cudaMemcpy(d_RR,d_As,sizeof(PetscComplex)*n2,cudaMemcpyDeviceToDevice);CHKERRCUDA(cerr);
1624:       shift_Cdiagonal(n,d_RR,n,-PetscRealPartComplex(rootp[i]),-PetscImaginaryPartComplex(rootp[i]));
1625:       cberr = cublasXCgemm(cublasv2handle,CUBLAS_OP_N,CUBLAS_OP_N,n,n,n,&cone,d_RR,n,d_expmA,n,&czero,d_Maux,n);CHKERRCUBLAS(cberr);
1626:       SWAP(d_expmA,d_Maux,aux);
1627:       cerr = cudaMemcpy(d_RR,d_As,sizeof(PetscComplex)*n2,cudaMemcpyDeviceToDevice);CHKERRCUDA(cerr);
1628:       shift_Cdiagonal(n,d_RR,n,-PetscRealPartComplex(rootq[i]),-PetscImaginaryPartComplex(rootq[i]));
1629:       mmagma_Cgesv_gpu(n,n,d_RR,n,piv,d_expmA,n,&info);CHKERRMAGMA(mierr);
1630:       SlepcCheckLapackInfo("gesv",info);
1631:       /* shift(n) + gemm + shift(n) + gesv */
1632:       SlepcLogFlopsComplex(1.0*n+(2.0*n*n*n)+1.0*n+(2.0*n*n*n/3.0+2.0*n*n*n));
1633:     }
1634:     /* extra enumerator */
1635:     for (i=minlen;i<irsize;i++) {
1636:       cerr = cudaMemcpy(d_RR,d_As,sizeof(PetscComplex)*n2,cudaMemcpyDeviceToDevice);CHKERRCUDA(cerr);
1637:       shift_Cdiagonal(n,d_RR,n,-PetscRealPartComplex(rootp[i]),-PetscImaginaryPartComplex(rootp[i]));
1638:       cberr = cublasXCgemm(cublasv2handle,CUBLAS_OP_N,CUBLAS_OP_N,n,n,n,&cone,d_RR,n,d_expmA,n,&czero,d_Maux,n);CHKERRCUBLAS(cberr);
1639:       SWAP(d_expmA,d_Maux,aux);
1640:       SlepcLogFlopsComplex(1.0*n+2.0*n*n*n);
1641:     }
1642:     /* extra denominator */
1643:     for (i=minlen;i<ipsize;i++) {
1644:       cerr = cudaMemcpy(d_RR,d_As,sizeof(PetscComplex)*n2,cudaMemcpyDeviceToDevice);CHKERRCUDA(cerr);
1645:       shift_Cdiagonal(n,d_RR,n,-PetscRealPartComplex(rootq[i]),-PetscImaginaryPartComplex(rootq[i]));
1646:       mmagma_Cgesv_gpu(n,n,d_RR,n,piv,d_expmA,n,&info);CHKERRMAGMA(mierr);
1647:       SlepcCheckLapackInfo("gesv",info);
1648:       SlepcLogFlopsComplex(1.0*n+(2.0*n*n*n/3.0+2.0*n*n*n));
1649:     }
1650:     cberr = cublasXCscal(cublasv2handle,n2,&mult,d_expmA,one);CHKERRCUBLAS(cberr);
1651:     SlepcLogFlopsComplex(1.0*n2);
1652:     PetscFree2(rootp,rootq);
1653:   }

1655: #if !defined(PETSC_USE_COMPLEX)
1656:   copy_array2D_C2S(n,n,d_Ba2,n,d_expmA,n);
1657: #else
1658:   cerr = cudaMemcpy(d_Ba2,d_expmA,sizeof(PetscScalar)*n2,cudaMemcpyDeviceToDevice);CHKERRCUDA(cerr);
1659: #endif

1661:   /* perform repeated squaring */
1662:   for (i=0;i<s;i++) { /* final squaring */
1663:     cberr = cublasXgemm(cublasv2handle,CUBLAS_OP_N,CUBLAS_OP_N,n,n,n,&sone,d_Ba2,n,d_Ba2,n,&szero,d_sMaux,n);CHKERRCUBLAS(cberr);
1664:     SWAP(d_Ba2,d_sMaux,saux);
1665:     PetscLogFlops(2.0*n*n*n);
1666:   }
1667:   if (d_Ba2!=d_Ba) {
1668:     cerr = cudaMemcpy(d_Ba,d_Ba2,sizeof(PetscScalar)*n2,cudaMemcpyDeviceToDevice);CHKERRCUDA(cerr);
1669:     d_sMaux = d_Ba2;
1670:   }
1671:   if (shift) {
1672:     expshift = PetscExpReal(shift);
1673:     cberr = cublasXscal(cublasv2handle,n2,&expshift,d_Ba,one);CHKERRCUBLAS(cberr);
1674:     PetscLogFlops(1.0*n2);
1675:   }

1677:   cerr = cudaMemcpy(Ba,d_Ba,sizeof(PetscScalar)*n2,cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);

1679:   /* restore pointers */
1680:   d_Maux = d_Maux2; d_expmA = d_expmA2; d_RR = d_RR2;
1681:   cerr = cudaFree(d_isreal);CHKERRCUDA(cerr);
1682:   cerr = cudaFree(d_Maux);CHKERRCUDA(cerr);
1683:   cerr = cudaFree(d_As);CHKERRCUDA(cerr);
1684:   cerr = cudaFree(d_RR);CHKERRCUDA(cerr);
1685:   cerr = cudaFree(d_expmA);CHKERRCUDA(cerr);
1686:   cerr = cudaFree(d_Ba);CHKERRCUDA(cerr);
1687:   PetscFree(piv);
1688:   PetscFree2(sMaux,Maux);
1689:   MatDenseRestoreArray(A,&Aa);
1690:   MatDenseRestoreArray(B,&Ba);
1691:   magma_finalize();
1692:   return(0);
1693: }
1694: #endif /* PETSC_HAVE_MAGMA */
1695: #endif /* PETSC_HAVE_CUDA */

1697: PetscErrorCode FNView_Exp(FN fn,PetscViewer viewer)
1698: {
1700:   PetscBool      isascii;
1701:   char           str[50];
1702:   const char     *methodname[] = {
1703:                   "scaling & squaring, [m/m] Pade approximant (Higham)",
1704:                   "scaling & squaring, [6/6] Pade approximant",
1705:                   "scaling & squaring, subdiagonal Pade approximant (product form)",
1706:                   "scaling & squaring, subdiagonal Pade approximant (partial fraction)"
1707: #if defined(PETSC_HAVE_CUDA)
1708:                  ,"scaling & squaring, [6/6] Pade approximant CUDA"
1709: #if defined(PETSC_HAVE_MAGMA)
1710:                  ,"scaling & squaring, [m/m] Pade approximant (Higham) CUDA/MAGMA",
1711:                   "scaling & squaring, [6/6] Pade approximant CUDA/MAGMA",
1712:                   "scaling & squaring, subdiagonal Pade approximant (product form) CUDA/MAGMA",
1713:                   "scaling & squaring, subdiagonal Pade approximant (partial fraction) CUDA/MAGMA",
1714: #endif
1715: #endif
1716:   };
1717:   const int      nmeth=sizeof(methodname)/sizeof(methodname[0]);

1720:   PetscObjectTypeCompare((PetscObject)viewer,PETSCVIEWERASCII,&isascii);
1721:   if (isascii) {
1722:     if (fn->beta==(PetscScalar)1.0) {
1723:       if (fn->alpha==(PetscScalar)1.0) {
1724:         PetscViewerASCIIPrintf(viewer,"  Exponential: exp(x)\n");
1725:       } else {
1726:         SlepcSNPrintfScalar(str,sizeof(str),fn->alpha,PETSC_TRUE);
1727:         PetscViewerASCIIPrintf(viewer,"  Exponential: exp(%s*x)\n",str);
1728:       }
1729:     } else {
1730:       SlepcSNPrintfScalar(str,sizeof(str),fn->beta,PETSC_TRUE);
1731:       if (fn->alpha==(PetscScalar)1.0) {
1732:         PetscViewerASCIIPrintf(viewer,"  Exponential: %s*exp(x)\n",str);
1733:       } else {
1734:         PetscViewerASCIIPrintf(viewer,"  Exponential: %s",str);
1735:         PetscViewerASCIIUseTabs(viewer,PETSC_FALSE);
1736:         SlepcSNPrintfScalar(str,sizeof(str),fn->alpha,PETSC_TRUE);
1737:         PetscViewerASCIIPrintf(viewer,"*exp(%s*x)\n",str);
1738:         PetscViewerASCIIUseTabs(viewer,PETSC_TRUE);
1739:       }
1740:     }
1741:     if (fn->method<nmeth) {
1742:       PetscViewerASCIIPrintf(viewer,"  computing matrix functions with: %s\n",methodname[fn->method]);
1743:     }
1744:   }
1745:   return(0);
1746: }

1748: SLEPC_EXTERN PetscErrorCode FNCreate_Exp(FN fn)
1749: {
1751:   fn->ops->evaluatefunction       = FNEvaluateFunction_Exp;
1752:   fn->ops->evaluatederivative     = FNEvaluateDerivative_Exp;
1753:   fn->ops->evaluatefunctionmat[0] = FNEvaluateFunctionMat_Exp_Higham;
1754:   fn->ops->evaluatefunctionmat[1] = FNEvaluateFunctionMat_Exp_Pade;
1755:   fn->ops->evaluatefunctionmat[2] = FNEvaluateFunctionMat_Exp_GuettelNakatsukasa; /* product form */
1756:   fn->ops->evaluatefunctionmat[3] = FNEvaluateFunctionMat_Exp_GuettelNakatsukasa; /* partial fraction */
1757: #if defined(PETSC_HAVE_CUDA)
1758:   fn->ops->evaluatefunctionmat[4] = FNEvaluateFunctionMat_Exp_Pade_CUDA;
1759: #if defined(PETSC_HAVE_MAGMA)
1760:   fn->ops->evaluatefunctionmat[5] = FNEvaluateFunctionMat_Exp_Higham_CUDAm;
1761:   fn->ops->evaluatefunctionmat[6] = FNEvaluateFunctionMat_Exp_Pade_CUDAm;
1762:   fn->ops->evaluatefunctionmat[7] = FNEvaluateFunctionMat_Exp_GuettelNakatsukasa_CUDAm; /* product form */
1763:   fn->ops->evaluatefunctionmat[8] = FNEvaluateFunctionMat_Exp_GuettelNakatsukasa_CUDAm; /* partial fraction */
1764: #endif
1765: #endif
1766:   fn->ops->view                   = FNView_Exp;
1767:   return(0);
1768: }