Actual source code: fnsqrt.c
slepc-3.15.2 2021-09-20
1: /*
2: - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
3: SLEPc - Scalable Library for Eigenvalue Problem Computations
4: Copyright (c) 2002-2021, Universitat Politecnica de Valencia, Spain
6: This file is part of SLEPc.
7: SLEPc is distributed under a 2-clause BSD license (see LICENSE).
8: - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
9: */
10: /*
11: Square root function sqrt(x)
12: */
14: #include <slepc/private/fnimpl.h>
15: #include <slepcblaslapack.h>
17: PetscErrorCode FNEvaluateFunction_Sqrt(FN fn,PetscScalar x,PetscScalar *y)
18: {
20: #if !defined(PETSC_USE_COMPLEX)
21: if (x<0.0) SETERRQ(PETSC_COMM_SELF,1,"Function not defined in the requested value");
22: #endif
23: *y = PetscSqrtScalar(x);
24: return(0);
25: }
27: PetscErrorCode FNEvaluateDerivative_Sqrt(FN fn,PetscScalar x,PetscScalar *y)
28: {
30: if (x==0.0) SETERRQ(PETSC_COMM_SELF,1,"Derivative not defined in the requested value");
31: #if !defined(PETSC_USE_COMPLEX)
32: if (x<0.0) SETERRQ(PETSC_COMM_SELF,1,"Derivative not defined in the requested value");
33: #endif
34: *y = 1.0/(2.0*PetscSqrtScalar(x));
35: return(0);
36: }
38: PetscErrorCode FNEvaluateFunctionMat_Sqrt_Schur(FN fn,Mat A,Mat B)
39: {
41: PetscBLASInt n=0;
42: PetscScalar *T;
43: PetscInt m;
46: if (A!=B) { MatCopy(A,B,SAME_NONZERO_PATTERN); }
47: MatDenseGetArray(B,&T);
48: MatGetSize(A,&m,NULL);
49: PetscBLASIntCast(m,&n);
50: FNSqrtmSchur(fn,n,T,n,PETSC_FALSE);
51: MatDenseRestoreArray(B,&T);
52: return(0);
53: }
55: PetscErrorCode FNEvaluateFunctionMatVec_Sqrt_Schur(FN fn,Mat A,Vec v)
56: {
58: PetscBLASInt n=0;
59: PetscScalar *T;
60: PetscInt m;
61: Mat B;
64: FN_AllocateWorkMat(fn,A,&B);
65: MatDenseGetArray(B,&T);
66: MatGetSize(A,&m,NULL);
67: PetscBLASIntCast(m,&n);
68: FNSqrtmSchur(fn,n,T,n,PETSC_TRUE);
69: MatDenseRestoreArray(B,&T);
70: MatGetColumnVector(B,v,0);
71: FN_FreeWorkMat(fn,&B);
72: return(0);
73: }
75: PetscErrorCode FNEvaluateFunctionMat_Sqrt_DBP(FN fn,Mat A,Mat B)
76: {
78: PetscBLASInt n=0;
79: PetscScalar *T;
80: PetscInt m;
83: if (A!=B) { MatCopy(A,B,SAME_NONZERO_PATTERN); }
84: MatDenseGetArray(B,&T);
85: MatGetSize(A,&m,NULL);
86: PetscBLASIntCast(m,&n);
87: FNSqrtmDenmanBeavers(fn,n,T,n,PETSC_FALSE);
88: MatDenseRestoreArray(B,&T);
89: return(0);
90: }
92: PetscErrorCode FNEvaluateFunctionMat_Sqrt_NS(FN fn,Mat A,Mat B)
93: {
95: PetscBLASInt n=0;
96: PetscScalar *Ba;
97: PetscInt m;
100: if (A!=B) { MatCopy(A,B,SAME_NONZERO_PATTERN); }
101: MatDenseGetArray(B,&Ba);
102: MatGetSize(A,&m,NULL);
103: PetscBLASIntCast(m,&n);
104: FNSqrtmNewtonSchulz(fn,n,Ba,n,PETSC_FALSE);
105: MatDenseRestoreArray(B,&Ba);
106: return(0);
107: }
109: #define MAXIT 50
111: /*
112: Computes the principal square root of the matrix A using the
113: Sadeghi iteration. A is overwritten with sqrtm(A).
114: */
115: PetscErrorCode FNSqrtmSadeghi(FN fn,PetscBLASInt n,PetscScalar *A,PetscBLASInt ld)
116: {
117: PetscScalar *M,*M2,*G,*X=A,*work,work1,sqrtnrm;
118: PetscScalar szero=0.0,sone=1.0,smfive=-5.0,s1d16=1.0/16.0;
119: PetscReal tol,Mres=0.0,nrm,rwork[1],done=1.0;
120: PetscBLASInt N,i,it,*piv=NULL,info,lwork=0,query=-1,one=1,zero=0;
121: PetscBool converged=PETSC_FALSE;
123: unsigned int ftz;
126: N = n*n;
127: tol = PetscSqrtReal((PetscReal)n)*PETSC_MACHINE_EPSILON/2;
128: SlepcSetFlushToZero(&ftz);
130: /* query work size */
131: PetscStackCallBLAS("LAPACKgetri",LAPACKgetri_(&n,A,&ld,piv,&work1,&query,&info));
132: PetscBLASIntCast((PetscInt)PetscRealPart(work1),&lwork);
134: PetscMalloc5(N,&M,N,&M2,N,&G,lwork,&work,n,&piv);
135: PetscArraycpy(M,A,N);
137: /* scale M */
138: nrm = LAPACKlange_("fro",&n,&n,M,&n,rwork);
139: if (nrm>1.0) {
140: sqrtnrm = PetscSqrtReal(nrm);
141: PetscStackCallBLAS("LAPACKlascl",LAPACKlascl_("G",&zero,&zero,&nrm,&done,&N,&one,M,&N,&info));
142: SlepcCheckLapackInfo("lascl",info);
143: tol *= nrm;
144: }
145: PetscInfo2(fn,"||A||_F = %g, new tol: %g\n",(double)nrm,(double)tol);
147: /* X = I */
148: PetscArrayzero(X,N);
149: for (i=0;i<n;i++) X[i+i*ld] = 1.0;
151: for (it=0;it<MAXIT && !converged;it++) {
153: /* G = (5/16)*I + (1/16)*M*(15*I-5*M+M*M) */
154: PetscStackCallBLAS("BLASgemm",BLASgemm_("N","N",&n,&n,&n,&sone,M,&ld,M,&ld,&szero,M2,&ld));
155: PetscStackCallBLAS("BLASaxpy",BLASaxpy_(&N,&smfive,M,&one,M2,&one));
156: for (i=0;i<n;i++) M2[i+i*ld] += 15.0;
157: PetscStackCallBLAS("BLASgemm",BLASgemm_("N","N",&n,&n,&n,&s1d16,M,&ld,M2,&ld,&szero,G,&ld));
158: for (i=0;i<n;i++) G[i+i*ld] += 5.0/16.0;
160: /* X = X*G */
161: PetscArraycpy(M2,X,N);
162: PetscStackCallBLAS("BLASgemm",BLASgemm_("N","N",&n,&n,&n,&sone,M2,&ld,G,&ld,&szero,X,&ld));
164: /* M = M*inv(G*G) */
165: PetscStackCallBLAS("BLASgemm",BLASgemm_("N","N",&n,&n,&n,&sone,G,&ld,G,&ld,&szero,M2,&ld));
166: PetscStackCallBLAS("LAPACKgetrf",LAPACKgetrf_(&n,&n,M2,&ld,piv,&info));
167: SlepcCheckLapackInfo("getrf",info);
168: PetscStackCallBLAS("LAPACKgetri",LAPACKgetri_(&n,M2,&ld,piv,work,&lwork,&info));
169: SlepcCheckLapackInfo("getri",info);
171: PetscArraycpy(G,M,N);
172: PetscStackCallBLAS("BLASgemm",BLASgemm_("N","N",&n,&n,&n,&sone,G,&ld,M2,&ld,&szero,M,&ld));
174: /* check ||I-M|| */
175: PetscArraycpy(M2,M,N);
176: for (i=0;i<n;i++) M2[i+i*ld] -= 1.0;
177: Mres = LAPACKlange_("fro",&n,&n,M2,&n,rwork);
178: PetscIsNanReal(Mres);
179: if (Mres<=tol) converged = PETSC_TRUE;
180: PetscInfo2(fn,"it: %D res: %g\n",it,(double)Mres);
181: PetscLogFlops(8.0*n*n*n+2.0*n*n+2.0*n*n*n/3.0+4.0*n*n*n/3.0+2.0*n*n*n+2.0*n*n);
182: }
184: if (Mres>tol) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"SQRTM not converged after %d iterations",MAXIT);
186: /* undo scaling */
187: if (nrm>1.0) PetscStackCallBLAS("BLASscal",BLASscal_(&N,&sqrtnrm,A,&one));
189: PetscFree5(M,M2,G,work,piv);
190: SlepcResetFlushToZero(&ftz);
191: return(0);
192: }
194: #if defined(PETSC_HAVE_CUDA)
195: #include "../src/sys/classes/fn/impls/cuda/fnutilcuda.h"
196: #include <slepccublas.h>
198: #if defined(PETSC_HAVE_MAGMA)
199: #include <slepcmagma.h>
201: /*
202: * Matrix square root by Sadeghi iteration. CUDA version.
203: * Computes the principal square root of the matrix T using the
204: * Sadeghi iteration. T is overwritten with sqrtm(T).
205: */
206: PetscErrorCode FNSqrtmSadeghi_CUDAm(FN fn,PetscBLASInt n,PetscScalar *A,PetscBLASInt ld)
207: {
208: PetscScalar *d_X,*d_M,*d_M2,*d_G,*d_work,alpha;
209: const PetscScalar szero=0.0,sone=1.0,smfive=-5.0,s15=15.0,s1d16=1.0/16.0;
210: PetscReal tol,Mres=0.0,nrm,sqrtnrm;
211: PetscInt it,nb,lwork;
212: PetscBLASInt info,*piv,N;
213: const PetscBLASInt one=1,zero=0;
214: PetscBool converged=PETSC_FALSE;
215: cublasHandle_t cublasv2handle;
216: PetscErrorCode ierr;
217: cublasStatus_t cberr;
218: cudaError_t cerr;
219: magma_int_t mierr;
222: PetscCUBLASGetHandle(&cublasv2handle);
223: magma_init();
224: N = n*n;
225: tol = PetscSqrtReal((PetscReal)n)*PETSC_MACHINE_EPSILON/2;
227: PetscMalloc1(n,&piv);
228: cerr = cudaMalloc((void **)&d_X,sizeof(PetscScalar)*N);CHKERRCUDA(cerr);
229: cerr = cudaMalloc((void **)&d_M,sizeof(PetscScalar)*N);CHKERRCUDA(cerr);
230: cerr = cudaMalloc((void **)&d_M2,sizeof(PetscScalar)*N);CHKERRCUDA(cerr);
231: cerr = cudaMalloc((void **)&d_G,sizeof(PetscScalar)*N);CHKERRCUDA(cerr);
233: nb = magma_get_xgetri_nb(n);
234: lwork = nb*n;
235: cerr = cudaMalloc((void **)&d_work,sizeof(PetscScalar)*lwork);CHKERRCUDA(cerr);
237: /* M = A */
238: cerr = cudaMemcpy(d_M,A,sizeof(PetscScalar)*N,cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
240: /* scale M */
241: cberr = cublasXnrm2(cublasv2handle,N,d_M,one,&nrm);CHKERRCUBLAS(cberr);
242: if (nrm>1.0) {
243: sqrtnrm = PetscSqrtReal(nrm);
244: alpha = 1.0/nrm;
245: cberr = cublasXscal(cublasv2handle,N,&alpha,d_M,one);CHKERRCUBLAS(cberr);
246: tol *= nrm;
247: }
248: PetscInfo2(fn,"||A||_F = %g, new tol: %g\n",(double)nrm,(double)tol);
250: /* X = I */
251: cerr = cudaMemset(d_X,zero,sizeof(PetscScalar)*N);CHKERRCUDA(cerr);
252: set_diagonal(n,d_X,ld,sone);CHKERRQ(cerr);
254: for (it=0;it<MAXIT && !converged;it++) {
256: /* G = (5/16)*I + (1/16)*M*(15*I-5*M+M*M) */
257: cberr = cublasXgemm(cublasv2handle,CUBLAS_OP_N,CUBLAS_OP_N,n,n,n,&sone,d_M,ld,d_M,ld,&szero,d_M2,ld);CHKERRCUBLAS(cberr);
258: cberr = cublasXaxpy(cublasv2handle,N,&smfive,d_M,one,d_M2,one);CHKERRCUBLAS(cberr);
259: shift_diagonal(n,d_M2,ld,s15);CHKERRQ(cerr);
260: cberr = cublasXgemm(cublasv2handle,CUBLAS_OP_N,CUBLAS_OP_N,n,n,n,&s1d16,d_M,ld,d_M2,ld,&szero,d_G,ld);CHKERRCUBLAS(cberr);
261: shift_diagonal(n,d_G,ld,5.0/16.0);CHKERRQ(cerr);
263: /* X = X*G */
264: cerr = cudaMemcpy(d_M2,d_X,sizeof(PetscScalar)*N,cudaMemcpyDeviceToDevice);CHKERRCUDA(cerr);
265: cberr = cublasXgemm(cublasv2handle,CUBLAS_OP_N,CUBLAS_OP_N,n,n,n,&sone,d_M2,ld,d_G,ld,&szero,d_X,ld);CHKERRCUBLAS(cberr);
267: /* M = M*inv(G*G) */
268: cberr = cublasXgemm(cublasv2handle,CUBLAS_OP_N,CUBLAS_OP_N,n,n,n,&sone,d_G,ld,d_G,ld,&szero,d_M2,ld);CHKERRCUBLAS(cberr);
269: /* magma */
270: mmagma_xgetrf_gpu(n,n,d_M2,ld,piv,&info);CHKERRMAGMA(mierr);
271: if (info < 0) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"LAPACKgetrf: Illegal value on argument %d",PetscAbsInt(info));
272: if (info > 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_MAT_LU_ZRPVT,"LAPACKgetrf: Matrix is singular. U(%d,%d) is zero",info,info);
273: mmagma_xgetri_gpu(n,d_M2,ld,piv,d_work,lwork,&info);CHKERRMAGMA(mierr);
274: if (info < 0) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"LAPACKgetri: Illegal value on argument %d",PetscAbsInt(info));
275: if (info > 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_MAT_LU_ZRPVT,"LAPACKgetri: Matrix is singular. U(%d,%d) is zero",info,info);
276: /* magma */
277: cerr = cudaMemcpy(d_G,d_M,sizeof(PetscScalar)*N,cudaMemcpyDeviceToDevice);CHKERRCUDA(cerr);
278: cberr = cublasXgemm(cublasv2handle,CUBLAS_OP_N,CUBLAS_OP_N,n,n,n,&sone,d_G,ld,d_M2,ld,&szero,d_M,ld);CHKERRCUBLAS(cberr);
280: /* check ||I-M|| */
281: cerr = cudaMemcpy(d_M2,d_M,sizeof(PetscScalar)*N,cudaMemcpyDeviceToDevice);CHKERRCUDA(cerr);
282: shift_diagonal(n,d_M2,ld,-1.0);CHKERRQ(cerr);
283: cberr = cublasXnrm2(cublasv2handle,N,d_M2,one,&Mres);CHKERRCUBLAS(cberr);
284: PetscIsNanReal(Mres);
285: if (Mres<=tol) converged = PETSC_TRUE;
286: PetscInfo2(fn,"it: %D res: %g\n",it,(double)Mres);
287: PetscLogFlops(8.0*n*n*n+2.0*n*n+2.0*n*n*n/3.0+4.0*n*n*n/3.0+2.0*n*n*n+2.0*n*n);
288: }
290: if (Mres>tol) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"SQRTM not converged after %d iterations", MAXIT);
292: if (nrm>1.0) {cberr = cublasXscal(cublasv2handle,N,&sqrtnrm,d_X,one);CHKERRCUBLAS(cberr);}
293: cerr = cudaMemcpy(A,d_X,sizeof(PetscScalar)*N,cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
295: cerr = cudaFree(d_X);CHKERRCUDA(cerr);
296: cerr = cudaFree(d_M);CHKERRCUDA(cerr);
297: cerr = cudaFree(d_M2);CHKERRCUDA(cerr);
298: cerr = cudaFree(d_G);CHKERRCUDA(cerr);
299: cerr = cudaFree(d_work);CHKERRCUDA(cerr);
300: PetscFree(piv);
302: magma_finalize();
303: return(0);
304: }
305: #endif /* PETSC_HAVE_MAGMA */
306: #endif /* PETSC_HAVE_CUDA */
308: PetscErrorCode FNEvaluateFunctionMat_Sqrt_Sadeghi(FN fn,Mat A,Mat B)
309: {
311: PetscBLASInt n=0;
312: PetscScalar *Ba;
313: PetscInt m;
316: if (A!=B) { MatCopy(A,B,SAME_NONZERO_PATTERN); }
317: MatDenseGetArray(B,&Ba);
318: MatGetSize(A,&m,NULL);
319: PetscBLASIntCast(m,&n);
320: FNSqrtmSadeghi(fn,n,Ba,n);
321: MatDenseRestoreArray(B,&Ba);
322: return(0);
323: }
325: #if defined(PETSC_HAVE_CUDA)
326: PetscErrorCode FNEvaluateFunctionMat_Sqrt_NS_CUDA(FN fn,Mat A,Mat B)
327: {
329: PetscBLASInt n=0;
330: PetscScalar *Ba;
331: PetscInt m;
334: if (A!=B) { MatCopy(A,B,SAME_NONZERO_PATTERN); }
335: MatDenseGetArray(B,&Ba);
336: MatGetSize(A,&m,NULL);
337: PetscBLASIntCast(m,&n);
338: FNSqrtmNewtonSchulz_CUDA(fn,n,Ba,n,PETSC_FALSE);
339: MatDenseRestoreArray(B,&Ba);
340: return(0);
341: }
343: #if defined(PETSC_HAVE_MAGMA)
344: PetscErrorCode FNEvaluateFunctionMat_Sqrt_DBP_CUDAm(FN fn,Mat A,Mat B)
345: {
347: PetscBLASInt n=0;
348: PetscScalar *T;
349: PetscInt m;
352: if (A!=B) { MatCopy(A,B,SAME_NONZERO_PATTERN); }
353: MatDenseGetArray(B,&T);
354: MatGetSize(A,&m,NULL);
355: PetscBLASIntCast(m,&n);
356: FNSqrtmDenmanBeavers_CUDAm(fn,n,T,n,PETSC_FALSE);
357: MatDenseRestoreArray(B,&T);
358: return(0);
359: }
361: PetscErrorCode FNEvaluateFunctionMat_Sqrt_Sadeghi_CUDAm(FN fn,Mat A,Mat B)
362: {
364: PetscBLASInt n=0;
365: PetscScalar *Ba;
366: PetscInt m;
369: if (A!=B) { MatCopy(A,B,SAME_NONZERO_PATTERN); }
370: MatDenseGetArray(B,&Ba);
371: MatGetSize(A,&m,NULL);
372: PetscBLASIntCast(m,&n);
373: FNSqrtmSadeghi_CUDAm(fn,n,Ba,n);
374: MatDenseRestoreArray(B,&Ba);
375: return(0);
376: }
377: #endif /* PETSC_HAVE_MAGMA */
378: #endif /* PETSC_HAVE_CUDA */
380: PetscErrorCode FNView_Sqrt(FN fn,PetscViewer viewer)
381: {
383: PetscBool isascii;
384: char str[50];
385: const char *methodname[] = {
386: "Schur method for the square root",
387: "Denman-Beavers (product form)",
388: "Newton-Schulz iteration",
389: "Sadeghi iteration"
390: #if defined(PETSC_HAVE_CUDA)
391: ,"Newton-Schulz iteration CUDA"
392: #if defined(PETSC_HAVE_MAGMA)
393: ,"Denman-Beavers (product form) CUDA/MAGMA",
394: "Sadeghi iteration CUDA/MAGMA"
395: #endif
396: #endif
397: };
398: const int nmeth=sizeof(methodname)/sizeof(methodname[0]);
401: PetscObjectTypeCompare((PetscObject)viewer,PETSCVIEWERASCII,&isascii);
402: if (isascii) {
403: if (fn->beta==(PetscScalar)1.0) {
404: if (fn->alpha==(PetscScalar)1.0) {
405: PetscViewerASCIIPrintf(viewer," Square root: sqrt(x)\n");
406: } else {
407: SlepcSNPrintfScalar(str,sizeof(str),fn->alpha,PETSC_TRUE);
408: PetscViewerASCIIPrintf(viewer," Square root: sqrt(%s*x)\n",str);
409: }
410: } else {
411: SlepcSNPrintfScalar(str,sizeof(str),fn->beta,PETSC_TRUE);
412: if (fn->alpha==(PetscScalar)1.0) {
413: PetscViewerASCIIPrintf(viewer," Square root: %s*sqrt(x)\n",str);
414: } else {
415: PetscViewerASCIIPrintf(viewer," Square root: %s",str);
416: PetscViewerASCIIUseTabs(viewer,PETSC_FALSE);
417: SlepcSNPrintfScalar(str,sizeof(str),fn->alpha,PETSC_TRUE);
418: PetscViewerASCIIPrintf(viewer,"*sqrt(%s*x)\n",str);
419: PetscViewerASCIIUseTabs(viewer,PETSC_TRUE);
420: }
421: }
422: if (fn->method<nmeth) {
423: PetscViewerASCIIPrintf(viewer," computing matrix functions with: %s\n",methodname[fn->method]);
424: }
425: }
426: return(0);
427: }
429: SLEPC_EXTERN PetscErrorCode FNCreate_Sqrt(FN fn)
430: {
432: fn->ops->evaluatefunction = FNEvaluateFunction_Sqrt;
433: fn->ops->evaluatederivative = FNEvaluateDerivative_Sqrt;
434: fn->ops->evaluatefunctionmat[0] = FNEvaluateFunctionMat_Sqrt_Schur;
435: fn->ops->evaluatefunctionmat[1] = FNEvaluateFunctionMat_Sqrt_DBP;
436: fn->ops->evaluatefunctionmat[2] = FNEvaluateFunctionMat_Sqrt_NS;
437: fn->ops->evaluatefunctionmat[3] = FNEvaluateFunctionMat_Sqrt_Sadeghi;
438: #if defined(PETSC_HAVE_CUDA)
439: fn->ops->evaluatefunctionmat[4] = FNEvaluateFunctionMat_Sqrt_NS_CUDA;
440: #if defined(PETSC_HAVE_MAGMA)
441: fn->ops->evaluatefunctionmat[5] = FNEvaluateFunctionMat_Sqrt_DBP_CUDAm;
442: fn->ops->evaluatefunctionmat[6] = FNEvaluateFunctionMat_Sqrt_Sadeghi_CUDAm;
443: #endif /* PETSC_HAVE_MAGMA */
444: #endif /* PETSC_HAVE_CUDA */
445: fn->ops->evaluatefunctionmatvec[0] = FNEvaluateFunctionMatVec_Sqrt_Schur;
446: fn->ops->view = FNView_Sqrt;
447: return(0);
448: }