Actual source code: sveccuda.cu
slepc-3.16.3 2022-04-11
1: /*
2: - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
3: SLEPc - Scalable Library for Eigenvalue Problem Computations
4: Copyright (c) 2002-2021, Universitat Politecnica de Valencia, Spain
6: This file is part of SLEPc.
7: SLEPc is distributed under a 2-clause BSD license (see LICENSE).
8: - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
9: */
10: /*
11: BV implemented as a single Vec (CUDA version)
12: */
14: #include <slepc/private/bvimpl.h>
15: #include "../src/sys/classes/bv/impls/svec/svec.h"
16: #include <slepccublas.h>
18: #if defined(PETSC_USE_COMPLEX)
19: #include <thrust/device_ptr.h>
20: #endif
22: #define BLOCKSIZE 64
24: /*
25: B := alpha*A + beta*B
27: A,B are nxk (ld=n)
28: */
29: static PetscErrorCode BVAXPY_BLAS_CUDA(BV bv,PetscInt n_,PetscInt k_,PetscScalar alpha,const PetscScalar *d_A,PetscScalar beta,PetscScalar *d_B)
30: {
32: PetscCuBLASInt m=0,one=1;
33: cublasStatus_t cberr;
34: cublasHandle_t cublasv2handle;
37: PetscCUBLASGetHandle(&cublasv2handle);
38: PetscCuBLASIntCast(n_*k_,&m);
39: PetscLogGpuTimeBegin();
40: if (beta!=(PetscScalar)1.0) {
41: cberr = cublasXscal(cublasv2handle,m,&beta,d_B,one);CHKERRCUBLAS(cberr);
42: PetscLogGpuFlops(1.0*m);
43: }
44: cberr = cublasXaxpy(cublasv2handle,m,&alpha,d_A,one,d_B,one);CHKERRCUBLAS(cberr);
45: PetscLogGpuTimeEnd();
46: PetscLogGpuFlops(2.0*m);
47: return(0);
48: }
50: /*
51: C := alpha*A*B + beta*C
52: */
53: PetscErrorCode BVMult_Svec_CUDA(BV Y,PetscScalar alpha,PetscScalar beta,BV X,Mat Q)
54: {
55: PetscErrorCode ierr;
56: BV_SVEC *y = (BV_SVEC*)Y->data,*x = (BV_SVEC*)X->data;
57: const PetscScalar *d_px,*d_A,*q;
58: PetscScalar *d_py,*d_q,*d_B,*d_C;
59: PetscInt ldq,mq;
60: PetscCuBLASInt m=0,n=0,k=0,ldq_=0;
61: cublasStatus_t cberr;
62: cudaError_t cerr;
63: cublasHandle_t cublasv2handle;
64: PetscBool matiscuda;
67: if (!Y->n) return(0);
68: VecCUDAGetArrayRead(x->v,&d_px);
69: if (beta==(PetscScalar)0.0) {
70: VecCUDAGetArrayWrite(y->v,&d_py);
71: } else {
72: VecCUDAGetArray(y->v,&d_py);
73: }
74: d_A = d_px+(X->nc+X->l)*X->n;
75: d_C = d_py+(Y->nc+Y->l)*Y->n;
76: if (Q) {
77: PetscCuBLASIntCast(Y->n,&m);
78: PetscCuBLASIntCast(Y->k-Y->l,&n);
79: PetscCuBLASIntCast(X->k-X->l,&k);
80: PetscCUBLASGetHandle(&cublasv2handle);
81: MatGetSize(Q,&ldq,&mq);
82: PetscCuBLASIntCast(ldq,&ldq_);
83: PetscObjectTypeCompare((PetscObject)Q,MATSEQDENSECUDA,&matiscuda);
84: if (matiscuda) {
85: MatDenseCUDAGetArrayRead(Q,(const PetscScalar**)&d_q);
86: } else {
87: MatDenseGetArrayRead(Q,&q);
88: cerr = cudaMalloc((void**)&d_q,ldq*mq*sizeof(PetscScalar));CHKERRCUDA(cerr);
89: cerr = cudaMemcpy(d_q,q,ldq*mq*sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
90: PetscLogCpuToGpu(ldq*mq*sizeof(PetscScalar));
91: }
92: d_B = d_q+Y->l*ldq+X->l;
93: PetscLogGpuTimeBegin();
94: cberr = cublasXgemm(cublasv2handle,CUBLAS_OP_N,CUBLAS_OP_N,m,n,k,&alpha,d_A,m,d_B,ldq_,&beta,d_C,m);CHKERRCUBLAS(cberr);
95: PetscLogGpuTimeEnd();
96: if (matiscuda) {
97: MatDenseCUDARestoreArrayRead(Q,(const PetscScalar**)&d_q);
98: } else {
99: MatDenseRestoreArrayRead(Q,&q);
100: cerr = cudaFree(d_q);CHKERRCUDA(cerr);
101: }
102: PetscLogGpuFlops(2.0*m*n*k);
103: } else {
104: BVAXPY_BLAS_CUDA(Y,Y->n,Y->k-Y->l,alpha,d_A,beta,d_C);
105: }
106: VecCUDARestoreArrayRead(x->v,&d_px);
107: VecCUDARestoreArrayWrite(y->v,&d_py);
108: return(0);
109: }
111: /*
112: y := alpha*A*x + beta*y
113: */
114: PetscErrorCode BVMultVec_Svec_CUDA(BV X,PetscScalar alpha,PetscScalar beta,Vec y,PetscScalar *q)
115: {
116: PetscErrorCode ierr;
117: BV_SVEC *x = (BV_SVEC*)X->data;
118: const PetscScalar *d_px,*d_A;
119: PetscScalar *d_py,*d_q,*d_x,*d_y;
120: PetscCuBLASInt n=0,k=0,one=1;
121: cublasStatus_t cberr;
122: cublasHandle_t cublasv2handle;
123: cudaError_t cerr;
126: PetscCuBLASIntCast(X->n,&n);
127: PetscCuBLASIntCast(X->k-X->l,&k);
128: PetscCUBLASGetHandle(&cublasv2handle);
129: VecCUDAGetArrayRead(x->v,&d_px);
130: if (beta==(PetscScalar)0.0) {
131: VecCUDAGetArrayWrite(y,&d_py);
132: } else {
133: VecCUDAGetArray(y,&d_py);
134: }
135: if (!q) {
136: VecCUDAGetArray(X->buffer,&d_q);
137: } else {
138: cerr = cudaMalloc((void**)&d_q,k*sizeof(PetscScalar));CHKERRCUDA(cerr);
139: cerr = cudaMemcpy(d_q,q,k*sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
140: PetscLogCpuToGpu(k*sizeof(PetscScalar));
141: }
142: d_A = d_px+(X->nc+X->l)*X->n;
143: d_x = d_q;
144: d_y = d_py;
145: PetscLogGpuTimeBegin();
146: cberr = cublasXgemv(cublasv2handle,CUBLAS_OP_N,n,k,&alpha,d_A,n,d_x,one,&beta,d_y,one);CHKERRCUBLAS(cberr);
147: PetscLogGpuTimeEnd();
148: VecCUDARestoreArrayRead(x->v,&d_px);
149: if (beta==(PetscScalar)0.0) {
150: VecCUDARestoreArrayWrite(y,&d_py);
151: } else {
152: VecCUDARestoreArray(y,&d_py);
153: }
154: if (!q) {
155: VecCUDARestoreArray(X->buffer,&d_q);
156: } else {
157: cerr = cudaFree(d_q);CHKERRCUDA(cerr);
158: }
159: PetscLogGpuFlops(2.0*n*k);
160: return(0);
161: }
163: /*
164: A(:,s:e-1) := A*B(:,s:e-1)
165: */
166: PetscErrorCode BVMultInPlace_Svec_CUDA(BV V,Mat Q,PetscInt s,PetscInt e)
167: {
168: PetscErrorCode ierr;
169: BV_SVEC *ctx = (BV_SVEC*)V->data;
170: PetscScalar *d_pv,*d_q,*d_A,*d_B,*d_work,sone=1.0,szero=0.0;
171: const PetscScalar *q;
172: PetscInt ldq,nq;
173: PetscCuBLASInt m=0,n=0,k=0,l=0,ldq_=0,bs=BLOCKSIZE;
174: cublasStatus_t cberr;
175: size_t freemem,totmem;
176: cublasHandle_t cublasv2handle;
177: cudaError_t cerr;
178: PetscBool matiscuda;
181: if (!V->n) return(0);
182: PetscCuBLASIntCast(V->n,&m);
183: PetscCuBLASIntCast(e-s,&n);
184: PetscCuBLASIntCast(V->k-V->l,&k);
185: MatGetSize(Q,&ldq,&nq);
186: PetscCuBLASIntCast(ldq,&ldq_);
187: VecCUDAGetArray(ctx->v,&d_pv);
188: PetscObjectTypeCompare((PetscObject)Q,MATSEQDENSECUDA,&matiscuda);
189: if (matiscuda) {
190: MatDenseCUDAGetArrayRead(Q,(const PetscScalar**)&d_q);
191: } else {
192: MatDenseGetArrayRead(Q,&q);
193: cerr = cudaMalloc((void**)&d_q,ldq*nq*sizeof(PetscScalar));CHKERRCUDA(cerr);
194: cerr = cudaMemcpy(d_q,q,ldq*nq*sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
195: PetscLogCpuToGpu(ldq*nq*sizeof(PetscScalar));
196: }
197: PetscCUBLASGetHandle(&cublasv2handle);
198: PetscLogGpuTimeBegin();
199: /* try to allocate the whole matrix */
200: cerr = cudaMemGetInfo(&freemem,&totmem);CHKERRCUDA(cerr);
201: if (freemem>=m*n*sizeof(PetscScalar)) {
202: cerr = cudaMalloc((void**)&d_work,m*n*sizeof(PetscScalar));CHKERRCUDA(cerr);
203: d_A = d_pv+(V->nc+V->l)*m;
204: d_B = d_q+V->l*ldq+V->l+(s-V->l)*ldq;
205: cberr = cublasXgemm(cublasv2handle,CUBLAS_OP_N,CUBLAS_OP_N,m,n,k,&sone,d_A,m,d_B,ldq_,&szero,d_work,m);CHKERRCUBLAS(cberr);
206: cerr = cudaMemcpy2D(d_A+(s-V->l)*m,m*sizeof(PetscScalar),d_work,m*sizeof(PetscScalar),m*sizeof(PetscScalar),n,cudaMemcpyDeviceToDevice);CHKERRCUDA(cerr);
207: } else {
208: PetscCuBLASIntCast(freemem/(m*sizeof(PetscScalar)),&bs);
209: cerr = cudaMalloc((void**)&d_work,bs*n*sizeof(PetscScalar));CHKERRCUDA(cerr);
210: PetscCuBLASIntCast(m % bs,&l);
211: if (l) {
212: d_A = d_pv+(V->nc+V->l)*m;
213: d_B = d_q+V->l*ldq+V->l+(s-V->l)*ldq;
214: cberr = cublasXgemm(cublasv2handle,CUBLAS_OP_N,CUBLAS_OP_N,l,n,k,&sone,d_A,m,d_B,ldq_,&szero,d_work,l);CHKERRCUBLAS(cberr);
215: cerr = cudaMemcpy2D(d_A+(s-V->l)*m,m*sizeof(PetscScalar),d_work,l*sizeof(PetscScalar),l*sizeof(PetscScalar),n,cudaMemcpyDeviceToDevice);CHKERRCUDA(cerr);
216: }
217: for (;l<m;l+=bs) {
218: d_A = d_pv+(V->nc+V->l)*m+l;
219: d_B = d_q+V->l*ldq+V->l+(s-V->l)*ldq;
220: cberr = cublasXgemm(cublasv2handle,CUBLAS_OP_N,CUBLAS_OP_N,bs,n,k,&sone,d_A,m,d_B,ldq_,&szero,d_work,bs);CHKERRCUBLAS(cberr);
221: cerr = cudaMemcpy2D(d_A+(s-V->l)*m,m*sizeof(PetscScalar),d_work,bs*sizeof(PetscScalar),bs*sizeof(PetscScalar),n,cudaMemcpyDeviceToDevice);CHKERRCUDA(cerr);
222: }
223: }
224: PetscLogGpuTimeEnd();
225: if (matiscuda) {
226: MatDenseCUDARestoreArrayRead(Q,(const PetscScalar**)&d_q);
227: } else {
228: MatDenseRestoreArrayRead(Q,&q);
229: cerr = cudaFree(d_q);CHKERRCUDA(cerr);
230: }
231: cerr = cudaFree(d_work);CHKERRCUDA(cerr);
232: VecCUDARestoreArray(ctx->v,&d_pv);
233: PetscLogGpuFlops(2.0*m*n*k);
234: return(0);
235: }
237: /*
238: A(:,s:e-1) := A*B(:,s:e-1)
239: */
240: PetscErrorCode BVMultInPlaceHermitianTranspose_Svec_CUDA(BV V,Mat Q,PetscInt s,PetscInt e)
241: {
242: PetscErrorCode ierr;
243: BV_SVEC *ctx = (BV_SVEC*)V->data;
244: PetscScalar *d_pv,*d_q,*d_A,*d_B,*d_work,sone=1.0,szero=0.0;
245: const PetscScalar *q;
246: PetscInt ldq,nq;
247: PetscCuBLASInt m=0,n=0,k=0,ldq_=0;
248: cublasStatus_t cberr;
249: cublasHandle_t cublasv2handle;
250: cudaError_t cerr;
251: PetscBool matiscuda;
254: if (!V->n) return(0);
255: PetscCuBLASIntCast(V->n,&m);
256: PetscCuBLASIntCast(e-s,&n);
257: PetscCuBLASIntCast(V->k-V->l,&k);
258: MatGetSize(Q,&ldq,&nq);
259: PetscCuBLASIntCast(ldq,&ldq_);
260: VecCUDAGetArray(ctx->v,&d_pv);
261: PetscObjectTypeCompare((PetscObject)Q,MATSEQDENSECUDA,&matiscuda);
262: if (matiscuda) {
263: MatDenseCUDAGetArrayRead(Q,(const PetscScalar**)&d_q);
264: } else {
265: MatDenseGetArrayRead(Q,&q);
266: cerr = cudaMalloc((void**)&d_q,ldq*nq*sizeof(PetscScalar));CHKERRCUDA(cerr);
267: cerr = cudaMemcpy(d_q,q,ldq*nq*sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
268: PetscLogCpuToGpu(ldq*nq*sizeof(PetscScalar));
269: }
270: PetscCUBLASGetHandle(&cublasv2handle);
271: PetscLogGpuTimeBegin();
272: cerr = cudaMalloc((void**)&d_work,m*n*sizeof(PetscScalar));CHKERRCUDA(cerr);
273: d_A = d_pv+(V->nc+V->l)*m;
274: d_B = d_q+V->l*ldq+s;
275: cberr = cublasXgemm(cublasv2handle,CUBLAS_OP_N,CUBLAS_OP_C,m,n,k,&sone,d_A,m,d_B,ldq_,&szero,d_work,m);CHKERRCUBLAS(cberr);
276: cerr = cudaMemcpy2D(d_A+(s-V->l)*m,m*sizeof(PetscScalar),d_work,m*sizeof(PetscScalar),m*sizeof(PetscScalar),n,cudaMemcpyDeviceToDevice);CHKERRCUDA(cerr);
277: PetscLogGpuTimeEnd();
278: if (matiscuda) {
279: MatDenseCUDARestoreArrayRead(Q,(const PetscScalar**)&d_q);
280: } else {
281: MatDenseRestoreArrayRead(Q,&q);
282: cerr = cudaFree(d_q);CHKERRCUDA(cerr);
283: }
284: cerr = cudaFree(d_work);CHKERRCUDA(cerr);
285: VecCUDARestoreArray(ctx->v,&d_pv);
286: PetscLogGpuFlops(2.0*m*n*k);
287: return(0);
288: }
290: /*
291: C := A'*B
292: */
293: PetscErrorCode BVDot_Svec_CUDA(BV X,BV Y,Mat M)
294: {
295: PetscErrorCode ierr;
296: BV_SVEC *x = (BV_SVEC*)X->data,*y = (BV_SVEC*)Y->data;
297: const PetscScalar *d_px,*d_py,*d_A,*d_B;
298: PetscScalar *pm,*d_work,sone=1.0,szero=0.0,*C,*CC;
299: PetscInt j,ldm;
300: PetscCuBLASInt m=0,n=0,k=0,ldm_=0;
301: PetscMPIInt len;
302: cublasStatus_t cberr;
303: cublasHandle_t cublasv2handle;
304: cudaError_t cerr;
307: PetscCuBLASIntCast(Y->k-Y->l,&m);
308: PetscCuBLASIntCast(X->k-X->l,&n);
309: PetscCuBLASIntCast(X->n,&k);
310: MatGetSize(M,&ldm,NULL);
311: PetscCuBLASIntCast(ldm,&ldm_);
312: VecCUDAGetArrayRead(x->v,&d_px);
313: VecCUDAGetArrayRead(y->v,&d_py);
314: MatDenseGetArray(M,&pm);
315: PetscCUBLASGetHandle(&cublasv2handle);
316: cerr = cudaMalloc((void**)&d_work,m*n*sizeof(PetscScalar));CHKERRCUDA(cerr);
317: d_A = d_py+(Y->nc+Y->l)*Y->n;
318: d_B = d_px+(X->nc+X->l)*X->n;
319: C = pm+X->l*ldm+Y->l;
320: if (x->mpi) {
321: if (ldm==m) {
322: BVAllocateWork_Private(X,m*n);
323: if (k) {
324: PetscLogGpuTimeBegin();
325: cberr = cublasXgemm(cublasv2handle,CUBLAS_OP_C,CUBLAS_OP_N,m,n,k,&sone,d_A,k,d_B,k,&szero,d_work,ldm_);CHKERRCUBLAS(cberr);
326: PetscLogGpuTimeEnd();
327: cerr = cudaMemcpy(X->work,d_work,m*n*sizeof(PetscScalar),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
328: PetscLogGpuToCpu(m*n*sizeof(PetscScalar));
329: } else {
330: PetscArrayzero(X->work,m*n);
331: }
332: PetscMPIIntCast(m*n,&len);
333: MPIU_Allreduce(X->work,C,len,MPIU_SCALAR,MPIU_SUM,PetscObjectComm((PetscObject)X));
334: } else {
335: BVAllocateWork_Private(X,2*m*n);
336: CC = X->work+m*n;
337: if (k) {
338: PetscLogGpuTimeBegin();
339: cberr = cublasXgemm(cublasv2handle,CUBLAS_OP_C,CUBLAS_OP_N,m,n,k,&sone,d_A,k,d_B,k,&szero,d_work,m);CHKERRCUBLAS(cberr);
340: PetscLogGpuTimeEnd();
341: cerr = cudaMemcpy(X->work,d_work,m*n*sizeof(PetscScalar),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
342: PetscLogGpuToCpu(m*n*sizeof(PetscScalar));
343: } else {
344: PetscArrayzero(X->work,m*n);
345: }
346: PetscMPIIntCast(m*n,&len);
347: MPIU_Allreduce(X->work,CC,len,MPIU_SCALAR,MPIU_SUM,PetscObjectComm((PetscObject)X));
348: for (j=0;j<n;j++) {
349: PetscArraycpy(C+j*ldm,CC+j*m,m);
350: }
351: }
352: } else {
353: if (k) {
354: BVAllocateWork_Private(X,m*n);
355: PetscLogGpuTimeBegin();
356: cberr = cublasXgemm(cublasv2handle,CUBLAS_OP_C,CUBLAS_OP_N,m,n,k,&sone,d_A,k,d_B,k,&szero,d_work,m);CHKERRCUBLAS(cberr);
357: PetscLogGpuTimeEnd();
358: cerr = cudaMemcpy(X->work,d_work,m*n*sizeof(PetscScalar),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
359: PetscLogGpuToCpu(m*n*sizeof(PetscScalar));
360: for (j=0;j<n;j++) {
361: PetscArraycpy(C+j*ldm,X->work+j*m,m);
362: }
363: }
364: }
365: cerr = cudaFree(d_work);CHKERRCUDA(cerr);
366: MatDenseRestoreArray(M,&pm);
367: VecCUDARestoreArrayRead(x->v,&d_px);
368: VecCUDARestoreArrayRead(y->v,&d_py);
369: PetscLogGpuFlops(2.0*m*n*k);
370: return(0);
371: }
373: #if defined(PETSC_USE_COMPLEX)
374: struct conjugate
375: {
376: __host__ __device__
377: PetscScalar operator()(PetscScalar x)
378: {
379: return PetscConj(x);
380: }
381: };
383: PetscErrorCode ConjugateCudaArray(PetscScalar *a, PetscInt n)
384: {
385: thrust::device_ptr<PetscScalar> ptr;
388: try {
389: ptr = thrust::device_pointer_cast(a);
390: thrust::transform(ptr,ptr+n,ptr,conjugate());
391: } catch (char *ex) {
392: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"Thrust error: %s", ex);
393: }
394: return(0);
395: }
396: #endif
398: /*
399: y := A'*x computed as y' := x'*A
400: */
401: PetscErrorCode BVDotVec_Svec_CUDA(BV X,Vec y,PetscScalar *q)
402: {
403: PetscErrorCode ierr;
404: BV_SVEC *x = (BV_SVEC*)X->data;
405: const PetscScalar *d_A,*d_x,*d_px,*d_py;
406: PetscScalar *d_work,szero=0.0,sone=1.0,*qq=q;
407: PetscCuBLASInt n=0,k=0,one=1;
408: PetscMPIInt len;
409: Vec z = y;
410: cublasStatus_t cberr;
411: cublasHandle_t cublasv2handle;
412: cudaError_t cerr;
415: PetscCuBLASIntCast(X->n,&n);
416: PetscCuBLASIntCast(X->k-X->l,&k);
417: PetscCUBLASGetHandle(&cublasv2handle);
418: if (X->matrix) {
419: BV_IPMatMult(X,y);
420: z = X->Bx;
421: }
422: VecCUDAGetArrayRead(x->v,&d_px);
423: VecCUDAGetArrayRead(z,&d_py);
424: if (!q) {
425: VecCUDAGetArrayWrite(X->buffer,&d_work);
426: } else {
427: cerr = cudaMalloc((void**)&d_work,k*sizeof(PetscScalar));CHKERRCUDA(cerr);
428: }
429: d_A = d_px+(X->nc+X->l)*X->n;
430: d_x = d_py;
431: if (x->mpi) {
432: BVAllocateWork_Private(X,k);
433: if (n) {
434: PetscLogGpuTimeBegin();
435: #if defined(PETSC_USE_COMPLEX)
436: cberr = cublasXgemm(cublasv2handle,CUBLAS_OP_C,CUBLAS_OP_N,one,k,n,&sone,d_x,n,d_A,n,&szero,d_work,one);CHKERRCUBLAS(cberr);
437: ConjugateCudaArray(d_work,k);
438: #else
439: cberr = cublasXgemm(cublasv2handle,CUBLAS_OP_N,CUBLAS_OP_N,one,k,n,&sone,d_x,one,d_A,n,&szero,d_work,one);CHKERRCUBLAS(cberr);
440: #endif
441: PetscLogGpuTimeEnd();
442: cerr = cudaMemcpy(X->work,d_work,k*sizeof(PetscScalar),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
443: PetscLogGpuToCpu(k*sizeof(PetscScalar));
444: } else {
445: PetscArrayzero(X->work,k);
446: }
447: if (!q) {
448: VecCUDARestoreArrayWrite(X->buffer,&d_work);
449: VecGetArray(X->buffer,&qq);
450: } else {
451: cerr = cudaFree(d_work);CHKERRCUDA(cerr);
452: }
453: PetscMPIIntCast(k,&len);
454: MPIU_Allreduce(X->work,qq,len,MPIU_SCALAR,MPIU_SUM,PetscObjectComm((PetscObject)X));
455: if (!q) { VecRestoreArray(X->buffer,&qq); }
456: } else {
457: if (n) {
458: PetscLogGpuTimeBegin();
459: #if defined(PETSC_USE_COMPLEX)
460: cberr = cublasXgemm(cublasv2handle,CUBLAS_OP_C,CUBLAS_OP_N,one,k,n,&sone,d_x,n,d_A,n,&szero,d_work,one);CHKERRCUBLAS(cberr);
461: ConjugateCudaArray(d_work,k);
462: #else
463: cberr = cublasXgemm(cublasv2handle,CUBLAS_OP_N,CUBLAS_OP_N,one,k,n,&sone,d_x,one,d_A,n,&szero,d_work,one);CHKERRCUBLAS(cberr);
464: #endif
465: PetscLogGpuTimeEnd();
466: }
467: if (!q) {
468: VecCUDARestoreArrayWrite(X->buffer,&d_work);
469: } else {
470: cerr = cudaMemcpy(q,d_work,k*sizeof(PetscScalar),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
471: PetscLogGpuToCpu(k*sizeof(PetscScalar));
472: cerr = cudaFree(d_work);CHKERRCUDA(cerr);
473: }
474: }
475: VecCUDARestoreArrayRead(z,&d_py);
476: VecCUDARestoreArrayRead(x->v,&d_px);
477: PetscLogGpuFlops(2.0*n*k);
478: return(0);
479: }
481: /*
482: y := A'*x computed as y' := x'*A
483: */
484: PetscErrorCode BVDotVec_Local_Svec_CUDA(BV X,Vec y,PetscScalar *m)
485: {
486: PetscErrorCode ierr;
487: BV_SVEC *x = (BV_SVEC*)X->data;
488: const PetscScalar *d_A,*d_x,*d_px,*d_py;
489: PetscScalar *d_y,szero=0.0,sone=1.0;
490: PetscCuBLASInt n=0,k=0,one=1;
491: Vec z = y;
492: cublasStatus_t cberr;
493: cublasHandle_t cublasv2handle;
494: cudaError_t cerr;
497: PetscCuBLASIntCast(X->n,&n);
498: PetscCuBLASIntCast(X->k-X->l,&k);
499: if (X->matrix) {
500: BV_IPMatMult(X,y);
501: z = X->Bx;
502: }
503: PetscCUBLASGetHandle(&cublasv2handle);
504: VecCUDAGetArrayRead(x->v,&d_px);
505: VecCUDAGetArrayRead(z,&d_py);
506: d_A = d_px+(X->nc+X->l)*X->n;
507: d_x = d_py;
508: if (n) {
509: cerr = cudaMalloc((void**)&d_y,k*sizeof(PetscScalar));CHKERRCUDA(cerr);
510: PetscLogGpuTimeBegin();
511: #if defined(PETSC_USE_COMPLEX)
512: cberr = cublasXgemm(cublasv2handle,CUBLAS_OP_C,CUBLAS_OP_N,one,k,n,&sone,d_x,n,d_A,n,&szero,d_y,one);CHKERRCUBLAS(cberr);
513: ConjugateCudaArray(d_y,k);
514: #else
515: cberr = cublasXgemm(cublasv2handle,CUBLAS_OP_N,CUBLAS_OP_N,one,k,n,&sone,d_x,one,d_A,n,&szero,d_y,one);CHKERRCUBLAS(cberr);
516: #endif
517: PetscLogGpuTimeEnd();
518: cerr = cudaMemcpy(m,d_y,k*sizeof(PetscScalar),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
519: PetscLogGpuToCpu(k*sizeof(PetscScalar));
520: cerr = cudaFree(d_y);CHKERRCUDA(cerr);
521: }
522: VecCUDARestoreArrayRead(z,&d_py);
523: VecCUDARestoreArrayRead(x->v,&d_px);
524: PetscLogGpuFlops(2.0*n*k);
525: return(0);
526: }
528: /*
529: Scale n scalars
530: */
531: PetscErrorCode BVScale_Svec_CUDA(BV bv,PetscInt j,PetscScalar alpha)
532: {
534: BV_SVEC *ctx = (BV_SVEC*)bv->data;
535: PetscScalar *d_array, *d_A;
536: PetscCuBLASInt n=0,one=1;
537: cublasStatus_t cberr;
538: cublasHandle_t cublasv2handle;
539: cudaError_t cerr;
542: VecCUDAGetArray(ctx->v,&d_array);
543: if (j<0) {
544: d_A = d_array+(bv->nc+bv->l)*bv->n;
545: PetscCuBLASIntCast((bv->k-bv->l)*bv->n,&n);
546: } else {
547: d_A = d_array+(bv->nc+j)*bv->n;
548: PetscCuBLASIntCast(bv->n,&n);
549: }
550: if (alpha == (PetscScalar)0.0) {
551: cerr = cudaMemset(d_A,0,n*sizeof(PetscScalar));CHKERRCUDA(cerr);
552: } else if (alpha != (PetscScalar)1.0) {
553: PetscCUBLASGetHandle(&cublasv2handle);
554: PetscLogGpuTimeBegin();
555: cberr = cublasXscal(cublasv2handle,n,&alpha,d_A,one);CHKERRCUBLAS(cberr);
556: PetscLogGpuTimeEnd();
557: PetscLogGpuFlops(1.0*n);
558: }
559: VecCUDARestoreArray(ctx->v,&d_array);
560: return(0);
561: }
563: PetscErrorCode BVMatMult_Svec_CUDA(BV V,Mat A,BV W)
564: {
565: PetscErrorCode ierr;
566: BV_SVEC *v = (BV_SVEC*)V->data,*w = (BV_SVEC*)W->data;
567: Mat Vmat,Wmat;
568: const PetscScalar *d_pv;
569: PetscScalar *d_pw;
570: PetscInt j;
573: if (V->vmm) {
574: BVGetMat(V,&Vmat);
575: BVGetMat(W,&Wmat);
576: MatProductCreateWithMat(A,Vmat,NULL,Wmat);
577: MatProductSetType(Wmat,MATPRODUCT_AB);
578: MatProductSetFromOptions(Wmat);
579: MatProductSymbolic(Wmat);
580: MatProductNumeric(Wmat);
581: MatProductClear(Wmat);
582: BVRestoreMat(V,&Vmat);
583: BVRestoreMat(W,&Wmat);
584: } else {
585: VecCUDAGetArrayRead(v->v,&d_pv);
586: VecCUDAGetArrayWrite(w->v,&d_pw);
587: for (j=0;j<V->k-V->l;j++) {
588: VecCUDAPlaceArray(V->cv[1],(PetscScalar *)d_pv+(V->nc+V->l+j)*V->n);
589: VecCUDAPlaceArray(W->cv[1],d_pw+(W->nc+W->l+j)*W->n);
590: MatMult(A,V->cv[1],W->cv[1]);
591: VecCUDAResetArray(V->cv[1]);
592: VecCUDAResetArray(W->cv[1]);
593: }
594: VecCUDARestoreArrayRead(v->v,&d_pv);
595: VecCUDARestoreArrayWrite(w->v,&d_pw);
596: }
597: return(0);
598: }
600: PetscErrorCode BVCopy_Svec_CUDA(BV V,BV W)
601: {
602: PetscErrorCode ierr;
603: BV_SVEC *v = (BV_SVEC*)V->data,*w = (BV_SVEC*)W->data;
604: const PetscScalar *d_pv,*d_pvc;
605: PetscScalar *d_pw,*d_pwc;
606: cudaError_t cerr;
609: VecCUDAGetArrayRead(v->v,&d_pv);
610: VecCUDAGetArrayWrite(w->v,&d_pw);
611: d_pvc = d_pv+(V->nc+V->l)*V->n;
612: d_pwc = d_pw+(W->nc+W->l)*W->n;
613: cerr = cudaMemcpy(d_pwc,d_pvc,(V->k-V->l)*V->n*sizeof(PetscScalar),cudaMemcpyDeviceToDevice);CHKERRCUDA(cerr);
614: VecCUDARestoreArrayRead(v->v,&d_pv);
615: VecCUDARestoreArrayWrite(w->v,&d_pw);
616: return(0);
617: }
619: PetscErrorCode BVCopyColumn_Svec_CUDA(BV V,PetscInt j,PetscInt i)
620: {
622: BV_SVEC *v = (BV_SVEC*)V->data;
623: PetscScalar *d_pv;
624: cudaError_t cerr;
627: VecCUDAGetArray(v->v,&d_pv);
628: cerr = cudaMemcpy(d_pv+(V->nc+i)*V->n,d_pv+(V->nc+j)*V->n,V->n*sizeof(PetscScalar),cudaMemcpyDeviceToDevice);CHKERRCUDA(cerr);
629: VecCUDARestoreArray(v->v,&d_pv);
630: return(0);
631: }
633: PetscErrorCode BVResize_Svec_CUDA(BV bv,PetscInt m,PetscBool copy)
634: {
635: PetscErrorCode ierr;
636: BV_SVEC *ctx = (BV_SVEC*)bv->data;
637: const PetscScalar *d_pv;
638: PetscScalar *d_pnew;
639: PetscInt bs;
640: Vec vnew;
641: char str[50];
642: cudaError_t cerr;
645: VecGetBlockSize(bv->t,&bs);
646: VecCreate(PetscObjectComm((PetscObject)bv->t),&vnew);
647: VecSetType(vnew,((PetscObject)bv->t)->type_name);
648: VecSetSizes(vnew,m*bv->n,PETSC_DECIDE);
649: VecSetBlockSize(vnew,bs);
650: PetscLogObjectParent((PetscObject)bv,(PetscObject)vnew);
651: if (((PetscObject)bv)->name) {
652: PetscSNPrintf(str,sizeof(str),"%s_0",((PetscObject)bv)->name);
653: PetscObjectSetName((PetscObject)vnew,str);
654: }
655: if (copy) {
656: VecCUDAGetArrayRead(ctx->v,&d_pv);
657: VecCUDAGetArrayWrite(vnew,&d_pnew);
658: cerr = cudaMemcpy(d_pnew,d_pv,PetscMin(m,bv->m)*bv->n*sizeof(PetscScalar),cudaMemcpyDeviceToDevice);CHKERRCUDA(cerr);
659: VecCUDARestoreArrayRead(ctx->v,&d_pv);
660: VecCUDARestoreArrayWrite(vnew,&d_pnew);
661: }
662: VecDestroy(&ctx->v);
663: ctx->v = vnew;
664: return(0);
665: }
667: PetscErrorCode BVGetColumn_Svec_CUDA(BV bv,PetscInt j,Vec *v)
668: {
670: BV_SVEC *ctx = (BV_SVEC*)bv->data;
671: PetscScalar *d_pv;
672: PetscInt l;
675: l = BVAvailableVec;
676: VecCUDAGetArray(ctx->v,&d_pv);
677: VecCUDAPlaceArray(bv->cv[l],d_pv+(bv->nc+j)*bv->n);
678: return(0);
679: }
681: PetscErrorCode BVRestoreColumn_Svec_CUDA(BV bv,PetscInt j,Vec *v)
682: {
684: BV_SVEC *ctx = (BV_SVEC*)bv->data;
685: PetscInt l;
688: l = (j==bv->ci[0])? 0: 1;
689: VecCUDAResetArray(bv->cv[l]);
690: VecCUDARestoreArray(ctx->v,NULL);
691: return(0);
692: }
694: PetscErrorCode BVRestoreSplit_Svec_CUDA(BV bv,BV *L,BV *R)
695: {
696: PetscErrorCode ierr;
697: Vec v;
698: const PetscScalar *d_pv;
699: PetscObjectState lstate,rstate;
700: PetscBool change=PETSC_FALSE;
703: /* force sync flag to PETSC_CUDA_BOTH */
704: if (L) {
705: PetscObjectStateGet((PetscObject)*L,&lstate);
706: if (lstate != bv->lstate) {
707: v = ((BV_SVEC*)bv->L->data)->v;
708: VecCUDAGetArrayRead(v,&d_pv);
709: VecCUDARestoreArrayRead(v,&d_pv);
710: change = PETSC_TRUE;
711: }
712: }
713: if (R) {
714: PetscObjectStateGet((PetscObject)*R,&rstate);
715: if (rstate != bv->rstate) {
716: v = ((BV_SVEC*)bv->R->data)->v;
717: VecCUDAGetArrayRead(v,&d_pv);
718: VecCUDARestoreArrayRead(v,&d_pv);
719: change = PETSC_TRUE;
720: }
721: }
722: if (change) {
723: v = ((BV_SVEC*)bv->data)->v;
724: VecCUDAGetArray(v,(PetscScalar **)&d_pv);
725: VecCUDARestoreArray(v,(PetscScalar **)&d_pv);
726: }
727: return(0);
728: }
730: PetscErrorCode BVGetMat_Svec_CUDA(BV bv,Mat *A)
731: {
733: BV_SVEC *ctx = (BV_SVEC*)bv->data;
734: PetscScalar *vv,*aa;
735: PetscBool create=PETSC_FALSE;
736: PetscInt m,cols;
739: m = bv->k-bv->l;
740: if (!bv->Aget) create=PETSC_TRUE;
741: else {
742: MatDenseCUDAGetArray(bv->Aget,&aa);
743: if (aa) SETERRQ(PetscObjectComm((PetscObject)bv),PETSC_ERR_ARG_WRONGSTATE,"BVGetMat already called on this BV");
744: MatGetSize(bv->Aget,NULL,&cols);
745: if (cols!=m) {
746: MatDestroy(&bv->Aget);
747: create=PETSC_TRUE;
748: }
749: }
750: VecCUDAGetArray(ctx->v,&vv);
751: if (create) {
752: MatCreateDenseCUDA(PetscObjectComm((PetscObject)bv),bv->n,PETSC_DECIDE,bv->N,m,vv,&bv->Aget); /* pass a pointer to avoid allocation of storage */
753: MatDenseCUDAReplaceArray(bv->Aget,NULL); /* replace with a null pointer, the value after BVRestoreMat */
754: PetscLogObjectParent((PetscObject)bv,(PetscObject)bv->Aget);
755: }
756: MatDenseCUDAPlaceArray(bv->Aget,vv+(bv->nc+bv->l)*bv->n); /* set the actual pointer */
757: *A = bv->Aget;
758: return(0);
759: }
761: PetscErrorCode BVRestoreMat_Svec_CUDA(BV bv,Mat *A)
762: {
764: BV_SVEC *ctx = (BV_SVEC*)bv->data;
765: PetscScalar *vv,*aa;
768: MatDenseCUDAGetArray(bv->Aget,&aa);
769: vv = aa-(bv->nc+bv->l)*bv->n;
770: MatDenseCUDAResetArray(bv->Aget);
771: VecCUDARestoreArray(ctx->v,&vv);
772: *A = NULL;
773: return(0);
774: }