Actual source code: sveccuda.cu
slepc-3.15.1 2021-05-28
1: /*
2: - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
3: SLEPc - Scalable Library for Eigenvalue Problem Computations
4: Copyright (c) 2002-2021, Universitat Politecnica de Valencia, Spain
6: This file is part of SLEPc.
7: SLEPc is distributed under a 2-clause BSD license (see LICENSE).
8: - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
9: */
10: /*
11: BV implemented as a single Vec (CUDA version)
12: */
14: #include <slepc/private/bvimpl.h>
15: #include "../src/sys/classes/bv/impls/svec/svec.h"
16: #include <slepccublas.h>
18: #if defined(PETSC_USE_COMPLEX)
19: #include <thrust/device_ptr.h>
20: #endif
22: #define BLOCKSIZE 64
24: /*
25: B := alpha*A + beta*B
27: A,B are nxk (ld=n)
28: */
29: static PetscErrorCode BVAXPY_BLAS_CUDA(BV bv,PetscInt n_,PetscInt k_,PetscScalar alpha,const PetscScalar *d_A,PetscScalar beta,PetscScalar *d_B)
30: {
32: PetscBLASInt m=0,one=1;
33: cublasStatus_t cberr;
34: cublasHandle_t cublasv2handle;
37: PetscCUBLASGetHandle(&cublasv2handle);
38: PetscBLASIntCast(n_*k_,&m);
39: PetscLogGpuTimeBegin();
40: if (beta!=(PetscScalar)1.0) {
41: cberr = cublasXscal(cublasv2handle,m,&beta,d_B,one);CHKERRCUBLAS(cberr);
42: PetscLogGpuFlops(1.0*m);
43: }
44: cberr = cublasXaxpy(cublasv2handle,m,&alpha,d_A,one,d_B,one);CHKERRCUBLAS(cberr);
45: PetscLogGpuTimeEnd();
46: PetscLogGpuFlops(2.0*m);
47: return(0);
48: }
50: /*
51: C := alpha*A*B + beta*C
52: */
53: PetscErrorCode BVMult_Svec_CUDA(BV Y,PetscScalar alpha,PetscScalar beta,BV X,Mat Q)
54: {
55: PetscErrorCode ierr;
56: BV_SVEC *y = (BV_SVEC*)Y->data,*x = (BV_SVEC*)X->data;
57: const PetscScalar *d_px,*d_A,*q;
58: PetscScalar *d_py,*d_q,*d_B,*d_C;
59: PetscInt ldq,mq;
60: PetscBLASInt m=0,n=0,k=0,ldq_=0;
61: cublasStatus_t cberr;
62: cudaError_t cerr;
63: cublasHandle_t cublasv2handle;
66: if (!Y->n) return(0);
67: VecCUDAGetArrayRead(x->v,&d_px);
68: if (beta==(PetscScalar)0.0) {
69: VecCUDAGetArrayWrite(y->v,&d_py);
70: } else {
71: VecCUDAGetArray(y->v,&d_py);
72: }
73: d_A = d_px+(X->nc+X->l)*X->n;
74: d_C = d_py+(Y->nc+Y->l)*Y->n;
75: if (Q) {
76: PetscBLASIntCast(Y->n,&m);
77: PetscBLASIntCast(Y->k-Y->l,&n);
78: PetscBLASIntCast(X->k-X->l,&k);
79: PetscCUBLASGetHandle(&cublasv2handle);
80: MatGetSize(Q,&ldq,&mq);
81: PetscBLASIntCast(ldq,&ldq_);
82: MatDenseGetArrayRead(Q,&q);
83: cerr = cudaMalloc((void**)&d_q,ldq*mq*sizeof(PetscScalar));CHKERRCUDA(cerr);
84: cerr = cudaMemcpy(d_q,q,ldq*mq*sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
85: PetscLogCpuToGpu(ldq*mq*sizeof(PetscScalar));
86: d_B = d_q+Y->l*ldq+X->l;
87: PetscLogGpuTimeBegin();
88: cberr = cublasXgemm(cublasv2handle,CUBLAS_OP_N,CUBLAS_OP_N,m,n,k,&alpha,d_A,m,d_B,ldq_,&beta,d_C,m);CHKERRCUBLAS(cberr);
89: PetscLogGpuTimeEnd();
90: MatDenseRestoreArrayRead(Q,&q);
91: cerr = cudaFree(d_q);CHKERRCUDA(cerr);
92: PetscLogGpuFlops(2.0*m*n*k);
93: } else {
94: BVAXPY_BLAS_CUDA(Y,Y->n,Y->k-Y->l,alpha,d_A,beta,d_C);
95: }
96: VecCUDARestoreArrayRead(x->v,&d_px);
97: VecCUDARestoreArrayWrite(y->v,&d_py);
98: return(0);
99: }
101: /*
102: y := alpha*A*x + beta*y
103: */
104: PetscErrorCode BVMultVec_Svec_CUDA(BV X,PetscScalar alpha,PetscScalar beta,Vec y,PetscScalar *q)
105: {
106: PetscErrorCode ierr;
107: BV_SVEC *x = (BV_SVEC*)X->data;
108: const PetscScalar *d_px,*d_A;
109: PetscScalar *d_py,*d_q,*d_x,*d_y;
110: PetscBLASInt n=0,k=0,one=1;
111: cublasStatus_t cberr;
112: cublasHandle_t cublasv2handle;
113: cudaError_t cerr;
116: PetscBLASIntCast(X->n,&n);
117: PetscBLASIntCast(X->k-X->l,&k);
118: PetscCUBLASGetHandle(&cublasv2handle);
119: VecCUDAGetArrayRead(x->v,&d_px);
120: if (beta==(PetscScalar)0.0) {
121: VecCUDAGetArrayWrite(y,&d_py);
122: } else {
123: VecCUDAGetArray(y,&d_py);
124: }
125: if (!q) {
126: VecCUDAGetArray(X->buffer,&d_q);
127: } else {
128: cerr = cudaMalloc((void**)&d_q,k*sizeof(PetscScalar));CHKERRCUDA(cerr);
129: cerr = cudaMemcpy(d_q,q,k*sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
130: PetscLogCpuToGpu(k*sizeof(PetscScalar));
131: }
132: d_A = d_px+(X->nc+X->l)*X->n;
133: d_x = d_q;
134: d_y = d_py;
135: PetscLogGpuTimeBegin();
136: cberr = cublasXgemv(cublasv2handle,CUBLAS_OP_N,n,k,&alpha,d_A,n,d_x,one,&beta,d_y,one);CHKERRCUBLAS(cberr);
137: PetscLogGpuTimeEnd();
138: VecCUDARestoreArrayRead(x->v,&d_px);
139: if (beta==(PetscScalar)0.0) {
140: VecCUDARestoreArrayWrite(y,&d_py);
141: } else {
142: VecCUDARestoreArray(y,&d_py);
143: }
144: if (!q) {
145: VecCUDARestoreArray(X->buffer,&d_q);
146: } else {
147: cerr = cudaFree(d_q);CHKERRCUDA(cerr);
148: }
149: PetscLogGpuFlops(2.0*n*k);
150: return(0);
151: }
153: /*
154: A(:,s:e-1) := A*B(:,s:e-1)
155: */
156: PetscErrorCode BVMultInPlace_Svec_CUDA(BV V,Mat Q,PetscInt s,PetscInt e)
157: {
158: PetscErrorCode ierr;
159: BV_SVEC *ctx = (BV_SVEC*)V->data;
160: PetscScalar *d_pv,*d_q,*d_A,*d_B,*d_work,sone=1.0,szero=0.0;
161: const PetscScalar *q;
162: PetscInt j,ldq,nq;
163: PetscBLASInt m=0,n=0,k=0,l,ldq_=0,bs=BLOCKSIZE;
164: cublasStatus_t cberr;
165: size_t freemem,totmem;
166: cublasHandle_t cublasv2handle;
167: cudaError_t cerr;
170: if (!V->n) return(0);
171: PetscBLASIntCast(V->n,&m);
172: PetscBLASIntCast(e-s,&n);
173: PetscBLASIntCast(V->k-V->l,&k);
174: MatGetSize(Q,&ldq,&nq);
175: PetscBLASIntCast(ldq,&ldq_);
176: VecCUDAGetArray(ctx->v,&d_pv);
177: MatDenseGetArrayRead(Q,&q);
178: cerr = cudaMalloc((void**)&d_q,ldq*nq*sizeof(PetscScalar));CHKERRCUDA(cerr);
179: cerr = cudaMemcpy(d_q,q,ldq*nq*sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
180: PetscLogCpuToGpu(ldq*nq*sizeof(PetscScalar));
181: PetscCUBLASGetHandle(&cublasv2handle);
182: PetscLogGpuTimeBegin();
183: /* try to allocate the whole matrix */
184: cerr = cudaMemGetInfo(&freemem,&totmem);CHKERRCUDA(cerr);
185: if (freemem>=m*n*sizeof(PetscScalar)) {
186: cerr = cudaMalloc((void**)&d_work,m*n*sizeof(PetscScalar));CHKERRCUDA(cerr);
187: d_A = d_pv+(V->nc+V->l)*m;
188: d_B = d_q+V->l*ldq+V->l+(s-V->l)*ldq;
189: cberr = cublasXgemm(cublasv2handle,CUBLAS_OP_N,CUBLAS_OP_N,m,n,k,&sone,d_A,m,d_B,ldq_,&szero,d_work,m);CHKERRCUBLAS(cberr);
190: for (j=0;j<n;j++) {
191: cerr = cudaMemcpy(d_A+(s-V->l+j)*m,d_work+(j*m),m*sizeof(PetscScalar),cudaMemcpyDeviceToDevice);CHKERRCUDA(cerr);
192: }
193: } else {
194: bs = freemem/(m*sizeof(PetscScalar));
195: cerr = cudaMalloc((void**)&d_work,bs*n*sizeof(PetscScalar));CHKERRCUDA(cerr);
196: l = m % bs;
197: if (l) {
198: d_A = d_pv+(V->nc+V->l)*m;
199: d_B = d_q+V->l*ldq+V->l+(s-V->l)*ldq;
200: cberr = cublasXgemm(cublasv2handle,CUBLAS_OP_N,CUBLAS_OP_N,l,n,k,&sone,d_A,m,d_B,ldq_,&szero,d_work,l);CHKERRCUBLAS(cberr);
201: for (j=0;j<n;j++) {
202: cerr = cudaMemcpy(d_A+(s-V->l+j)*m,d_work+(j*l),l*sizeof(PetscScalar),cudaMemcpyDeviceToDevice);CHKERRCUDA(cerr);
203: }
204: }
205: for (;l<m;l+=bs) {
206: d_A = d_pv+(V->nc+V->l)*m+l;
207: d_B = d_q+V->l*ldq+V->l+(s-V->l)*ldq;
208: cberr = cublasXgemm(cublasv2handle,CUBLAS_OP_N,CUBLAS_OP_N,bs,n,k,&sone,d_A,m,d_B,ldq_,&szero,d_work,bs);CHKERRCUBLAS(cberr);
209: for (j=0;j<n;j++) {
210: cerr = cudaMemcpy(d_A+(s-V->l+j)*m,d_work+(j*bs),bs*sizeof(PetscScalar),cudaMemcpyDeviceToDevice);CHKERRCUDA(cerr);
211: }
212: }
213: }
214: PetscLogGpuTimeEnd();
215: cerr = WaitForCUDA();CHKERRCUDA(cerr);
216: MatDenseRestoreArrayRead(Q,&q);
217: cerr = cudaFree(d_q);CHKERRCUDA(cerr);
218: cerr = cudaFree(d_work);CHKERRCUDA(cerr);
219: VecCUDARestoreArray(ctx->v,&d_pv);
220: PetscLogGpuFlops(2.0*m*n*k);
221: return(0);
222: }
224: /*
225: A(:,s:e-1) := A*B(:,s:e-1)
226: */
227: PetscErrorCode BVMultInPlaceTranspose_Svec_CUDA(BV V,Mat Q,PetscInt s,PetscInt e)
228: {
229: PetscErrorCode ierr;
230: BV_SVEC *ctx = (BV_SVEC*)V->data;
231: PetscScalar *d_pv,*d_q,*d_A,*d_B,*d_work,sone=1.0,szero=0.0;
232: const PetscScalar *q;
233: PetscInt j,ldq,nq;
234: PetscBLASInt m=0,n=0,k=0,ldq_=0;
235: cublasStatus_t cberr;
236: cublasHandle_t cublasv2handle;
237: cudaError_t cerr;
240: if (!V->n) return(0);
241: PetscBLASIntCast(V->n,&m);
242: PetscBLASIntCast(e-s,&n);
243: PetscBLASIntCast(V->k-V->l,&k);
244: MatGetSize(Q,&ldq,&nq);
245: PetscBLASIntCast(ldq,&ldq_);
246: VecCUDAGetArray(ctx->v,&d_pv);
247: MatDenseGetArrayRead(Q,&q);
248: PetscCUBLASGetHandle(&cublasv2handle);
249: cerr = cudaMalloc((void**)&d_q,ldq*nq*sizeof(PetscScalar));CHKERRCUDA(cerr);
250: cerr = cudaMemcpy(d_q,q,ldq*nq*sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
251: PetscLogCpuToGpu(ldq*nq*sizeof(PetscScalar));
252: PetscLogGpuTimeBegin();
253: cerr = cudaMalloc((void**)&d_work,m*n*sizeof(PetscScalar));CHKERRCUDA(cerr);
254: d_A = d_pv+(V->nc+V->l)*m;
255: d_B = d_q+V->l*ldq+s;
256: cberr = cublasXgemm(cublasv2handle,CUBLAS_OP_N,CUBLAS_OP_C,m,n,k,&sone,d_A,m,d_B,ldq_,&szero,d_work,m);CHKERRCUBLAS(cberr);
257: for (j=0;j<n;j++) {
258: cerr = cudaMemcpy(d_A+(s-V->l+j)*m,d_work+(j*m),m*sizeof(PetscScalar),cudaMemcpyDeviceToDevice);CHKERRCUDA(cerr);
259: }
260: PetscLogGpuTimeEnd();
261: cerr = WaitForCUDA();CHKERRCUDA(cerr);
262: MatDenseRestoreArrayRead(Q,&q);
263: cerr = cudaFree(d_q);CHKERRCUDA(cerr);
264: cerr = cudaFree(d_work);CHKERRCUDA(cerr);
265: VecCUDARestoreArray(ctx->v,&d_pv);
266: PetscLogGpuFlops(2.0*m*n*k);
267: return(0);
268: }
270: /*
271: C := A'*B
272: */
273: PetscErrorCode BVDot_Svec_CUDA(BV X,BV Y,Mat M)
274: {
275: PetscErrorCode ierr;
276: BV_SVEC *x = (BV_SVEC*)X->data,*y = (BV_SVEC*)Y->data;
277: const PetscScalar *d_px,*d_py,*d_A,*d_B;
278: PetscScalar *pm,*d_work,sone=1.0,szero=0.0,*C,*CC;
279: PetscInt j,ldm;
280: PetscBLASInt m=0,n=0,k=0,ldm_=0;
281: PetscMPIInt len;
282: cublasStatus_t cberr;
283: cublasHandle_t cublasv2handle;
284: cudaError_t cerr;
287: PetscBLASIntCast(Y->k-Y->l,&m);
288: PetscBLASIntCast(X->k-X->l,&n);
289: PetscBLASIntCast(X->n,&k);
290: MatGetSize(M,&ldm,NULL);
291: PetscBLASIntCast(ldm,&ldm_);
292: VecCUDAGetArrayRead(x->v,&d_px);
293: VecCUDAGetArrayRead(y->v,&d_py);
294: MatDenseGetArray(M,&pm);
295: PetscCUBLASGetHandle(&cublasv2handle);
296: cerr = cudaMalloc((void**)&d_work,m*n*sizeof(PetscScalar));CHKERRCUDA(cerr);
297: d_A = d_py+(Y->nc+Y->l)*Y->n;
298: d_B = d_px+(X->nc+X->l)*X->n;
299: C = pm+X->l*ldm+Y->l;
300: if (x->mpi) {
301: if (ldm==m) {
302: BVAllocateWork_Private(X,m*n);
303: if (k) {
304: PetscLogGpuTimeBegin();
305: cberr = cublasXgemm(cublasv2handle,CUBLAS_OP_C,CUBLAS_OP_N,m,n,k,&sone,d_A,k,d_B,k,&szero,d_work,ldm_);CHKERRCUBLAS(cberr);
306: PetscLogGpuTimeEnd();
307: cerr = cudaMemcpy(X->work,d_work,m*n*sizeof(PetscScalar),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
308: PetscLogGpuToCpu(m*n*sizeof(PetscScalar));
309: } else {
310: PetscArrayzero(X->work,m*n);
311: }
312: PetscMPIIntCast(m*n,&len);
313: MPIU_Allreduce(X->work,C,len,MPIU_SCALAR,MPIU_SUM,PetscObjectComm((PetscObject)X));
314: } else {
315: BVAllocateWork_Private(X,2*m*n);
316: CC = X->work+m*n;
317: if (k) {
318: PetscLogGpuTimeBegin();
319: cberr = cublasXgemm(cublasv2handle,CUBLAS_OP_C,CUBLAS_OP_N,m,n,k,&sone,d_A,k,d_B,k,&szero,d_work,m);CHKERRCUBLAS(cberr);
320: PetscLogGpuTimeEnd();
321: cerr = cudaMemcpy(X->work,d_work,m*n*sizeof(PetscScalar),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
322: PetscLogGpuToCpu(m*n*sizeof(PetscScalar));
323: } else {
324: PetscArrayzero(X->work,m*n);
325: }
326: PetscMPIIntCast(m*n,&len);
327: MPIU_Allreduce(X->work,CC,len,MPIU_SCALAR,MPIU_SUM,PetscObjectComm((PetscObject)X));
328: for (j=0;j<n;j++) {
329: PetscArraycpy(C+j*ldm,CC+j*m,m);
330: }
331: }
332: } else {
333: if (k) {
334: BVAllocateWork_Private(X,m*n);
335: PetscLogGpuTimeBegin();
336: cberr = cublasXgemm(cublasv2handle,CUBLAS_OP_C,CUBLAS_OP_N,m,n,k,&sone,d_A,k,d_B,k,&szero,d_work,m);CHKERRCUBLAS(cberr);
337: PetscLogGpuTimeEnd();
338: cerr = cudaMemcpy(X->work,d_work,m*n*sizeof(PetscScalar),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
339: PetscLogGpuToCpu(m*n*sizeof(PetscScalar));
340: for (j=0;j<n;j++) {
341: PetscArraycpy(C+j*ldm,X->work+j*m,m);
342: }
343: }
344: }
345: cerr = WaitForCUDA();CHKERRCUDA(cerr);
346: cerr = cudaFree(d_work);CHKERRCUDA(cerr);
347: MatDenseRestoreArray(M,&pm);
348: VecCUDARestoreArrayRead(x->v,&d_px);
349: VecCUDARestoreArrayRead(y->v,&d_py);
350: PetscLogGpuFlops(2.0*m*n*k);
351: return(0);
352: }
354: #if defined(PETSC_USE_COMPLEX)
355: struct conjugate
356: {
357: __host__ __device__
358: PetscScalar operator()(PetscScalar x)
359: {
360: return PetscConj(x);
361: }
362: };
364: PetscErrorCode ConjugateCudaArray(PetscScalar *a, PetscInt n)
365: {
366: cudaError_t cerr;
367: thrust::device_ptr<PetscScalar> ptr;
370: try {
371: ptr = thrust::device_pointer_cast(a);
372: thrust::transform(ptr,ptr+n,ptr,conjugate());
373: cerr = WaitForCUDA();CHKERRCUDA(cerr);
374: } catch (char *ex) {
375: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"Thrust error: %s", ex);
376: }
377: return(0);
378: }
379: #endif
381: /*
382: y := A'*x computed as y' := x'*A
383: */
384: PetscErrorCode BVDotVec_Svec_CUDA(BV X,Vec y,PetscScalar *q)
385: {
386: PetscErrorCode ierr;
387: BV_SVEC *x = (BV_SVEC*)X->data;
388: const PetscScalar *d_A,*d_x,*d_px,*d_py;
389: PetscScalar *d_work,szero=0.0,sone=1.0,*qq=q;
390: PetscBLASInt n=0,k=0,one=1;
391: PetscMPIInt len;
392: Vec z = y;
393: cublasStatus_t cberr;
394: cublasHandle_t cublasv2handle;
395: cudaError_t cerr;
398: PetscBLASIntCast(X->n,&n);
399: PetscBLASIntCast(X->k-X->l,&k);
400: PetscCUBLASGetHandle(&cublasv2handle);
401: if (X->matrix) {
402: BV_IPMatMult(X,y);
403: z = X->Bx;
404: }
405: VecCUDAGetArrayRead(x->v,&d_px);
406: VecCUDAGetArrayRead(z,&d_py);
407: if (!q) {
408: VecCUDAGetArrayWrite(X->buffer,&d_work);
409: } else {
410: cerr = cudaMalloc((void**)&d_work,k*sizeof(PetscScalar));CHKERRCUDA(cerr);
411: }
412: d_A = d_px+(X->nc+X->l)*X->n;
413: d_x = d_py;
414: if (x->mpi) {
415: BVAllocateWork_Private(X,k);
416: if (n) {
417: PetscLogGpuTimeBegin();
418: #if defined(PETSC_USE_COMPLEX)
419: cberr = cublasXgemm(cublasv2handle,CUBLAS_OP_C,CUBLAS_OP_N,one,k,n,&sone,d_x,n,d_A,n,&szero,d_work,one);CHKERRCUBLAS(cberr);
420: ConjugateCudaArray(d_work,k);
421: #else
422: cberr = cublasXgemm(cublasv2handle,CUBLAS_OP_N,CUBLAS_OP_N,one,k,n,&sone,d_x,one,d_A,n,&szero,d_work,one);CHKERRCUBLAS(cberr);
423: #endif
424: PetscLogGpuTimeEnd();
425: cerr = cudaMemcpy(X->work,d_work,k*sizeof(PetscScalar),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
426: PetscLogGpuToCpu(k*sizeof(PetscScalar));
427: } else {
428: PetscArrayzero(X->work,k);
429: }
430: if (!q) {
431: VecCUDARestoreArrayWrite(X->buffer,&d_work);
432: VecGetArray(X->buffer,&qq);
433: } else {
434: cerr = cudaFree(d_work);CHKERRCUDA(cerr);
435: }
436: PetscMPIIntCast(k,&len);
437: MPIU_Allreduce(X->work,qq,len,MPIU_SCALAR,MPIU_SUM,PetscObjectComm((PetscObject)X));
438: if (!q) { VecRestoreArray(X->buffer,&qq); }
439: } else {
440: if (n) {
441: PetscLogGpuTimeBegin();
442: #if defined(PETSC_USE_COMPLEX)
443: cberr = cublasXgemm(cublasv2handle,CUBLAS_OP_C,CUBLAS_OP_N,one,k,n,&sone,d_x,n,d_A,n,&szero,d_work,one);CHKERRCUBLAS(cberr);
444: ConjugateCudaArray(d_work,k);
445: #else
446: cberr = cublasXgemm(cublasv2handle,CUBLAS_OP_N,CUBLAS_OP_N,one,k,n,&sone,d_x,one,d_A,n,&szero,d_work,one);CHKERRCUBLAS(cberr);
447: #endif
448: PetscLogGpuTimeEnd();
449: }
450: if (!q) {
451: VecCUDARestoreArrayWrite(X->buffer,&d_work);
452: } else {
453: cerr = cudaMemcpy(q,d_work,k*sizeof(PetscScalar),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
454: PetscLogGpuToCpu(k*sizeof(PetscScalar));
455: cerr = cudaFree(d_work);CHKERRCUDA(cerr);
456: }
457: }
458: cerr = WaitForCUDA();CHKERRCUDA(cerr);
459: VecCUDARestoreArrayRead(z,&d_py);
460: VecCUDARestoreArrayRead(x->v,&d_px);
461: PetscLogGpuFlops(2.0*n*k);
462: return(0);
463: }
465: /*
466: y := A'*x computed as y' := x'*A
467: */
468: PetscErrorCode BVDotVec_Local_Svec_CUDA(BV X,Vec y,PetscScalar *m)
469: {
470: PetscErrorCode ierr;
471: BV_SVEC *x = (BV_SVEC*)X->data;
472: const PetscScalar *d_A,*d_x,*d_px,*d_py;
473: PetscScalar *d_y,szero=0.0,sone=1.0;
474: PetscBLASInt n=0,k=0,one=1;
475: Vec z = y;
476: cublasStatus_t cberr;
477: cublasHandle_t cublasv2handle;
478: cudaError_t cerr;
481: PetscBLASIntCast(X->n,&n);
482: PetscBLASIntCast(X->k-X->l,&k);
483: if (X->matrix) {
484: BV_IPMatMult(X,y);
485: z = X->Bx;
486: }
487: PetscCUBLASGetHandle(&cublasv2handle);
488: VecCUDAGetArrayRead(x->v,&d_px);
489: VecCUDAGetArrayRead(z,&d_py);
490: d_A = d_px+(X->nc+X->l)*X->n;
491: d_x = d_py;
492: if (n) {
493: cerr = cudaMalloc((void**)&d_y,k*sizeof(PetscScalar));CHKERRCUDA(cerr);
494: PetscLogGpuTimeBegin();
495: #if defined(PETSC_USE_COMPLEX)
496: cberr = cublasXgemm(cublasv2handle,CUBLAS_OP_C,CUBLAS_OP_N,one,k,n,&sone,d_x,n,d_A,n,&szero,d_y,one);CHKERRCUBLAS(cberr);
497: ConjugateCudaArray(d_y,k);
498: #else
499: cberr = cublasXgemm(cublasv2handle,CUBLAS_OP_N,CUBLAS_OP_N,one,k,n,&sone,d_x,one,d_A,n,&szero,d_y,one);CHKERRCUBLAS(cberr);
500: #endif
501: PetscLogGpuTimeEnd();
502: cerr = cudaMemcpy(m,d_y,k*sizeof(PetscScalar),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
503: PetscLogGpuToCpu(k*sizeof(PetscScalar));
504: cerr = cudaFree(d_y);CHKERRCUDA(cerr);
505: }
506: VecCUDARestoreArrayRead(z,&d_py);
507: VecCUDARestoreArrayRead(x->v,&d_px);
508: PetscLogGpuFlops(2.0*n*k);
509: return(0);
510: }
512: /*
513: Scale n scalars
514: */
515: PetscErrorCode BVScale_Svec_CUDA(BV bv,PetscInt j,PetscScalar alpha)
516: {
518: BV_SVEC *ctx = (BV_SVEC*)bv->data;
519: PetscScalar *d_array, *d_A;
520: PetscBLASInt n=0,one=1;
521: cublasStatus_t cberr;
522: cublasHandle_t cublasv2handle;
523: cudaError_t cerr;
526: VecCUDAGetArray(ctx->v,&d_array);
527: if (j<0) {
528: d_A = d_array+(bv->nc+bv->l)*bv->n;
529: PetscBLASIntCast((bv->k-bv->l)*bv->n,&n);
530: } else {
531: d_A = d_array+(bv->nc+j)*bv->n;
532: PetscBLASIntCast(bv->n,&n);
533: }
534: if (alpha == (PetscScalar)0.0) {
535: cerr = cudaMemset(d_A,0,n*sizeof(PetscScalar));CHKERRCUDA(cerr);
536: } else if (alpha != (PetscScalar)1.0) {
537: PetscCUBLASGetHandle(&cublasv2handle);
538: PetscLogGpuTimeBegin();
539: cberr = cublasXscal(cublasv2handle,n,&alpha,d_A,one);CHKERRCUBLAS(cberr);
540: PetscLogGpuTimeEnd();
541: PetscLogGpuFlops(1.0*n);
542: }
543: VecCUDARestoreArray(ctx->v,&d_array);
544: return(0);
545: }
547: PetscErrorCode BVMatMult_Svec_CUDA(BV V,Mat A,BV W)
548: {
549: PetscErrorCode ierr;
550: BV_SVEC *v = (BV_SVEC*)V->data,*w = (BV_SVEC*)W->data;
551: Mat Vmat,Wmat;
552: const PetscScalar *d_pv;
553: PetscScalar *d_pw;
554: PetscInt j;
557: if (V->vmm) {
558: BVGetMat(V,&Vmat);
559: BVGetMat(W,&Wmat);
560: MatProductCreateWithMat(A,Vmat,NULL,Wmat);
561: MatProductSetType(Wmat,MATPRODUCT_AB);
562: MatProductSetFromOptions(Wmat);
563: MatProductSymbolic(Wmat);
564: MatProductNumeric(Wmat);
565: MatProductClear(Wmat);
566: BVRestoreMat(V,&Vmat);
567: BVRestoreMat(W,&Wmat);
568: } else {
569: VecCUDAGetArrayRead(v->v,&d_pv);
570: VecCUDAGetArrayWrite(w->v,&d_pw);
571: for (j=0;j<V->k-V->l;j++) {
572: VecCUDAPlaceArray(V->cv[1],(PetscScalar *)d_pv+(V->nc+V->l+j)*V->n);
573: VecCUDAPlaceArray(W->cv[1],d_pw+(W->nc+W->l+j)*W->n);
574: MatMult(A,V->cv[1],W->cv[1]);
575: VecCUDAResetArray(V->cv[1]);
576: VecCUDAResetArray(W->cv[1]);
577: }
578: VecCUDARestoreArrayRead(v->v,&d_pv);
579: VecCUDARestoreArrayWrite(w->v,&d_pw);
580: }
581: return(0);
582: }
584: PetscErrorCode BVCopy_Svec_CUDA(BV V,BV W)
585: {
586: PetscErrorCode ierr;
587: BV_SVEC *v = (BV_SVEC*)V->data,*w = (BV_SVEC*)W->data;
588: const PetscScalar *d_pv,*d_pvc;
589: PetscScalar *d_pw,*d_pwc;
590: cudaError_t cerr;
593: VecCUDAGetArrayRead(v->v,&d_pv);
594: VecCUDAGetArrayWrite(w->v,&d_pw);
595: d_pvc = d_pv+(V->nc+V->l)*V->n;
596: d_pwc = d_pw+(W->nc+W->l)*W->n;
597: cerr = cudaMemcpy(d_pwc,d_pvc,(V->k-V->l)*V->n*sizeof(PetscScalar),cudaMemcpyDeviceToDevice);CHKERRCUDA(cerr);
598: VecCUDARestoreArrayRead(v->v,&d_pv);
599: VecCUDARestoreArrayWrite(w->v,&d_pw);
600: return(0);
601: }
603: PetscErrorCode BVCopyColumn_Svec_CUDA(BV V,PetscInt j,PetscInt i)
604: {
606: BV_SVEC *v = (BV_SVEC*)V->data;
607: PetscScalar *d_pv;
608: cudaError_t cerr;
611: VecCUDAGetArray(v->v,&d_pv);
612: cerr = cudaMemcpy(d_pv+(V->nc+i)*V->n,d_pv+(V->nc+j)*V->n,V->n*sizeof(PetscScalar),cudaMemcpyDeviceToDevice);CHKERRCUDA(cerr);
613: VecCUDARestoreArray(v->v,&d_pv);
614: return(0);
615: }
617: PetscErrorCode BVResize_Svec_CUDA(BV bv,PetscInt m,PetscBool copy)
618: {
619: PetscErrorCode ierr;
620: BV_SVEC *ctx = (BV_SVEC*)bv->data;
621: const PetscScalar *d_pv;
622: PetscScalar *d_pnew;
623: PetscInt bs;
624: Vec vnew;
625: char str[50];
626: cudaError_t cerr;
629: VecGetBlockSize(bv->t,&bs);
630: VecCreate(PetscObjectComm((PetscObject)bv->t),&vnew);
631: VecSetType(vnew,((PetscObject)bv->t)->type_name);
632: VecSetSizes(vnew,m*bv->n,PETSC_DECIDE);
633: VecSetBlockSize(vnew,bs);
634: PetscLogObjectParent((PetscObject)bv,(PetscObject)vnew);
635: if (((PetscObject)bv)->name) {
636: PetscSNPrintf(str,sizeof(str),"%s_0",((PetscObject)bv)->name);
637: PetscObjectSetName((PetscObject)vnew,str);
638: }
639: if (copy) {
640: VecCUDAGetArrayRead(ctx->v,&d_pv);
641: VecCUDAGetArrayWrite(vnew,&d_pnew);
642: cerr = cudaMemcpy(d_pnew,d_pv,PetscMin(m,bv->m)*bv->n*sizeof(PetscScalar),cudaMemcpyDeviceToDevice);CHKERRCUDA(cerr);
643: VecCUDARestoreArrayRead(ctx->v,&d_pv);
644: VecCUDARestoreArrayWrite(vnew,&d_pnew);
645: }
646: VecDestroy(&ctx->v);
647: ctx->v = vnew;
648: return(0);
649: }
651: PetscErrorCode BVGetColumn_Svec_CUDA(BV bv,PetscInt j,Vec *v)
652: {
654: BV_SVEC *ctx = (BV_SVEC*)bv->data;
655: PetscScalar *d_pv;
656: PetscInt l;
659: l = BVAvailableVec;
660: VecCUDAGetArray(ctx->v,&d_pv);
661: VecCUDAPlaceArray(bv->cv[l],d_pv+(bv->nc+j)*bv->n);
662: return(0);
663: }
665: PetscErrorCode BVRestoreColumn_Svec_CUDA(BV bv,PetscInt j,Vec *v)
666: {
668: BV_SVEC *ctx = (BV_SVEC*)bv->data;
669: PetscInt l;
672: l = (j==bv->ci[0])? 0: 1;
673: VecCUDAResetArray(bv->cv[l]);
674: VecCUDARestoreArray(ctx->v,NULL);
675: return(0);
676: }
678: PetscErrorCode BVRestoreSplit_Svec_CUDA(BV bv,BV *L,BV *R)
679: {
680: PetscErrorCode ierr;
681: Vec v;
682: const PetscScalar *d_pv;
683: PetscObjectState lstate,rstate;
684: PetscBool change=PETSC_FALSE;
687: /* force sync flag to PETSC_CUDA_BOTH */
688: if (L) {
689: PetscObjectStateGet((PetscObject)*L,&lstate);
690: if (lstate != bv->lstate) {
691: v = ((BV_SVEC*)bv->L->data)->v;
692: VecCUDAGetArrayRead(v,&d_pv);
693: VecCUDARestoreArrayRead(v,&d_pv);
694: change = PETSC_TRUE;
695: }
696: }
697: if (R) {
698: PetscObjectStateGet((PetscObject)*R,&rstate);
699: if (rstate != bv->rstate) {
700: v = ((BV_SVEC*)bv->R->data)->v;
701: VecCUDAGetArrayRead(v,&d_pv);
702: VecCUDARestoreArrayRead(v,&d_pv);
703: change = PETSC_TRUE;
704: }
705: }
706: if (change) {
707: v = ((BV_SVEC*)bv->data)->v;
708: VecCUDAGetArray(v,(PetscScalar **)&d_pv);
709: VecCUDARestoreArray(v,(PetscScalar **)&d_pv);
710: }
711: return(0);
712: }
714: PetscErrorCode BVGetMat_Svec_CUDA(BV bv,Mat *A)
715: {
717: BV_SVEC *ctx = (BV_SVEC*)bv->data;
718: PetscScalar *vv,*aa;
719: PetscBool create=PETSC_FALSE;
720: PetscInt m,cols;
723: m = bv->k-bv->l;
724: if (!bv->Aget) create=PETSC_TRUE;
725: else {
726: MatDenseCUDAGetArray(bv->Aget,&aa);
727: if (aa) SETERRQ(PetscObjectComm((PetscObject)bv),PETSC_ERR_ARG_WRONGSTATE,"BVGetMat already called on this BV");
728: MatGetSize(bv->Aget,NULL,&cols);
729: if (cols!=m) {
730: MatDestroy(&bv->Aget);
731: create=PETSC_TRUE;
732: }
733: }
734: VecCUDAGetArray(ctx->v,&vv);
735: if (create) {
736: MatCreateDenseCUDA(PetscObjectComm((PetscObject)bv),bv->n,PETSC_DECIDE,bv->N,m,vv,&bv->Aget); /* pass a pointer to avoid allocation of storage */
737: MatDenseCUDAReplaceArray(bv->Aget,NULL); /* replace with a null pointer, the value after BVRestoreMat */
738: PetscLogObjectParent((PetscObject)bv,(PetscObject)bv->Aget);
739: }
740: MatDenseCUDAPlaceArray(bv->Aget,vv+(bv->nc+bv->l)*bv->n); /* set the actual pointer */
741: *A = bv->Aget;
742: return(0);
743: }
745: PetscErrorCode BVRestoreMat_Svec_CUDA(BV bv,Mat *A)
746: {
748: BV_SVEC *ctx = (BV_SVEC*)bv->data;
749: PetscScalar *vv,*aa;
752: MatDenseCUDAGetArray(bv->Aget,&aa);
753: vv = aa-(bv->nc+bv->l)*bv->n;
754: MatDenseCUDAResetArray(bv->Aget);
755: VecCUDARestoreArray(ctx->v,&vv);
756: *A = NULL;
757: return(0);
758: }