Actual source code: sveccuda.cu

slepc-3.15.1 2021-05-28
Report Typos and Errors
  1: /*
  2:    - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
  3:    SLEPc - Scalable Library for Eigenvalue Problem Computations
  4:    Copyright (c) 2002-2021, Universitat Politecnica de Valencia, Spain

  6:    This file is part of SLEPc.
  7:    SLEPc is distributed under a 2-clause BSD license (see LICENSE).
  8:    - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
  9: */
 10: /*
 11:    BV implemented as a single Vec (CUDA version)
 12: */

 14: #include <slepc/private/bvimpl.h>
 15: #include "../src/sys/classes/bv/impls/svec/svec.h"
 16: #include <slepccublas.h>

 18: #if defined(PETSC_USE_COMPLEX)
 19: #include <thrust/device_ptr.h>
 20: #endif

 22: #define BLOCKSIZE 64

 24: /*
 25:     B := alpha*A + beta*B

 27:     A,B are nxk (ld=n)
 28:  */
 29: static PetscErrorCode BVAXPY_BLAS_CUDA(BV bv,PetscInt n_,PetscInt k_,PetscScalar alpha,const PetscScalar *d_A,PetscScalar beta,PetscScalar *d_B)
 30: {
 32:   PetscBLASInt   m=0,one=1;
 33:   cublasStatus_t cberr;
 34:   cublasHandle_t cublasv2handle;

 37:   PetscCUBLASGetHandle(&cublasv2handle);
 38:   PetscBLASIntCast(n_*k_,&m);
 39:   PetscLogGpuTimeBegin();
 40:   if (beta!=(PetscScalar)1.0) {
 41:     cberr = cublasXscal(cublasv2handle,m,&beta,d_B,one);CHKERRCUBLAS(cberr);
 42:     PetscLogGpuFlops(1.0*m);
 43:   }
 44:   cberr = cublasXaxpy(cublasv2handle,m,&alpha,d_A,one,d_B,one);CHKERRCUBLAS(cberr);
 45:   PetscLogGpuTimeEnd();
 46:   PetscLogGpuFlops(2.0*m);
 47:   return(0);
 48: }

 50: /*
 51:     C := alpha*A*B + beta*C
 52: */
 53: PetscErrorCode BVMult_Svec_CUDA(BV Y,PetscScalar alpha,PetscScalar beta,BV X,Mat Q)
 54: {
 55:   PetscErrorCode    ierr;
 56:   BV_SVEC           *y = (BV_SVEC*)Y->data,*x = (BV_SVEC*)X->data;
 57:   const PetscScalar *d_px,*d_A,*q;
 58:   PetscScalar       *d_py,*d_q,*d_B,*d_C;
 59:   PetscInt          ldq,mq;
 60:   PetscBLASInt      m=0,n=0,k=0,ldq_=0;
 61:   cublasStatus_t    cberr;
 62:   cudaError_t       cerr;
 63:   cublasHandle_t    cublasv2handle;

 66:   if (!Y->n) return(0);
 67:   VecCUDAGetArrayRead(x->v,&d_px);
 68:   if (beta==(PetscScalar)0.0) {
 69:     VecCUDAGetArrayWrite(y->v,&d_py);
 70:   } else {
 71:     VecCUDAGetArray(y->v,&d_py);
 72:   }
 73:   d_A = d_px+(X->nc+X->l)*X->n;
 74:   d_C = d_py+(Y->nc+Y->l)*Y->n;
 75:   if (Q) {
 76:     PetscBLASIntCast(Y->n,&m);
 77:     PetscBLASIntCast(Y->k-Y->l,&n);
 78:     PetscBLASIntCast(X->k-X->l,&k);
 79:     PetscCUBLASGetHandle(&cublasv2handle);
 80:     MatGetSize(Q,&ldq,&mq);
 81:     PetscBLASIntCast(ldq,&ldq_);
 82:     MatDenseGetArrayRead(Q,&q);
 83:     cerr = cudaMalloc((void**)&d_q,ldq*mq*sizeof(PetscScalar));CHKERRCUDA(cerr);
 84:     cerr = cudaMemcpy(d_q,q,ldq*mq*sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
 85:     PetscLogCpuToGpu(ldq*mq*sizeof(PetscScalar));
 86:     d_B = d_q+Y->l*ldq+X->l;
 87:     PetscLogGpuTimeBegin();
 88:     cberr = cublasXgemm(cublasv2handle,CUBLAS_OP_N,CUBLAS_OP_N,m,n,k,&alpha,d_A,m,d_B,ldq_,&beta,d_C,m);CHKERRCUBLAS(cberr);
 89:     PetscLogGpuTimeEnd();
 90:     MatDenseRestoreArrayRead(Q,&q);
 91:     cerr = cudaFree(d_q);CHKERRCUDA(cerr);
 92:     PetscLogGpuFlops(2.0*m*n*k);
 93:   } else {
 94:     BVAXPY_BLAS_CUDA(Y,Y->n,Y->k-Y->l,alpha,d_A,beta,d_C);
 95:   }
 96:   VecCUDARestoreArrayRead(x->v,&d_px);
 97:   VecCUDARestoreArrayWrite(y->v,&d_py);
 98:   return(0);
 99: }

101: /*
102:     y := alpha*A*x + beta*y
103: */
104: PetscErrorCode BVMultVec_Svec_CUDA(BV X,PetscScalar alpha,PetscScalar beta,Vec y,PetscScalar *q)
105: {
106:   PetscErrorCode    ierr;
107:   BV_SVEC           *x = (BV_SVEC*)X->data;
108:   const PetscScalar *d_px,*d_A;
109:   PetscScalar       *d_py,*d_q,*d_x,*d_y;
110:   PetscBLASInt      n=0,k=0,one=1;
111:   cublasStatus_t    cberr;
112:   cublasHandle_t    cublasv2handle;
113:   cudaError_t       cerr;

116:   PetscBLASIntCast(X->n,&n);
117:   PetscBLASIntCast(X->k-X->l,&k);
118:   PetscCUBLASGetHandle(&cublasv2handle);
119:   VecCUDAGetArrayRead(x->v,&d_px);
120:   if (beta==(PetscScalar)0.0) {
121:     VecCUDAGetArrayWrite(y,&d_py);
122:   } else {
123:     VecCUDAGetArray(y,&d_py);
124:   }
125:   if (!q) {
126:     VecCUDAGetArray(X->buffer,&d_q);
127:   } else {
128:     cerr = cudaMalloc((void**)&d_q,k*sizeof(PetscScalar));CHKERRCUDA(cerr);
129:     cerr = cudaMemcpy(d_q,q,k*sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
130:     PetscLogCpuToGpu(k*sizeof(PetscScalar));
131:   }
132:   d_A = d_px+(X->nc+X->l)*X->n;
133:   d_x = d_q;
134:   d_y = d_py;
135:   PetscLogGpuTimeBegin();
136:   cberr = cublasXgemv(cublasv2handle,CUBLAS_OP_N,n,k,&alpha,d_A,n,d_x,one,&beta,d_y,one);CHKERRCUBLAS(cberr);
137:   PetscLogGpuTimeEnd();
138:   VecCUDARestoreArrayRead(x->v,&d_px);
139:   if (beta==(PetscScalar)0.0) {
140:     VecCUDARestoreArrayWrite(y,&d_py);
141:   } else {
142:     VecCUDARestoreArray(y,&d_py);
143:   }
144:   if (!q) {
145:     VecCUDARestoreArray(X->buffer,&d_q);
146:   } else {
147:     cerr = cudaFree(d_q);CHKERRCUDA(cerr);
148:   }
149:   PetscLogGpuFlops(2.0*n*k);
150:   return(0);
151: }

153: /*
154:     A(:,s:e-1) := A*B(:,s:e-1)
155: */
156: PetscErrorCode BVMultInPlace_Svec_CUDA(BV V,Mat Q,PetscInt s,PetscInt e)
157: {
158:   PetscErrorCode    ierr;
159:   BV_SVEC           *ctx = (BV_SVEC*)V->data;
160:   PetscScalar       *d_pv,*d_q,*d_A,*d_B,*d_work,sone=1.0,szero=0.0;
161:   const PetscScalar *q;
162:   PetscInt          j,ldq,nq;
163:   PetscBLASInt      m=0,n=0,k=0,l,ldq_=0,bs=BLOCKSIZE;
164:   cublasStatus_t    cberr;
165:   size_t            freemem,totmem;
166:   cublasHandle_t    cublasv2handle;
167:   cudaError_t       cerr;

170:   if (!V->n) return(0);
171:   PetscBLASIntCast(V->n,&m);
172:   PetscBLASIntCast(e-s,&n);
173:   PetscBLASIntCast(V->k-V->l,&k);
174:   MatGetSize(Q,&ldq,&nq);
175:   PetscBLASIntCast(ldq,&ldq_);
176:   VecCUDAGetArray(ctx->v,&d_pv);
177:   MatDenseGetArrayRead(Q,&q);
178:   cerr = cudaMalloc((void**)&d_q,ldq*nq*sizeof(PetscScalar));CHKERRCUDA(cerr);
179:   cerr = cudaMemcpy(d_q,q,ldq*nq*sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
180:   PetscLogCpuToGpu(ldq*nq*sizeof(PetscScalar));
181:   PetscCUBLASGetHandle(&cublasv2handle);
182:   PetscLogGpuTimeBegin();
183:   /* try to allocate the whole matrix */
184:   cerr = cudaMemGetInfo(&freemem,&totmem);CHKERRCUDA(cerr);
185:   if (freemem>=m*n*sizeof(PetscScalar)) {
186:     cerr = cudaMalloc((void**)&d_work,m*n*sizeof(PetscScalar));CHKERRCUDA(cerr);
187:     d_A = d_pv+(V->nc+V->l)*m;
188:     d_B = d_q+V->l*ldq+V->l+(s-V->l)*ldq;
189:     cberr = cublasXgemm(cublasv2handle,CUBLAS_OP_N,CUBLAS_OP_N,m,n,k,&sone,d_A,m,d_B,ldq_,&szero,d_work,m);CHKERRCUBLAS(cberr);
190:     for (j=0;j<n;j++) {
191:       cerr = cudaMemcpy(d_A+(s-V->l+j)*m,d_work+(j*m),m*sizeof(PetscScalar),cudaMemcpyDeviceToDevice);CHKERRCUDA(cerr);
192:     }
193:   } else {
194:     bs = freemem/(m*sizeof(PetscScalar));
195:     cerr = cudaMalloc((void**)&d_work,bs*n*sizeof(PetscScalar));CHKERRCUDA(cerr);
196:     l = m % bs;
197:     if (l) {
198:       d_A = d_pv+(V->nc+V->l)*m;
199:       d_B = d_q+V->l*ldq+V->l+(s-V->l)*ldq;
200:       cberr = cublasXgemm(cublasv2handle,CUBLAS_OP_N,CUBLAS_OP_N,l,n,k,&sone,d_A,m,d_B,ldq_,&szero,d_work,l);CHKERRCUBLAS(cberr);
201:       for (j=0;j<n;j++) {
202:         cerr = cudaMemcpy(d_A+(s-V->l+j)*m,d_work+(j*l),l*sizeof(PetscScalar),cudaMemcpyDeviceToDevice);CHKERRCUDA(cerr);
203:       }
204:     }
205:     for (;l<m;l+=bs) {
206:       d_A = d_pv+(V->nc+V->l)*m+l;
207:       d_B = d_q+V->l*ldq+V->l+(s-V->l)*ldq;
208:       cberr = cublasXgemm(cublasv2handle,CUBLAS_OP_N,CUBLAS_OP_N,bs,n,k,&sone,d_A,m,d_B,ldq_,&szero,d_work,bs);CHKERRCUBLAS(cberr);
209:       for (j=0;j<n;j++) {
210:         cerr = cudaMemcpy(d_A+(s-V->l+j)*m,d_work+(j*bs),bs*sizeof(PetscScalar),cudaMemcpyDeviceToDevice);CHKERRCUDA(cerr);
211:       }
212:     }
213:   }
214:   PetscLogGpuTimeEnd();
215:   cerr = WaitForCUDA();CHKERRCUDA(cerr);
216:   MatDenseRestoreArrayRead(Q,&q);
217:   cerr = cudaFree(d_q);CHKERRCUDA(cerr);
218:   cerr = cudaFree(d_work);CHKERRCUDA(cerr);
219:   VecCUDARestoreArray(ctx->v,&d_pv);
220:   PetscLogGpuFlops(2.0*m*n*k);
221:   return(0);
222: }

224: /*
225:     A(:,s:e-1) := A*B(:,s:e-1)
226: */
227: PetscErrorCode BVMultInPlaceTranspose_Svec_CUDA(BV V,Mat Q,PetscInt s,PetscInt e)
228: {
229:   PetscErrorCode    ierr;
230:   BV_SVEC           *ctx = (BV_SVEC*)V->data;
231:   PetscScalar       *d_pv,*d_q,*d_A,*d_B,*d_work,sone=1.0,szero=0.0;
232:   const PetscScalar *q;
233:   PetscInt          j,ldq,nq;
234:   PetscBLASInt      m=0,n=0,k=0,ldq_=0;
235:   cublasStatus_t    cberr;
236:   cublasHandle_t    cublasv2handle;
237:   cudaError_t       cerr;

240:   if (!V->n) return(0);
241:   PetscBLASIntCast(V->n,&m);
242:   PetscBLASIntCast(e-s,&n);
243:   PetscBLASIntCast(V->k-V->l,&k);
244:   MatGetSize(Q,&ldq,&nq);
245:   PetscBLASIntCast(ldq,&ldq_);
246:   VecCUDAGetArray(ctx->v,&d_pv);
247:   MatDenseGetArrayRead(Q,&q);
248:   PetscCUBLASGetHandle(&cublasv2handle);
249:   cerr = cudaMalloc((void**)&d_q,ldq*nq*sizeof(PetscScalar));CHKERRCUDA(cerr);
250:   cerr = cudaMemcpy(d_q,q,ldq*nq*sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
251:   PetscLogCpuToGpu(ldq*nq*sizeof(PetscScalar));
252:   PetscLogGpuTimeBegin();
253:   cerr = cudaMalloc((void**)&d_work,m*n*sizeof(PetscScalar));CHKERRCUDA(cerr);
254:   d_A = d_pv+(V->nc+V->l)*m;
255:   d_B = d_q+V->l*ldq+s;
256:   cberr = cublasXgemm(cublasv2handle,CUBLAS_OP_N,CUBLAS_OP_C,m,n,k,&sone,d_A,m,d_B,ldq_,&szero,d_work,m);CHKERRCUBLAS(cberr);
257:   for (j=0;j<n;j++) {
258:     cerr = cudaMemcpy(d_A+(s-V->l+j)*m,d_work+(j*m),m*sizeof(PetscScalar),cudaMemcpyDeviceToDevice);CHKERRCUDA(cerr);
259:   }
260:   PetscLogGpuTimeEnd();
261:   cerr = WaitForCUDA();CHKERRCUDA(cerr);
262:   MatDenseRestoreArrayRead(Q,&q);
263:   cerr = cudaFree(d_q);CHKERRCUDA(cerr);
264:   cerr = cudaFree(d_work);CHKERRCUDA(cerr);
265:   VecCUDARestoreArray(ctx->v,&d_pv);
266:   PetscLogGpuFlops(2.0*m*n*k);
267:   return(0);
268: }

270: /*
271:     C := A'*B
272: */
273: PetscErrorCode BVDot_Svec_CUDA(BV X,BV Y,Mat M)
274: {
275:   PetscErrorCode    ierr;
276:   BV_SVEC           *x = (BV_SVEC*)X->data,*y = (BV_SVEC*)Y->data;
277:   const PetscScalar *d_px,*d_py,*d_A,*d_B;
278:   PetscScalar       *pm,*d_work,sone=1.0,szero=0.0,*C,*CC;
279:   PetscInt          j,ldm;
280:   PetscBLASInt      m=0,n=0,k=0,ldm_=0;
281:   PetscMPIInt       len;
282:   cublasStatus_t    cberr;
283:   cublasHandle_t    cublasv2handle;
284:   cudaError_t       cerr;

287:   PetscBLASIntCast(Y->k-Y->l,&m);
288:   PetscBLASIntCast(X->k-X->l,&n);
289:   PetscBLASIntCast(X->n,&k);
290:   MatGetSize(M,&ldm,NULL);
291:   PetscBLASIntCast(ldm,&ldm_);
292:   VecCUDAGetArrayRead(x->v,&d_px);
293:   VecCUDAGetArrayRead(y->v,&d_py);
294:   MatDenseGetArray(M,&pm);
295:   PetscCUBLASGetHandle(&cublasv2handle);
296:   cerr = cudaMalloc((void**)&d_work,m*n*sizeof(PetscScalar));CHKERRCUDA(cerr);
297:   d_A = d_py+(Y->nc+Y->l)*Y->n;
298:   d_B = d_px+(X->nc+X->l)*X->n;
299:   C = pm+X->l*ldm+Y->l;
300:   if (x->mpi) {
301:     if (ldm==m) {
302:       BVAllocateWork_Private(X,m*n);
303:       if (k) {
304:         PetscLogGpuTimeBegin();
305:         cberr = cublasXgemm(cublasv2handle,CUBLAS_OP_C,CUBLAS_OP_N,m,n,k,&sone,d_A,k,d_B,k,&szero,d_work,ldm_);CHKERRCUBLAS(cberr);
306:         PetscLogGpuTimeEnd();
307:         cerr = cudaMemcpy(X->work,d_work,m*n*sizeof(PetscScalar),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
308:         PetscLogGpuToCpu(m*n*sizeof(PetscScalar));
309:       } else {
310:         PetscArrayzero(X->work,m*n);
311:       }
312:       PetscMPIIntCast(m*n,&len);
313:       MPIU_Allreduce(X->work,C,len,MPIU_SCALAR,MPIU_SUM,PetscObjectComm((PetscObject)X));
314:     } else {
315:       BVAllocateWork_Private(X,2*m*n);
316:       CC = X->work+m*n;
317:       if (k) {
318:         PetscLogGpuTimeBegin();
319:         cberr = cublasXgemm(cublasv2handle,CUBLAS_OP_C,CUBLAS_OP_N,m,n,k,&sone,d_A,k,d_B,k,&szero,d_work,m);CHKERRCUBLAS(cberr);
320:         PetscLogGpuTimeEnd();
321:         cerr = cudaMemcpy(X->work,d_work,m*n*sizeof(PetscScalar),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
322:         PetscLogGpuToCpu(m*n*sizeof(PetscScalar));
323:       } else {
324:         PetscArrayzero(X->work,m*n);
325:       }
326:       PetscMPIIntCast(m*n,&len);
327:       MPIU_Allreduce(X->work,CC,len,MPIU_SCALAR,MPIU_SUM,PetscObjectComm((PetscObject)X));
328:       for (j=0;j<n;j++) {
329:         PetscArraycpy(C+j*ldm,CC+j*m,m);
330:       }
331:     }
332:   } else {
333:     if (k) {
334:       BVAllocateWork_Private(X,m*n);
335:       PetscLogGpuTimeBegin();
336:       cberr = cublasXgemm(cublasv2handle,CUBLAS_OP_C,CUBLAS_OP_N,m,n,k,&sone,d_A,k,d_B,k,&szero,d_work,m);CHKERRCUBLAS(cberr);
337:       PetscLogGpuTimeEnd();
338:       cerr = cudaMemcpy(X->work,d_work,m*n*sizeof(PetscScalar),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
339:       PetscLogGpuToCpu(m*n*sizeof(PetscScalar));
340:       for (j=0;j<n;j++) {
341:         PetscArraycpy(C+j*ldm,X->work+j*m,m);
342:       }
343:     }
344:   }
345:   cerr = WaitForCUDA();CHKERRCUDA(cerr);
346:   cerr = cudaFree(d_work);CHKERRCUDA(cerr);
347:   MatDenseRestoreArray(M,&pm);
348:   VecCUDARestoreArrayRead(x->v,&d_px);
349:   VecCUDARestoreArrayRead(y->v,&d_py);
350:   PetscLogGpuFlops(2.0*m*n*k);
351:   return(0);
352: }

354: #if defined(PETSC_USE_COMPLEX)
355: struct conjugate
356: {
357:   __host__ __device__
358:     PetscScalar operator()(PetscScalar x)
359:     {
360:       return PetscConj(x);
361:     }
362: };

364: PetscErrorCode ConjugateCudaArray(PetscScalar *a, PetscInt n)
365: {
366:   cudaError_t                     cerr;
367:   thrust::device_ptr<PetscScalar> ptr;

370:   try {
371:     ptr = thrust::device_pointer_cast(a);
372:     thrust::transform(ptr,ptr+n,ptr,conjugate());
373:     cerr = WaitForCUDA();CHKERRCUDA(cerr);
374:   } catch (char *ex) {
375:     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"Thrust error: %s", ex);
376:   }
377:   return(0);
378: }
379: #endif

381: /*
382:     y := A'*x computed as y' := x'*A
383: */
384: PetscErrorCode BVDotVec_Svec_CUDA(BV X,Vec y,PetscScalar *q)
385: {
386:   PetscErrorCode    ierr;
387:   BV_SVEC           *x = (BV_SVEC*)X->data;
388:   const PetscScalar *d_A,*d_x,*d_px,*d_py;
389:   PetscScalar       *d_work,szero=0.0,sone=1.0,*qq=q;
390:   PetscBLASInt      n=0,k=0,one=1;
391:   PetscMPIInt       len;
392:   Vec               z = y;
393:   cublasStatus_t    cberr;
394:   cublasHandle_t    cublasv2handle;
395:   cudaError_t       cerr;

398:   PetscBLASIntCast(X->n,&n);
399:   PetscBLASIntCast(X->k-X->l,&k);
400:   PetscCUBLASGetHandle(&cublasv2handle);
401:   if (X->matrix) {
402:     BV_IPMatMult(X,y);
403:     z = X->Bx;
404:   }
405:   VecCUDAGetArrayRead(x->v,&d_px);
406:   VecCUDAGetArrayRead(z,&d_py);
407:   if (!q) {
408:     VecCUDAGetArrayWrite(X->buffer,&d_work);
409:   } else {
410:     cerr = cudaMalloc((void**)&d_work,k*sizeof(PetscScalar));CHKERRCUDA(cerr);
411:   }
412:   d_A = d_px+(X->nc+X->l)*X->n;
413:   d_x = d_py;
414:   if (x->mpi) {
415:     BVAllocateWork_Private(X,k);
416:     if (n) {
417:       PetscLogGpuTimeBegin();
418: #if defined(PETSC_USE_COMPLEX)
419:       cberr = cublasXgemm(cublasv2handle,CUBLAS_OP_C,CUBLAS_OP_N,one,k,n,&sone,d_x,n,d_A,n,&szero,d_work,one);CHKERRCUBLAS(cberr);
420:       ConjugateCudaArray(d_work,k);
421: #else
422:       cberr = cublasXgemm(cublasv2handle,CUBLAS_OP_N,CUBLAS_OP_N,one,k,n,&sone,d_x,one,d_A,n,&szero,d_work,one);CHKERRCUBLAS(cberr);
423: #endif
424:       PetscLogGpuTimeEnd();
425:       cerr = cudaMemcpy(X->work,d_work,k*sizeof(PetscScalar),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
426:       PetscLogGpuToCpu(k*sizeof(PetscScalar));
427:     } else {
428:       PetscArrayzero(X->work,k);
429:     }
430:     if (!q) {
431:       VecCUDARestoreArrayWrite(X->buffer,&d_work);
432:       VecGetArray(X->buffer,&qq);
433:     } else {
434:       cerr = cudaFree(d_work);CHKERRCUDA(cerr);
435:     }
436:     PetscMPIIntCast(k,&len);
437:     MPIU_Allreduce(X->work,qq,len,MPIU_SCALAR,MPIU_SUM,PetscObjectComm((PetscObject)X));
438:     if (!q) { VecRestoreArray(X->buffer,&qq); }
439:   } else {
440:     if (n) {
441:       PetscLogGpuTimeBegin();
442: #if defined(PETSC_USE_COMPLEX)
443:       cberr = cublasXgemm(cublasv2handle,CUBLAS_OP_C,CUBLAS_OP_N,one,k,n,&sone,d_x,n,d_A,n,&szero,d_work,one);CHKERRCUBLAS(cberr);
444:       ConjugateCudaArray(d_work,k);
445: #else
446:       cberr = cublasXgemm(cublasv2handle,CUBLAS_OP_N,CUBLAS_OP_N,one,k,n,&sone,d_x,one,d_A,n,&szero,d_work,one);CHKERRCUBLAS(cberr);
447: #endif
448:       PetscLogGpuTimeEnd();
449:     }
450:     if (!q) {
451:       VecCUDARestoreArrayWrite(X->buffer,&d_work);
452:     } else {
453:       cerr = cudaMemcpy(q,d_work,k*sizeof(PetscScalar),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
454:       PetscLogGpuToCpu(k*sizeof(PetscScalar));
455:       cerr = cudaFree(d_work);CHKERRCUDA(cerr);
456:     }
457:   }
458:   cerr = WaitForCUDA();CHKERRCUDA(cerr);
459:   VecCUDARestoreArrayRead(z,&d_py);
460:   VecCUDARestoreArrayRead(x->v,&d_px);
461:   PetscLogGpuFlops(2.0*n*k);
462:   return(0);
463: }

465: /*
466:     y := A'*x computed as y' := x'*A
467: */
468: PetscErrorCode BVDotVec_Local_Svec_CUDA(BV X,Vec y,PetscScalar *m)
469: {
470:   PetscErrorCode    ierr;
471:   BV_SVEC           *x = (BV_SVEC*)X->data;
472:   const PetscScalar *d_A,*d_x,*d_px,*d_py;
473:   PetscScalar       *d_y,szero=0.0,sone=1.0;
474:   PetscBLASInt      n=0,k=0,one=1;
475:   Vec               z = y;
476:   cublasStatus_t    cberr;
477:   cublasHandle_t    cublasv2handle;
478:   cudaError_t       cerr;

481:   PetscBLASIntCast(X->n,&n);
482:   PetscBLASIntCast(X->k-X->l,&k);
483:   if (X->matrix) {
484:     BV_IPMatMult(X,y);
485:     z = X->Bx;
486:   }
487:   PetscCUBLASGetHandle(&cublasv2handle);
488:   VecCUDAGetArrayRead(x->v,&d_px);
489:   VecCUDAGetArrayRead(z,&d_py);
490:   d_A = d_px+(X->nc+X->l)*X->n;
491:   d_x = d_py;
492:   if (n) {
493:     cerr = cudaMalloc((void**)&d_y,k*sizeof(PetscScalar));CHKERRCUDA(cerr);
494:     PetscLogGpuTimeBegin();
495: #if defined(PETSC_USE_COMPLEX)
496:     cberr = cublasXgemm(cublasv2handle,CUBLAS_OP_C,CUBLAS_OP_N,one,k,n,&sone,d_x,n,d_A,n,&szero,d_y,one);CHKERRCUBLAS(cberr);
497:     ConjugateCudaArray(d_y,k);
498: #else
499:     cberr = cublasXgemm(cublasv2handle,CUBLAS_OP_N,CUBLAS_OP_N,one,k,n,&sone,d_x,one,d_A,n,&szero,d_y,one);CHKERRCUBLAS(cberr);
500: #endif
501:     PetscLogGpuTimeEnd();
502:     cerr = cudaMemcpy(m,d_y,k*sizeof(PetscScalar),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
503:     PetscLogGpuToCpu(k*sizeof(PetscScalar));
504:     cerr = cudaFree(d_y);CHKERRCUDA(cerr);
505:   }
506:   VecCUDARestoreArrayRead(z,&d_py);
507:   VecCUDARestoreArrayRead(x->v,&d_px);
508:   PetscLogGpuFlops(2.0*n*k);
509:   return(0);
510: }

512: /*
513:     Scale n scalars
514: */
515: PetscErrorCode BVScale_Svec_CUDA(BV bv,PetscInt j,PetscScalar alpha)
516: {
518:   BV_SVEC        *ctx = (BV_SVEC*)bv->data;
519:   PetscScalar    *d_array, *d_A;
520:   PetscBLASInt   n=0,one=1;
521:   cublasStatus_t cberr;
522:   cublasHandle_t cublasv2handle;
523:   cudaError_t    cerr;

526:   VecCUDAGetArray(ctx->v,&d_array);
527:   if (j<0) {
528:     d_A = d_array+(bv->nc+bv->l)*bv->n;
529:     PetscBLASIntCast((bv->k-bv->l)*bv->n,&n);
530:   } else {
531:     d_A = d_array+(bv->nc+j)*bv->n;
532:     PetscBLASIntCast(bv->n,&n);
533:   }
534:   if (alpha == (PetscScalar)0.0) {
535:     cerr = cudaMemset(d_A,0,n*sizeof(PetscScalar));CHKERRCUDA(cerr);
536:   } else if (alpha != (PetscScalar)1.0) {
537:     PetscCUBLASGetHandle(&cublasv2handle);
538:     PetscLogGpuTimeBegin();
539:     cberr = cublasXscal(cublasv2handle,n,&alpha,d_A,one);CHKERRCUBLAS(cberr);
540:     PetscLogGpuTimeEnd();
541:     PetscLogGpuFlops(1.0*n);
542:   }
543:   VecCUDARestoreArray(ctx->v,&d_array);
544:   return(0);
545: }

547: PetscErrorCode BVMatMult_Svec_CUDA(BV V,Mat A,BV W)
548: {
549:   PetscErrorCode    ierr;
550:   BV_SVEC           *v = (BV_SVEC*)V->data,*w = (BV_SVEC*)W->data;
551:   Mat               Vmat,Wmat;
552:   const PetscScalar *d_pv;
553:   PetscScalar       *d_pw;
554:   PetscInt          j;

557:   if (V->vmm) {
558:     BVGetMat(V,&Vmat);
559:     BVGetMat(W,&Wmat);
560:     MatProductCreateWithMat(A,Vmat,NULL,Wmat);
561:     MatProductSetType(Wmat,MATPRODUCT_AB);
562:     MatProductSetFromOptions(Wmat);
563:     MatProductSymbolic(Wmat);
564:     MatProductNumeric(Wmat);
565:     MatProductClear(Wmat);
566:     BVRestoreMat(V,&Vmat);
567:     BVRestoreMat(W,&Wmat);
568:   } else {
569:     VecCUDAGetArrayRead(v->v,&d_pv);
570:     VecCUDAGetArrayWrite(w->v,&d_pw);
571:     for (j=0;j<V->k-V->l;j++) {
572:       VecCUDAPlaceArray(V->cv[1],(PetscScalar *)d_pv+(V->nc+V->l+j)*V->n);
573:       VecCUDAPlaceArray(W->cv[1],d_pw+(W->nc+W->l+j)*W->n);
574:       MatMult(A,V->cv[1],W->cv[1]);
575:       VecCUDAResetArray(V->cv[1]);
576:       VecCUDAResetArray(W->cv[1]);
577:     }
578:     VecCUDARestoreArrayRead(v->v,&d_pv);
579:     VecCUDARestoreArrayWrite(w->v,&d_pw);
580:   }
581:   return(0);
582: }

584: PetscErrorCode BVCopy_Svec_CUDA(BV V,BV W)
585: {
586:   PetscErrorCode    ierr;
587:   BV_SVEC           *v = (BV_SVEC*)V->data,*w = (BV_SVEC*)W->data;
588:   const PetscScalar *d_pv,*d_pvc;
589:   PetscScalar       *d_pw,*d_pwc;
590:   cudaError_t       cerr;

593:   VecCUDAGetArrayRead(v->v,&d_pv);
594:   VecCUDAGetArrayWrite(w->v,&d_pw);
595:   d_pvc = d_pv+(V->nc+V->l)*V->n;
596:   d_pwc = d_pw+(W->nc+W->l)*W->n;
597:   cerr = cudaMemcpy(d_pwc,d_pvc,(V->k-V->l)*V->n*sizeof(PetscScalar),cudaMemcpyDeviceToDevice);CHKERRCUDA(cerr);
598:   VecCUDARestoreArrayRead(v->v,&d_pv);
599:   VecCUDARestoreArrayWrite(w->v,&d_pw);
600:   return(0);
601: }

603: PetscErrorCode BVCopyColumn_Svec_CUDA(BV V,PetscInt j,PetscInt i)
604: {
606:   BV_SVEC        *v = (BV_SVEC*)V->data;
607:   PetscScalar    *d_pv;
608:   cudaError_t    cerr;

611:   VecCUDAGetArray(v->v,&d_pv);
612:   cerr = cudaMemcpy(d_pv+(V->nc+i)*V->n,d_pv+(V->nc+j)*V->n,V->n*sizeof(PetscScalar),cudaMemcpyDeviceToDevice);CHKERRCUDA(cerr);
613:   VecCUDARestoreArray(v->v,&d_pv);
614:   return(0);
615: }

617: PetscErrorCode BVResize_Svec_CUDA(BV bv,PetscInt m,PetscBool copy)
618: {
619:   PetscErrorCode    ierr;
620:   BV_SVEC           *ctx = (BV_SVEC*)bv->data;
621:   const PetscScalar *d_pv;
622:   PetscScalar       *d_pnew;
623:   PetscInt          bs;
624:   Vec               vnew;
625:   char              str[50];
626:   cudaError_t       cerr;

629:   VecGetBlockSize(bv->t,&bs);
630:   VecCreate(PetscObjectComm((PetscObject)bv->t),&vnew);
631:   VecSetType(vnew,((PetscObject)bv->t)->type_name);
632:   VecSetSizes(vnew,m*bv->n,PETSC_DECIDE);
633:   VecSetBlockSize(vnew,bs);
634:   PetscLogObjectParent((PetscObject)bv,(PetscObject)vnew);
635:   if (((PetscObject)bv)->name) {
636:     PetscSNPrintf(str,sizeof(str),"%s_0",((PetscObject)bv)->name);
637:     PetscObjectSetName((PetscObject)vnew,str);
638:   }
639:   if (copy) {
640:     VecCUDAGetArrayRead(ctx->v,&d_pv);
641:     VecCUDAGetArrayWrite(vnew,&d_pnew);
642:     cerr = cudaMemcpy(d_pnew,d_pv,PetscMin(m,bv->m)*bv->n*sizeof(PetscScalar),cudaMemcpyDeviceToDevice);CHKERRCUDA(cerr);
643:     VecCUDARestoreArrayRead(ctx->v,&d_pv);
644:     VecCUDARestoreArrayWrite(vnew,&d_pnew);
645:   }
646:   VecDestroy(&ctx->v);
647:   ctx->v = vnew;
648:   return(0);
649: }

651: PetscErrorCode BVGetColumn_Svec_CUDA(BV bv,PetscInt j,Vec *v)
652: {
654:   BV_SVEC        *ctx = (BV_SVEC*)bv->data;
655:   PetscScalar    *d_pv;
656:   PetscInt       l;

659:   l = BVAvailableVec;
660:   VecCUDAGetArray(ctx->v,&d_pv);
661:   VecCUDAPlaceArray(bv->cv[l],d_pv+(bv->nc+j)*bv->n);
662:   return(0);
663: }

665: PetscErrorCode BVRestoreColumn_Svec_CUDA(BV bv,PetscInt j,Vec *v)
666: {
668:   BV_SVEC        *ctx = (BV_SVEC*)bv->data;
669:   PetscInt       l;

672:   l = (j==bv->ci[0])? 0: 1;
673:   VecCUDAResetArray(bv->cv[l]);
674:   VecCUDARestoreArray(ctx->v,NULL);
675:   return(0);
676: }

678: PetscErrorCode BVRestoreSplit_Svec_CUDA(BV bv,BV *L,BV *R)
679: {
680:   PetscErrorCode    ierr;
681:   Vec               v;
682:   const PetscScalar *d_pv;
683:   PetscObjectState  lstate,rstate;
684:   PetscBool         change=PETSC_FALSE;

687:   /* force sync flag to PETSC_CUDA_BOTH */
688:   if (L) {
689:     PetscObjectStateGet((PetscObject)*L,&lstate);
690:     if (lstate != bv->lstate) {
691:       v = ((BV_SVEC*)bv->L->data)->v;
692:       VecCUDAGetArrayRead(v,&d_pv);
693:       VecCUDARestoreArrayRead(v,&d_pv);
694:       change = PETSC_TRUE;
695:     }
696:   }
697:   if (R) {
698:     PetscObjectStateGet((PetscObject)*R,&rstate);
699:     if (rstate != bv->rstate) {
700:       v = ((BV_SVEC*)bv->R->data)->v;
701:       VecCUDAGetArrayRead(v,&d_pv);
702:       VecCUDARestoreArrayRead(v,&d_pv);
703:       change = PETSC_TRUE;
704:     }
705:   }
706:   if (change) {
707:     v = ((BV_SVEC*)bv->data)->v;
708:     VecCUDAGetArray(v,(PetscScalar **)&d_pv);
709:     VecCUDARestoreArray(v,(PetscScalar **)&d_pv);
710:   }
711:   return(0);
712: }

714: PetscErrorCode BVGetMat_Svec_CUDA(BV bv,Mat *A)
715: {
717:   BV_SVEC        *ctx = (BV_SVEC*)bv->data;
718:   PetscScalar    *vv,*aa;
719:   PetscBool      create=PETSC_FALSE;
720:   PetscInt       m,cols;

723:   m = bv->k-bv->l;
724:   if (!bv->Aget) create=PETSC_TRUE;
725:   else {
726:     MatDenseCUDAGetArray(bv->Aget,&aa);
727:     if (aa) SETERRQ(PetscObjectComm((PetscObject)bv),PETSC_ERR_ARG_WRONGSTATE,"BVGetMat already called on this BV");
728:     MatGetSize(bv->Aget,NULL,&cols);
729:     if (cols!=m) {
730:       MatDestroy(&bv->Aget);
731:       create=PETSC_TRUE;
732:     }
733:   }
734:   VecCUDAGetArray(ctx->v,&vv);
735:   if (create) {
736:     MatCreateDenseCUDA(PetscObjectComm((PetscObject)bv),bv->n,PETSC_DECIDE,bv->N,m,vv,&bv->Aget); /* pass a pointer to avoid allocation of storage */
737:     MatDenseCUDAReplaceArray(bv->Aget,NULL);  /* replace with a null pointer, the value after BVRestoreMat */
738:     PetscLogObjectParent((PetscObject)bv,(PetscObject)bv->Aget);
739:   }
740:   MatDenseCUDAPlaceArray(bv->Aget,vv+(bv->nc+bv->l)*bv->n);  /* set the actual pointer */
741:   *A = bv->Aget;
742:   return(0);
743: }

745: PetscErrorCode BVRestoreMat_Svec_CUDA(BV bv,Mat *A)
746: {
748:   BV_SVEC        *ctx = (BV_SVEC*)bv->data;
749:   PetscScalar    *vv,*aa;

752:   MatDenseCUDAGetArray(bv->Aget,&aa);
753:   vv = aa-(bv->nc+bv->l)*bv->n;
754:   MatDenseCUDAResetArray(bv->Aget);
755:   VecCUDARestoreArray(ctx->v,&vv);
756:   *A = NULL;
757:   return(0);
758: }