Actual source code: sveccuda.cu

slepc-3.16.3 2022-04-11
Report Typos and Errors
  1: /*
  2:    - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
  3:    SLEPc - Scalable Library for Eigenvalue Problem Computations
  4:    Copyright (c) 2002-2021, Universitat Politecnica de Valencia, Spain

  6:    This file is part of SLEPc.
  7:    SLEPc is distributed under a 2-clause BSD license (see LICENSE).
  8:    - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
  9: */
 10: /*
 11:    BV implemented as a single Vec (CUDA version)
 12: */

 14: #include <slepc/private/bvimpl.h>
 15: #include "../src/sys/classes/bv/impls/svec/svec.h"
 16: #include <slepccublas.h>

 18: #if defined(PETSC_USE_COMPLEX)
 19: #include <thrust/device_ptr.h>
 20: #endif

 22: #define BLOCKSIZE 64

 24: /*
 25:     B := alpha*A + beta*B

 27:     A,B are nxk (ld=n)
 28:  */
 29: static PetscErrorCode BVAXPY_BLAS_CUDA(BV bv,PetscInt n_,PetscInt k_,PetscScalar alpha,const PetscScalar *d_A,PetscScalar beta,PetscScalar *d_B)
 30: {
 32:   PetscCuBLASInt m=0,one=1;
 33:   cublasStatus_t cberr;
 34:   cublasHandle_t cublasv2handle;

 37:   PetscCUBLASGetHandle(&cublasv2handle);
 38:   PetscCuBLASIntCast(n_*k_,&m);
 39:   PetscLogGpuTimeBegin();
 40:   if (beta!=(PetscScalar)1.0) {
 41:     cberr = cublasXscal(cublasv2handle,m,&beta,d_B,one);CHKERRCUBLAS(cberr);
 42:     PetscLogGpuFlops(1.0*m);
 43:   }
 44:   cberr = cublasXaxpy(cublasv2handle,m,&alpha,d_A,one,d_B,one);CHKERRCUBLAS(cberr);
 45:   PetscLogGpuTimeEnd();
 46:   PetscLogGpuFlops(2.0*m);
 47:   return(0);
 48: }

 50: /*
 51:     C := alpha*A*B + beta*C
 52: */
 53: PetscErrorCode BVMult_Svec_CUDA(BV Y,PetscScalar alpha,PetscScalar beta,BV X,Mat Q)
 54: {
 55:   PetscErrorCode    ierr;
 56:   BV_SVEC           *y = (BV_SVEC*)Y->data,*x = (BV_SVEC*)X->data;
 57:   const PetscScalar *d_px,*d_A,*q;
 58:   PetscScalar       *d_py,*d_q,*d_B,*d_C;
 59:   PetscInt          ldq,mq;
 60:   PetscCuBLASInt    m=0,n=0,k=0,ldq_=0;
 61:   cublasStatus_t    cberr;
 62:   cudaError_t       cerr;
 63:   cublasHandle_t    cublasv2handle;
 64:   PetscBool         matiscuda;

 67:   if (!Y->n) return(0);
 68:   VecCUDAGetArrayRead(x->v,&d_px);
 69:   if (beta==(PetscScalar)0.0) {
 70:     VecCUDAGetArrayWrite(y->v,&d_py);
 71:   } else {
 72:     VecCUDAGetArray(y->v,&d_py);
 73:   }
 74:   d_A = d_px+(X->nc+X->l)*X->n;
 75:   d_C = d_py+(Y->nc+Y->l)*Y->n;
 76:   if (Q) {
 77:     PetscCuBLASIntCast(Y->n,&m);
 78:     PetscCuBLASIntCast(Y->k-Y->l,&n);
 79:     PetscCuBLASIntCast(X->k-X->l,&k);
 80:     PetscCUBLASGetHandle(&cublasv2handle);
 81:     MatGetSize(Q,&ldq,&mq);
 82:     PetscCuBLASIntCast(ldq,&ldq_);
 83:     PetscObjectTypeCompare((PetscObject)Q,MATSEQDENSECUDA,&matiscuda);
 84:     if (matiscuda) {
 85:       MatDenseCUDAGetArrayRead(Q,(const PetscScalar**)&d_q);
 86:     } else {
 87:       MatDenseGetArrayRead(Q,&q);
 88:       cerr = cudaMalloc((void**)&d_q,ldq*mq*sizeof(PetscScalar));CHKERRCUDA(cerr);
 89:       cerr = cudaMemcpy(d_q,q,ldq*mq*sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
 90:       PetscLogCpuToGpu(ldq*mq*sizeof(PetscScalar));
 91:     }
 92:     d_B = d_q+Y->l*ldq+X->l;
 93:     PetscLogGpuTimeBegin();
 94:     cberr = cublasXgemm(cublasv2handle,CUBLAS_OP_N,CUBLAS_OP_N,m,n,k,&alpha,d_A,m,d_B,ldq_,&beta,d_C,m);CHKERRCUBLAS(cberr);
 95:     PetscLogGpuTimeEnd();
 96:     if (matiscuda) {
 97:       MatDenseCUDARestoreArrayRead(Q,(const PetscScalar**)&d_q);
 98:     } else {
 99:       MatDenseRestoreArrayRead(Q,&q);
100:       cerr = cudaFree(d_q);CHKERRCUDA(cerr);
101:     }
102:     PetscLogGpuFlops(2.0*m*n*k);
103:   } else {
104:     BVAXPY_BLAS_CUDA(Y,Y->n,Y->k-Y->l,alpha,d_A,beta,d_C);
105:   }
106:   VecCUDARestoreArrayRead(x->v,&d_px);
107:   VecCUDARestoreArrayWrite(y->v,&d_py);
108:   return(0);
109: }

111: /*
112:     y := alpha*A*x + beta*y
113: */
114: PetscErrorCode BVMultVec_Svec_CUDA(BV X,PetscScalar alpha,PetscScalar beta,Vec y,PetscScalar *q)
115: {
116:   PetscErrorCode    ierr;
117:   BV_SVEC           *x = (BV_SVEC*)X->data;
118:   const PetscScalar *d_px,*d_A;
119:   PetscScalar       *d_py,*d_q,*d_x,*d_y;
120:   PetscCuBLASInt    n=0,k=0,one=1;
121:   cublasStatus_t    cberr;
122:   cublasHandle_t    cublasv2handle;
123:   cudaError_t       cerr;

126:   PetscCuBLASIntCast(X->n,&n);
127:   PetscCuBLASIntCast(X->k-X->l,&k);
128:   PetscCUBLASGetHandle(&cublasv2handle);
129:   VecCUDAGetArrayRead(x->v,&d_px);
130:   if (beta==(PetscScalar)0.0) {
131:     VecCUDAGetArrayWrite(y,&d_py);
132:   } else {
133:     VecCUDAGetArray(y,&d_py);
134:   }
135:   if (!q) {
136:     VecCUDAGetArray(X->buffer,&d_q);
137:   } else {
138:     cerr = cudaMalloc((void**)&d_q,k*sizeof(PetscScalar));CHKERRCUDA(cerr);
139:     cerr = cudaMemcpy(d_q,q,k*sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
140:     PetscLogCpuToGpu(k*sizeof(PetscScalar));
141:   }
142:   d_A = d_px+(X->nc+X->l)*X->n;
143:   d_x = d_q;
144:   d_y = d_py;
145:   PetscLogGpuTimeBegin();
146:   cberr = cublasXgemv(cublasv2handle,CUBLAS_OP_N,n,k,&alpha,d_A,n,d_x,one,&beta,d_y,one);CHKERRCUBLAS(cberr);
147:   PetscLogGpuTimeEnd();
148:   VecCUDARestoreArrayRead(x->v,&d_px);
149:   if (beta==(PetscScalar)0.0) {
150:     VecCUDARestoreArrayWrite(y,&d_py);
151:   } else {
152:     VecCUDARestoreArray(y,&d_py);
153:   }
154:   if (!q) {
155:     VecCUDARestoreArray(X->buffer,&d_q);
156:   } else {
157:     cerr = cudaFree(d_q);CHKERRCUDA(cerr);
158:   }
159:   PetscLogGpuFlops(2.0*n*k);
160:   return(0);
161: }

163: /*
164:     A(:,s:e-1) := A*B(:,s:e-1)
165: */
166: PetscErrorCode BVMultInPlace_Svec_CUDA(BV V,Mat Q,PetscInt s,PetscInt e)
167: {
168:   PetscErrorCode    ierr;
169:   BV_SVEC           *ctx = (BV_SVEC*)V->data;
170:   PetscScalar       *d_pv,*d_q,*d_A,*d_B,*d_work,sone=1.0,szero=0.0;
171:   const PetscScalar *q;
172:   PetscInt          ldq,nq;
173:   PetscCuBLASInt    m=0,n=0,k=0,l=0,ldq_=0,bs=BLOCKSIZE;
174:   cublasStatus_t    cberr;
175:   size_t            freemem,totmem;
176:   cublasHandle_t    cublasv2handle;
177:   cudaError_t       cerr;
178:   PetscBool         matiscuda;

181:   if (!V->n) return(0);
182:   PetscCuBLASIntCast(V->n,&m);
183:   PetscCuBLASIntCast(e-s,&n);
184:   PetscCuBLASIntCast(V->k-V->l,&k);
185:   MatGetSize(Q,&ldq,&nq);
186:   PetscCuBLASIntCast(ldq,&ldq_);
187:   VecCUDAGetArray(ctx->v,&d_pv);
188:   PetscObjectTypeCompare((PetscObject)Q,MATSEQDENSECUDA,&matiscuda);
189:   if (matiscuda) {
190:     MatDenseCUDAGetArrayRead(Q,(const PetscScalar**)&d_q);
191:   } else {
192:     MatDenseGetArrayRead(Q,&q);
193:     cerr = cudaMalloc((void**)&d_q,ldq*nq*sizeof(PetscScalar));CHKERRCUDA(cerr);
194:     cerr = cudaMemcpy(d_q,q,ldq*nq*sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
195:     PetscLogCpuToGpu(ldq*nq*sizeof(PetscScalar));
196:   }
197:   PetscCUBLASGetHandle(&cublasv2handle);
198:   PetscLogGpuTimeBegin();
199:   /* try to allocate the whole matrix */
200:   cerr = cudaMemGetInfo(&freemem,&totmem);CHKERRCUDA(cerr);
201:   if (freemem>=m*n*sizeof(PetscScalar)) {
202:     cerr = cudaMalloc((void**)&d_work,m*n*sizeof(PetscScalar));CHKERRCUDA(cerr);
203:     d_A = d_pv+(V->nc+V->l)*m;
204:     d_B = d_q+V->l*ldq+V->l+(s-V->l)*ldq;
205:     cberr = cublasXgemm(cublasv2handle,CUBLAS_OP_N,CUBLAS_OP_N,m,n,k,&sone,d_A,m,d_B,ldq_,&szero,d_work,m);CHKERRCUBLAS(cberr);
206:     cerr = cudaMemcpy2D(d_A+(s-V->l)*m,m*sizeof(PetscScalar),d_work,m*sizeof(PetscScalar),m*sizeof(PetscScalar),n,cudaMemcpyDeviceToDevice);CHKERRCUDA(cerr);
207:   } else {
208:     PetscCuBLASIntCast(freemem/(m*sizeof(PetscScalar)),&bs);
209:     cerr = cudaMalloc((void**)&d_work,bs*n*sizeof(PetscScalar));CHKERRCUDA(cerr);
210:     PetscCuBLASIntCast(m % bs,&l);
211:     if (l) {
212:       d_A = d_pv+(V->nc+V->l)*m;
213:       d_B = d_q+V->l*ldq+V->l+(s-V->l)*ldq;
214:       cberr = cublasXgemm(cublasv2handle,CUBLAS_OP_N,CUBLAS_OP_N,l,n,k,&sone,d_A,m,d_B,ldq_,&szero,d_work,l);CHKERRCUBLAS(cberr);
215:       cerr = cudaMemcpy2D(d_A+(s-V->l)*m,m*sizeof(PetscScalar),d_work,l*sizeof(PetscScalar),l*sizeof(PetscScalar),n,cudaMemcpyDeviceToDevice);CHKERRCUDA(cerr);
216:     }
217:     for (;l<m;l+=bs) {
218:       d_A = d_pv+(V->nc+V->l)*m+l;
219:       d_B = d_q+V->l*ldq+V->l+(s-V->l)*ldq;
220:       cberr = cublasXgemm(cublasv2handle,CUBLAS_OP_N,CUBLAS_OP_N,bs,n,k,&sone,d_A,m,d_B,ldq_,&szero,d_work,bs);CHKERRCUBLAS(cberr);
221:       cerr = cudaMemcpy2D(d_A+(s-V->l)*m,m*sizeof(PetscScalar),d_work,bs*sizeof(PetscScalar),bs*sizeof(PetscScalar),n,cudaMemcpyDeviceToDevice);CHKERRCUDA(cerr);
222:     }
223:   }
224:   PetscLogGpuTimeEnd();
225:   if (matiscuda) {
226:     MatDenseCUDARestoreArrayRead(Q,(const PetscScalar**)&d_q);
227:   } else {
228:     MatDenseRestoreArrayRead(Q,&q);
229:     cerr = cudaFree(d_q);CHKERRCUDA(cerr);
230:   }
231:   cerr = cudaFree(d_work);CHKERRCUDA(cerr);
232:   VecCUDARestoreArray(ctx->v,&d_pv);
233:   PetscLogGpuFlops(2.0*m*n*k);
234:   return(0);
235: }

237: /*
238:     A(:,s:e-1) := A*B(:,s:e-1)
239: */
240: PetscErrorCode BVMultInPlaceHermitianTranspose_Svec_CUDA(BV V,Mat Q,PetscInt s,PetscInt e)
241: {
242:   PetscErrorCode    ierr;
243:   BV_SVEC           *ctx = (BV_SVEC*)V->data;
244:   PetscScalar       *d_pv,*d_q,*d_A,*d_B,*d_work,sone=1.0,szero=0.0;
245:   const PetscScalar *q;
246:   PetscInt          ldq,nq;
247:   PetscCuBLASInt    m=0,n=0,k=0,ldq_=0;
248:   cublasStatus_t    cberr;
249:   cublasHandle_t    cublasv2handle;
250:   cudaError_t       cerr;
251:   PetscBool         matiscuda;

254:   if (!V->n) return(0);
255:   PetscCuBLASIntCast(V->n,&m);
256:   PetscCuBLASIntCast(e-s,&n);
257:   PetscCuBLASIntCast(V->k-V->l,&k);
258:   MatGetSize(Q,&ldq,&nq);
259:   PetscCuBLASIntCast(ldq,&ldq_);
260:   VecCUDAGetArray(ctx->v,&d_pv);
261:   PetscObjectTypeCompare((PetscObject)Q,MATSEQDENSECUDA,&matiscuda);
262:   if (matiscuda) {
263:     MatDenseCUDAGetArrayRead(Q,(const PetscScalar**)&d_q);
264:   } else {
265:     MatDenseGetArrayRead(Q,&q);
266:     cerr = cudaMalloc((void**)&d_q,ldq*nq*sizeof(PetscScalar));CHKERRCUDA(cerr);
267:     cerr = cudaMemcpy(d_q,q,ldq*nq*sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
268:     PetscLogCpuToGpu(ldq*nq*sizeof(PetscScalar));
269:   }
270:   PetscCUBLASGetHandle(&cublasv2handle);
271:   PetscLogGpuTimeBegin();
272:   cerr = cudaMalloc((void**)&d_work,m*n*sizeof(PetscScalar));CHKERRCUDA(cerr);
273:   d_A = d_pv+(V->nc+V->l)*m;
274:   d_B = d_q+V->l*ldq+s;
275:   cberr = cublasXgemm(cublasv2handle,CUBLAS_OP_N,CUBLAS_OP_C,m,n,k,&sone,d_A,m,d_B,ldq_,&szero,d_work,m);CHKERRCUBLAS(cberr);
276:   cerr = cudaMemcpy2D(d_A+(s-V->l)*m,m*sizeof(PetscScalar),d_work,m*sizeof(PetscScalar),m*sizeof(PetscScalar),n,cudaMemcpyDeviceToDevice);CHKERRCUDA(cerr);
277:   PetscLogGpuTimeEnd();
278:   if (matiscuda) {
279:     MatDenseCUDARestoreArrayRead(Q,(const PetscScalar**)&d_q);
280:   } else {
281:     MatDenseRestoreArrayRead(Q,&q);
282:     cerr = cudaFree(d_q);CHKERRCUDA(cerr);
283:   }
284:   cerr = cudaFree(d_work);CHKERRCUDA(cerr);
285:   VecCUDARestoreArray(ctx->v,&d_pv);
286:   PetscLogGpuFlops(2.0*m*n*k);
287:   return(0);
288: }

290: /*
291:     C := A'*B
292: */
293: PetscErrorCode BVDot_Svec_CUDA(BV X,BV Y,Mat M)
294: {
295:   PetscErrorCode    ierr;
296:   BV_SVEC           *x = (BV_SVEC*)X->data,*y = (BV_SVEC*)Y->data;
297:   const PetscScalar *d_px,*d_py,*d_A,*d_B;
298:   PetscScalar       *pm,*d_work,sone=1.0,szero=0.0,*C,*CC;
299:   PetscInt          j,ldm;
300:   PetscCuBLASInt    m=0,n=0,k=0,ldm_=0;
301:   PetscMPIInt       len;
302:   cublasStatus_t    cberr;
303:   cublasHandle_t    cublasv2handle;
304:   cudaError_t       cerr;

307:   PetscCuBLASIntCast(Y->k-Y->l,&m);
308:   PetscCuBLASIntCast(X->k-X->l,&n);
309:   PetscCuBLASIntCast(X->n,&k);
310:   MatGetSize(M,&ldm,NULL);
311:   PetscCuBLASIntCast(ldm,&ldm_);
312:   VecCUDAGetArrayRead(x->v,&d_px);
313:   VecCUDAGetArrayRead(y->v,&d_py);
314:   MatDenseGetArray(M,&pm);
315:   PetscCUBLASGetHandle(&cublasv2handle);
316:   cerr = cudaMalloc((void**)&d_work,m*n*sizeof(PetscScalar));CHKERRCUDA(cerr);
317:   d_A = d_py+(Y->nc+Y->l)*Y->n;
318:   d_B = d_px+(X->nc+X->l)*X->n;
319:   C = pm+X->l*ldm+Y->l;
320:   if (x->mpi) {
321:     if (ldm==m) {
322:       BVAllocateWork_Private(X,m*n);
323:       if (k) {
324:         PetscLogGpuTimeBegin();
325:         cberr = cublasXgemm(cublasv2handle,CUBLAS_OP_C,CUBLAS_OP_N,m,n,k,&sone,d_A,k,d_B,k,&szero,d_work,ldm_);CHKERRCUBLAS(cberr);
326:         PetscLogGpuTimeEnd();
327:         cerr = cudaMemcpy(X->work,d_work,m*n*sizeof(PetscScalar),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
328:         PetscLogGpuToCpu(m*n*sizeof(PetscScalar));
329:       } else {
330:         PetscArrayzero(X->work,m*n);
331:       }
332:       PetscMPIIntCast(m*n,&len);
333:       MPIU_Allreduce(X->work,C,len,MPIU_SCALAR,MPIU_SUM,PetscObjectComm((PetscObject)X));
334:     } else {
335:       BVAllocateWork_Private(X,2*m*n);
336:       CC = X->work+m*n;
337:       if (k) {
338:         PetscLogGpuTimeBegin();
339:         cberr = cublasXgemm(cublasv2handle,CUBLAS_OP_C,CUBLAS_OP_N,m,n,k,&sone,d_A,k,d_B,k,&szero,d_work,m);CHKERRCUBLAS(cberr);
340:         PetscLogGpuTimeEnd();
341:         cerr = cudaMemcpy(X->work,d_work,m*n*sizeof(PetscScalar),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
342:         PetscLogGpuToCpu(m*n*sizeof(PetscScalar));
343:       } else {
344:         PetscArrayzero(X->work,m*n);
345:       }
346:       PetscMPIIntCast(m*n,&len);
347:       MPIU_Allreduce(X->work,CC,len,MPIU_SCALAR,MPIU_SUM,PetscObjectComm((PetscObject)X));
348:       for (j=0;j<n;j++) {
349:         PetscArraycpy(C+j*ldm,CC+j*m,m);
350:       }
351:     }
352:   } else {
353:     if (k) {
354:       BVAllocateWork_Private(X,m*n);
355:       PetscLogGpuTimeBegin();
356:       cberr = cublasXgemm(cublasv2handle,CUBLAS_OP_C,CUBLAS_OP_N,m,n,k,&sone,d_A,k,d_B,k,&szero,d_work,m);CHKERRCUBLAS(cberr);
357:       PetscLogGpuTimeEnd();
358:       cerr = cudaMemcpy(X->work,d_work,m*n*sizeof(PetscScalar),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
359:       PetscLogGpuToCpu(m*n*sizeof(PetscScalar));
360:       for (j=0;j<n;j++) {
361:         PetscArraycpy(C+j*ldm,X->work+j*m,m);
362:       }
363:     }
364:   }
365:   cerr = cudaFree(d_work);CHKERRCUDA(cerr);
366:   MatDenseRestoreArray(M,&pm);
367:   VecCUDARestoreArrayRead(x->v,&d_px);
368:   VecCUDARestoreArrayRead(y->v,&d_py);
369:   PetscLogGpuFlops(2.0*m*n*k);
370:   return(0);
371: }

373: #if defined(PETSC_USE_COMPLEX)
374: struct conjugate
375: {
376:   __host__ __device__
377:     PetscScalar operator()(PetscScalar x)
378:     {
379:       return PetscConj(x);
380:     }
381: };

383: PetscErrorCode ConjugateCudaArray(PetscScalar *a, PetscInt n)
384: {
385:   thrust::device_ptr<PetscScalar> ptr;

388:   try {
389:     ptr = thrust::device_pointer_cast(a);
390:     thrust::transform(ptr,ptr+n,ptr,conjugate());
391:   } catch (char *ex) {
392:     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"Thrust error: %s", ex);
393:   }
394:   return(0);
395: }
396: #endif

398: /*
399:     y := A'*x computed as y' := x'*A
400: */
401: PetscErrorCode BVDotVec_Svec_CUDA(BV X,Vec y,PetscScalar *q)
402: {
403:   PetscErrorCode    ierr;
404:   BV_SVEC           *x = (BV_SVEC*)X->data;
405:   const PetscScalar *d_A,*d_x,*d_px,*d_py;
406:   PetscScalar       *d_work,szero=0.0,sone=1.0,*qq=q;
407:   PetscCuBLASInt    n=0,k=0,one=1;
408:   PetscMPIInt       len;
409:   Vec               z = y;
410:   cublasStatus_t    cberr;
411:   cublasHandle_t    cublasv2handle;
412:   cudaError_t       cerr;

415:   PetscCuBLASIntCast(X->n,&n);
416:   PetscCuBLASIntCast(X->k-X->l,&k);
417:   PetscCUBLASGetHandle(&cublasv2handle);
418:   if (X->matrix) {
419:     BV_IPMatMult(X,y);
420:     z = X->Bx;
421:   }
422:   VecCUDAGetArrayRead(x->v,&d_px);
423:   VecCUDAGetArrayRead(z,&d_py);
424:   if (!q) {
425:     VecCUDAGetArrayWrite(X->buffer,&d_work);
426:   } else {
427:     cerr = cudaMalloc((void**)&d_work,k*sizeof(PetscScalar));CHKERRCUDA(cerr);
428:   }
429:   d_A = d_px+(X->nc+X->l)*X->n;
430:   d_x = d_py;
431:   if (x->mpi) {
432:     BVAllocateWork_Private(X,k);
433:     if (n) {
434:       PetscLogGpuTimeBegin();
435: #if defined(PETSC_USE_COMPLEX)
436:       cberr = cublasXgemm(cublasv2handle,CUBLAS_OP_C,CUBLAS_OP_N,one,k,n,&sone,d_x,n,d_A,n,&szero,d_work,one);CHKERRCUBLAS(cberr);
437:       ConjugateCudaArray(d_work,k);
438: #else
439:       cberr = cublasXgemm(cublasv2handle,CUBLAS_OP_N,CUBLAS_OP_N,one,k,n,&sone,d_x,one,d_A,n,&szero,d_work,one);CHKERRCUBLAS(cberr);
440: #endif
441:       PetscLogGpuTimeEnd();
442:       cerr = cudaMemcpy(X->work,d_work,k*sizeof(PetscScalar),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
443:       PetscLogGpuToCpu(k*sizeof(PetscScalar));
444:     } else {
445:       PetscArrayzero(X->work,k);
446:     }
447:     if (!q) {
448:       VecCUDARestoreArrayWrite(X->buffer,&d_work);
449:       VecGetArray(X->buffer,&qq);
450:     } else {
451:       cerr = cudaFree(d_work);CHKERRCUDA(cerr);
452:     }
453:     PetscMPIIntCast(k,&len);
454:     MPIU_Allreduce(X->work,qq,len,MPIU_SCALAR,MPIU_SUM,PetscObjectComm((PetscObject)X));
455:     if (!q) { VecRestoreArray(X->buffer,&qq); }
456:   } else {
457:     if (n) {
458:       PetscLogGpuTimeBegin();
459: #if defined(PETSC_USE_COMPLEX)
460:       cberr = cublasXgemm(cublasv2handle,CUBLAS_OP_C,CUBLAS_OP_N,one,k,n,&sone,d_x,n,d_A,n,&szero,d_work,one);CHKERRCUBLAS(cberr);
461:       ConjugateCudaArray(d_work,k);
462: #else
463:       cberr = cublasXgemm(cublasv2handle,CUBLAS_OP_N,CUBLAS_OP_N,one,k,n,&sone,d_x,one,d_A,n,&szero,d_work,one);CHKERRCUBLAS(cberr);
464: #endif
465:       PetscLogGpuTimeEnd();
466:     }
467:     if (!q) {
468:       VecCUDARestoreArrayWrite(X->buffer,&d_work);
469:     } else {
470:       cerr = cudaMemcpy(q,d_work,k*sizeof(PetscScalar),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
471:       PetscLogGpuToCpu(k*sizeof(PetscScalar));
472:       cerr = cudaFree(d_work);CHKERRCUDA(cerr);
473:     }
474:   }
475:   VecCUDARestoreArrayRead(z,&d_py);
476:   VecCUDARestoreArrayRead(x->v,&d_px);
477:   PetscLogGpuFlops(2.0*n*k);
478:   return(0);
479: }

481: /*
482:     y := A'*x computed as y' := x'*A
483: */
484: PetscErrorCode BVDotVec_Local_Svec_CUDA(BV X,Vec y,PetscScalar *m)
485: {
486:   PetscErrorCode    ierr;
487:   BV_SVEC           *x = (BV_SVEC*)X->data;
488:   const PetscScalar *d_A,*d_x,*d_px,*d_py;
489:   PetscScalar       *d_y,szero=0.0,sone=1.0;
490:   PetscCuBLASInt    n=0,k=0,one=1;
491:   Vec               z = y;
492:   cublasStatus_t    cberr;
493:   cublasHandle_t    cublasv2handle;
494:   cudaError_t       cerr;

497:   PetscCuBLASIntCast(X->n,&n);
498:   PetscCuBLASIntCast(X->k-X->l,&k);
499:   if (X->matrix) {
500:     BV_IPMatMult(X,y);
501:     z = X->Bx;
502:   }
503:   PetscCUBLASGetHandle(&cublasv2handle);
504:   VecCUDAGetArrayRead(x->v,&d_px);
505:   VecCUDAGetArrayRead(z,&d_py);
506:   d_A = d_px+(X->nc+X->l)*X->n;
507:   d_x = d_py;
508:   if (n) {
509:     cerr = cudaMalloc((void**)&d_y,k*sizeof(PetscScalar));CHKERRCUDA(cerr);
510:     PetscLogGpuTimeBegin();
511: #if defined(PETSC_USE_COMPLEX)
512:     cberr = cublasXgemm(cublasv2handle,CUBLAS_OP_C,CUBLAS_OP_N,one,k,n,&sone,d_x,n,d_A,n,&szero,d_y,one);CHKERRCUBLAS(cberr);
513:     ConjugateCudaArray(d_y,k);
514: #else
515:     cberr = cublasXgemm(cublasv2handle,CUBLAS_OP_N,CUBLAS_OP_N,one,k,n,&sone,d_x,one,d_A,n,&szero,d_y,one);CHKERRCUBLAS(cberr);
516: #endif
517:     PetscLogGpuTimeEnd();
518:     cerr = cudaMemcpy(m,d_y,k*sizeof(PetscScalar),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
519:     PetscLogGpuToCpu(k*sizeof(PetscScalar));
520:     cerr = cudaFree(d_y);CHKERRCUDA(cerr);
521:   }
522:   VecCUDARestoreArrayRead(z,&d_py);
523:   VecCUDARestoreArrayRead(x->v,&d_px);
524:   PetscLogGpuFlops(2.0*n*k);
525:   return(0);
526: }

528: /*
529:     Scale n scalars
530: */
531: PetscErrorCode BVScale_Svec_CUDA(BV bv,PetscInt j,PetscScalar alpha)
532: {
534:   BV_SVEC        *ctx = (BV_SVEC*)bv->data;
535:   PetscScalar    *d_array, *d_A;
536:   PetscCuBLASInt n=0,one=1;
537:   cublasStatus_t cberr;
538:   cublasHandle_t cublasv2handle;
539:   cudaError_t    cerr;

542:   VecCUDAGetArray(ctx->v,&d_array);
543:   if (j<0) {
544:     d_A = d_array+(bv->nc+bv->l)*bv->n;
545:     PetscCuBLASIntCast((bv->k-bv->l)*bv->n,&n);
546:   } else {
547:     d_A = d_array+(bv->nc+j)*bv->n;
548:     PetscCuBLASIntCast(bv->n,&n);
549:   }
550:   if (alpha == (PetscScalar)0.0) {
551:     cerr = cudaMemset(d_A,0,n*sizeof(PetscScalar));CHKERRCUDA(cerr);
552:   } else if (alpha != (PetscScalar)1.0) {
553:     PetscCUBLASGetHandle(&cublasv2handle);
554:     PetscLogGpuTimeBegin();
555:     cberr = cublasXscal(cublasv2handle,n,&alpha,d_A,one);CHKERRCUBLAS(cberr);
556:     PetscLogGpuTimeEnd();
557:     PetscLogGpuFlops(1.0*n);
558:   }
559:   VecCUDARestoreArray(ctx->v,&d_array);
560:   return(0);
561: }

563: PetscErrorCode BVMatMult_Svec_CUDA(BV V,Mat A,BV W)
564: {
565:   PetscErrorCode    ierr;
566:   BV_SVEC           *v = (BV_SVEC*)V->data,*w = (BV_SVEC*)W->data;
567:   Mat               Vmat,Wmat;
568:   const PetscScalar *d_pv;
569:   PetscScalar       *d_pw;
570:   PetscInt          j;

573:   if (V->vmm) {
574:     BVGetMat(V,&Vmat);
575:     BVGetMat(W,&Wmat);
576:     MatProductCreateWithMat(A,Vmat,NULL,Wmat);
577:     MatProductSetType(Wmat,MATPRODUCT_AB);
578:     MatProductSetFromOptions(Wmat);
579:     MatProductSymbolic(Wmat);
580:     MatProductNumeric(Wmat);
581:     MatProductClear(Wmat);
582:     BVRestoreMat(V,&Vmat);
583:     BVRestoreMat(W,&Wmat);
584:   } else {
585:     VecCUDAGetArrayRead(v->v,&d_pv);
586:     VecCUDAGetArrayWrite(w->v,&d_pw);
587:     for (j=0;j<V->k-V->l;j++) {
588:       VecCUDAPlaceArray(V->cv[1],(PetscScalar *)d_pv+(V->nc+V->l+j)*V->n);
589:       VecCUDAPlaceArray(W->cv[1],d_pw+(W->nc+W->l+j)*W->n);
590:       MatMult(A,V->cv[1],W->cv[1]);
591:       VecCUDAResetArray(V->cv[1]);
592:       VecCUDAResetArray(W->cv[1]);
593:     }
594:     VecCUDARestoreArrayRead(v->v,&d_pv);
595:     VecCUDARestoreArrayWrite(w->v,&d_pw);
596:   }
597:   return(0);
598: }

600: PetscErrorCode BVCopy_Svec_CUDA(BV V,BV W)
601: {
602:   PetscErrorCode    ierr;
603:   BV_SVEC           *v = (BV_SVEC*)V->data,*w = (BV_SVEC*)W->data;
604:   const PetscScalar *d_pv,*d_pvc;
605:   PetscScalar       *d_pw,*d_pwc;
606:   cudaError_t       cerr;

609:   VecCUDAGetArrayRead(v->v,&d_pv);
610:   VecCUDAGetArrayWrite(w->v,&d_pw);
611:   d_pvc = d_pv+(V->nc+V->l)*V->n;
612:   d_pwc = d_pw+(W->nc+W->l)*W->n;
613:   cerr = cudaMemcpy(d_pwc,d_pvc,(V->k-V->l)*V->n*sizeof(PetscScalar),cudaMemcpyDeviceToDevice);CHKERRCUDA(cerr);
614:   VecCUDARestoreArrayRead(v->v,&d_pv);
615:   VecCUDARestoreArrayWrite(w->v,&d_pw);
616:   return(0);
617: }

619: PetscErrorCode BVCopyColumn_Svec_CUDA(BV V,PetscInt j,PetscInt i)
620: {
622:   BV_SVEC        *v = (BV_SVEC*)V->data;
623:   PetscScalar    *d_pv;
624:   cudaError_t    cerr;

627:   VecCUDAGetArray(v->v,&d_pv);
628:   cerr = cudaMemcpy(d_pv+(V->nc+i)*V->n,d_pv+(V->nc+j)*V->n,V->n*sizeof(PetscScalar),cudaMemcpyDeviceToDevice);CHKERRCUDA(cerr);
629:   VecCUDARestoreArray(v->v,&d_pv);
630:   return(0);
631: }

633: PetscErrorCode BVResize_Svec_CUDA(BV bv,PetscInt m,PetscBool copy)
634: {
635:   PetscErrorCode    ierr;
636:   BV_SVEC           *ctx = (BV_SVEC*)bv->data;
637:   const PetscScalar *d_pv;
638:   PetscScalar       *d_pnew;
639:   PetscInt          bs;
640:   Vec               vnew;
641:   char              str[50];
642:   cudaError_t       cerr;

645:   VecGetBlockSize(bv->t,&bs);
646:   VecCreate(PetscObjectComm((PetscObject)bv->t),&vnew);
647:   VecSetType(vnew,((PetscObject)bv->t)->type_name);
648:   VecSetSizes(vnew,m*bv->n,PETSC_DECIDE);
649:   VecSetBlockSize(vnew,bs);
650:   PetscLogObjectParent((PetscObject)bv,(PetscObject)vnew);
651:   if (((PetscObject)bv)->name) {
652:     PetscSNPrintf(str,sizeof(str),"%s_0",((PetscObject)bv)->name);
653:     PetscObjectSetName((PetscObject)vnew,str);
654:   }
655:   if (copy) {
656:     VecCUDAGetArrayRead(ctx->v,&d_pv);
657:     VecCUDAGetArrayWrite(vnew,&d_pnew);
658:     cerr = cudaMemcpy(d_pnew,d_pv,PetscMin(m,bv->m)*bv->n*sizeof(PetscScalar),cudaMemcpyDeviceToDevice);CHKERRCUDA(cerr);
659:     VecCUDARestoreArrayRead(ctx->v,&d_pv);
660:     VecCUDARestoreArrayWrite(vnew,&d_pnew);
661:   }
662:   VecDestroy(&ctx->v);
663:   ctx->v = vnew;
664:   return(0);
665: }

667: PetscErrorCode BVGetColumn_Svec_CUDA(BV bv,PetscInt j,Vec *v)
668: {
670:   BV_SVEC        *ctx = (BV_SVEC*)bv->data;
671:   PetscScalar    *d_pv;
672:   PetscInt       l;

675:   l = BVAvailableVec;
676:   VecCUDAGetArray(ctx->v,&d_pv);
677:   VecCUDAPlaceArray(bv->cv[l],d_pv+(bv->nc+j)*bv->n);
678:   return(0);
679: }

681: PetscErrorCode BVRestoreColumn_Svec_CUDA(BV bv,PetscInt j,Vec *v)
682: {
684:   BV_SVEC        *ctx = (BV_SVEC*)bv->data;
685:   PetscInt       l;

688:   l = (j==bv->ci[0])? 0: 1;
689:   VecCUDAResetArray(bv->cv[l]);
690:   VecCUDARestoreArray(ctx->v,NULL);
691:   return(0);
692: }

694: PetscErrorCode BVRestoreSplit_Svec_CUDA(BV bv,BV *L,BV *R)
695: {
696:   PetscErrorCode    ierr;
697:   Vec               v;
698:   const PetscScalar *d_pv;
699:   PetscObjectState  lstate,rstate;
700:   PetscBool         change=PETSC_FALSE;

703:   /* force sync flag to PETSC_CUDA_BOTH */
704:   if (L) {
705:     PetscObjectStateGet((PetscObject)*L,&lstate);
706:     if (lstate != bv->lstate) {
707:       v = ((BV_SVEC*)bv->L->data)->v;
708:       VecCUDAGetArrayRead(v,&d_pv);
709:       VecCUDARestoreArrayRead(v,&d_pv);
710:       change = PETSC_TRUE;
711:     }
712:   }
713:   if (R) {
714:     PetscObjectStateGet((PetscObject)*R,&rstate);
715:     if (rstate != bv->rstate) {
716:       v = ((BV_SVEC*)bv->R->data)->v;
717:       VecCUDAGetArrayRead(v,&d_pv);
718:       VecCUDARestoreArrayRead(v,&d_pv);
719:       change = PETSC_TRUE;
720:     }
721:   }
722:   if (change) {
723:     v = ((BV_SVEC*)bv->data)->v;
724:     VecCUDAGetArray(v,(PetscScalar **)&d_pv);
725:     VecCUDARestoreArray(v,(PetscScalar **)&d_pv);
726:   }
727:   return(0);
728: }

730: PetscErrorCode BVGetMat_Svec_CUDA(BV bv,Mat *A)
731: {
733:   BV_SVEC        *ctx = (BV_SVEC*)bv->data;
734:   PetscScalar    *vv,*aa;
735:   PetscBool      create=PETSC_FALSE;
736:   PetscInt       m,cols;

739:   m = bv->k-bv->l;
740:   if (!bv->Aget) create=PETSC_TRUE;
741:   else {
742:     MatDenseCUDAGetArray(bv->Aget,&aa);
743:     if (aa) SETERRQ(PetscObjectComm((PetscObject)bv),PETSC_ERR_ARG_WRONGSTATE,"BVGetMat already called on this BV");
744:     MatGetSize(bv->Aget,NULL,&cols);
745:     if (cols!=m) {
746:       MatDestroy(&bv->Aget);
747:       create=PETSC_TRUE;
748:     }
749:   }
750:   VecCUDAGetArray(ctx->v,&vv);
751:   if (create) {
752:     MatCreateDenseCUDA(PetscObjectComm((PetscObject)bv),bv->n,PETSC_DECIDE,bv->N,m,vv,&bv->Aget); /* pass a pointer to avoid allocation of storage */
753:     MatDenseCUDAReplaceArray(bv->Aget,NULL);  /* replace with a null pointer, the value after BVRestoreMat */
754:     PetscLogObjectParent((PetscObject)bv,(PetscObject)bv->Aget);
755:   }
756:   MatDenseCUDAPlaceArray(bv->Aget,vv+(bv->nc+bv->l)*bv->n);  /* set the actual pointer */
757:   *A = bv->Aget;
758:   return(0);
759: }

761: PetscErrorCode BVRestoreMat_Svec_CUDA(BV bv,Mat *A)
762: {
764:   BV_SVEC        *ctx = (BV_SVEC*)bv->data;
765:   PetscScalar    *vv,*aa;

768:   MatDenseCUDAGetArray(bv->Aget,&aa);
769:   vv = aa-(bv->nc+bv->l)*bv->n;
770:   MatDenseCUDAResetArray(bv->Aget);
771:   VecCUDARestoreArray(ctx->v,&vv);
772:   *A = NULL;
773:   return(0);
774: }