Actual source code: sveccuda.cu

slepc-3.18.3 2023-03-24
Report Typos and Errors
  1: /*
  2:    - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
  3:    SLEPc - Scalable Library for Eigenvalue Problem Computations
  4:    Copyright (c) 2002-, Universitat Politecnica de Valencia, Spain

  6:    This file is part of SLEPc.
  7:    SLEPc is distributed under a 2-clause BSD license (see LICENSE).
  8:    - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
  9: */
 10: /*
 11:    BV implemented as a single Vec (CUDA version)
 12: */

 14: #include <slepc/private/bvimpl.h>
 15: #include "../src/sys/classes/bv/impls/svec/svec.h"
 16: #include <slepccublas.h>

 18: #if defined(PETSC_USE_COMPLEX)
 19: #include <thrust/device_ptr.h>
 20: #endif

 22: #define BLOCKSIZE 64

 24: /*
 25:     B := alpha*A + beta*B

 27:     A,B are nxk (ld=n)
 28:  */
 29: static PetscErrorCode BVAXPY_BLAS_CUDA(BV bv,PetscInt n_,PetscInt k_,PetscScalar alpha,const PetscScalar *d_A,PetscScalar beta,PetscScalar *d_B)
 30: {
 31:   PetscCuBLASInt m=0,one=1;
 32:   cublasHandle_t cublasv2handle;

 34:   PetscCUBLASGetHandle(&cublasv2handle);
 35:   PetscCuBLASIntCast(n_*k_,&m);
 36:   PetscLogGpuTimeBegin();
 37:   if (beta!=(PetscScalar)1.0) {
 38:     cublasXscal(cublasv2handle,m,&beta,d_B,one);
 39:     PetscLogGpuFlops(1.0*m);
 40:   }
 41:   cublasXaxpy(cublasv2handle,m,&alpha,d_A,one,d_B,one);
 42:   PetscLogGpuTimeEnd();
 43:   PetscLogGpuFlops(2.0*m);
 44:   return 0;
 45: }

 47: /*
 48:     C := alpha*A*Q + beta*C
 49: */
 50: PetscErrorCode BVMult_Svec_CUDA(BV Y,PetscScalar alpha,PetscScalar beta,BV X,Mat Q)
 51: {
 52:   BV_SVEC           *y = (BV_SVEC*)Y->data,*x = (BV_SVEC*)X->data;
 53:   const PetscScalar *d_px,*d_A,*q;
 54:   PetscScalar       *d_py,*d_q,*d_B,*d_C;
 55:   PetscInt          ldq,mq;
 56:   PetscCuBLASInt    m=0,n=0,k=0,ldq_=0;
 57:   cublasHandle_t    cublasv2handle;
 58:   PetscBool         matiscuda;

 60:   if (!Y->n) return 0;
 61:   VecCUDAGetArrayRead(x->v,&d_px);
 62:   if (beta==(PetscScalar)0.0) VecCUDAGetArrayWrite(y->v,&d_py);
 63:   else VecCUDAGetArray(y->v,&d_py);
 64:   d_A = d_px+(X->nc+X->l)*X->n;
 65:   d_C = d_py+(Y->nc+Y->l)*Y->n;
 66:   if (Q) {
 67:     PetscCuBLASIntCast(Y->n,&m);
 68:     PetscCuBLASIntCast(Y->k-Y->l,&n);
 69:     PetscCuBLASIntCast(X->k-X->l,&k);
 70:     PetscCUBLASGetHandle(&cublasv2handle);
 71:     MatGetSize(Q,NULL,&mq);
 72:     MatDenseGetLDA(Q,&ldq);
 73:     PetscCuBLASIntCast(ldq,&ldq_);
 74:     PetscObjectTypeCompare((PetscObject)Q,MATSEQDENSECUDA,&matiscuda);
 75:     if (matiscuda) MatDenseCUDAGetArrayRead(Q,(const PetscScalar**)&d_q);
 76:     else {
 77:       MatDenseGetArrayRead(Q,&q);
 78:       cudaMalloc((void**)&d_q,ldq*mq*sizeof(PetscScalar));
 79:       cudaMemcpy(d_q,q,ldq*mq*sizeof(PetscScalar),cudaMemcpyHostToDevice);
 80:       PetscLogCpuToGpu(ldq*mq*sizeof(PetscScalar));
 81:     }
 82:     d_B = d_q+Y->l*ldq+X->l;
 83:     PetscLogGpuTimeBegin();
 84:     cublasXgemm(cublasv2handle,CUBLAS_OP_N,CUBLAS_OP_N,m,n,k,&alpha,d_A,m,d_B,ldq_,&beta,d_C,m);
 85:     PetscLogGpuTimeEnd();
 86:     if (matiscuda) MatDenseCUDARestoreArrayRead(Q,(const PetscScalar**)&d_q);
 87:     else {
 88:       MatDenseRestoreArrayRead(Q,&q);
 89:       cudaFree(d_q);
 90:     }
 91:     PetscLogGpuFlops(2.0*m*n*k);
 92:   } else BVAXPY_BLAS_CUDA(Y,Y->n,Y->k-Y->l,alpha,d_A,beta,d_C);
 93:   VecCUDARestoreArrayRead(x->v,&d_px);
 94:   VecCUDARestoreArrayWrite(y->v,&d_py);
 95:   return 0;
 96: }

 98: /*
 99:     y := alpha*A*x + beta*y
100: */
101: PetscErrorCode BVMultVec_Svec_CUDA(BV X,PetscScalar alpha,PetscScalar beta,Vec y,PetscScalar *q)
102: {
103:   BV_SVEC           *x = (BV_SVEC*)X->data;
104:   const PetscScalar *d_px,*d_A;
105:   PetscScalar       *d_py,*d_q,*d_x,*d_y;
106:   PetscCuBLASInt    n=0,k=0,one=1;
107:   cublasHandle_t    cublasv2handle;

109:   PetscCuBLASIntCast(X->n,&n);
110:   PetscCuBLASIntCast(X->k-X->l,&k);
111:   PetscCUBLASGetHandle(&cublasv2handle);
112:   VecCUDAGetArrayRead(x->v,&d_px);
113:   if (beta==(PetscScalar)0.0) VecCUDAGetArrayWrite(y,&d_py);
114:   else VecCUDAGetArray(y,&d_py);
115:   if (!q) VecCUDAGetArray(X->buffer,&d_q);
116:   else {
117:     cudaMalloc((void**)&d_q,k*sizeof(PetscScalar));
118:     cudaMemcpy(d_q,q,k*sizeof(PetscScalar),cudaMemcpyHostToDevice);
119:     PetscLogCpuToGpu(k*sizeof(PetscScalar));
120:   }
121:   d_A = d_px+(X->nc+X->l)*X->n;
122:   d_x = d_q;
123:   d_y = d_py;
124:   PetscLogGpuTimeBegin();
125:   cublasXgemv(cublasv2handle,CUBLAS_OP_N,n,k,&alpha,d_A,n,d_x,one,&beta,d_y,one);
126:   PetscLogGpuTimeEnd();
127:   VecCUDARestoreArrayRead(x->v,&d_px);
128:   if (beta==(PetscScalar)0.0) VecCUDARestoreArrayWrite(y,&d_py);
129:   else VecCUDARestoreArray(y,&d_py);
130:   if (!q) VecCUDARestoreArray(X->buffer,&d_q);
131:   else cudaFree(d_q);
132:   PetscLogGpuFlops(2.0*n*k);
133:   return 0;
134: }

136: /*
137:     A(:,s:e-1) := A*B(:,s:e-1)
138: */
139: PetscErrorCode BVMultInPlace_Svec_CUDA(BV V,Mat Q,PetscInt s,PetscInt e)
140: {
141:   BV_SVEC           *ctx = (BV_SVEC*)V->data;
142:   PetscScalar       *d_pv,*d_q,*d_A,*d_B,*d_work,sone=1.0,szero=0.0;
143:   const PetscScalar *q;
144:   PetscInt          ldq,nq;
145:   PetscCuBLASInt    m=0,n=0,k=0,l=0,ldq_=0,bs=BLOCKSIZE;
146:   size_t            freemem,totmem;
147:   cublasHandle_t    cublasv2handle;
148:   PetscBool         matiscuda;

150:   if (!V->n) return 0;
151:   PetscCuBLASIntCast(V->n,&m);
152:   PetscCuBLASIntCast(e-s,&n);
153:   PetscCuBLASIntCast(V->k-V->l,&k);
154:   MatGetSize(Q,NULL,&nq);
155:   MatDenseGetLDA(Q,&ldq);
156:   PetscCuBLASIntCast(ldq,&ldq_);
157:   VecCUDAGetArray(ctx->v,&d_pv);
158:   PetscObjectTypeCompare((PetscObject)Q,MATSEQDENSECUDA,&matiscuda);
159:   if (matiscuda) MatDenseCUDAGetArrayRead(Q,(const PetscScalar**)&d_q);
160:   else {
161:     MatDenseGetArrayRead(Q,&q);
162:     cudaMalloc((void**)&d_q,ldq*nq*sizeof(PetscScalar));
163:     cudaMemcpy(d_q,q,ldq*nq*sizeof(PetscScalar),cudaMemcpyHostToDevice);
164:     PetscLogCpuToGpu(ldq*nq*sizeof(PetscScalar));
165:   }
166:   PetscCUBLASGetHandle(&cublasv2handle);
167:   PetscLogGpuTimeBegin();
168:   /* try to allocate the whole matrix */
169:   cudaMemGetInfo(&freemem,&totmem);
170:   if (freemem>=m*n*sizeof(PetscScalar)) {
171:     cudaMalloc((void**)&d_work,m*n*sizeof(PetscScalar));
172:     d_A = d_pv+(V->nc+V->l)*m;
173:     d_B = d_q+V->l*ldq+V->l+(s-V->l)*ldq;
174:     cublasXgemm(cublasv2handle,CUBLAS_OP_N,CUBLAS_OP_N,m,n,k,&sone,d_A,m,d_B,ldq_,&szero,d_work,m);
175:     cudaMemcpy2D(d_A+(s-V->l)*m,m*sizeof(PetscScalar),d_work,m*sizeof(PetscScalar),m*sizeof(PetscScalar),n,cudaMemcpyDeviceToDevice);
176:   } else {
177:     PetscCuBLASIntCast(freemem/(m*sizeof(PetscScalar)),&bs);
178:     cudaMalloc((void**)&d_work,bs*n*sizeof(PetscScalar));
179:     PetscCuBLASIntCast(m % bs,&l);
180:     if (l) {
181:       d_A = d_pv+(V->nc+V->l)*m;
182:       d_B = d_q+V->l*ldq+V->l+(s-V->l)*ldq;
183:       cublasXgemm(cublasv2handle,CUBLAS_OP_N,CUBLAS_OP_N,l,n,k,&sone,d_A,m,d_B,ldq_,&szero,d_work,l);
184:       cudaMemcpy2D(d_A+(s-V->l)*m,m*sizeof(PetscScalar),d_work,l*sizeof(PetscScalar),l*sizeof(PetscScalar),n,cudaMemcpyDeviceToDevice);
185:     }
186:     for (;l<m;l+=bs) {
187:       d_A = d_pv+(V->nc+V->l)*m+l;
188:       d_B = d_q+V->l*ldq+V->l+(s-V->l)*ldq;
189:       cublasXgemm(cublasv2handle,CUBLAS_OP_N,CUBLAS_OP_N,bs,n,k,&sone,d_A,m,d_B,ldq_,&szero,d_work,bs);
190:       cudaMemcpy2D(d_A+(s-V->l)*m,m*sizeof(PetscScalar),d_work,bs*sizeof(PetscScalar),bs*sizeof(PetscScalar),n,cudaMemcpyDeviceToDevice);
191:     }
192:   }
193:   PetscLogGpuTimeEnd();
194:   if (matiscuda) MatDenseCUDARestoreArrayRead(Q,(const PetscScalar**)&d_q);
195:   else {
196:     MatDenseRestoreArrayRead(Q,&q);
197:     cudaFree(d_q);
198:   }
199:   cudaFree(d_work);
200:   VecCUDARestoreArray(ctx->v,&d_pv);
201:   PetscLogGpuFlops(2.0*m*n*k);
202:   return 0;
203: }

205: /*
206:     A(:,s:e-1) := A*B(:,s:e-1)
207: */
208: PetscErrorCode BVMultInPlaceHermitianTranspose_Svec_CUDA(BV V,Mat Q,PetscInt s,PetscInt e)
209: {
210:   BV_SVEC           *ctx = (BV_SVEC*)V->data;
211:   PetscScalar       *d_pv,*d_q,*d_A,*d_B,*d_work,sone=1.0,szero=0.0;
212:   const PetscScalar *q;
213:   PetscInt          ldq,nq;
214:   PetscCuBLASInt    m=0,n=0,k=0,ldq_=0;
215:   cublasHandle_t    cublasv2handle;
216:   PetscBool         matiscuda;

218:   if (!V->n) return 0;
219:   PetscCuBLASIntCast(V->n,&m);
220:   PetscCuBLASIntCast(e-s,&n);
221:   PetscCuBLASIntCast(V->k-V->l,&k);
222:   MatGetSize(Q,NULL,&nq);
223:   MatDenseGetLDA(Q,&ldq);
224:   PetscCuBLASIntCast(ldq,&ldq_);
225:   VecCUDAGetArray(ctx->v,&d_pv);
226:   PetscObjectTypeCompare((PetscObject)Q,MATSEQDENSECUDA,&matiscuda);
227:   if (matiscuda) MatDenseCUDAGetArrayRead(Q,(const PetscScalar**)&d_q);
228:   else {
229:     MatDenseGetArrayRead(Q,&q);
230:     cudaMalloc((void**)&d_q,ldq*nq*sizeof(PetscScalar));
231:     cudaMemcpy(d_q,q,ldq*nq*sizeof(PetscScalar),cudaMemcpyHostToDevice);
232:     PetscLogCpuToGpu(ldq*nq*sizeof(PetscScalar));
233:   }
234:   PetscCUBLASGetHandle(&cublasv2handle);
235:   PetscLogGpuTimeBegin();
236:   cudaMalloc((void**)&d_work,m*n*sizeof(PetscScalar));
237:   d_A = d_pv+(V->nc+V->l)*m;
238:   d_B = d_q+V->l*ldq+s;
239:   cublasXgemm(cublasv2handle,CUBLAS_OP_N,CUBLAS_OP_C,m,n,k,&sone,d_A,m,d_B,ldq_,&szero,d_work,m);
240:   cudaMemcpy2D(d_A+(s-V->l)*m,m*sizeof(PetscScalar),d_work,m*sizeof(PetscScalar),m*sizeof(PetscScalar),n,cudaMemcpyDeviceToDevice);
241:   PetscLogGpuTimeEnd();
242:   if (matiscuda) MatDenseCUDARestoreArrayRead(Q,(const PetscScalar**)&d_q);
243:   else {
244:     MatDenseRestoreArrayRead(Q,&q);
245:     cudaFree(d_q);
246:   }
247:   cudaFree(d_work);
248:   VecCUDARestoreArray(ctx->v,&d_pv);
249:   PetscLogGpuFlops(2.0*m*n*k);
250:   return 0;
251: }

253: /*
254:     C := A'*B
255: */
256: PetscErrorCode BVDot_Svec_CUDA(BV X,BV Y,Mat M)
257: {
258:   BV_SVEC           *x = (BV_SVEC*)X->data,*y = (BV_SVEC*)Y->data;
259:   const PetscScalar *d_px,*d_py,*d_A,*d_B;
260:   PetscScalar       *pm,*d_work,sone=1.0,szero=0.0,*C,*CC;
261:   PetscInt          j,ldm;
262:   PetscCuBLASInt    m=0,n=0,k=0,ldm_=0;
263:   PetscMPIInt       len;
264:   cublasHandle_t    cublasv2handle;

266:   PetscCuBLASIntCast(Y->k-Y->l,&m);
267:   PetscCuBLASIntCast(X->k-X->l,&n);
268:   PetscCuBLASIntCast(X->n,&k);
269:   MatDenseGetLDA(M,&ldm);
270:   PetscCuBLASIntCast(ldm,&ldm_);
271:   VecCUDAGetArrayRead(x->v,&d_px);
272:   VecCUDAGetArrayRead(y->v,&d_py);
273:   MatDenseGetArrayWrite(M,&pm);
274:   PetscCUBLASGetHandle(&cublasv2handle);
275:   cudaMalloc((void**)&d_work,m*n*sizeof(PetscScalar));
276:   d_A = d_py+(Y->nc+Y->l)*Y->n;
277:   d_B = d_px+(X->nc+X->l)*X->n;
278:   C = pm+X->l*ldm+Y->l;
279:   if (x->mpi) {
280:     if (ldm==m) {
281:       BVAllocateWork_Private(X,m*n);
282:       if (k) {
283:         PetscLogGpuTimeBegin();
284:         cublasXgemm(cublasv2handle,CUBLAS_OP_C,CUBLAS_OP_N,m,n,k,&sone,d_A,k,d_B,k,&szero,d_work,ldm_);
285:         PetscLogGpuTimeEnd();
286:         cudaMemcpy(X->work,d_work,m*n*sizeof(PetscScalar),cudaMemcpyDeviceToHost);
287:         PetscLogGpuToCpu(m*n*sizeof(PetscScalar));
288:       } else PetscArrayzero(X->work,m*n);
289:       PetscMPIIntCast(m*n,&len);
290:       MPIU_Allreduce(X->work,C,len,MPIU_SCALAR,MPIU_SUM,PetscObjectComm((PetscObject)X));
291:     } else {
292:       BVAllocateWork_Private(X,2*m*n);
293:       CC = X->work+m*n;
294:       if (k) {
295:         PetscLogGpuTimeBegin();
296:         cublasXgemm(cublasv2handle,CUBLAS_OP_C,CUBLAS_OP_N,m,n,k,&sone,d_A,k,d_B,k,&szero,d_work,m);
297:         PetscLogGpuTimeEnd();
298:         cudaMemcpy(X->work,d_work,m*n*sizeof(PetscScalar),cudaMemcpyDeviceToHost);
299:         PetscLogGpuToCpu(m*n*sizeof(PetscScalar));
300:       } else PetscArrayzero(X->work,m*n);
301:       PetscMPIIntCast(m*n,&len);
302:       MPIU_Allreduce(X->work,CC,len,MPIU_SCALAR,MPIU_SUM,PetscObjectComm((PetscObject)X));
303:       for (j=0;j<n;j++) PetscArraycpy(C+j*ldm,CC+j*m,m);
304:     }
305:   } else {
306:     if (k) {
307:       BVAllocateWork_Private(X,m*n);
308:       PetscLogGpuTimeBegin();
309:       cublasXgemm(cublasv2handle,CUBLAS_OP_C,CUBLAS_OP_N,m,n,k,&sone,d_A,k,d_B,k,&szero,d_work,m);
310:       PetscLogGpuTimeEnd();
311:       cudaMemcpy(X->work,d_work,m*n*sizeof(PetscScalar),cudaMemcpyDeviceToHost);
312:       PetscLogGpuToCpu(m*n*sizeof(PetscScalar));
313:       for (j=0;j<n;j++) PetscArraycpy(C+j*ldm,X->work+j*m,m);
314:     }
315:   }
316:   cudaFree(d_work);
317:   MatDenseRestoreArrayWrite(M,&pm);
318:   VecCUDARestoreArrayRead(x->v,&d_px);
319:   VecCUDARestoreArrayRead(y->v,&d_py);
320:   PetscLogGpuFlops(2.0*m*n*k);
321:   return 0;
322: }

324: #if defined(PETSC_USE_COMPLEX)
325: struct conjugate
326: {
327:   __host__ __device__
328:     PetscScalar operator()(PetscScalar x)
329:     {
330:       return PetscConj(x);
331:     }
332: };

334: PetscErrorCode ConjugateCudaArray(PetscScalar *a, PetscInt n)
335: {
336:   thrust::device_ptr<PetscScalar> ptr;

338:   try {
339:     ptr = thrust::device_pointer_cast(a);
340:     thrust::transform(ptr,ptr+n,ptr,conjugate());
341:   } catch (char *ex) {
342:     SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"Thrust error: %s", ex);
343:   }
344:   return 0;
345: }
346: #endif

348: /*
349:     y := A'*x computed as y' := x'*A
350: */
351: PetscErrorCode BVDotVec_Svec_CUDA(BV X,Vec y,PetscScalar *q)
352: {
353:   BV_SVEC           *x = (BV_SVEC*)X->data;
354:   const PetscScalar *d_A,*d_x,*d_px,*d_py;
355:   PetscScalar       *d_work,szero=0.0,sone=1.0,*qq=q;
356:   PetscCuBLASInt    n=0,k=0,one=1;
357:   PetscMPIInt       len;
358:   Vec               z = y;
359:   cublasHandle_t    cublasv2handle;

361:   PetscCuBLASIntCast(X->n,&n);
362:   PetscCuBLASIntCast(X->k-X->l,&k);
363:   PetscCUBLASGetHandle(&cublasv2handle);
364:   if (X->matrix) {
365:     BV_IPMatMult(X,y);
366:     z = X->Bx;
367:   }
368:   VecCUDAGetArrayRead(x->v,&d_px);
369:   VecCUDAGetArrayRead(z,&d_py);
370:   if (!q) VecCUDAGetArrayWrite(X->buffer,&d_work);
371:   else cudaMalloc((void**)&d_work,k*sizeof(PetscScalar));
372:   d_A = d_px+(X->nc+X->l)*X->n;
373:   d_x = d_py;
374:   if (x->mpi) {
375:     BVAllocateWork_Private(X,k);
376:     if (n) {
377:       PetscLogGpuTimeBegin();
378: #if defined(PETSC_USE_COMPLEX)
379:       cublasXgemm(cublasv2handle,CUBLAS_OP_C,CUBLAS_OP_N,one,k,n,&sone,d_x,n,d_A,n,&szero,d_work,one);
380:       ConjugateCudaArray(d_work,k);
381: #else
382:       cublasXgemm(cublasv2handle,CUBLAS_OP_N,CUBLAS_OP_N,one,k,n,&sone,d_x,one,d_A,n,&szero,d_work,one);
383: #endif
384:       PetscLogGpuTimeEnd();
385:       cudaMemcpy(X->work,d_work,k*sizeof(PetscScalar),cudaMemcpyDeviceToHost);
386:       PetscLogGpuToCpu(k*sizeof(PetscScalar));
387:     } else PetscArrayzero(X->work,k);
388:     if (!q) {
389:       VecCUDARestoreArrayWrite(X->buffer,&d_work);
390:       VecGetArray(X->buffer,&qq);
391:     } else cudaFree(d_work);
392:     PetscMPIIntCast(k,&len);
393:     MPIU_Allreduce(X->work,qq,len,MPIU_SCALAR,MPIU_SUM,PetscObjectComm((PetscObject)X));
394:     if (!q) VecRestoreArray(X->buffer,&qq);
395:   } else {
396:     if (n) {
397:       PetscLogGpuTimeBegin();
398: #if defined(PETSC_USE_COMPLEX)
399:       cublasXgemm(cublasv2handle,CUBLAS_OP_C,CUBLAS_OP_N,one,k,n,&sone,d_x,n,d_A,n,&szero,d_work,one);
400:       ConjugateCudaArray(d_work,k);
401: #else
402:       cublasXgemm(cublasv2handle,CUBLAS_OP_N,CUBLAS_OP_N,one,k,n,&sone,d_x,one,d_A,n,&szero,d_work,one);
403: #endif
404:       PetscLogGpuTimeEnd();
405:     }
406:     if (!q) VecCUDARestoreArrayWrite(X->buffer,&d_work);
407:     else {
408:       cudaMemcpy(q,d_work,k*sizeof(PetscScalar),cudaMemcpyDeviceToHost);
409:       PetscLogGpuToCpu(k*sizeof(PetscScalar));
410:       cudaFree(d_work);
411:     }
412:   }
413:   VecCUDARestoreArrayRead(z,&d_py);
414:   VecCUDARestoreArrayRead(x->v,&d_px);
415:   PetscLogGpuFlops(2.0*n*k);
416:   return 0;
417: }

419: /*
420:     y := A'*x computed as y' := x'*A
421: */
422: PetscErrorCode BVDotVec_Local_Svec_CUDA(BV X,Vec y,PetscScalar *m)
423: {
424:   BV_SVEC           *x = (BV_SVEC*)X->data;
425:   const PetscScalar *d_A,*d_x,*d_px,*d_py;
426:   PetscScalar       *d_y,szero=0.0,sone=1.0;
427:   PetscCuBLASInt    n=0,k=0,one=1;
428:   Vec               z = y;
429:   cublasHandle_t    cublasv2handle;

431:   PetscCuBLASIntCast(X->n,&n);
432:   PetscCuBLASIntCast(X->k-X->l,&k);
433:   if (X->matrix) {
434:     BV_IPMatMult(X,y);
435:     z = X->Bx;
436:   }
437:   PetscCUBLASGetHandle(&cublasv2handle);
438:   VecCUDAGetArrayRead(x->v,&d_px);
439:   VecCUDAGetArrayRead(z,&d_py);
440:   d_A = d_px+(X->nc+X->l)*X->n;
441:   d_x = d_py;
442:   if (n) {
443:     cudaMalloc((void**)&d_y,k*sizeof(PetscScalar));
444:     PetscLogGpuTimeBegin();
445: #if defined(PETSC_USE_COMPLEX)
446:     cublasXgemm(cublasv2handle,CUBLAS_OP_C,CUBLAS_OP_N,one,k,n,&sone,d_x,n,d_A,n,&szero,d_y,one);
447:     ConjugateCudaArray(d_y,k);
448: #else
449:     cublasXgemm(cublasv2handle,CUBLAS_OP_N,CUBLAS_OP_N,one,k,n,&sone,d_x,one,d_A,n,&szero,d_y,one);
450: #endif
451:     PetscLogGpuTimeEnd();
452:     cudaMemcpy(m,d_y,k*sizeof(PetscScalar),cudaMemcpyDeviceToHost);
453:     PetscLogGpuToCpu(k*sizeof(PetscScalar));
454:     cudaFree(d_y);
455:   }
456:   VecCUDARestoreArrayRead(z,&d_py);
457:   VecCUDARestoreArrayRead(x->v,&d_px);
458:   PetscLogGpuFlops(2.0*n*k);
459:   return 0;
460: }

462: /*
463:     Scale n scalars
464: */
465: PetscErrorCode BVScale_Svec_CUDA(BV bv,PetscInt j,PetscScalar alpha)
466: {
467:   BV_SVEC        *ctx = (BV_SVEC*)bv->data;
468:   PetscScalar    *d_array, *d_A;
469:   PetscCuBLASInt n=0,one=1;
470:   cublasHandle_t cublasv2handle;

472:   VecCUDAGetArray(ctx->v,&d_array);
473:   if (j<0) {
474:     d_A = d_array+(bv->nc+bv->l)*bv->n;
475:     PetscCuBLASIntCast((bv->k-bv->l)*bv->n,&n);
476:   } else {
477:     d_A = d_array+(bv->nc+j)*bv->n;
478:     PetscCuBLASIntCast(bv->n,&n);
479:   }
480:   if (alpha == (PetscScalar)0.0) cudaMemset(d_A,0,n*sizeof(PetscScalar));
481:   else if (alpha != (PetscScalar)1.0) {
482:     PetscCUBLASGetHandle(&cublasv2handle);
483:     PetscLogGpuTimeBegin();
484:     cublasXscal(cublasv2handle,n,&alpha,d_A,one);
485:     PetscLogGpuTimeEnd();
486:     PetscLogGpuFlops(1.0*n);
487:   }
488:   VecCUDARestoreArray(ctx->v,&d_array);
489:   return 0;
490: }

492: PetscErrorCode BVMatMult_Svec_CUDA(BV V,Mat A,BV W)
493: {
494:   BV_SVEC           *v = (BV_SVEC*)V->data,*w = (BV_SVEC*)W->data;
495:   Mat               Vmat,Wmat;
496:   const PetscScalar *d_pv;
497:   PetscScalar       *d_pw;
498:   PetscInt          j;

500:   if (V->vmm) {
501:     BVGetMat(V,&Vmat);
502:     BVGetMat(W,&Wmat);
503:     MatProductCreateWithMat(A,Vmat,NULL,Wmat);
504:     MatProductSetType(Wmat,MATPRODUCT_AB);
505:     MatProductSetFromOptions(Wmat);
506:     MatProductSymbolic(Wmat);
507:     MatProductNumeric(Wmat);
508:     MatProductClear(Wmat);
509:     BVRestoreMat(V,&Vmat);
510:     BVRestoreMat(W,&Wmat);
511:   } else {
512:     VecCUDAGetArrayRead(v->v,&d_pv);
513:     VecCUDAGetArrayWrite(w->v,&d_pw);
514:     for (j=0;j<V->k-V->l;j++) {
515:       VecCUDAPlaceArray(V->cv[1],(PetscScalar *)d_pv+(V->nc+V->l+j)*V->n);
516:       VecCUDAPlaceArray(W->cv[1],d_pw+(W->nc+W->l+j)*W->n);
517:       MatMult(A,V->cv[1],W->cv[1]);
518:       VecCUDAResetArray(V->cv[1]);
519:       VecCUDAResetArray(W->cv[1]);
520:     }
521:     VecCUDARestoreArrayRead(v->v,&d_pv);
522:     VecCUDARestoreArrayWrite(w->v,&d_pw);
523:   }
524:   return 0;
525: }

527: PetscErrorCode BVCopy_Svec_CUDA(BV V,BV W)
528: {
529:   BV_SVEC           *v = (BV_SVEC*)V->data,*w = (BV_SVEC*)W->data;
530:   const PetscScalar *d_pv,*d_pvc;
531:   PetscScalar       *d_pw,*d_pwc;

533:   VecCUDAGetArrayRead(v->v,&d_pv);
534:   VecCUDAGetArrayWrite(w->v,&d_pw);
535:   d_pvc = d_pv+(V->nc+V->l)*V->n;
536:   d_pwc = d_pw+(W->nc+W->l)*W->n;
537:   cudaMemcpy(d_pwc,d_pvc,(V->k-V->l)*V->n*sizeof(PetscScalar),cudaMemcpyDeviceToDevice);
538:   VecCUDARestoreArrayRead(v->v,&d_pv);
539:   VecCUDARestoreArrayWrite(w->v,&d_pw);
540:   return 0;
541: }

543: PetscErrorCode BVCopyColumn_Svec_CUDA(BV V,PetscInt j,PetscInt i)
544: {
545:   BV_SVEC        *v = (BV_SVEC*)V->data;
546:   PetscScalar    *d_pv;

548:   VecCUDAGetArray(v->v,&d_pv);
549:   cudaMemcpy(d_pv+(V->nc+i)*V->n,d_pv+(V->nc+j)*V->n,V->n*sizeof(PetscScalar),cudaMemcpyDeviceToDevice);
550:   VecCUDARestoreArray(v->v,&d_pv);
551:   return 0;
552: }

554: PetscErrorCode BVResize_Svec_CUDA(BV bv,PetscInt m,PetscBool copy)
555: {
556:   BV_SVEC           *ctx = (BV_SVEC*)bv->data;
557:   const PetscScalar *d_pv;
558:   PetscScalar       *d_pnew;
559:   PetscInt          bs;
560:   Vec               vnew;
561:   char              str[50];

563:   VecGetBlockSize(bv->t,&bs);
564:   VecCreate(PetscObjectComm((PetscObject)bv->t),&vnew);
565:   VecSetType(vnew,((PetscObject)bv->t)->type_name);
566:   VecSetSizes(vnew,m*bv->n,PETSC_DECIDE);
567:   VecSetBlockSize(vnew,bs);
568:   if (((PetscObject)bv)->name) {
569:     PetscSNPrintf(str,sizeof(str),"%s_0",((PetscObject)bv)->name);
570:     PetscObjectSetName((PetscObject)vnew,str);
571:   }
572:   if (copy) {
573:     VecCUDAGetArrayRead(ctx->v,&d_pv);
574:     VecCUDAGetArrayWrite(vnew,&d_pnew);
575:     cudaMemcpy(d_pnew,d_pv,PetscMin(m,bv->m)*bv->n*sizeof(PetscScalar),cudaMemcpyDeviceToDevice);
576:     VecCUDARestoreArrayRead(ctx->v,&d_pv);
577:     VecCUDARestoreArrayWrite(vnew,&d_pnew);
578:   }
579:   VecDestroy(&ctx->v);
580:   ctx->v = vnew;
581:   return 0;
582: }

584: PetscErrorCode BVGetColumn_Svec_CUDA(BV bv,PetscInt j,Vec *v)
585: {
586:   BV_SVEC        *ctx = (BV_SVEC*)bv->data;
587:   PetscScalar    *d_pv;
588:   PetscInt       l;

590:   l = BVAvailableVec;
591:   VecCUDAGetArray(ctx->v,&d_pv);
592:   VecCUDAPlaceArray(bv->cv[l],d_pv+(bv->nc+j)*bv->n);
593:   return 0;
594: }

596: PetscErrorCode BVRestoreColumn_Svec_CUDA(BV bv,PetscInt j,Vec *v)
597: {
598:   BV_SVEC        *ctx = (BV_SVEC*)bv->data;
599:   PetscInt       l;

601:   l = (j==bv->ci[0])? 0: 1;
602:   VecCUDAResetArray(bv->cv[l]);
603:   VecCUDARestoreArray(ctx->v,NULL);
604:   return 0;
605: }

607: PetscErrorCode BVRestoreSplit_Svec_CUDA(BV bv,BV *L,BV *R)
608: {
609:   Vec               v;
610:   const PetscScalar *d_pv;
611:   PetscObjectState  lstate,rstate;
612:   PetscBool         change=PETSC_FALSE;

614:   /* force sync flag to PETSC_CUDA_BOTH */
615:   if (L) {
616:     PetscObjectStateGet((PetscObject)*L,&lstate);
617:     if (lstate != bv->lstate) {
618:       v = ((BV_SVEC*)bv->L->data)->v;
619:       VecCUDAGetArrayRead(v,&d_pv);
620:       VecCUDARestoreArrayRead(v,&d_pv);
621:       change = PETSC_TRUE;
622:     }
623:   }
624:   if (R) {
625:     PetscObjectStateGet((PetscObject)*R,&rstate);
626:     if (rstate != bv->rstate) {
627:       v = ((BV_SVEC*)bv->R->data)->v;
628:       VecCUDAGetArrayRead(v,&d_pv);
629:       VecCUDARestoreArrayRead(v,&d_pv);
630:       change = PETSC_TRUE;
631:     }
632:   }
633:   if (change) {
634:     v = ((BV_SVEC*)bv->data)->v;
635:     VecCUDAGetArray(v,(PetscScalar **)&d_pv);
636:     VecCUDARestoreArray(v,(PetscScalar **)&d_pv);
637:   }
638:   return 0;
639: }

641: PetscErrorCode BVGetMat_Svec_CUDA(BV bv,Mat *A)
642: {
643:   BV_SVEC        *ctx = (BV_SVEC*)bv->data;
644:   PetscScalar    *vv,*aa;
645:   PetscBool      create=PETSC_FALSE;
646:   PetscInt       m,cols;

648:   m = bv->k-bv->l;
649:   if (!bv->Aget) create=PETSC_TRUE;
650:   else {
651:     MatDenseCUDAGetArray(bv->Aget,&aa);
653:     MatGetSize(bv->Aget,NULL,&cols);
654:     if (cols!=m) {
655:       MatDestroy(&bv->Aget);
656:       create=PETSC_TRUE;
657:     }
658:   }
659:   VecCUDAGetArray(ctx->v,&vv);
660:   if (create) {
661:     MatCreateDenseCUDA(PetscObjectComm((PetscObject)bv),bv->n,PETSC_DECIDE,bv->N,m,vv,&bv->Aget); /* pass a pointer to avoid allocation of storage */
662:     MatDenseCUDAReplaceArray(bv->Aget,NULL);  /* replace with a null pointer, the value after BVRestoreMat */
663:   }
664:   MatDenseCUDAPlaceArray(bv->Aget,vv+(bv->nc+bv->l)*bv->n);  /* set the actual pointer */
665:   *A = bv->Aget;
666:   return 0;
667: }

669: PetscErrorCode BVRestoreMat_Svec_CUDA(BV bv,Mat *A)
670: {
671:   BV_SVEC        *ctx = (BV_SVEC*)bv->data;
672:   PetscScalar    *vv,*aa;

674:   MatDenseCUDAGetArray(bv->Aget,&aa);
675:   vv = aa-(bv->nc+bv->l)*bv->n;
676:   MatDenseCUDAResetArray(bv->Aget);
677:   VecCUDARestoreArray(ctx->v,&vv);
678:   *A = NULL;
679:   return 0;
680: }