Actual source code: sveccuda.cu
slepc-3.12.0 2019-09-30
1: /*
2: - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
3: SLEPc - Scalable Library for Eigenvalue Problem Computations
4: Copyright (c) 2002-2019, Universitat Politecnica de Valencia, Spain
6: This file is part of SLEPc.
7: SLEPc is distributed under a 2-clause BSD license (see LICENSE).
8: - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
9: */
10: /*
11: BV implemented as a single Vec (CUDA version)
12: */
14: #include <slepc/private/bvimpl.h>
15: #include "../src/sys/classes/bv/impls/svec/svec.h"
16: #include <petsccublas.h>
18: #if defined(PETSC_USE_COMPLEX)
19: #include <thrust/device_ptr.h>
20: #endif
22: #define BLOCKSIZE 64
24: /*
25: B := alpha*A + beta*B
27: A,B are nxk (ld=n)
28: */
29: static PetscErrorCode BVAXPY_BLAS_CUDA(BV bv,PetscInt n_,PetscInt k_,PetscScalar alpha,const PetscScalar *d_A,PetscScalar beta,PetscScalar *d_B)
30: {
32: PetscBLASInt m,one=1;
33: cublasStatus_t cberr;
34: cublasHandle_t cublasv2handle;
37: PetscCUBLASGetHandle(&cublasv2handle);
38: PetscBLASIntCast(n_*k_,&m);
39: PetscLogGpuTimeBegin();
40: if (beta!=(PetscScalar)1.0) {
41: cberr = cublasXscal(cublasv2handle,m,&beta,d_B,one);CHKERRCUBLAS(cberr);
42: PetscLogGpuFlops(1.0*m);
43: }
44: cberr = cublasXaxpy(cublasv2handle,m,&alpha,d_A,one,d_B,one);CHKERRCUBLAS(cberr);
45: PetscLogGpuTimeEnd();
46: WaitForGPU();CHKERRCUDA(ierr);
47: PetscLogGpuFlops(2.0*m);
48: return(0);
49: }
51: /*
52: C := alpha*A*B + beta*C
53: */
54: PetscErrorCode BVMult_Svec_CUDA(BV Y,PetscScalar alpha,PetscScalar beta,BV X,Mat Q)
55: {
56: PetscErrorCode ierr;
57: BV_SVEC *y = (BV_SVEC*)Y->data,*x = (BV_SVEC*)X->data;
58: const PetscScalar *d_px,*d_A;
59: PetscScalar *d_py,*q,*d_q,*d_B,*d_C;
60: PetscInt m,n,k,ldq,mq;
61: cublasStatus_t cberr;
62: cudaError_t err;
63: cublasHandle_t cublasv2handle;
66: m = Y->n;
67: n = Y->k-Y->l;
68: k = X->k-X->l;
69: if (!Y->n) return(0);
70: VecCUDAGetArrayRead(x->v,&d_px);
71: if (beta==(PetscScalar)0.0) {
72: VecCUDAGetArrayWrite(y->v,&d_py);
73: } else {
74: VecCUDAGetArray(y->v,&d_py);
75: }
76: d_A = d_px+(X->nc+X->l)*X->n;
77: d_C = d_py+(Y->nc+Y->l)*Y->n;
78: if (Q) {
79: PetscCUBLASGetHandle(&cublasv2handle);
80: MatGetSize(Q,&ldq,&mq);
81: MatDenseGetArray(Q,&q);
82: err = cudaMalloc((void**)&d_q,ldq*mq*sizeof(PetscScalar));CHKERRCUDA(err);
83: err = cudaMemcpy(d_q,q,ldq*mq*sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
84: PetscLogCpuToGpu(ldq*mq*sizeof(PetscScalar));
85: d_B = d_q+Y->l*ldq+X->l;
86: PetscLogGpuTimeBegin();
87: cberr = cublasXgemm(cublasv2handle,CUBLAS_OP_N,CUBLAS_OP_N,m,n,k,&alpha,d_A,m,d_B,ldq,&beta,d_C,m);CHKERRCUBLAS(cberr);
88: PetscLogGpuTimeEnd();
89: WaitForGPU();CHKERRCUDA(ierr);
90: MatDenseRestoreArray(Q,&q);
91: err = cudaFree(d_q);CHKERRCUDA(err);
92: PetscLogGpuFlops(2.0*m*n*k);
93: } else {
94: BVAXPY_BLAS_CUDA(Y,m,n,alpha,d_A,beta,d_C);
95: }
96: VecCUDARestoreArrayRead(x->v,&d_px);
97: VecCUDARestoreArrayWrite(y->v,&d_py);
98: return(0);
99: }
101: /*
102: y := alpha*A*x + beta*y
103: */
104: PetscErrorCode BVMultVec_Svec_CUDA(BV X,PetscScalar alpha,PetscScalar beta,Vec y,PetscScalar *q)
105: {
106: PetscErrorCode ierr;
107: BV_SVEC *x = (BV_SVEC*)X->data;
108: const PetscScalar *d_px,*d_A;
109: PetscScalar *d_py,*d_q,*d_x,*d_y;
110: PetscBLASInt n,k,one=1;
111: cublasStatus_t cberr;
112: cublasHandle_t cublasv2handle;
115: n = X->n;
116: k = X->k-X->l;
117: PetscCUBLASGetHandle(&cublasv2handle);
118: VecCUDAGetArrayRead(x->v,&d_px);
119: if (beta==(PetscScalar)0.0) {
120: VecCUDAGetArrayWrite(y,&d_py);
121: } else {
122: VecCUDAGetArray(y,&d_py);
123: }
124: if (!q) {
125: VecCUDAGetArray(X->buffer,&d_q);
126: } else {
127: cudaMalloc((void**)&d_q,k*sizeof(PetscScalar));
128: cudaMemcpy(d_q,q,k*sizeof(PetscScalar),cudaMemcpyHostToDevice);
129: PetscLogCpuToGpu(k*sizeof(PetscScalar));
130: }
131: d_A = d_px+(X->nc+X->l)*X->n;
132: d_x = d_q;
133: d_y = d_py;
134: PetscLogGpuTimeBegin();
135: cberr = cublasXgemv(cublasv2handle,CUBLAS_OP_N,n,k,&alpha,d_A,n,d_x,one,&beta,d_y,one);CHKERRCUBLAS(cberr);
136: PetscLogGpuTimeEnd();
137: WaitForGPU();CHKERRCUDA(ierr);
138: VecCUDARestoreArrayRead(x->v,&d_px);
139: if (beta==(PetscScalar)0.0) {
140: VecCUDARestoreArrayWrite(y,&d_py);
141: } else {
142: VecCUDARestoreArray(y,&d_py);
143: }
144: if (!q) {
145: VecCUDARestoreArray(X->buffer,&d_q);
146: } else {
147: cudaFree(d_q);
148: }
149: PetscLogGpuFlops(2.0*n*k);
150: return(0);
151: }
153: /*
154: A(:,s:e-1) := A*B(:,s:e-1)
155: */
156: PetscErrorCode BVMultInPlace_Svec_CUDA(BV V,Mat Q,PetscInt s,PetscInt e)
157: {
159: BV_SVEC *ctx = (BV_SVEC*)V->data;
160: PetscScalar *d_pv,*q,*d_q,*d_A,*d_B,*d_work,sone=1.0,szero=0.0;
161: PetscInt m,n,j,k,l,ldq,nq,bs=BLOCKSIZE;
162: cublasStatus_t cberr;
163: size_t freemem,totmem;
164: cublasHandle_t cublasv2handle;
167: m = V->n;
168: n = e-s;
169: k = V->k-V->l;
170: if (!m) return(0);
171: MatGetSize(Q,&ldq,&nq);
172: VecCUDAGetArray(ctx->v,&d_pv);
173: MatDenseGetArray(Q,&q);
174: cudaMalloc((void**)&d_q,ldq*nq*sizeof(PetscScalar));
175: cudaMemcpy(d_q,q,ldq*nq*sizeof(PetscScalar),cudaMemcpyHostToDevice);
176: PetscLogCpuToGpu(ldq*nq*sizeof(PetscScalar));
177: PetscCUBLASGetHandle(&cublasv2handle);
178: PetscLogGpuTimeBegin();
179: /* try to allocate the whole matrix */
180: cudaMemGetInfo(&freemem,&totmem);
181: if (freemem>=m*n*sizeof(PetscScalar)) {
182: cudaMalloc((void**)&d_work,m*n*sizeof(PetscScalar));
183: d_A = d_pv+(V->nc+V->l)*m;
184: d_B = d_q+V->l*ldq+V->l+(s-V->l)*ldq;
185: cberr = cublasXgemm(cublasv2handle,CUBLAS_OP_N,CUBLAS_OP_N,m,n,k,&sone,d_A,m,d_B,ldq,&szero,d_work,m);CHKERRCUBLAS(cberr);
186: for (j=0;j<n;j++) {
187: cudaMemcpy(d_A+(s-V->l+j)*m,d_work+(j*m),m*sizeof(PetscScalar),cudaMemcpyDeviceToDevice);CHKERRCUDA(ierr);
188: }
189: } else {
190: bs = freemem/(m*sizeof(PetscScalar));
191: cudaMalloc((void**)&d_work,bs*n*sizeof(PetscScalar));
192: l = m % bs;
193: if (l) {
194: d_A = d_pv+(V->nc+V->l)*m;
195: d_B = d_q+V->l*ldq+V->l+(s-V->l)*ldq;
196: cberr = cublasXgemm(cublasv2handle,CUBLAS_OP_N,CUBLAS_OP_N,l,n,k,&sone,d_A,m,d_B,ldq,&szero,d_work,l);CHKERRCUBLAS(cberr);
197: for (j=0;j<n;j++) {
198: cudaMemcpy(d_A+(s-V->l+j)*m,d_work+(j*l),l*sizeof(PetscScalar),cudaMemcpyDeviceToDevice);CHKERRCUDA(ierr);
199: }
200: }
201: for (;l<m;l+=bs) {
202: d_A = d_pv+(V->nc+V->l)*m+l;
203: d_B = d_q+V->l*ldq+V->l+(s-V->l)*ldq;
204: cberr = cublasXgemm(cublasv2handle,CUBLAS_OP_N,CUBLAS_OP_N,bs,n,k,&sone,d_A,m,d_B,ldq,&szero,d_work,bs);CHKERRCUBLAS(cberr);
205: for (j=0;j<n;j++) {
206: cudaMemcpy(d_A+(s-V->l+j)*m,d_work+(j*bs),bs*sizeof(PetscScalar),cudaMemcpyDeviceToDevice);CHKERRCUDA(ierr);
207: }
208: }
209: }
210: PetscLogGpuTimeEnd();
211: WaitForGPU();CHKERRCUDA(ierr);
212: MatDenseRestoreArray(Q,&q);
213: cudaFree(d_q);
214: cudaFree(d_work);
215: VecCUDARestoreArray(ctx->v,&d_pv);
216: PetscLogGpuFlops(2.0*m*n*k);
217: return(0);
218: }
220: /*
221: A(:,s:e-1) := A*B(:,s:e-1)
222: */
223: PetscErrorCode BVMultInPlaceTranspose_Svec_CUDA(BV V,Mat Q,PetscInt s,PetscInt e)
224: {
226: BV_SVEC *ctx = (BV_SVEC*)V->data;
227: PetscScalar *d_pv,*q,*d_q,*d_A,*d_B,*d_work,sone=1.0,szero=0.0;
228: PetscInt m,n,j,k,ldq,nq;
229: cublasStatus_t cberr;
230: cublasHandle_t cublasv2handle;
233: m = V->n;
234: n = e-s;
235: k = V->k-V->l;
236: if (!m) return(0);
237: MatGetSize(Q,&ldq,&nq);
238: VecCUDAGetArray(ctx->v,&d_pv);
239: MatDenseGetArray(Q,&q);
240: PetscCUBLASGetHandle(&cublasv2handle);
241: cudaMalloc((void**)&d_q,ldq*nq*sizeof(PetscScalar));
242: cudaMemcpy(d_q,q,ldq*nq*sizeof(PetscScalar),cudaMemcpyHostToDevice);
243: PetscLogCpuToGpu(ldq*nq*sizeof(PetscScalar));
244: PetscLogGpuTimeBegin();
245: cudaMalloc((void**)&d_work,m*n*sizeof(PetscScalar));
246: d_A = d_pv+(V->nc+V->l)*m;
247: d_B = d_q+V->l*ldq+s;
248: cberr = cublasXgemm(cublasv2handle,CUBLAS_OP_N,CUBLAS_OP_C,m,n,k,&sone,d_A,m,d_B,ldq,&szero,d_work,m);CHKERRCUBLAS(cberr);
249: for (j=0;j<n;j++) {
250: cudaMemcpy(d_A+(s-V->l+j)*m,d_work+(j*m),m*sizeof(PetscScalar),cudaMemcpyDeviceToDevice);
251: }
252: PetscLogGpuTimeEnd();
253: WaitForGPU();CHKERRCUDA(ierr);
254: MatDenseRestoreArray(Q,&q);
255: cudaFree(d_q);
256: cudaFree(d_work);
257: VecCUDARestoreArray(ctx->v,&d_pv);
258: PetscLogGpuFlops(2.0*m*n*k);
259: return(0);
260: }
262: /*
263: C := A'*B
264: */
265: PetscErrorCode BVDot_Svec_CUDA(BV X,BV Y,Mat M)
266: {
267: PetscErrorCode ierr;
268: BV_SVEC *x = (BV_SVEC*)X->data,*y = (BV_SVEC*)Y->data;
269: const PetscScalar *d_px,*d_py,*d_A,*d_B;
270: PetscScalar *pm,*d_work,sone=1.0,szero=0.0,*C,*CC;
271: PetscInt ldm,m,n,k,len,j;
272: cublasStatus_t cberr;
273: cublasHandle_t cublasv2handle;
276: m = Y->k-Y->l;
277: n = X->k-X->l;
278: k = X->n;
279: MatGetSize(M,&ldm,NULL);
280: VecCUDAGetArrayRead(x->v,&d_px);
281: VecCUDAGetArrayRead(y->v,&d_py);
282: MatDenseGetArray(M,&pm);
283: PetscCUBLASGetHandle(&cublasv2handle);
284: cudaMalloc((void**)&d_work,m*n*sizeof(PetscScalar));
285: d_A = d_py+(Y->nc+Y->l)*Y->n;
286: d_B = d_px+(X->nc+X->l)*X->n;
287: C = pm+X->l*ldm+Y->l;
288: if (x->mpi) {
289: if (ldm==m) {
290: BVAllocateWork_Private(X,m*n);
291: if (k) {
292: PetscLogGpuTimeBegin();
293: cberr = cublasXgemm(cublasv2handle,CUBLAS_OP_C,CUBLAS_OP_N,m,n,k,&sone,d_A,k,d_B,k,&szero,d_work,ldm);CHKERRCUBLAS(cberr);
294: PetscLogGpuTimeEnd();
295: cudaMemcpy(X->work,d_work,m*n*sizeof(PetscScalar),cudaMemcpyDeviceToHost);
296: PetscLogGpuToCpu(m*n*sizeof(PetscScalar));
297: } else {
298: PetscArrayzero(X->work,m*n);
299: }
300: PetscMPIIntCast(m*n,&len);
301: MPI_Allreduce(X->work,C,len,MPIU_SCALAR,MPIU_SUM,PetscObjectComm((PetscObject)X));
302: } else {
303: BVAllocateWork_Private(X,2*m*n);
304: CC = X->work+m*n;
305: if (k) {
306: PetscLogGpuTimeBegin();
307: cberr = cublasXgemm(cublasv2handle,CUBLAS_OP_C,CUBLAS_OP_N,m,n,k,&sone,d_A,k,d_B,k,&szero,d_work,m);CHKERRCUBLAS(cberr);
308: PetscLogGpuTimeEnd();
309: cudaMemcpy(X->work,d_work,m*n*sizeof(PetscScalar),cudaMemcpyDeviceToHost);
310: PetscLogGpuToCpu(m*n*sizeof(PetscScalar));
311: } else {
312: PetscArrayzero(X->work,m*n);
313: }
314: PetscMPIIntCast(m*n,&len);
315: MPI_Allreduce(X->work,CC,len,MPIU_SCALAR,MPIU_SUM,PetscObjectComm((PetscObject)X));
316: for (j=0;j<n;j++) {
317: PetscArraycpy(C+j*ldm,CC+j*m,m);
318: }
319: }
320: } else {
321: if (k) {
322: BVAllocateWork_Private(X,m*n);
323: PetscLogGpuTimeBegin();
324: cberr = cublasXgemm(cublasv2handle,CUBLAS_OP_C,CUBLAS_OP_N,m,n,k,&sone,d_A,k,d_B,k,&szero,d_work,m);CHKERRCUBLAS(cberr);
325: PetscLogGpuTimeEnd();
326: cudaMemcpy(X->work,d_work,m*n*sizeof(PetscScalar),cudaMemcpyDeviceToHost);
327: PetscLogGpuToCpu(m*n*sizeof(PetscScalar));
328: for (j=0;j<n;j++) {
329: PetscArraycpy(C+j*ldm,X->work+j*m,m);
330: }
331: }
332: }
333: WaitForGPU();CHKERRCUDA(ierr);
334: cudaFree(d_work);
335: MatDenseRestoreArray(M,&pm);
336: VecCUDARestoreArrayRead(x->v,&d_px);
337: VecCUDARestoreArrayRead(y->v,&d_py);
338: PetscLogGpuFlops(2.0*m*n*k);
339: return(0);
340: }
342: #if defined(PETSC_USE_COMPLEX)
343: struct conjugate
344: {
345: __host__ __device__
346: PetscScalar operator()(PetscScalar x)
347: {
348: return PetscConj(x);
349: }
350: };
352: PetscErrorCode ConjugateCudaArray(PetscScalar *a, PetscInt n)
353: {
354: PetscErrorCode ierr;
355: thrust::device_ptr<PetscScalar> ptr;
358: try {
359: ptr = thrust::device_pointer_cast(a);
360: thrust::transform(ptr,ptr+n,ptr,conjugate());
361: WaitForGPU();CHKERRCUDA(ierr);
362: } catch (char *ex) {
363: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"Thrust error: %s", ex);
364: }
365: return(0);
366: }
367: #endif
369: /*
370: y := A'*x computed as y' := x'*A
371: */
372: PetscErrorCode BVDotVec_Svec_CUDA(BV X,Vec y,PetscScalar *q)
373: {
374: PetscErrorCode ierr;
375: BV_SVEC *x = (BV_SVEC*)X->data;
376: const PetscScalar *d_A,*d_x,*d_px,*d_py;
377: PetscScalar *d_work,szero=0.0,sone=1.0,*qq=q;
378: PetscBLASInt n,k,one=1,len;
379: Vec z = y;
380: cublasStatus_t cberr;
381: cublasHandle_t cublasv2handle;
384: n = X->n;
385: k = X->k-X->l;
386: PetscCUBLASGetHandle(&cublasv2handle);
387: if (X->matrix) {
388: BV_IPMatMult(X,y);
389: z = X->Bx;
390: }
391: VecCUDAGetArrayRead(x->v,&d_px);
392: VecCUDAGetArrayRead(z,&d_py);
393: if (!q) {
394: VecCUDAGetArrayWrite(X->buffer,&d_work);
395: } else {
396: cudaMalloc((void**)&d_work,k*sizeof(PetscScalar));
397: }
398: d_A = d_px+(X->nc+X->l)*X->n;
399: d_x = d_py;
400: if (x->mpi) {
401: BVAllocateWork_Private(X,k);
402: if (n) {
403: PetscLogGpuTimeBegin();
404: #if defined(PETSC_USE_COMPLEX)
405: cberr = cublasXgemm(cublasv2handle,CUBLAS_OP_C,CUBLAS_OP_N,one,k,n,&sone,d_x,n,d_A,n,&szero,d_work,one);CHKERRCUBLAS(cberr);
406: ConjugateCudaArray(d_work,k);
407: #else
408: cberr = cublasXgemm(cublasv2handle,CUBLAS_OP_N,CUBLAS_OP_N,one,k,n,&sone,d_x,one,d_A,n,&szero,d_work,one);CHKERRCUBLAS(cberr);
409: #endif
410: PetscLogGpuTimeEnd();
411: cudaMemcpy(X->work,d_work,k*sizeof(PetscScalar),cudaMemcpyDeviceToHost);
412: PetscLogGpuToCpu(k*sizeof(PetscScalar));
413: } else {
414: PetscArrayzero(X->work,k);
415: }
416: if (!q) {
417: VecCUDARestoreArrayWrite(X->buffer,&d_work);
418: VecGetArray(X->buffer,&qq);
419: } else {
420: cudaFree(d_work);
421: }
422: PetscMPIIntCast(k,&len);
423: MPI_Allreduce(X->work,qq,len,MPIU_SCALAR,MPIU_SUM,PetscObjectComm((PetscObject)X));
424: if (!q) { VecRestoreArray(X->buffer,&qq); }
425: } else {
426: if (n) {
427: PetscLogGpuTimeBegin();
428: #if defined(PETSC_USE_COMPLEX)
429: cberr = cublasXgemm(cublasv2handle,CUBLAS_OP_C,CUBLAS_OP_N,one,k,n,&sone,d_x,n,d_A,n,&szero,d_work,one);CHKERRCUBLAS(cberr);
430: ConjugateCudaArray(d_work,k);
431: #else
432: cberr = cublasXgemm(cublasv2handle,CUBLAS_OP_N,CUBLAS_OP_N,one,k,n,&sone,d_x,one,d_A,n,&szero,d_work,one);CHKERRCUBLAS(cberr);
433: #endif
434: PetscLogGpuTimeEnd();
435: }
436: if (!q) {
437: VecCUDARestoreArrayWrite(X->buffer,&d_work);
438: } else {
439: cudaMemcpy(q,d_work,k*sizeof(PetscScalar),cudaMemcpyDeviceToHost);
440: PetscLogGpuToCpu(k*sizeof(PetscScalar));
441: cudaFree(d_work);
442: }
443: }
444: WaitForGPU();CHKERRCUDA(ierr);
445: VecCUDARestoreArrayRead(z,&d_py);
446: VecCUDARestoreArrayRead(x->v,&d_px);
447: PetscLogGpuFlops(2.0*n*k);
448: return(0);
449: }
451: /*
452: y := A'*x computed as y' := x'*A
453: */
454: PetscErrorCode BVDotVec_Local_Svec_CUDA(BV X,Vec y,PetscScalar *m)
455: {
456: PetscErrorCode ierr;
457: BV_SVEC *x = (BV_SVEC*)X->data;
458: const PetscScalar *d_A,*d_x,*d_px,*d_py;
459: PetscScalar *d_y,szero=0.0,sone=1.0;
460: PetscBLASInt n,k,one=1;
461: Vec z = y;
462: cublasStatus_t cberr;
463: cublasHandle_t cublasv2handle;
466: n = X->n;
467: k = X->k-X->l;
468: if (X->matrix) {
469: BV_IPMatMult(X,y);
470: z = X->Bx;
471: }
472: PetscCUBLASGetHandle(&cublasv2handle);
473: VecCUDAGetArrayRead(x->v,&d_px);
474: VecCUDAGetArrayRead(z,&d_py);
475: d_A = d_px+(X->nc+X->l)*X->n;
476: d_x = d_py;
477: if (n) {
478: cudaMalloc((void**)&d_y,k*sizeof(PetscScalar));
479: PetscLogGpuTimeBegin();
480: #if defined(PETSC_USE_COMPLEX)
481: cberr = cublasXgemm(cublasv2handle,CUBLAS_OP_C,CUBLAS_OP_N,one,k,n,&sone,d_x,n,d_A,n,&szero,d_y,one);CHKERRCUBLAS(cberr);
482: ConjugateCudaArray(d_y,k);
483: #else
484: cberr = cublasXgemm(cublasv2handle,CUBLAS_OP_N,CUBLAS_OP_N,one,k,n,&sone,d_x,one,d_A,n,&szero,d_y,one);CHKERRCUBLAS(cberr);
485: #endif
486: PetscLogGpuTimeEnd();
487: WaitForGPU();CHKERRCUDA(ierr);
488: cudaMemcpy(m,d_y,k*sizeof(PetscScalar),cudaMemcpyDeviceToHost);
489: PetscLogGpuToCpu(k*sizeof(PetscScalar));
490: cudaFree(d_y);
491: }
492: VecCUDARestoreArrayRead(z,&d_py);
493: VecCUDARestoreArrayRead(x->v,&d_px);
494: PetscLogGpuFlops(2.0*n*k);
495: return(0);
496: }
498: /*
499: Scale n scalars
500: */
501: PetscErrorCode BVScale_Svec_CUDA(BV bv,PetscInt j,PetscScalar alpha)
502: {
504: BV_SVEC *ctx = (BV_SVEC*)bv->data;
505: PetscScalar *d_array, *d_A;
506: PetscBLASInt n,one=1;
507: cublasStatus_t cberr;
508: cublasHandle_t cublasv2handle;
511: VecCUDAGetArray(ctx->v,&d_array);
512: if (j<0) {
513: d_A = d_array+(bv->nc+bv->l)*bv->n;
514: n = (bv->k-bv->l)*bv->n;
515: } else {
516: d_A = d_array+(bv->nc+j)*bv->n;
517: n = bv->n;
518: }
519: if (alpha == (PetscScalar)0.0) {
520: cudaMemset(d_A,0,n*sizeof(PetscScalar));
521: } else if (alpha != (PetscScalar)1.0) {
522: PetscCUBLASGetHandle(&cublasv2handle);
523: PetscLogGpuTimeBegin();
524: cberr = cublasXscal(cublasv2handle,n,&alpha,d_A,one);CHKERRCUBLAS(cberr);
525: PetscLogGpuTimeEnd();
526: WaitForGPU();CHKERRCUDA(ierr);
527: PetscLogGpuFlops(1.0*n);
528: }
529: VecCUDARestoreArray(ctx->v,&d_array);
530: return(0);
531: }
533: PetscErrorCode BVMatMult_Svec_CUDA(BV V,Mat A,BV W)
534: {
535: PetscErrorCode ierr;
536: BV_SVEC *v = (BV_SVEC*)V->data,*w = (BV_SVEC*)W->data;
537: const PetscScalar *d_pv;
538: PetscScalar *d_pw;
539: PetscInt j;
542: VecCUDAGetArrayRead(v->v,&d_pv);
543: VecCUDAGetArrayWrite(w->v,&d_pw);
544: for (j=0;j<V->k-V->l;j++) {
545: VecCUDAPlaceArray(V->cv[1],(PetscScalar *)d_pv+(V->nc+V->l+j)*V->n);
546: VecCUDAPlaceArray(W->cv[1],d_pw+(W->nc+W->l+j)*W->n);
547: MatMult(A,V->cv[1],W->cv[1]);
548: VecCUDAResetArray(V->cv[1]);
549: VecCUDAResetArray(W->cv[1]);
550: }
551: VecCUDARestoreArrayRead(v->v,&d_pv);
552: VecCUDARestoreArrayWrite(w->v,&d_pw);
553: return(0);
554: }
556: PetscErrorCode BVCopy_Svec_CUDA(BV V,BV W)
557: {
558: PetscErrorCode ierr;
559: BV_SVEC *v = (BV_SVEC*)V->data,*w = (BV_SVEC*)W->data;
560: const PetscScalar *d_pv,*d_pvc;
561: PetscScalar *d_pw,*d_pwc;
562: cudaError_t err;
565: VecCUDAGetArrayRead(v->v,&d_pv);
566: VecCUDAGetArrayWrite(w->v,&d_pw);
567: d_pvc = d_pv+(V->nc+V->l)*V->n;
568: d_pwc = d_pw+(W->nc+W->l)*W->n;
569: err = cudaMemcpy(d_pwc,d_pvc,(V->k-V->l)*V->n*sizeof(PetscScalar),cudaMemcpyDeviceToDevice);CHKERRCUDA(err);
570: VecCUDARestoreArrayRead(v->v,&d_pv);
571: VecCUDARestoreArrayWrite(w->v,&d_pw);
572: return(0);
573: }
575: PetscErrorCode BVCopyColumn_Svec_CUDA(BV V,PetscInt j,PetscInt i)
576: {
578: BV_SVEC *v = (BV_SVEC*)V->data;
579: PetscScalar *d_pv;
580: cudaError_t err;
583: VecCUDAGetArray(v->v,&d_pv);
584: err = cudaMemcpy(d_pv+(V->nc+i)*V->n,d_pv+(V->nc+j)*V->n,V->n*sizeof(PetscScalar),cudaMemcpyDeviceToDevice);CHKERRCUDA(err);
585: VecCUDARestoreArray(v->v,&d_pv);
586: return(0);
587: }
589: PetscErrorCode BVResize_Svec_CUDA(BV bv,PetscInt m,PetscBool copy)
590: {
591: PetscErrorCode ierr;
592: BV_SVEC *ctx = (BV_SVEC*)bv->data;
593: const PetscScalar *d_pv;
594: PetscScalar *d_pnew,*d_ptr;
595: PetscInt bs,lsplit;
596: Vec vnew,vpar;
597: char str[50];
598: cudaError_t err;
599: BV parent;
602: if (bv->issplit==2) {
603: parent = bv->splitparent;
604: lsplit = parent->lsplit;
605: vpar = ((BV_SVEC*)parent->data)->v;
606: VecCUDAResetArray(ctx->v);
607: VecCUDAGetArray(vpar,&d_ptr);
608: VecCUDAPlaceArray(ctx->v,d_ptr+lsplit*bv->n);
609: VecCUDARestoreArray(vpar,&d_ptr);
610: } else if (!bv->issplit) {
611: VecGetBlockSize(bv->t,&bs);
612: VecCreate(PetscObjectComm((PetscObject)bv->t),&vnew);
613: VecSetType(vnew,((PetscObject)bv->t)->type_name);
614: VecSetSizes(vnew,m*bv->n,PETSC_DECIDE);
615: VecSetBlockSize(vnew,bs);
616: PetscLogObjectParent((PetscObject)bv,(PetscObject)vnew);
617: if (((PetscObject)bv)->name) {
618: PetscSNPrintf(str,50,"%s_0",((PetscObject)bv)->name);
619: PetscObjectSetName((PetscObject)vnew,str);
620: }
621: if (copy) {
622: VecCUDAGetArrayRead(ctx->v,&d_pv);
623: VecCUDAGetArrayWrite(vnew,&d_pnew);
624: err = cudaMemcpy(d_pnew,d_pv,PetscMin(m,bv->m)*bv->n*sizeof(PetscScalar),cudaMemcpyDeviceToDevice);CHKERRCUDA(err);
625: VecCUDARestoreArrayRead(ctx->v,&d_pv);
626: VecCUDARestoreArrayWrite(vnew,&d_pnew);
627: }
628: VecDestroy(&ctx->v);
629: ctx->v = vnew;
630: }
631: return(0);
632: }
634: PetscErrorCode BVGetColumn_Svec_CUDA(BV bv,PetscInt j,Vec *v)
635: {
637: BV_SVEC *ctx = (BV_SVEC*)bv->data;
638: PetscScalar *d_pv;
639: PetscInt l;
642: l = BVAvailableVec;
643: VecCUDAGetArray(ctx->v,&d_pv);
644: VecCUDAPlaceArray(bv->cv[l],d_pv+(bv->nc+j)*bv->n);
645: return(0);
646: }
648: PetscErrorCode BVRestoreColumn_Svec_CUDA(BV bv,PetscInt j,Vec *v)
649: {
651: BV_SVEC *ctx = (BV_SVEC*)bv->data;
652: PetscInt l;
655: l = (j==bv->ci[0])? 0: 1;
656: VecCUDAResetArray(bv->cv[l]);
657: VecCUDARestoreArray(ctx->v,NULL);
658: return(0);
659: }
661: PetscErrorCode BVRestoreSplit_Svec_CUDA(BV bv,BV *L,BV *R)
662: {
663: PetscErrorCode ierr;
664: Vec v;
665: const PetscScalar *d_pv;
666: PetscObjectState lstate,rstate;
667: PetscBool change=PETSC_FALSE;
670: /* force sync flag to PETSC_CUDA_BOTH */
671: if (L) {
672: PetscObjectStateGet((PetscObject)*L,&lstate);
673: if (lstate != bv->lstate) {
674: v = ((BV_SVEC*)bv->L->data)->v;
675: VecCUDAGetArrayRead(v,&d_pv);
676: VecCUDARestoreArrayRead(v,&d_pv);
677: change = PETSC_TRUE;
678: }
679: }
680: if (R) {
681: PetscObjectStateGet((PetscObject)*R,&rstate);
682: if (rstate != bv->rstate) {
683: v = ((BV_SVEC*)bv->R->data)->v;
684: VecCUDAGetArrayRead(v,&d_pv);
685: VecCUDARestoreArrayRead(v,&d_pv);
686: change = PETSC_TRUE;
687: }
688: }
689: if (change) {
690: v = ((BV_SVEC*)bv->data)->v;
691: VecCUDAGetArray(v,(PetscScalar **)&d_pv);
692: VecCUDARestoreArray(v,(PetscScalar **)&d_pv);
693: }
694: return(0);
695: }