Actual source code: sveccuda.cu
slepc-3.17.0 2022-03-31
1: /*
2: - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
3: SLEPc - Scalable Library for Eigenvalue Problem Computations
4: Copyright (c) 2002-, Universitat Politecnica de Valencia, Spain
6: This file is part of SLEPc.
7: SLEPc is distributed under a 2-clause BSD license (see LICENSE).
8: - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
9: */
10: /*
11: BV implemented as a single Vec (CUDA version)
12: */
14: #include <slepc/private/bvimpl.h>
15: #include "../src/sys/classes/bv/impls/svec/svec.h"
16: #include <slepccublas.h>
18: #if defined(PETSC_USE_COMPLEX)
19: #include <thrust/device_ptr.h>
20: #endif
22: #define BLOCKSIZE 64
24: /*
25: B := alpha*A + beta*B
27: A,B are nxk (ld=n)
28: */
29: static PetscErrorCode BVAXPY_BLAS_CUDA(BV bv,PetscInt n_,PetscInt k_,PetscScalar alpha,const PetscScalar *d_A,PetscScalar beta,PetscScalar *d_B)
30: {
31: PetscCuBLASInt m=0,one=1;
32: cublasHandle_t cublasv2handle;
34: PetscCUBLASGetHandle(&cublasv2handle);
35: PetscCuBLASIntCast(n_*k_,&m);
36: PetscLogGpuTimeBegin();
37: if (beta!=(PetscScalar)1.0) {
38: cublasXscal(cublasv2handle,m,&beta,d_B,one);
39: PetscLogGpuFlops(1.0*m);
40: }
41: cublasXaxpy(cublasv2handle,m,&alpha,d_A,one,d_B,one);
42: PetscLogGpuTimeEnd();
43: PetscLogGpuFlops(2.0*m);
44: PetscFunctionReturn(0);
45: }
47: /*
48: C := alpha*A*Q + beta*C
49: */
50: PetscErrorCode BVMult_Svec_CUDA(BV Y,PetscScalar alpha,PetscScalar beta,BV X,Mat Q)
51: {
52: BV_SVEC *y = (BV_SVEC*)Y->data,*x = (BV_SVEC*)X->data;
53: const PetscScalar *d_px,*d_A,*q;
54: PetscScalar *d_py,*d_q,*d_B,*d_C;
55: PetscInt ldq,mq;
56: PetscCuBLASInt m=0,n=0,k=0,ldq_=0;
57: cublasHandle_t cublasv2handle;
58: PetscBool matiscuda;
60: if (!Y->n) PetscFunctionReturn(0);
61: VecCUDAGetArrayRead(x->v,&d_px);
62: if (beta==(PetscScalar)0.0) VecCUDAGetArrayWrite(y->v,&d_py);
63: else VecCUDAGetArray(y->v,&d_py);
64: d_A = d_px+(X->nc+X->l)*X->n;
65: d_C = d_py+(Y->nc+Y->l)*Y->n;
66: if (Q) {
67: PetscCuBLASIntCast(Y->n,&m);
68: PetscCuBLASIntCast(Y->k-Y->l,&n);
69: PetscCuBLASIntCast(X->k-X->l,&k);
70: PetscCUBLASGetHandle(&cublasv2handle);
71: MatGetSize(Q,NULL,&mq);
72: MatDenseGetLDA(Q,&ldq);
73: PetscCuBLASIntCast(ldq,&ldq_);
74: PetscObjectTypeCompare((PetscObject)Q,MATSEQDENSECUDA,&matiscuda);
75: if (matiscuda) MatDenseCUDAGetArrayRead(Q,(const PetscScalar**)&d_q);
76: else {
77: MatDenseGetArrayRead(Q,&q);
78: cudaMalloc((void**)&d_q,ldq*mq*sizeof(PetscScalar));
79: cudaMemcpy(d_q,q,ldq*mq*sizeof(PetscScalar),cudaMemcpyHostToDevice);
80: PetscLogCpuToGpu(ldq*mq*sizeof(PetscScalar));
81: }
82: d_B = d_q+Y->l*ldq+X->l;
83: PetscLogGpuTimeBegin();
84: cublasXgemm(cublasv2handle,CUBLAS_OP_N,CUBLAS_OP_N,m,n,k,&alpha,d_A,m,d_B,ldq_,&beta,d_C,m);
85: PetscLogGpuTimeEnd();
86: if (matiscuda) MatDenseCUDARestoreArrayRead(Q,(const PetscScalar**)&d_q);
87: else {
88: MatDenseRestoreArrayRead(Q,&q);
89: cudaFree(d_q);
90: }
91: PetscLogGpuFlops(2.0*m*n*k);
92: } else BVAXPY_BLAS_CUDA(Y,Y->n,Y->k-Y->l,alpha,d_A,beta,d_C);
93: VecCUDARestoreArrayRead(x->v,&d_px);
94: VecCUDARestoreArrayWrite(y->v,&d_py);
95: PetscFunctionReturn(0);
96: }
98: /*
99: y := alpha*A*x + beta*y
100: */
101: PetscErrorCode BVMultVec_Svec_CUDA(BV X,PetscScalar alpha,PetscScalar beta,Vec y,PetscScalar *q)
102: {
103: BV_SVEC *x = (BV_SVEC*)X->data;
104: const PetscScalar *d_px,*d_A;
105: PetscScalar *d_py,*d_q,*d_x,*d_y;
106: PetscCuBLASInt n=0,k=0,one=1;
107: cublasHandle_t cublasv2handle;
109: PetscCuBLASIntCast(X->n,&n);
110: PetscCuBLASIntCast(X->k-X->l,&k);
111: PetscCUBLASGetHandle(&cublasv2handle);
112: VecCUDAGetArrayRead(x->v,&d_px);
113: if (beta==(PetscScalar)0.0) VecCUDAGetArrayWrite(y,&d_py);
114: else VecCUDAGetArray(y,&d_py);
115: if (!q) VecCUDAGetArray(X->buffer,&d_q);
116: else {
117: cudaMalloc((void**)&d_q,k*sizeof(PetscScalar));
118: cudaMemcpy(d_q,q,k*sizeof(PetscScalar),cudaMemcpyHostToDevice);
119: PetscLogCpuToGpu(k*sizeof(PetscScalar));
120: }
121: d_A = d_px+(X->nc+X->l)*X->n;
122: d_x = d_q;
123: d_y = d_py;
124: PetscLogGpuTimeBegin();
125: cublasXgemv(cublasv2handle,CUBLAS_OP_N,n,k,&alpha,d_A,n,d_x,one,&beta,d_y,one);
126: PetscLogGpuTimeEnd();
127: VecCUDARestoreArrayRead(x->v,&d_px);
128: if (beta==(PetscScalar)0.0) VecCUDARestoreArrayWrite(y,&d_py);
129: else VecCUDARestoreArray(y,&d_py);
130: if (!q) VecCUDARestoreArray(X->buffer,&d_q);
131: else cudaFree(d_q);
132: PetscLogGpuFlops(2.0*n*k);
133: PetscFunctionReturn(0);
134: }
136: /*
137: A(:,s:e-1) := A*B(:,s:e-1)
138: */
139: PetscErrorCode BVMultInPlace_Svec_CUDA(BV V,Mat Q,PetscInt s,PetscInt e)
140: {
141: BV_SVEC *ctx = (BV_SVEC*)V->data;
142: PetscScalar *d_pv,*d_q,*d_A,*d_B,*d_work,sone=1.0,szero=0.0;
143: const PetscScalar *q;
144: PetscInt ldq,nq;
145: PetscCuBLASInt m=0,n=0,k=0,l=0,ldq_=0,bs=BLOCKSIZE;
146: size_t freemem,totmem;
147: cublasHandle_t cublasv2handle;
148: PetscBool matiscuda;
150: if (!V->n) PetscFunctionReturn(0);
151: PetscCuBLASIntCast(V->n,&m);
152: PetscCuBLASIntCast(e-s,&n);
153: PetscCuBLASIntCast(V->k-V->l,&k);
154: MatGetSize(Q,NULL,&nq);
155: MatDenseGetLDA(Q,&ldq);
156: PetscCuBLASIntCast(ldq,&ldq_);
157: VecCUDAGetArray(ctx->v,&d_pv);
158: PetscObjectTypeCompare((PetscObject)Q,MATSEQDENSECUDA,&matiscuda);
159: if (matiscuda) MatDenseCUDAGetArrayRead(Q,(const PetscScalar**)&d_q);
160: else {
161: MatDenseGetArrayRead(Q,&q);
162: cudaMalloc((void**)&d_q,ldq*nq*sizeof(PetscScalar));
163: cudaMemcpy(d_q,q,ldq*nq*sizeof(PetscScalar),cudaMemcpyHostToDevice);
164: PetscLogCpuToGpu(ldq*nq*sizeof(PetscScalar));
165: }
166: PetscCUBLASGetHandle(&cublasv2handle);
167: PetscLogGpuTimeBegin();
168: /* try to allocate the whole matrix */
169: cudaMemGetInfo(&freemem,&totmem);
170: if (freemem>=m*n*sizeof(PetscScalar)) {
171: cudaMalloc((void**)&d_work,m*n*sizeof(PetscScalar));
172: d_A = d_pv+(V->nc+V->l)*m;
173: d_B = d_q+V->l*ldq+V->l+(s-V->l)*ldq;
174: cublasXgemm(cublasv2handle,CUBLAS_OP_N,CUBLAS_OP_N,m,n,k,&sone,d_A,m,d_B,ldq_,&szero,d_work,m);
175: cudaMemcpy2D(d_A+(s-V->l)*m,m*sizeof(PetscScalar),d_work,m*sizeof(PetscScalar),m*sizeof(PetscScalar),n,cudaMemcpyDeviceToDevice);
176: } else {
177: PetscCuBLASIntCast(freemem/(m*sizeof(PetscScalar)),&bs);
178: cudaMalloc((void**)&d_work,bs*n*sizeof(PetscScalar));
179: PetscCuBLASIntCast(m % bs,&l);
180: if (l) {
181: d_A = d_pv+(V->nc+V->l)*m;
182: d_B = d_q+V->l*ldq+V->l+(s-V->l)*ldq;
183: cublasXgemm(cublasv2handle,CUBLAS_OP_N,CUBLAS_OP_N,l,n,k,&sone,d_A,m,d_B,ldq_,&szero,d_work,l);
184: cudaMemcpy2D(d_A+(s-V->l)*m,m*sizeof(PetscScalar),d_work,l*sizeof(PetscScalar),l*sizeof(PetscScalar),n,cudaMemcpyDeviceToDevice);
185: }
186: for (;l<m;l+=bs) {
187: d_A = d_pv+(V->nc+V->l)*m+l;
188: d_B = d_q+V->l*ldq+V->l+(s-V->l)*ldq;
189: cublasXgemm(cublasv2handle,CUBLAS_OP_N,CUBLAS_OP_N,bs,n,k,&sone,d_A,m,d_B,ldq_,&szero,d_work,bs);
190: cudaMemcpy2D(d_A+(s-V->l)*m,m*sizeof(PetscScalar),d_work,bs*sizeof(PetscScalar),bs*sizeof(PetscScalar),n,cudaMemcpyDeviceToDevice);
191: }
192: }
193: PetscLogGpuTimeEnd();
194: if (matiscuda) MatDenseCUDARestoreArrayRead(Q,(const PetscScalar**)&d_q);
195: else {
196: MatDenseRestoreArrayRead(Q,&q);
197: cudaFree(d_q);
198: }
199: cudaFree(d_work);
200: VecCUDARestoreArray(ctx->v,&d_pv);
201: PetscLogGpuFlops(2.0*m*n*k);
202: PetscFunctionReturn(0);
203: }
205: /*
206: A(:,s:e-1) := A*B(:,s:e-1)
207: */
208: PetscErrorCode BVMultInPlaceHermitianTranspose_Svec_CUDA(BV V,Mat Q,PetscInt s,PetscInt e)
209: {
210: BV_SVEC *ctx = (BV_SVEC*)V->data;
211: PetscScalar *d_pv,*d_q,*d_A,*d_B,*d_work,sone=1.0,szero=0.0;
212: const PetscScalar *q;
213: PetscInt ldq,nq;
214: PetscCuBLASInt m=0,n=0,k=0,ldq_=0;
215: cublasHandle_t cublasv2handle;
216: PetscBool matiscuda;
218: if (!V->n) PetscFunctionReturn(0);
219: PetscCuBLASIntCast(V->n,&m);
220: PetscCuBLASIntCast(e-s,&n);
221: PetscCuBLASIntCast(V->k-V->l,&k);
222: MatGetSize(Q,NULL,&nq);
223: MatDenseGetLDA(Q,&ldq);
224: PetscCuBLASIntCast(ldq,&ldq_);
225: VecCUDAGetArray(ctx->v,&d_pv);
226: PetscObjectTypeCompare((PetscObject)Q,MATSEQDENSECUDA,&matiscuda);
227: if (matiscuda) MatDenseCUDAGetArrayRead(Q,(const PetscScalar**)&d_q);
228: else {
229: MatDenseGetArrayRead(Q,&q);
230: cudaMalloc((void**)&d_q,ldq*nq*sizeof(PetscScalar));
231: cudaMemcpy(d_q,q,ldq*nq*sizeof(PetscScalar),cudaMemcpyHostToDevice);
232: PetscLogCpuToGpu(ldq*nq*sizeof(PetscScalar));
233: }
234: PetscCUBLASGetHandle(&cublasv2handle);
235: PetscLogGpuTimeBegin();
236: cudaMalloc((void**)&d_work,m*n*sizeof(PetscScalar));
237: d_A = d_pv+(V->nc+V->l)*m;
238: d_B = d_q+V->l*ldq+s;
239: cublasXgemm(cublasv2handle,CUBLAS_OP_N,CUBLAS_OP_C,m,n,k,&sone,d_A,m,d_B,ldq_,&szero,d_work,m);
240: cudaMemcpy2D(d_A+(s-V->l)*m,m*sizeof(PetscScalar),d_work,m*sizeof(PetscScalar),m*sizeof(PetscScalar),n,cudaMemcpyDeviceToDevice);
241: PetscLogGpuTimeEnd();
242: if (matiscuda) MatDenseCUDARestoreArrayRead(Q,(const PetscScalar**)&d_q);
243: else {
244: MatDenseRestoreArrayRead(Q,&q);
245: cudaFree(d_q);
246: }
247: cudaFree(d_work);
248: VecCUDARestoreArray(ctx->v,&d_pv);
249: PetscLogGpuFlops(2.0*m*n*k);
250: PetscFunctionReturn(0);
251: }
253: /*
254: C := A'*B
255: */
256: PetscErrorCode BVDot_Svec_CUDA(BV X,BV Y,Mat M)
257: {
258: BV_SVEC *x = (BV_SVEC*)X->data,*y = (BV_SVEC*)Y->data;
259: const PetscScalar *d_px,*d_py,*d_A,*d_B;
260: PetscScalar *pm,*d_work,sone=1.0,szero=0.0,*C,*CC;
261: PetscInt j,ldm;
262: PetscCuBLASInt m=0,n=0,k=0,ldm_=0;
263: PetscMPIInt len;
264: cublasHandle_t cublasv2handle;
266: PetscCuBLASIntCast(Y->k-Y->l,&m);
267: PetscCuBLASIntCast(X->k-X->l,&n);
268: PetscCuBLASIntCast(X->n,&k);
269: MatDenseGetLDA(M,&ldm);
270: PetscCuBLASIntCast(ldm,&ldm_);
271: VecCUDAGetArrayRead(x->v,&d_px);
272: VecCUDAGetArrayRead(y->v,&d_py);
273: MatDenseGetArrayWrite(M,&pm);
274: PetscCUBLASGetHandle(&cublasv2handle);
275: cudaMalloc((void**)&d_work,m*n*sizeof(PetscScalar));
276: d_A = d_py+(Y->nc+Y->l)*Y->n;
277: d_B = d_px+(X->nc+X->l)*X->n;
278: C = pm+X->l*ldm+Y->l;
279: if (x->mpi) {
280: if (ldm==m) {
281: BVAllocateWork_Private(X,m*n);
282: if (k) {
283: PetscLogGpuTimeBegin();
284: cublasXgemm(cublasv2handle,CUBLAS_OP_C,CUBLAS_OP_N,m,n,k,&sone,d_A,k,d_B,k,&szero,d_work,ldm_);
285: PetscLogGpuTimeEnd();
286: cudaMemcpy(X->work,d_work,m*n*sizeof(PetscScalar),cudaMemcpyDeviceToHost);
287: PetscLogGpuToCpu(m*n*sizeof(PetscScalar));
288: } else PetscArrayzero(X->work,m*n);
289: PetscMPIIntCast(m*n,&len);
290: MPIU_Allreduce(X->work,C,len,MPIU_SCALAR,MPIU_SUM,PetscObjectComm((PetscObject)X));
291: } else {
292: BVAllocateWork_Private(X,2*m*n);
293: CC = X->work+m*n;
294: if (k) {
295: PetscLogGpuTimeBegin();
296: cublasXgemm(cublasv2handle,CUBLAS_OP_C,CUBLAS_OP_N,m,n,k,&sone,d_A,k,d_B,k,&szero,d_work,m);
297: PetscLogGpuTimeEnd();
298: cudaMemcpy(X->work,d_work,m*n*sizeof(PetscScalar),cudaMemcpyDeviceToHost);
299: PetscLogGpuToCpu(m*n*sizeof(PetscScalar));
300: } else PetscArrayzero(X->work,m*n);
301: PetscMPIIntCast(m*n,&len);
302: MPIU_Allreduce(X->work,CC,len,MPIU_SCALAR,MPIU_SUM,PetscObjectComm((PetscObject)X));
303: for (j=0;j<n;j++) PetscArraycpy(C+j*ldm,CC+j*m,m);
304: }
305: } else {
306: if (k) {
307: BVAllocateWork_Private(X,m*n);
308: PetscLogGpuTimeBegin();
309: cublasXgemm(cublasv2handle,CUBLAS_OP_C,CUBLAS_OP_N,m,n,k,&sone,d_A,k,d_B,k,&szero,d_work,m);
310: PetscLogGpuTimeEnd();
311: cudaMemcpy(X->work,d_work,m*n*sizeof(PetscScalar),cudaMemcpyDeviceToHost);
312: PetscLogGpuToCpu(m*n*sizeof(PetscScalar));
313: for (j=0;j<n;j++) PetscArraycpy(C+j*ldm,X->work+j*m,m);
314: }
315: }
316: cudaFree(d_work);
317: MatDenseRestoreArrayWrite(M,&pm);
318: VecCUDARestoreArrayRead(x->v,&d_px);
319: VecCUDARestoreArrayRead(y->v,&d_py);
320: PetscLogGpuFlops(2.0*m*n*k);
321: PetscFunctionReturn(0);
322: }
324: #if defined(PETSC_USE_COMPLEX)
325: struct conjugate
326: {
327: __host__ __device__
328: PetscScalar operator()(PetscScalar x)
329: {
330: return PetscConj(x);
331: }
332: };
334: PetscErrorCode ConjugateCudaArray(PetscScalar *a, PetscInt n)
335: {
336: thrust::device_ptr<PetscScalar> ptr;
338: try {
339: ptr = thrust::device_pointer_cast(a);
340: thrust::transform(ptr,ptr+n,ptr,conjugate());
341: } catch (char *ex) {
342: SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"Thrust error: %s", ex);
343: }
344: PetscFunctionReturn(0);
345: }
346: #endif
348: /*
349: y := A'*x computed as y' := x'*A
350: */
351: PetscErrorCode BVDotVec_Svec_CUDA(BV X,Vec y,PetscScalar *q)
352: {
353: BV_SVEC *x = (BV_SVEC*)X->data;
354: const PetscScalar *d_A,*d_x,*d_px,*d_py;
355: PetscScalar *d_work,szero=0.0,sone=1.0,*qq=q;
356: PetscCuBLASInt n=0,k=0,one=1;
357: PetscMPIInt len;
358: Vec z = y;
359: cublasHandle_t cublasv2handle;
361: PetscCuBLASIntCast(X->n,&n);
362: PetscCuBLASIntCast(X->k-X->l,&k);
363: PetscCUBLASGetHandle(&cublasv2handle);
364: if (X->matrix) {
365: BV_IPMatMult(X,y);
366: z = X->Bx;
367: }
368: VecCUDAGetArrayRead(x->v,&d_px);
369: VecCUDAGetArrayRead(z,&d_py);
370: if (!q) VecCUDAGetArrayWrite(X->buffer,&d_work);
371: else cudaMalloc((void**)&d_work,k*sizeof(PetscScalar));
372: d_A = d_px+(X->nc+X->l)*X->n;
373: d_x = d_py;
374: if (x->mpi) {
375: BVAllocateWork_Private(X,k);
376: if (n) {
377: PetscLogGpuTimeBegin();
378: #if defined(PETSC_USE_COMPLEX)
379: cublasXgemm(cublasv2handle,CUBLAS_OP_C,CUBLAS_OP_N,one,k,n,&sone,d_x,n,d_A,n,&szero,d_work,one);
380: ConjugateCudaArray(d_work,k);
381: #else
382: cublasXgemm(cublasv2handle,CUBLAS_OP_N,CUBLAS_OP_N,one,k,n,&sone,d_x,one,d_A,n,&szero,d_work,one);
383: #endif
384: PetscLogGpuTimeEnd();
385: cudaMemcpy(X->work,d_work,k*sizeof(PetscScalar),cudaMemcpyDeviceToHost);
386: PetscLogGpuToCpu(k*sizeof(PetscScalar));
387: } else PetscArrayzero(X->work,k);
388: if (!q) {
389: VecCUDARestoreArrayWrite(X->buffer,&d_work);
390: VecGetArray(X->buffer,&qq);
391: } else cudaFree(d_work);
392: PetscMPIIntCast(k,&len);
393: MPIU_Allreduce(X->work,qq,len,MPIU_SCALAR,MPIU_SUM,PetscObjectComm((PetscObject)X));
394: if (!q) VecRestoreArray(X->buffer,&qq);
395: } else {
396: if (n) {
397: PetscLogGpuTimeBegin();
398: #if defined(PETSC_USE_COMPLEX)
399: cublasXgemm(cublasv2handle,CUBLAS_OP_C,CUBLAS_OP_N,one,k,n,&sone,d_x,n,d_A,n,&szero,d_work,one);
400: ConjugateCudaArray(d_work,k);
401: #else
402: cublasXgemm(cublasv2handle,CUBLAS_OP_N,CUBLAS_OP_N,one,k,n,&sone,d_x,one,d_A,n,&szero,d_work,one);
403: #endif
404: PetscLogGpuTimeEnd();
405: }
406: if (!q) VecCUDARestoreArrayWrite(X->buffer,&d_work);
407: else {
408: cudaMemcpy(q,d_work,k*sizeof(PetscScalar),cudaMemcpyDeviceToHost);
409: PetscLogGpuToCpu(k*sizeof(PetscScalar));
410: cudaFree(d_work);
411: }
412: }
413: VecCUDARestoreArrayRead(z,&d_py);
414: VecCUDARestoreArrayRead(x->v,&d_px);
415: PetscLogGpuFlops(2.0*n*k);
416: PetscFunctionReturn(0);
417: }
419: /*
420: y := A'*x computed as y' := x'*A
421: */
422: PetscErrorCode BVDotVec_Local_Svec_CUDA(BV X,Vec y,PetscScalar *m)
423: {
424: BV_SVEC *x = (BV_SVEC*)X->data;
425: const PetscScalar *d_A,*d_x,*d_px,*d_py;
426: PetscScalar *d_y,szero=0.0,sone=1.0;
427: PetscCuBLASInt n=0,k=0,one=1;
428: Vec z = y;
429: cublasHandle_t cublasv2handle;
431: PetscCuBLASIntCast(X->n,&n);
432: PetscCuBLASIntCast(X->k-X->l,&k);
433: if (X->matrix) {
434: BV_IPMatMult(X,y);
435: z = X->Bx;
436: }
437: PetscCUBLASGetHandle(&cublasv2handle);
438: VecCUDAGetArrayRead(x->v,&d_px);
439: VecCUDAGetArrayRead(z,&d_py);
440: d_A = d_px+(X->nc+X->l)*X->n;
441: d_x = d_py;
442: if (n) {
443: cudaMalloc((void**)&d_y,k*sizeof(PetscScalar));
444: PetscLogGpuTimeBegin();
445: #if defined(PETSC_USE_COMPLEX)
446: cublasXgemm(cublasv2handle,CUBLAS_OP_C,CUBLAS_OP_N,one,k,n,&sone,d_x,n,d_A,n,&szero,d_y,one);
447: ConjugateCudaArray(d_y,k);
448: #else
449: cublasXgemm(cublasv2handle,CUBLAS_OP_N,CUBLAS_OP_N,one,k,n,&sone,d_x,one,d_A,n,&szero,d_y,one);
450: #endif
451: PetscLogGpuTimeEnd();
452: cudaMemcpy(m,d_y,k*sizeof(PetscScalar),cudaMemcpyDeviceToHost);
453: PetscLogGpuToCpu(k*sizeof(PetscScalar));
454: cudaFree(d_y);
455: }
456: VecCUDARestoreArrayRead(z,&d_py);
457: VecCUDARestoreArrayRead(x->v,&d_px);
458: PetscLogGpuFlops(2.0*n*k);
459: PetscFunctionReturn(0);
460: }
462: /*
463: Scale n scalars
464: */
465: PetscErrorCode BVScale_Svec_CUDA(BV bv,PetscInt j,PetscScalar alpha)
466: {
467: BV_SVEC *ctx = (BV_SVEC*)bv->data;
468: PetscScalar *d_array, *d_A;
469: PetscCuBLASInt n=0,one=1;
470: cublasHandle_t cublasv2handle;
472: VecCUDAGetArray(ctx->v,&d_array);
473: if (j<0) {
474: d_A = d_array+(bv->nc+bv->l)*bv->n;
475: PetscCuBLASIntCast((bv->k-bv->l)*bv->n,&n);
476: } else {
477: d_A = d_array+(bv->nc+j)*bv->n;
478: PetscCuBLASIntCast(bv->n,&n);
479: }
480: if (alpha == (PetscScalar)0.0) cudaMemset(d_A,0,n*sizeof(PetscScalar));
481: else if (alpha != (PetscScalar)1.0) {
482: PetscCUBLASGetHandle(&cublasv2handle);
483: PetscLogGpuTimeBegin();
484: cublasXscal(cublasv2handle,n,&alpha,d_A,one);
485: PetscLogGpuTimeEnd();
486: PetscLogGpuFlops(1.0*n);
487: }
488: VecCUDARestoreArray(ctx->v,&d_array);
489: PetscFunctionReturn(0);
490: }
492: PetscErrorCode BVMatMult_Svec_CUDA(BV V,Mat A,BV W)
493: {
494: BV_SVEC *v = (BV_SVEC*)V->data,*w = (BV_SVEC*)W->data;
495: Mat Vmat,Wmat;
496: const PetscScalar *d_pv;
497: PetscScalar *d_pw;
498: PetscInt j;
500: if (V->vmm) {
501: BVGetMat(V,&Vmat);
502: BVGetMat(W,&Wmat);
503: MatProductCreateWithMat(A,Vmat,NULL,Wmat);
504: MatProductSetType(Wmat,MATPRODUCT_AB);
505: MatProductSetFromOptions(Wmat);
506: MatProductSymbolic(Wmat);
507: MatProductNumeric(Wmat);
508: MatProductClear(Wmat);
509: BVRestoreMat(V,&Vmat);
510: BVRestoreMat(W,&Wmat);
511: } else {
512: VecCUDAGetArrayRead(v->v,&d_pv);
513: VecCUDAGetArrayWrite(w->v,&d_pw);
514: for (j=0;j<V->k-V->l;j++) {
515: VecCUDAPlaceArray(V->cv[1],(PetscScalar *)d_pv+(V->nc+V->l+j)*V->n);
516: VecCUDAPlaceArray(W->cv[1],d_pw+(W->nc+W->l+j)*W->n);
517: MatMult(A,V->cv[1],W->cv[1]);
518: VecCUDAResetArray(V->cv[1]);
519: VecCUDAResetArray(W->cv[1]);
520: }
521: VecCUDARestoreArrayRead(v->v,&d_pv);
522: VecCUDARestoreArrayWrite(w->v,&d_pw);
523: }
524: PetscFunctionReturn(0);
525: }
527: PetscErrorCode BVCopy_Svec_CUDA(BV V,BV W)
528: {
529: BV_SVEC *v = (BV_SVEC*)V->data,*w = (BV_SVEC*)W->data;
530: const PetscScalar *d_pv,*d_pvc;
531: PetscScalar *d_pw,*d_pwc;
533: VecCUDAGetArrayRead(v->v,&d_pv);
534: VecCUDAGetArrayWrite(w->v,&d_pw);
535: d_pvc = d_pv+(V->nc+V->l)*V->n;
536: d_pwc = d_pw+(W->nc+W->l)*W->n;
537: cudaMemcpy(d_pwc,d_pvc,(V->k-V->l)*V->n*sizeof(PetscScalar),cudaMemcpyDeviceToDevice);
538: VecCUDARestoreArrayRead(v->v,&d_pv);
539: VecCUDARestoreArrayWrite(w->v,&d_pw);
540: PetscFunctionReturn(0);
541: }
543: PetscErrorCode BVCopyColumn_Svec_CUDA(BV V,PetscInt j,PetscInt i)
544: {
545: BV_SVEC *v = (BV_SVEC*)V->data;
546: PetscScalar *d_pv;
548: VecCUDAGetArray(v->v,&d_pv);
549: cudaMemcpy(d_pv+(V->nc+i)*V->n,d_pv+(V->nc+j)*V->n,V->n*sizeof(PetscScalar),cudaMemcpyDeviceToDevice);
550: VecCUDARestoreArray(v->v,&d_pv);
551: PetscFunctionReturn(0);
552: }
554: PetscErrorCode BVResize_Svec_CUDA(BV bv,PetscInt m,PetscBool copy)
555: {
556: BV_SVEC *ctx = (BV_SVEC*)bv->data;
557: const PetscScalar *d_pv;
558: PetscScalar *d_pnew;
559: PetscInt bs;
560: Vec vnew;
561: char str[50];
563: VecGetBlockSize(bv->t,&bs);
564: VecCreate(PetscObjectComm((PetscObject)bv->t),&vnew);
565: VecSetType(vnew,((PetscObject)bv->t)->type_name);
566: VecSetSizes(vnew,m*bv->n,PETSC_DECIDE);
567: VecSetBlockSize(vnew,bs);
568: PetscLogObjectParent((PetscObject)bv,(PetscObject)vnew);
569: if (((PetscObject)bv)->name) {
570: PetscSNPrintf(str,sizeof(str),"%s_0",((PetscObject)bv)->name);
571: PetscObjectSetName((PetscObject)vnew,str);
572: }
573: if (copy) {
574: VecCUDAGetArrayRead(ctx->v,&d_pv);
575: VecCUDAGetArrayWrite(vnew,&d_pnew);
576: cudaMemcpy(d_pnew,d_pv,PetscMin(m,bv->m)*bv->n*sizeof(PetscScalar),cudaMemcpyDeviceToDevice);
577: VecCUDARestoreArrayRead(ctx->v,&d_pv);
578: VecCUDARestoreArrayWrite(vnew,&d_pnew);
579: }
580: VecDestroy(&ctx->v);
581: ctx->v = vnew;
582: PetscFunctionReturn(0);
583: }
585: PetscErrorCode BVGetColumn_Svec_CUDA(BV bv,PetscInt j,Vec *v)
586: {
587: BV_SVEC *ctx = (BV_SVEC*)bv->data;
588: PetscScalar *d_pv;
589: PetscInt l;
591: l = BVAvailableVec;
592: VecCUDAGetArray(ctx->v,&d_pv);
593: VecCUDAPlaceArray(bv->cv[l],d_pv+(bv->nc+j)*bv->n);
594: PetscFunctionReturn(0);
595: }
597: PetscErrorCode BVRestoreColumn_Svec_CUDA(BV bv,PetscInt j,Vec *v)
598: {
599: BV_SVEC *ctx = (BV_SVEC*)bv->data;
600: PetscInt l;
602: l = (j==bv->ci[0])? 0: 1;
603: VecCUDAResetArray(bv->cv[l]);
604: VecCUDARestoreArray(ctx->v,NULL);
605: PetscFunctionReturn(0);
606: }
608: PetscErrorCode BVRestoreSplit_Svec_CUDA(BV bv,BV *L,BV *R)
609: {
610: Vec v;
611: const PetscScalar *d_pv;
612: PetscObjectState lstate,rstate;
613: PetscBool change=PETSC_FALSE;
615: /* force sync flag to PETSC_CUDA_BOTH */
616: if (L) {
617: PetscObjectStateGet((PetscObject)*L,&lstate);
618: if (lstate != bv->lstate) {
619: v = ((BV_SVEC*)bv->L->data)->v;
620: VecCUDAGetArrayRead(v,&d_pv);
621: VecCUDARestoreArrayRead(v,&d_pv);
622: change = PETSC_TRUE;
623: }
624: }
625: if (R) {
626: PetscObjectStateGet((PetscObject)*R,&rstate);
627: if (rstate != bv->rstate) {
628: v = ((BV_SVEC*)bv->R->data)->v;
629: VecCUDAGetArrayRead(v,&d_pv);
630: VecCUDARestoreArrayRead(v,&d_pv);
631: change = PETSC_TRUE;
632: }
633: }
634: if (change) {
635: v = ((BV_SVEC*)bv->data)->v;
636: VecCUDAGetArray(v,(PetscScalar **)&d_pv);
637: VecCUDARestoreArray(v,(PetscScalar **)&d_pv);
638: }
639: PetscFunctionReturn(0);
640: }
642: PetscErrorCode BVGetMat_Svec_CUDA(BV bv,Mat *A)
643: {
644: BV_SVEC *ctx = (BV_SVEC*)bv->data;
645: PetscScalar *vv,*aa;
646: PetscBool create=PETSC_FALSE;
647: PetscInt m,cols;
649: m = bv->k-bv->l;
650: if (!bv->Aget) create=PETSC_TRUE;
651: else {
652: MatDenseCUDAGetArray(bv->Aget,&aa);
654: MatGetSize(bv->Aget,NULL,&cols);
655: if (cols!=m) {
656: MatDestroy(&bv->Aget);
657: create=PETSC_TRUE;
658: }
659: }
660: VecCUDAGetArray(ctx->v,&vv);
661: if (create) {
662: MatCreateDenseCUDA(PetscObjectComm((PetscObject)bv),bv->n,PETSC_DECIDE,bv->N,m,vv,&bv->Aget); /* pass a pointer to avoid allocation of storage */
663: MatDenseCUDAReplaceArray(bv->Aget,NULL); /* replace with a null pointer, the value after BVRestoreMat */
664: PetscLogObjectParent((PetscObject)bv,(PetscObject)bv->Aget);
665: }
666: MatDenseCUDAPlaceArray(bv->Aget,vv+(bv->nc+bv->l)*bv->n); /* set the actual pointer */
667: *A = bv->Aget;
668: PetscFunctionReturn(0);
669: }
671: PetscErrorCode BVRestoreMat_Svec_CUDA(BV bv,Mat *A)
672: {
673: BV_SVEC *ctx = (BV_SVEC*)bv->data;
674: PetscScalar *vv,*aa;
676: MatDenseCUDAGetArray(bv->Aget,&aa);
677: vv = aa-(bv->nc+bv->l)*bv->n;
678: MatDenseCUDAResetArray(bv->Aget);
679: VecCUDARestoreArray(ctx->v,&vv);
680: *A = NULL;
681: PetscFunctionReturn(0);
682: }