Actual source code: bvorthogcuda.cu
slepc-3.16.3 2022-04-11
1: /*
2: - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
3: SLEPc - Scalable Library for Eigenvalue Problem Computations
4: Copyright (c) 2002-2021, Universitat Politecnica de Valencia, Spain
6: This file is part of SLEPc.
7: SLEPc is distributed under a 2-clause BSD license (see LICENSE).
8: - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
9: */
10: /*
11: BV orthogonalization routines (CUDA)
12: */
14: #include <slepc/private/bvimpl.h>
15: #include <slepcblaslapack.h>
16: #include <slepccublas.h>
18: /*
19: BV_CleanCoefficients_CUDA - Sets to zero all entries of column j of the bv buffer
20: */
21: PetscErrorCode BV_CleanCoefficients_CUDA(BV bv,PetscInt j,PetscScalar *h)
22: {
24: PetscScalar *d_hh,*d_a;
25: PetscInt i;
26: cudaError_t cerr;
29: if (!h) {
30: VecCUDAGetArray(bv->buffer,&d_a);
31: PetscLogGpuTimeBegin();
32: d_hh = d_a + j*(bv->nc+bv->m);
33: cerr = cudaMemset(d_hh,0,(bv->nc+j)*sizeof(PetscScalar));CHKERRCUDA(cerr);
34: PetscLogGpuTimeEnd();
35: VecCUDARestoreArray(bv->buffer,&d_a);
36: } else { /* cpu memory */
37: for (i=0;i<bv->nc+j;i++) h[i] = 0.0;
38: }
39: return(0);
40: }
42: /*
43: BV_AddCoefficients_CUDA - Add the contents of the scratch (0-th column) of the bv buffer
44: into column j of the bv buffer
45: */
46: PetscErrorCode BV_AddCoefficients_CUDA(BV bv,PetscInt j,PetscScalar *h,PetscScalar *c)
47: {
49: PetscScalar *d_h,*d_c,sone=1.0;
50: PetscInt i;
51: PetscCuBLASInt idx=0,one=1;
52: cublasStatus_t cberr;
53: cublasHandle_t cublasv2handle;
56: if (!h) {
57: PetscCUBLASGetHandle(&cublasv2handle);
58: VecCUDAGetArray(bv->buffer,&d_c);
59: d_h = d_c + j*(bv->nc+bv->m);
60: PetscCuBLASIntCast(bv->nc+j,&idx);
61: PetscLogGpuTimeBegin();
62: cberr = cublasXaxpy(cublasv2handle,idx,&sone,d_c,one,d_h,one);CHKERRCUBLAS(cberr);
63: PetscLogGpuTimeEnd();
64: PetscLogGpuFlops(1.0*(bv->nc+j));
65: VecCUDARestoreArray(bv->buffer,&d_c);
66: } else { /* cpu memory */
67: for (i=0;i<bv->nc+j;i++) h[i] += c[i];
68: PetscLogFlops(1.0*(bv->nc+j));
69: }
70: return(0);
71: }
73: /*
74: BV_SetValue_CUDA - Sets value in row j (counted after the constraints) of column k
75: of the coefficients array
76: */
77: PetscErrorCode BV_SetValue_CUDA(BV bv,PetscInt j,PetscInt k,PetscScalar *h,PetscScalar value)
78: {
80: PetscScalar *d_h,*a;
81: cudaError_t cerr;
84: if (!h) {
85: VecCUDAGetArray(bv->buffer,&a);
86: PetscLogGpuTimeBegin();
87: d_h = a + k*(bv->nc+bv->m) + bv->nc+j;
88: cerr = cudaMemcpy(d_h,&value,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
89: PetscLogCpuToGpu(sizeof(PetscScalar));
90: PetscLogGpuTimeEnd();
91: VecCUDARestoreArray(bv->buffer,&a);
92: } else { /* cpu memory */
93: h[bv->nc+j] = value;
94: }
95: return(0);
96: }
98: /*
99: BV_SquareSum_CUDA - Returns the value h'*h, where h represents the contents of the
100: coefficients array (up to position j)
101: */
102: PetscErrorCode BV_SquareSum_CUDA(BV bv,PetscInt j,PetscScalar *h,PetscReal *sum)
103: {
104: PetscErrorCode ierr;
105: const PetscScalar *d_h;
106: PetscScalar dot;
107: PetscInt i;
108: PetscCuBLASInt idx=0,one=1;
109: cublasStatus_t cberr;
110: cublasHandle_t cublasv2handle;
113: if (!h) {
114: PetscCUBLASGetHandle(&cublasv2handle);
115: VecCUDAGetArrayRead(bv->buffer,&d_h);
116: PetscCuBLASIntCast(bv->nc+j,&idx);
117: PetscLogGpuTimeBegin();
118: cberr = cublasXdotc(cublasv2handle,idx,d_h,one,d_h,one,&dot);CHKERRCUBLAS(cberr);
119: PetscLogGpuTimeEnd();
120: PetscLogGpuFlops(2.0*(bv->nc+j));
121: *sum = PetscRealPart(dot);
122: VecCUDARestoreArrayRead(bv->buffer,&d_h);
123: } else { /* cpu memory */
124: *sum = 0.0;
125: for (i=0;i<bv->nc+j;i++) *sum += PetscRealPart(h[i]*PetscConj(h[i]));
126: PetscLogFlops(2.0*(bv->nc+j));
127: }
128: return(0);
129: }
131: #define X_AXIS 0
132: #define BLOCK_SIZE_X 64
133: #define TILE_SIZE_X 16 /* work to be done by any thread on axis x */
135: /*
136: Set the kernels grid dimensions
137: xcount: number of kernel calls needed for the requested size
138: */
139: PetscErrorCode SetGrid1D(PetscInt n, dim3 *dimGrid, dim3 *dimBlock,PetscInt *xcount)
140: {
141: PetscInt one=1;
142: PetscBLASInt card;
143: struct cudaDeviceProp devprop;
144: cudaError_t cerr;
147: *xcount = 1;
148: if (n>BLOCK_SIZE_X) {
149: dimBlock->x = BLOCK_SIZE_X;
150: dimGrid->x = (n+BLOCK_SIZE_X*TILE_SIZE_X-one)/BLOCK_SIZE_X*TILE_SIZE_X;
151: } else {
152: dimBlock->x = (n+TILE_SIZE_X-one)/TILE_SIZE_X;
153: dimGrid->x = one;
154: }
155: cerr = cudaGetDevice(&card);CHKERRCUDA(cerr);
156: cerr = cudaGetDeviceProperties(&devprop,card);CHKERRCUDA(cerr);
157: if (dimGrid->x>(unsigned)devprop.maxGridSize[X_AXIS]) {
158: *xcount = (dimGrid->x+devprop.maxGridSize[X_AXIS]-one)/devprop.maxGridSize[X_AXIS];
159: dimGrid->x = devprop.maxGridSize[X_AXIS];
160: }
161: return(0);
162: }
164: /* pointwise multiplication */
165: __global__ void PointwiseMult_kernel(PetscInt xcount,PetscScalar *a,const PetscScalar *b,PetscInt n)
166: {
167: PetscInt i,x;
169: x = xcount*gridDim.x*blockDim.x+blockIdx.x*blockDim.x*TILE_SIZE_X+threadIdx.x*TILE_SIZE_X;
170: for (i=x;i<x+TILE_SIZE_X&&i<n;i++) {
171: a[i] *= PetscRealPart(b[i]);
172: }
173: }
175: /* pointwise division */
176: __global__ void PointwiseDiv_kernel(PetscInt xcount,PetscScalar *a,const PetscScalar *b,PetscInt n)
177: {
178: PetscInt i,x;
180: x = xcount*gridDim.x*blockDim.x+blockIdx.x*blockDim.x*TILE_SIZE_X+threadIdx.x*TILE_SIZE_X;
181: for (i=x;i<x+TILE_SIZE_X&&i<n;i++) {
182: a[i] /= PetscRealPart(b[i]);
183: }
184: }
186: /*
187: BV_ApplySignature_CUDA - Computes the pointwise product h*omega, where h represents
188: the contents of the coefficients array (up to position j) and omega is the signature;
189: if inverse=TRUE then the operation is h/omega
190: */
191: PetscErrorCode BV_ApplySignature_CUDA(BV bv,PetscInt j,PetscScalar *h,PetscBool inverse)
192: {
193: PetscErrorCode ierr;
194: PetscScalar *d_h;
195: const PetscScalar *d_omega,*omega;
196: PetscInt i,xcount;
197: dim3 blocks3d, threads3d;
198: cudaError_t cerr;
201: if (!(bv->nc+j)) return(0);
202: if (!h) {
203: VecCUDAGetArray(bv->buffer,&d_h);
204: VecCUDAGetArrayRead(bv->omega,&d_omega);
205: SetGrid1D(bv->nc+j,&blocks3d,&threads3d,&xcount);
206: PetscLogGpuTimeBegin();
207: if (inverse) {
208: for (i=0;i<xcount;i++) {
209: PointwiseDiv_kernel<<<blocks3d,threads3d>>>(i,d_h,d_omega,bv->nc+j);
210: }
211: } else {
212: for (i=0;i<xcount;i++) {
213: PointwiseMult_kernel<<<blocks3d,threads3d>>>(i,d_h,d_omega,bv->nc+j);
214: }
215: }
216: cerr = cudaGetLastError();CHKERRCUDA(cerr);
217: PetscLogGpuTimeEnd();
218: PetscLogGpuFlops(1.0*(bv->nc+j));
219: VecCUDARestoreArrayRead(bv->omega,&d_omega);
220: VecCUDARestoreArray(bv->buffer,&d_h);
221: } else {
222: VecGetArrayRead(bv->omega,&omega);
223: if (inverse) for (i=0;i<bv->nc+j;i++) h[i] /= PetscRealPart(omega[i]);
224: else for (i=0;i<bv->nc+j;i++) h[i] *= PetscRealPart(omega[i]);
225: VecRestoreArrayRead(bv->omega,&omega);
226: PetscLogFlops(1.0*(bv->nc+j));
227: }
228: return(0);
229: }
231: /*
232: BV_SquareRoot_CUDA - Returns the square root of position j (counted after the constraints)
233: of the coefficients array
234: */
235: PetscErrorCode BV_SquareRoot_CUDA(BV bv,PetscInt j,PetscScalar *h,PetscReal *beta)
236: {
237: PetscErrorCode ierr;
238: const PetscScalar *d_h;
239: PetscScalar hh;
240: cudaError_t cerr;
243: if (!h) {
244: VecCUDAGetArrayRead(bv->buffer,&d_h);
245: PetscLogGpuTimeBegin();
246: cerr = cudaMemcpy(&hh,d_h+bv->nc+j,sizeof(PetscScalar),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
247: PetscLogGpuToCpu(sizeof(PetscScalar));
248: PetscLogGpuTimeEnd();
249: BV_SafeSqrt(bv,hh,beta);
250: VecCUDARestoreArrayRead(bv->buffer,&d_h);
251: } else {
252: BV_SafeSqrt(bv,h[bv->nc+j],beta);
253: }
254: return(0);
255: }
257: /*
258: BV_StoreCoefficients_CUDA - Copy the contents of the coefficients array to an array dest
259: provided by the caller (only values from l to j are copied)
260: */
261: PetscErrorCode BV_StoreCoefficients_CUDA(BV bv,PetscInt j,PetscScalar *h,PetscScalar *dest)
262: {
263: PetscErrorCode ierr;
264: const PetscScalar *d_h,*d_a;
265: PetscInt i;
266: cudaError_t cerr;
269: if (!h) {
270: VecCUDAGetArrayRead(bv->buffer,&d_a);
271: PetscLogGpuTimeBegin();
272: d_h = d_a + j*(bv->nc+bv->m)+bv->nc;
273: cerr = cudaMemcpy(dest-bv->l,d_h,(j-bv->l)*sizeof(PetscScalar),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
274: PetscLogGpuToCpu((j-bv->l)*sizeof(PetscScalar));
275: PetscLogGpuTimeEnd();
276: VecCUDARestoreArrayRead(bv->buffer,&d_a);
277: } else {
278: for (i=bv->l;i<j;i++) dest[i-bv->l] = h[bv->nc+i];
279: }
280: return(0);
281: }