Actual source code: bvorthogcuda.cu

slepc-3.12.0 2019-09-30
Report Typos and Errors
  1: /*
  2:    - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
  3:    SLEPc - Scalable Library for Eigenvalue Problem Computations
  4:    Copyright (c) 2002-2019, Universitat Politecnica de Valencia, Spain

  6:    This file is part of SLEPc.
  7:    SLEPc is distributed under a 2-clause BSD license (see LICENSE).
  8:    - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
  9: */
 10: /*
 11:    BV orthogonalization routines (CUDA)
 12: */

 14: #include <slepc/private/bvimpl.h>          /*I   "slepcbv.h"   I*/
 15: #include <slepcblaslapack.h>
 16: #include <petsccublas.h>

 18: /*
 19:    BV_CleanCoefficients_CUDA - Sets to zero all entries of column j of the bv buffer
 20: */
 21: PetscErrorCode BV_CleanCoefficients_CUDA(BV bv,PetscInt j,PetscScalar *h)
 22: {
 24:   PetscScalar    *d_hh,*d_a;
 25:   PetscInt       i;

 28:   if (!h) {
 29:     VecCUDAGetArray(bv->buffer,&d_a);
 30:     d_hh = d_a + j*(bv->nc+bv->m);
 31:     cudaMemset(d_hh,0,(bv->nc+j)*sizeof(PetscScalar));
 32:     WaitForGPU();CHKERRCUDA(ierr);
 33:     VecCUDARestoreArray(bv->buffer,&d_a);
 34:   } else { /* cpu memory */
 35:     for (i=0;i<bv->nc+j;i++) h[i] = 0.0;
 36:   }
 37:   return(0);
 38: }

 40: /*
 41:    BV_AddCoefficients_CUDA - Add the contents of the scratch (0-th column) of the bv buffer
 42:    into column j of the bv buffer
 43:  */
 44: PetscErrorCode BV_AddCoefficients_CUDA(BV bv,PetscInt j,PetscScalar *h,PetscScalar *c)
 45: {
 47:   PetscScalar    *d_h,*d_c,sone=1.0;
 48:   PetscInt       i;
 49:   PetscBLASInt   one=1;
 50:   cublasStatus_t cberr;
 51:   cublasHandle_t cublasv2handle;

 54:   if (!h) {
 55:     PetscCUBLASGetHandle(&cublasv2handle);
 56:     VecCUDAGetArray(bv->buffer,&d_c);
 57:     d_h = d_c + j*(bv->nc+bv->m);
 58:     PetscLogGpuTimeBegin();
 59:     cberr = cublasXaxpy(cublasv2handle,bv->nc+j,&sone,d_c,one,d_h,one);CHKERRCUBLAS(cberr);
 60:     PetscLogGpuTimeEnd();
 61:     PetscLogGpuFlops(1.0*bv->nc+j);
 62:     WaitForGPU();CHKERRCUDA(ierr);
 63:     VecCUDARestoreArray(bv->buffer,&d_c);
 64:   } else { /* cpu memory */
 65:     for (i=0;i<bv->nc+j;i++) h[i] += c[i];
 66:     PetscLogFlops(1.0*bv->nc+j);
 67:   }
 68:   return(0);
 69: }

 71: /*
 72:    BV_SetValue_CUDA - Sets value in row j (counted after the constraints) of column k
 73:    of the coefficients array
 74: */
 75: PetscErrorCode BV_SetValue_CUDA(BV bv,PetscInt j,PetscInt k,PetscScalar *h,PetscScalar value)
 76: {
 78:   PetscScalar    *d_h,*a;
 79:   cudaError_t    cerr;

 82:   if (!h) {
 83:     VecCUDAGetArray(bv->buffer,&a);
 84:     d_h = a + k*(bv->nc+bv->m) + bv->nc+j;
 85:     cerr = cudaMemcpy(d_h,&value,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
 86:     PetscLogCpuToGpu(sizeof(PetscScalar));
 87:     WaitForGPU();CHKERRCUDA(ierr);
 88:     VecCUDARestoreArray(bv->buffer,&a);
 89:   } else { /* cpu memory */
 90:     h[bv->nc+j] = value;
 91:   }
 92:   return(0);
 93: }

 95: /*
 96:    BV_SquareSum_CUDA - Returns the value h'*h, where h represents the contents of the
 97:    coefficients array (up to position j)
 98: */
 99: PetscErrorCode BV_SquareSum_CUDA(BV bv,PetscInt j,PetscScalar *h,PetscReal *sum)
100: {
101:   PetscErrorCode    ierr;
102:   const PetscScalar *d_h;
103:   PetscScalar       dot;
104:   PetscInt          i;
105:   PetscBLASInt      one=1;
106:   cublasStatus_t    cberr;
107:   cublasHandle_t    cublasv2handle;

110:   if (!h) {
111:     PetscCUBLASGetHandle(&cublasv2handle);
112:     VecCUDAGetArrayRead(bv->buffer,&d_h);
113:     PetscLogGpuTimeBegin();
114:     cberr = cublasXdotc(cublasv2handle,bv->nc+j,d_h,one,d_h,one,&dot);CHKERRCUBLAS(cberr);
115:     PetscLogGpuTimeEnd();
116:     PetscLogGpuFlops(2.0*bv->nc+j);
117:     WaitForGPU();CHKERRCUDA(ierr);
118:     *sum = PetscRealPart(dot);
119:     VecCUDARestoreArrayRead(bv->buffer,&d_h);
120:   } else { /* cpu memory */
121:     *sum = 0.0;
122:     for (i=0;i<bv->nc+j;i++) *sum += PetscRealPart(h[i]*PetscConj(h[i]));
123:     PetscLogFlops(2.0*bv->nc+j);
124:   }
125:   return(0);
126: }

128: #define X_AXIS        0
129: #define BLOCK_SIZE_X 64
130: #define TILE_SIZE_X  16 /* work to be done by any thread on axis x */

132: /*
133:    Set the kernels grid dimensions
134:    xcount: number of kernel calls needed for the requested size
135:  */
136: PetscErrorCode SetGrid1D(PetscInt n, dim3 *dimGrid, dim3 *dimBlock,PetscInt *xcount)
137: {
138:   PetscInt              one=1,card;
139:   struct cudaDeviceProp devprop;
140:   cudaError_t           cerr;

143:   *xcount = 1;
144:   if (n>BLOCK_SIZE_X) {
145:     dimBlock->x = BLOCK_SIZE_X;
146:     dimGrid->x = (n+BLOCK_SIZE_X*TILE_SIZE_X-one)/BLOCK_SIZE_X*TILE_SIZE_X;
147:   } else {
148:     dimBlock->x = (n+TILE_SIZE_X-one)/TILE_SIZE_X;
149:     dimGrid->x = one;
150:   }
151:   cerr = cudaGetDevice(&card);CHKERRCUDA(cerr);
152:   cerr = cudaGetDeviceProperties(&devprop,card);CHKERRCUDA(cerr);
153:   if (dimGrid->x>(unsigned)devprop.maxGridSize[X_AXIS]) {
154:     *xcount = (dimGrid->x+devprop.maxGridSize[X_AXIS]-one)/devprop.maxGridSize[X_AXIS];
155:     dimGrid->x = devprop.maxGridSize[X_AXIS];
156:   }
157:   return(0);
158: }

160: /* pointwise multiplication */
161: __global__ void PointwiseMult_kernel(PetscInt xcount,PetscScalar *a,const PetscScalar *b,PetscInt n)
162: {
163:   PetscInt i,x;

165:   x = xcount*gridDim.x*blockDim.x+blockIdx.x*blockDim.x*TILE_SIZE_X+threadIdx.x*TILE_SIZE_X;
166:   for (i=x;i<x+TILE_SIZE_X&&i<n;i++) {
167:     a[i] *= PetscRealPart(b[i]);
168:   }
169: }

171: /* pointwise division */
172: __global__ void PointwiseDiv_kernel(PetscInt xcount,PetscScalar *a,const PetscScalar *b,PetscInt n)
173: {
174:   PetscInt i,x;

176:   x = xcount*gridDim.x*blockDim.x+blockIdx.x*blockDim.x*TILE_SIZE_X+threadIdx.x*TILE_SIZE_X;
177:   for (i=x;i<x+TILE_SIZE_X&&i<n;i++) {
178:     a[i] /= PetscRealPart(b[i]);
179:   }
180: }

182: /*
183:    BV_ApplySignature_CUDA - Computes the pointwise product h*omega, where h represents
184:    the contents of the coefficients array (up to position j) and omega is the signature;
185:    if inverse=TRUE then the operation is h/omega
186: */
187: PetscErrorCode BV_ApplySignature_CUDA(BV bv,PetscInt j,PetscScalar *h,PetscBool inverse)
188: {
189:   PetscErrorCode    ierr;
190:   PetscScalar       *d_h;
191:   const PetscScalar *d_omega,*omega;
192:   PetscInt          i,xcount;
193:   dim3              blocks3d, threads3d;
194:   cudaError_t       cerr;

197:   if (!(bv->nc+j)) return(0);
198:   if (!h) {
199:     VecCUDAGetArray(bv->buffer,&d_h);
200:     VecCUDAGetArrayRead(bv->omega,&d_omega);
201:     SetGrid1D(bv->nc+j,&blocks3d,&threads3d,&xcount);
202:     PetscLogGpuTimeBegin();
203:     if (inverse) {
204:       for (i=0;i<xcount;i++) {
205:         PointwiseDiv_kernel<<<blocks3d,threads3d>>>(i,d_h,d_omega,bv->nc+j);
206:       }
207:     } else {
208:       for (i=0;i<xcount;i++) {
209:         PointwiseMult_kernel<<<blocks3d,threads3d>>>(i,d_h,d_omega,bv->nc+j);
210:       }
211:     }
212:     cerr = cudaGetLastError();CHKERRCUDA(cerr);
213:     PetscLogGpuTimeEnd();
214:     PetscLogGpuFlops(1.0*bv->nc+j);
215:     WaitForGPU();CHKERRCUDA(ierr);
216:     VecCUDARestoreArrayRead(bv->omega,&d_omega);
217:     VecCUDARestoreArray(bv->buffer,&d_h);
218:   } else {
219:     VecGetArrayRead(bv->omega,&omega);
220:     if (inverse) for (i=0;i<bv->nc+j;i++) h[i] /= PetscRealPart(omega[i]);
221:     else for (i=0;i<bv->nc+j;i++) h[i] *= PetscRealPart(omega[i]);
222:     VecRestoreArrayRead(bv->omega,&omega);
223:     PetscLogFlops(1.0*bv->nc+j);
224:   }
225:   return(0);
226: }

228: /*
229:    BV_SquareRoot_CUDA - Returns the square root of position j (counted after the constraints)
230:    of the coefficients array
231: */
232: PetscErrorCode BV_SquareRoot_CUDA(BV bv,PetscInt j,PetscScalar *h,PetscReal *beta)
233: {
234:   PetscErrorCode    ierr;
235:   const PetscScalar *d_h;
236:   PetscScalar       hh;
237:   cudaError_t       cerr;

240:   if (!h) {
241:     VecCUDAGetArrayRead(bv->buffer,&d_h);
242:     cerr = cudaMemcpy(&hh,d_h+bv->nc+j,sizeof(PetscScalar),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
243:     PetscLogGpuToCpu(sizeof(PetscScalar));
244:     WaitForGPU();CHKERRCUDA(ierr);
245:     BV_SafeSqrt(bv,hh,beta);
246:     VecCUDARestoreArrayRead(bv->buffer,&d_h);
247:   } else {
248:     BV_SafeSqrt(bv,h[bv->nc+j],beta);
249:   }
250:   return(0);
251: }

253: /*
254:    BV_StoreCoefficients_CUDA - Copy the contents of the coefficients array to an array dest
255:    provided by the caller (only values from l to j are copied)
256: */
257: PetscErrorCode BV_StoreCoefficients_CUDA(BV bv,PetscInt j,PetscScalar *h,PetscScalar *dest)
258: {
259:   PetscErrorCode    ierr;
260:   const PetscScalar *d_h,*d_a;
261:   PetscInt          i;
262:   cudaError_t       cerr;

265:   if (!h) {
266:     VecCUDAGetArrayRead(bv->buffer,&d_a);
267:     d_h = d_a + j*(bv->nc+bv->m)+bv->nc;
268:     cerr = cudaMemcpy(dest-bv->l,d_h,(j-bv->l)*sizeof(PetscScalar),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
269:     PetscLogGpuToCpu((j-bv->l)*sizeof(PetscScalar));
270:     WaitForGPU();CHKERRCUDA(ierr);
271:     VecCUDARestoreArrayRead(bv->buffer,&d_a);
272:   } else {
273:     for (i=bv->l;i<j;i++) dest[i-bv->l] = h[bv->nc+i];
274:   }
275:   return(0);
276: }