Actual source code: fnutilcuda.cu
slepc-3.15.1 2021-05-28
1: /*
2: - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
3: SLEPc - Scalable Library for Eigenvalue Problem Computations
4: Copyright (c) 2002-2021, Universitat Politecnica de Valencia, Spain
6: This file is part of SLEPc.
7: SLEPc is distributed under a 2-clause BSD license (see LICENSE).
8: - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
9: */
10: /*
11: Utility subroutines common to several impls
12: */
14: #include <petscsys.h>
15: #include "fnutilcuda.h"
17: __global__ void clean_offdiagonal_kernel(PetscInt n,PetscScalar *d_pa,PetscInt ld,PetscScalar v,PetscInt xcount)
18: {
19: PetscInt x,j;
20: x = (xcount*gridDim.x*blockDim.x)+blockIdx.x*blockDim.x*TILE_SIZE_X+threadIdx.x*TILE_SIZE_X;
22: if (x<n) {
23: for (j=0;j<n;j++){
24: if (j != x) d_pa[x+j*ld] = 0.0;
25: else d_pa[x+j*ld] = d_pa[x+j*ld]*v;
26: }
27: }
28: }
30: __host__ PetscErrorCode clean_offdiagonal(PetscInt n,PetscScalar *d_pa,PetscInt ld,PetscScalar v)
31: {
32: /* XXX use 2D TBD */
33: PetscInt i,dimGrid_xcount;
34: dim3 blocks3d,threads3d;
35: cudaError_t cerr;
38: get_params_1D(n,&blocks3d,&threads3d,&dimGrid_xcount);
39: for (i=0;i<dimGrid_xcount;i++) {
40: clean_offdiagonal_kernel<<<blocks3d, threads3d>>>(n,d_pa,ld,v,i);
41: cerr = cudaGetLastError();CHKERRCUDA(cerr);
42: }
43: return(0);
44: }
46: __global__ void set_diagonal_kernel(PetscInt n,PetscScalar *d_pa,PetscInt ld,PetscScalar v,PetscInt xcount)
47: {
48: PetscInt x;
49: x = (xcount*gridDim.x*blockDim.x)+blockIdx.x*blockDim.x+threadIdx.x;
51: if (x<n) {
52: d_pa[x+x*ld] = v;
53: }
54: }
56: __host__ PetscErrorCode set_diagonal(PetscInt n,PetscScalar *d_pa,PetscInt ld,PetscScalar v)
57: {
58: PetscInt i,dimGrid_xcount;
59: dim3 blocks3d,threads3d;
60: cudaError_t cerr;
63: get_params_1D(n,&blocks3d,&threads3d,&dimGrid_xcount);
64: for (i=0;i<dimGrid_xcount;i++) {
65: set_diagonal_kernel<<<blocks3d, threads3d>>>(n,d_pa,ld,v,i);
66: cerr = cudaGetLastError();CHKERRCUDA(cerr);
67: }
68: return(0);
69: }
71: __global__ void set_Cdiagonal_kernel(PetscInt n,PetscComplex *d_pa,PetscInt ld,PetscReal vr,PetscReal vi,PetscInt xcount)
72: {
73: PetscInt x;
74: x = (xcount*gridDim.x*blockDim.x)+blockIdx.x*blockDim.x+threadIdx.x;
76: if (x<n) {
77: d_pa[x+x*ld] = thrust::complex<PetscReal>(vr, vi);
78: }
79: }
81: __host__ PetscErrorCode set_Cdiagonal(PetscInt n,PetscComplex *d_pa,PetscInt ld,PetscReal vr,PetscReal vi)
82: {
83: PetscInt i,dimGrid_xcount;
84: dim3 blocks3d,threads3d;
85: cudaError_t cerr;
88: get_params_1D(n,&blocks3d,&threads3d,&dimGrid_xcount);
89: for (i=0;i<dimGrid_xcount;i++) {
90: set_Cdiagonal_kernel<<<blocks3d, threads3d>>>(n,d_pa,ld,vr,vi,i);
91: cerr = cudaGetLastError();CHKERRCUDA(cerr);
92: }
93: return(0);
94: }
96: __global__ void shift_diagonal_kernel(PetscInt n,PetscScalar *d_pa,PetscInt ld,PetscScalar v,PetscInt xcount)
97: {
98: PetscInt x;
99: x = (xcount*gridDim.x*blockDim.x)+blockIdx.x*blockDim.x+threadIdx.x;
101: if (x<n) {
102: d_pa[x+x*ld] += v;
103: }
104: }
106: __host__ PetscErrorCode shift_diagonal(PetscInt n,PetscScalar *d_pa,PetscInt ld,PetscScalar v)
107: {
108: PetscInt i,dimGrid_xcount;
109: dim3 blocks3d,threads3d;
110: cudaError_t cerr;
113: get_params_1D(n,&blocks3d,&threads3d,&dimGrid_xcount);
114: for (i=0;i<dimGrid_xcount;i++) {
115: shift_diagonal_kernel<<<blocks3d, threads3d>>>(n,d_pa,ld,v,i);
116: cerr = cudaGetLastError();CHKERRCUDA(cerr);
117: }
118: return(0);
119: }
121: __global__ void shift_Cdiagonal_kernel(PetscInt n,PetscComplex *d_pa,PetscInt ld,PetscReal vr,PetscReal vi,PetscInt xcount)
122: {
123: PetscInt x;
124: x = (xcount*gridDim.x*blockDim.x)+blockIdx.x*blockDim.x+threadIdx.x;
126: if (x<n) {
127: d_pa[x+x*ld] += thrust::complex<PetscReal>(vr, vi);
128: }
129: }
131: __host__ PetscErrorCode shift_Cdiagonal(PetscInt n,PetscComplex *d_pa,PetscInt ld,PetscReal vr,PetscReal vi)
132: {
133: PetscInt i,dimGrid_xcount;
134: dim3 blocks3d,threads3d;
135: cudaError_t cerr;
138: get_params_1D(n,&blocks3d,&threads3d,&dimGrid_xcount);
139: for (i=0;i<dimGrid_xcount;i++) {
140: shift_Cdiagonal_kernel<<<blocks3d, threads3d>>>(n,d_pa,ld,vr,vi,i);
141: cerr = cudaGetLastError();CHKERRCUDA(cerr);
142: }
143: return(0);
144: }
146: __global__ void copy_array2D_S2C_kernel(PetscInt m,PetscInt n,PetscComplex *d_pa,PetscInt lda,PetscScalar *d_pb,PetscInt ldb,PetscInt xcount,PetscInt ycount)
147: {
148: PetscInt x,y,i,j;
150: x = (xcount*gridDim.x*blockDim.x)+blockIdx.x*blockDim.x*TILE_SIZE_X+threadIdx.x*TILE_SIZE_X;
151: y = (ycount*gridDim.y*blockDim.y)+blockIdx.y*blockDim.y*TILE_SIZE_Y+threadIdx.y*TILE_SIZE_Y;
152: for (i=x;i<x+TILE_SIZE_X&&i<m;i++) {
153: for (j=y;j<y+TILE_SIZE_Y&&j<n;j++) {
154: d_pa[i+j*lda] = d_pb[i+j*ldb];
155: }
156: }
157: }
159: __host__ PetscErrorCode copy_array2D_S2C(PetscInt m,PetscInt n,PetscComplex *d_pa,PetscInt lda,PetscScalar *d_pb,PetscInt ldb)
160: {
161: PetscInt i,j,dimGrid_xcount,dimGrid_ycount;
162: dim3 blocks3d,threads3d;
163: cudaError_t cerr;
166: get_params_2D(m,n,&blocks3d,&threads3d,&dimGrid_xcount,&dimGrid_ycount);
167: for (i=0;i<dimGrid_xcount;i++) {
168: for (j=0;j<dimGrid_ycount;j++) {
169: copy_array2D_S2C_kernel<<<blocks3d,threads3d>>>(m,n,d_pa,lda,d_pb,ldb,i,j);
170: cerr = cudaGetLastError();CHKERRCUDA(cerr);
171: }
172: }
173: return(0);
174: }
176: __global__ void copy_array2D_C2S_kernel(PetscInt m,PetscInt n,PetscScalar *d_pa,PetscInt lda,PetscComplex *d_pb,PetscInt ldb,PetscInt xcount,PetscInt ycount)
177: {
178: PetscInt x,y,i,j;
180: x = (xcount*gridDim.x*blockDim.x)+blockIdx.x*blockDim.x*TILE_SIZE_X+threadIdx.x*TILE_SIZE_X;
181: y = (ycount*gridDim.y*blockDim.y)+blockIdx.y*blockDim.y*TILE_SIZE_Y+threadIdx.y*TILE_SIZE_Y;
182: for (i=x;i<x+TILE_SIZE_X&&i<m;i++) {
183: for (j=y;j<y+TILE_SIZE_Y&&j<n;j++) {
184: d_pa[i+j*lda] = PetscRealPartComplex(d_pb[i+j*ldb]);
185: }
186: }
187: }
189: __host__ PetscErrorCode copy_array2D_C2S(PetscInt m,PetscInt n,PetscScalar *d_pa,PetscInt lda,PetscComplex *d_pb,PetscInt ldb)
190: {
191: PetscInt i,j,dimGrid_xcount,dimGrid_ycount;
192: dim3 blocks3d,threads3d;
193: cudaError_t cerr;
196: get_params_2D(m,n,&blocks3d,&threads3d,&dimGrid_xcount,&dimGrid_ycount);
197: for (i=0;i<dimGrid_xcount;i++) {
198: for (j=0;j<dimGrid_ycount;j++) {
199: copy_array2D_C2S_kernel<<<blocks3d,threads3d>>>(m,n,d_pa,lda,d_pb,ldb,i,j);
200: cerr = cudaGetLastError();CHKERRCUDA(cerr);
201: }
202: }
203: return(0);
204: }
206: __global__ void add_array2D_Conj_kernel(PetscInt m,PetscInt n,PetscComplex *d_pa,PetscInt lda,PetscInt xcount,PetscInt ycount)
207: {
208: PetscInt x,y,i,j;
210: x = (xcount*gridDim.x*blockDim.x)+blockIdx.x*blockDim.x*TILE_SIZE_X+threadIdx.x*TILE_SIZE_X;
211: y = (ycount*gridDim.y*blockDim.y)+blockIdx.y*blockDim.y*TILE_SIZE_Y+threadIdx.y*TILE_SIZE_Y;
212: for (i=x;i<x+TILE_SIZE_X&&i<m;i++) {
213: for (j=y;j<y+TILE_SIZE_Y&&j<n;j++) {
214: d_pa[i+j*lda] += PetscConj(d_pa[i+j*lda]);
215: }
216: }
217: }
219: __host__ PetscErrorCode add_array2D_Conj(PetscInt m,PetscInt n,PetscComplex *d_pa,PetscInt lda)
220: {
221: PetscInt i,j,dimGrid_xcount,dimGrid_ycount;
222: dim3 blocks3d,threads3d;
223: cudaError_t cerr;
226: get_params_2D(m,n,&blocks3d,&threads3d,&dimGrid_xcount,&dimGrid_ycount);
227: for (i=0;i<dimGrid_xcount;i++) {
228: for (j=0;j<dimGrid_ycount;j++) {
229: add_array2D_Conj_kernel<<<blocks3d,threads3d>>>(m,n,d_pa,lda,i,j);
230: cerr = cudaGetLastError();CHKERRCUDA(cerr);
231: }
232: }
233: return(0);
234: }
236: __global__ void getisreal_array2D_kernel(PetscInt m,PetscInt n,PetscComplex *d_pa,PetscInt lda,PetscBool *d_result,PetscInt xcount,PetscInt ycount)
237: {
238: PetscInt x,y,i,j;
240: x = (xcount*gridDim.x*blockDim.x)+blockIdx.x*blockDim.x*TILE_SIZE_X+threadIdx.x*TILE_SIZE_X;
241: y = (ycount*gridDim.y*blockDim.y)+blockIdx.y*blockDim.y*TILE_SIZE_Y+threadIdx.y*TILE_SIZE_Y;
242: if (*d_result) {
243: for (i=x;i<x+TILE_SIZE_X&&i<m;i++) {
244: for (j=y;j<y+TILE_SIZE_Y&&j<n;j++) {
245: if (PetscImaginaryPartComplex(d_pa[i+j*lda])) *d_result=PETSC_FALSE;
246: }
247: }
248: }
249: }
251: __host__ PetscErrorCode getisreal_array2D(PetscInt m,PetscInt n,PetscComplex *d_pa,PetscInt lda,PetscBool *d_result)
252: {
253: PetscInt i,j,dimGrid_xcount,dimGrid_ycount;
254: PetscBool result=PETSC_TRUE;
255: dim3 blocks3d,threads3d;
256: cudaError_t cerr;
259: cerr = cudaMemcpy(d_result,&result,sizeof(PetscBool),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
260: get_params_2D(m,n,&blocks3d,&threads3d,&dimGrid_xcount,&dimGrid_ycount);
261: for (i=0;i<dimGrid_xcount;i++) {
262: for (j=0;j<dimGrid_ycount;j++) {
263: getisreal_array2D_kernel<<<blocks3d,threads3d>>>(m,n,d_pa,lda,d_result,i,j);
264: cerr = cudaGetLastError();CHKERRCUDA(cerr);
265: }
266: }
267: return(0);
268: }
270: //template <class T, unsigned int bs>
271: //__global__ void mult_diagonal_kernel(T *d_pa,PetscInt n,PetscInt ld,T *d_v,PetscInt xcount)
272: //{
273: // PetscInt x;
274: // extern __shared__ T *shrdres;
275: //
276: // x = (xcount*gridDim.x*blockDim.x)+blockIdx.x*blockDim.x*TILE_SIZE_X+threadIdx.x*TILE_SIZE_X;
277: //
278: // if (x<n) {
279: // shrdres[x] = d_pa[x+ld*x];
280: // __syncthreads();
281: //
282: // /* reduction */
283: // if ((bs >= 512) && (threadIdx.x < 256)) { shrdres[threadIdx.x] *= shrdres[threadIdx.x + 256]; } __syncthreads();
284: // if ((bs >= 256) && (threadIdx.x < 128)) { shrdres[threadIdx.x] *= shrdres[threadIdx.x + 128]; } __syncthreads();
285: // if ((bs >= 128) && (threadIdx.x < 64)) { shrdres[threadIdx.x] *= shrdres[threadIdx.x + 64]; } __syncthreads();
286: // if ((bs >= 64) && (threadIdx.x < 32)) { shrdres[threadIdx.x] *= shrdres[threadIdx.x + 32]; } __syncthreads();
287: // if ((bs >= 32) && (threadIdx.x < 16)) { shrdres[threadIdx.x] *= shrdres[threadIdx.x + 16]; } __syncthreads();
288: // if ((bs >= 16) && (threadIdx.x < 8)) { shrdres[threadIdx.x] *= shrdres[threadIdx.x + 8]; } __syncthreads();
289: // if ((bs >= 8) && (threadIdx.x < 4)) { shrdres[threadIdx.x] *= shrdres[threadIdx.x + 4]; } __syncthreads();
290: // if ((bs >= 4) && (threadIdx.x < 2)) { shrdres[threadIdx.x] *= shrdres[threadIdx.x + 2]; } __syncthreads();
291: // if ((bs >= 2) && (threadIdx.x < 1)) { shrdres[threadIdx.x] *= shrdres[threadIdx.x + 1]; } __syncthreads();
292: //
293: // if (threadIdx.x == 0) d_v[blockIdx.x] = shrdres[threadIdx.x];
294: // }
295: //
296: //}
297: //
298: //__host__ PetscErrorCode mult_diagonal(PetscScalar *d_pa,PetscInt n, PetscInt ld,PetscScalar *v)
299: //{
300: // PetscInt i,j,dimGrid_xcount;
301: // PetscScalar *part,*d_part;
303: // dim3 blocks3d,threads3d;
304: // cudaError_t cerr;
305: //
307: // get_params_1D(n,&blocks3d,&threads3d,&dimGrid_xcount);
308: // cerr = cudaMalloc((void **)&d_part,sizeof(PetscScalar)*blocks3d.x);CHKERRCUDA(cerr);
309: // PetscMalloc1(blocks3d.x,&part);
310: // for (i=0;i<dimGrid_xcount;i++) {
311: // mult_diagonal_kernel<threads3d.x><<<blocks3d, threads3d>>>(d_pa,n,ld,d_part,i);
312: // cerr = cudaGetLastError();CHKERRCUDA(cerr);
313: //
314: // cerr = cudaMemcpy(part,d_part,blocks3d.x*sizeof(PetscScalar),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
315: // if (i == 0) {
316: // *v = part[0];
317: // j=1;
318: // } else {
319: // j=0;
320: // }
321: // for (; j<blocks3d.x; j++) {
322: // *v *= part[j];
323: // }
324: // }
325: // cerr = cudaFree(d_part);CHKERRCUDA(cerr);
326: // PetscFree(part);
327: // return(0);
328: //}
330: __host__ PetscErrorCode get_params_1D(PetscInt rows,dim3 *dimGrid,dim3 *dimBlock,PetscInt *dimGrid_xcount)
331: {
332: int card;
333: struct cudaDeviceProp devprop;
334: cudaError_t cerr;
337: cerr = cudaGetDevice(&card);CHKERRCUDA(cerr);
338: cerr = cudaGetDeviceProperties(&devprop,card);CHKERRCUDA(cerr);
340: *dimGrid_xcount = 1;
342: // X axis
343: dimGrid->x = 1;
344: dimBlock->x = BLOCK_SIZE_X;
345: if (rows>(BLOCK_SIZE_X*TILE_SIZE_X)) {
346: dimGrid->x = (rows+((BLOCK_SIZE_X*TILE_SIZE_X)-1))/(BLOCK_SIZE_X*TILE_SIZE_X);
347: } else {
348: dimBlock->x = (rows+(TILE_SIZE_X-1))/TILE_SIZE_X;
349: }
351: if (dimGrid->x>(unsigned)devprop.maxGridSize[X_AXIS]) {
352: *dimGrid_xcount = (dimGrid->x+(devprop.maxGridSize[X_AXIS]-1))/devprop.maxGridSize[X_AXIS];
353: dimGrid->x = devprop.maxGridSize[X_AXIS];
354: }
355: return(0);
356: }
358: __host__ PetscErrorCode get_params_2D(PetscInt rows,PetscInt cols,dim3 *dimGrid,dim3 *dimBlock,PetscInt *dimGrid_xcount,PetscInt *dimGrid_ycount)
359: {
360: int card;
361: cudaError_t cerr;
362: struct cudaDeviceProp devprop;
365: cerr = cudaGetDevice(&card);CHKERRCUDA(cerr);
366: cerr = cudaGetDeviceProperties(&devprop,card);CHKERRCUDA(cerr);
368: *dimGrid_xcount = *dimGrid_ycount = 1;
370: // X axis
371: dimGrid->x = 1;
372: dimBlock->x = BLOCK_SIZE_X;
373: if (rows > (BLOCK_SIZE_X*TILE_SIZE_X)) {
374: dimGrid->x = (rows+((BLOCK_SIZE_X*TILE_SIZE_X)-1))/(BLOCK_SIZE_X*TILE_SIZE_X);
375: } else {
376: dimBlock->x = (rows+(TILE_SIZE_X-1))/TILE_SIZE_X;
377: }
379: if (dimGrid->x>(unsigned)devprop.maxGridSize[X_AXIS]) {
380: *dimGrid_xcount = (dimGrid->x+(devprop.maxGridSize[X_AXIS]-1))/devprop.maxGridSize[X_AXIS];
381: dimGrid->x = devprop.maxGridSize[X_AXIS];
382: }
384: // Y axis
385: dimGrid->y = 1;
386: dimBlock->y = BLOCK_SIZE_Y;
387: if (cols>(BLOCK_SIZE_Y*TILE_SIZE_Y)) {
388: dimGrid->y = (cols+((BLOCK_SIZE_Y*TILE_SIZE_Y)-1))/(BLOCK_SIZE_Y*TILE_SIZE_Y);
389: } else {
390: dimBlock->y = (cols+(TILE_SIZE_Y-1))/TILE_SIZE_Y;
391: }
393: if (dimGrid->y>(unsigned)devprop.maxGridSize[Y_AXIS]) {
394: *dimGrid_ycount = (dimGrid->y+(devprop.maxGridSize[Y_AXIS]-1))/devprop.maxGridSize[Y_AXIS];
395: dimGrid->y = devprop.maxGridSize[Y_AXIS];
396: }
397: return(0);
398: }