Actual source code: fnutilcuda.cu
slepc-3.17.0 2022-03-31
1: /*
2: - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
3: SLEPc - Scalable Library for Eigenvalue Problem Computations
4: Copyright (c) 2002-, Universitat Politecnica de Valencia, Spain
6: This file is part of SLEPc.
7: SLEPc is distributed under a 2-clause BSD license (see LICENSE).
8: - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
9: */
10: /*
11: Utility subroutines common to several impls
12: */
14: #include <petscsys.h>
15: #include "fnutilcuda.h"
17: __global__ void clean_offdiagonal_kernel(PetscInt n,PetscScalar *d_pa,PetscInt ld,PetscScalar v,PetscInt xcount)
18: {
19: PetscInt x,j;
20: x = (xcount*gridDim.x*blockDim.x)+blockIdx.x*blockDim.x*TILE_SIZE_X+threadIdx.x*TILE_SIZE_X;
22: if (x<n) {
23: for (j=0;j<n;j++) {
24: if (j != x) d_pa[x+j*ld] = 0.0;
25: else d_pa[x+j*ld] = d_pa[x+j*ld]*v;
26: }
27: }
28: }
30: __host__ PetscErrorCode clean_offdiagonal(PetscInt n,PetscScalar *d_pa,PetscInt ld,PetscScalar v)
31: {
32: /* XXX use 2D TBD */
33: PetscInt i,dimGrid_xcount;
34: dim3 blocks3d,threads3d;
36: get_params_1D(n,&blocks3d,&threads3d,&dimGrid_xcount);
37: for (i=0;i<dimGrid_xcount;i++) {
38: clean_offdiagonal_kernel<<<blocks3d, threads3d>>>(n,d_pa,ld,v,i);
39: cudaGetLastError();
40: }
41: PetscFunctionReturn(0);
42: }
44: __global__ void set_diagonal_kernel(PetscInt n,PetscScalar *d_pa,PetscInt ld,PetscScalar v,PetscInt xcount)
45: {
46: PetscInt x;
47: x = (xcount*gridDim.x*blockDim.x)+blockIdx.x*blockDim.x+threadIdx.x;
49: if (x<n) {
50: d_pa[x+x*ld] = v;
51: }
52: }
54: __host__ PetscErrorCode set_diagonal(PetscInt n,PetscScalar *d_pa,PetscInt ld,PetscScalar v)
55: {
56: PetscInt i,dimGrid_xcount;
57: dim3 blocks3d,threads3d;
59: get_params_1D(n,&blocks3d,&threads3d,&dimGrid_xcount);
60: for (i=0;i<dimGrid_xcount;i++) {
61: set_diagonal_kernel<<<blocks3d, threads3d>>>(n,d_pa,ld,v,i);
62: cudaGetLastError();
63: }
64: PetscFunctionReturn(0);
65: }
67: __global__ void set_Cdiagonal_kernel(PetscInt n,PetscComplex *d_pa,PetscInt ld,PetscReal vr,PetscReal vi,PetscInt xcount)
68: {
69: PetscInt x;
70: x = (xcount*gridDim.x*blockDim.x)+blockIdx.x*blockDim.x+threadIdx.x;
72: if (x<n) {
73: d_pa[x+x*ld] = thrust::complex<PetscReal>(vr, vi);
74: }
75: }
77: __host__ PetscErrorCode set_Cdiagonal(PetscInt n,PetscComplex *d_pa,PetscInt ld,PetscReal vr,PetscReal vi)
78: {
79: PetscInt i,dimGrid_xcount;
80: dim3 blocks3d,threads3d;
82: get_params_1D(n,&blocks3d,&threads3d,&dimGrid_xcount);
83: for (i=0;i<dimGrid_xcount;i++) {
84: set_Cdiagonal_kernel<<<blocks3d, threads3d>>>(n,d_pa,ld,vr,vi,i);
85: cudaGetLastError();
86: }
87: PetscFunctionReturn(0);
88: }
90: __global__ void shift_diagonal_kernel(PetscInt n,PetscScalar *d_pa,PetscInt ld,PetscScalar v,PetscInt xcount)
91: {
92: PetscInt x;
93: x = (xcount*gridDim.x*blockDim.x)+blockIdx.x*blockDim.x+threadIdx.x;
95: if (x<n) {
96: d_pa[x+x*ld] += v;
97: }
98: }
100: __host__ PetscErrorCode shift_diagonal(PetscInt n,PetscScalar *d_pa,PetscInt ld,PetscScalar v)
101: {
102: PetscInt i,dimGrid_xcount;
103: dim3 blocks3d,threads3d;
105: get_params_1D(n,&blocks3d,&threads3d,&dimGrid_xcount);
106: for (i=0;i<dimGrid_xcount;i++) {
107: shift_diagonal_kernel<<<blocks3d, threads3d>>>(n,d_pa,ld,v,i);
108: cudaGetLastError();
109: }
110: PetscFunctionReturn(0);
111: }
113: __global__ void shift_Cdiagonal_kernel(PetscInt n,PetscComplex *d_pa,PetscInt ld,PetscReal vr,PetscReal vi,PetscInt xcount)
114: {
115: PetscInt x;
116: x = (xcount*gridDim.x*blockDim.x)+blockIdx.x*blockDim.x+threadIdx.x;
118: if (x<n) {
119: d_pa[x+x*ld] += thrust::complex<PetscReal>(vr, vi);
120: }
121: }
123: __host__ PetscErrorCode shift_Cdiagonal(PetscInt n,PetscComplex *d_pa,PetscInt ld,PetscReal vr,PetscReal vi)
124: {
125: PetscInt i,dimGrid_xcount;
126: dim3 blocks3d,threads3d;
128: get_params_1D(n,&blocks3d,&threads3d,&dimGrid_xcount);
129: for (i=0;i<dimGrid_xcount;i++) {
130: shift_Cdiagonal_kernel<<<blocks3d, threads3d>>>(n,d_pa,ld,vr,vi,i);
131: cudaGetLastError();
132: }
133: PetscFunctionReturn(0);
134: }
136: __global__ void copy_array2D_S2C_kernel(PetscInt m,PetscInt n,PetscComplex *d_pa,PetscInt lda,PetscScalar *d_pb,PetscInt ldb,PetscInt xcount,PetscInt ycount)
137: {
138: PetscInt x,y,i,j;
140: x = (xcount*gridDim.x*blockDim.x)+blockIdx.x*blockDim.x*TILE_SIZE_X+threadIdx.x*TILE_SIZE_X;
141: y = (ycount*gridDim.y*blockDim.y)+blockIdx.y*blockDim.y*TILE_SIZE_Y+threadIdx.y*TILE_SIZE_Y;
142: for (i=x;i<x+TILE_SIZE_X&&i<m;i++) {
143: for (j=y;j<y+TILE_SIZE_Y&&j<n;j++) {
144: d_pa[i+j*lda] = d_pb[i+j*ldb];
145: }
146: }
147: }
149: __host__ PetscErrorCode copy_array2D_S2C(PetscInt m,PetscInt n,PetscComplex *d_pa,PetscInt lda,PetscScalar *d_pb,PetscInt ldb)
150: {
151: PetscInt i,j,dimGrid_xcount,dimGrid_ycount;
152: dim3 blocks3d,threads3d;
154: get_params_2D(m,n,&blocks3d,&threads3d,&dimGrid_xcount,&dimGrid_ycount);
155: for (i=0;i<dimGrid_xcount;i++) {
156: for (j=0;j<dimGrid_ycount;j++) {
157: copy_array2D_S2C_kernel<<<blocks3d,threads3d>>>(m,n,d_pa,lda,d_pb,ldb,i,j);
158: cudaGetLastError();
159: }
160: }
161: PetscFunctionReturn(0);
162: }
164: __global__ void copy_array2D_C2S_kernel(PetscInt m,PetscInt n,PetscScalar *d_pa,PetscInt lda,PetscComplex *d_pb,PetscInt ldb,PetscInt xcount,PetscInt ycount)
165: {
166: PetscInt x,y,i,j;
168: x = (xcount*gridDim.x*blockDim.x)+blockIdx.x*blockDim.x*TILE_SIZE_X+threadIdx.x*TILE_SIZE_X;
169: y = (ycount*gridDim.y*blockDim.y)+blockIdx.y*blockDim.y*TILE_SIZE_Y+threadIdx.y*TILE_SIZE_Y;
170: for (i=x;i<x+TILE_SIZE_X&&i<m;i++) {
171: for (j=y;j<y+TILE_SIZE_Y&&j<n;j++) {
172: d_pa[i+j*lda] = PetscRealPartComplex(d_pb[i+j*ldb]);
173: }
174: }
175: }
177: __host__ PetscErrorCode copy_array2D_C2S(PetscInt m,PetscInt n,PetscScalar *d_pa,PetscInt lda,PetscComplex *d_pb,PetscInt ldb)
178: {
179: PetscInt i,j,dimGrid_xcount,dimGrid_ycount;
180: dim3 blocks3d,threads3d;
182: get_params_2D(m,n,&blocks3d,&threads3d,&dimGrid_xcount,&dimGrid_ycount);
183: for (i=0;i<dimGrid_xcount;i++) {
184: for (j=0;j<dimGrid_ycount;j++) {
185: copy_array2D_C2S_kernel<<<blocks3d,threads3d>>>(m,n,d_pa,lda,d_pb,ldb,i,j);
186: cudaGetLastError();
187: }
188: }
189: PetscFunctionReturn(0);
190: }
192: __global__ void add_array2D_Conj_kernel(PetscInt m,PetscInt n,PetscComplex *d_pa,PetscInt lda,PetscInt xcount,PetscInt ycount)
193: {
194: PetscInt x,y,i,j;
196: x = (xcount*gridDim.x*blockDim.x)+blockIdx.x*blockDim.x*TILE_SIZE_X+threadIdx.x*TILE_SIZE_X;
197: y = (ycount*gridDim.y*blockDim.y)+blockIdx.y*blockDim.y*TILE_SIZE_Y+threadIdx.y*TILE_SIZE_Y;
198: for (i=x;i<x+TILE_SIZE_X&&i<m;i++) {
199: for (j=y;j<y+TILE_SIZE_Y&&j<n;j++) {
200: d_pa[i+j*lda] += PetscConj(d_pa[i+j*lda]);
201: }
202: }
203: }
205: __host__ PetscErrorCode add_array2D_Conj(PetscInt m,PetscInt n,PetscComplex *d_pa,PetscInt lda)
206: {
207: PetscInt i,j,dimGrid_xcount,dimGrid_ycount;
208: dim3 blocks3d,threads3d;
210: get_params_2D(m,n,&blocks3d,&threads3d,&dimGrid_xcount,&dimGrid_ycount);
211: for (i=0;i<dimGrid_xcount;i++) {
212: for (j=0;j<dimGrid_ycount;j++) {
213: add_array2D_Conj_kernel<<<blocks3d,threads3d>>>(m,n,d_pa,lda,i,j);
214: cudaGetLastError();
215: }
216: }
217: PetscFunctionReturn(0);
218: }
220: __global__ void getisreal_array2D_kernel(PetscInt m,PetscInt n,PetscComplex *d_pa,PetscInt lda,PetscBool *d_result,PetscInt xcount,PetscInt ycount)
221: {
222: PetscInt x,y,i,j;
224: x = (xcount*gridDim.x*blockDim.x)+blockIdx.x*blockDim.x*TILE_SIZE_X+threadIdx.x*TILE_SIZE_X;
225: y = (ycount*gridDim.y*blockDim.y)+blockIdx.y*blockDim.y*TILE_SIZE_Y+threadIdx.y*TILE_SIZE_Y;
226: if (*d_result) {
227: for (i=x;i<x+TILE_SIZE_X&&i<m;i++) {
228: for (j=y;j<y+TILE_SIZE_Y&&j<n;j++) {
229: if (PetscImaginaryPartComplex(d_pa[i+j*lda])) *d_result=PETSC_FALSE;
230: }
231: }
232: }
233: }
235: __host__ PetscErrorCode getisreal_array2D(PetscInt m,PetscInt n,PetscComplex *d_pa,PetscInt lda,PetscBool *d_result)
236: {
237: PetscInt i,j,dimGrid_xcount,dimGrid_ycount;
238: PetscBool result=PETSC_TRUE;
239: dim3 blocks3d,threads3d;
241: cudaMemcpy(d_result,&result,sizeof(PetscBool),cudaMemcpyHostToDevice);
242: get_params_2D(m,n,&blocks3d,&threads3d,&dimGrid_xcount,&dimGrid_ycount);
243: for (i=0;i<dimGrid_xcount;i++) {
244: for (j=0;j<dimGrid_ycount;j++) {
245: getisreal_array2D_kernel<<<blocks3d,threads3d>>>(m,n,d_pa,lda,d_result,i,j);
246: cudaGetLastError();
247: }
248: }
249: PetscFunctionReturn(0);
250: }
252: //template <class T, unsigned int bs>
253: //__global__ void mult_diagonal_kernel(T *d_pa,PetscInt n,PetscInt ld,T *d_v,PetscInt xcount)
254: //{
255: // PetscInt x;
256: // extern __shared__ T *shrdres;
257: //
258: // x = (xcount*gridDim.x*blockDim.x)+blockIdx.x*blockDim.x*TILE_SIZE_X+threadIdx.x*TILE_SIZE_X;
259: //
260: // if (x<n) {
261: // shrdres[x] = d_pa[x+ld*x];
262: // __syncthreads();
263: //
264: // /* reduction */
265: // if ((bs >= 512) && (threadIdx.x < 256)) { shrdres[threadIdx.x] *= shrdres[threadIdx.x + 256]; } __syncthreads();
266: // if ((bs >= 256) && (threadIdx.x < 128)) { shrdres[threadIdx.x] *= shrdres[threadIdx.x + 128]; } __syncthreads();
267: // if ((bs >= 128) && (threadIdx.x < 64)) { shrdres[threadIdx.x] *= shrdres[threadIdx.x + 64]; } __syncthreads();
268: // if ((bs >= 64) && (threadIdx.x < 32)) { shrdres[threadIdx.x] *= shrdres[threadIdx.x + 32]; } __syncthreads();
269: // if ((bs >= 32) && (threadIdx.x < 16)) { shrdres[threadIdx.x] *= shrdres[threadIdx.x + 16]; } __syncthreads();
270: // if ((bs >= 16) && (threadIdx.x < 8)) { shrdres[threadIdx.x] *= shrdres[threadIdx.x + 8]; } __syncthreads();
271: // if ((bs >= 8) && (threadIdx.x < 4)) { shrdres[threadIdx.x] *= shrdres[threadIdx.x + 4]; } __syncthreads();
272: // if ((bs >= 4) && (threadIdx.x < 2)) { shrdres[threadIdx.x] *= shrdres[threadIdx.x + 2]; } __syncthreads();
273: // if ((bs >= 2) && (threadIdx.x < 1)) { shrdres[threadIdx.x] *= shrdres[threadIdx.x + 1]; } __syncthreads();
274: //
275: // if (threadIdx.x == 0) d_v[blockIdx.x] = shrdres[threadIdx.x];
276: // }
277: //
278: //}
279: //
280: //__host__ PetscErrorCode mult_diagonal(PetscScalar *d_pa,PetscInt n, PetscInt ld,PetscScalar *v)
281: //{
282: // PetscInt i,j,dimGrid_xcount;
283: // PetscScalar *part,*d_part;
284: // dim3 blocks3d,threads3d;
285: //
286: // get_params_1D(n,&blocks3d,&threads3d,&dimGrid_xcount);
287: // cudaMalloc((void **)&d_part,sizeof(PetscScalar)*blocks3d.x);
288: // PetscMalloc1(blocks3d.x,&part);
289: // for (i=0;i<dimGrid_xcount;i++) {
290: // mult_diagonal_kernel<threads3d.x><<<blocks3d, threads3d>>>(d_pa,n,ld,d_part,i);
291: // cudaGetLastError();
292: //
293: // cudaMemcpy(part,d_part,blocks3d.x*sizeof(PetscScalar),cudaMemcpyDeviceToHost);
294: // if (i == 0) {
295: // *v = part[0];
296: // j=1;
297: // } else {
298: // j=0;
299: // }
300: // for (; j<blocks3d.x; j++) {
301: // *v *= part[j];
302: // }
303: // }
304: // cudaFree(d_part);
305: // PetscFree(part);
306: // PetscFunctionReturn(0);
307: //}
309: __host__ PetscErrorCode get_params_1D(PetscInt rows,dim3 *dimGrid,dim3 *dimBlock,PetscInt *dimGrid_xcount)
310: {
311: int card;
312: struct cudaDeviceProp devprop;
314: cudaGetDevice(&card);
315: cudaGetDeviceProperties(&devprop,card);
317: *dimGrid_xcount = 1;
319: // X axis
320: dimGrid->x = 1;
321: dimBlock->x = BLOCK_SIZE_X;
322: if (rows>(BLOCK_SIZE_X*TILE_SIZE_X)) {
323: dimGrid->x = (rows+((BLOCK_SIZE_X*TILE_SIZE_X)-1))/(BLOCK_SIZE_X*TILE_SIZE_X);
324: } else {
325: dimBlock->x = (rows+(TILE_SIZE_X-1))/TILE_SIZE_X;
326: }
328: if (dimGrid->x>(unsigned)devprop.maxGridSize[X_AXIS]) {
329: *dimGrid_xcount = (dimGrid->x+(devprop.maxGridSize[X_AXIS]-1))/devprop.maxGridSize[X_AXIS];
330: dimGrid->x = devprop.maxGridSize[X_AXIS];
331: }
332: PetscFunctionReturn(0);
333: }
335: __host__ PetscErrorCode get_params_2D(PetscInt rows,PetscInt cols,dim3 *dimGrid,dim3 *dimBlock,PetscInt *dimGrid_xcount,PetscInt *dimGrid_ycount)
336: {
337: int card;
338: struct cudaDeviceProp devprop;
340: cudaGetDevice(&card);
341: cudaGetDeviceProperties(&devprop,card);
343: *dimGrid_xcount = *dimGrid_ycount = 1;
345: // X axis
346: dimGrid->x = 1;
347: dimBlock->x = BLOCK_SIZE_X;
348: if (rows > (BLOCK_SIZE_X*TILE_SIZE_X)) {
349: dimGrid->x = (rows+((BLOCK_SIZE_X*TILE_SIZE_X)-1))/(BLOCK_SIZE_X*TILE_SIZE_X);
350: } else {
351: dimBlock->x = (rows+(TILE_SIZE_X-1))/TILE_SIZE_X;
352: }
354: if (dimGrid->x>(unsigned)devprop.maxGridSize[X_AXIS]) {
355: *dimGrid_xcount = (dimGrid->x+(devprop.maxGridSize[X_AXIS]-1))/devprop.maxGridSize[X_AXIS];
356: dimGrid->x = devprop.maxGridSize[X_AXIS];
357: }
359: // Y axis
360: dimGrid->y = 1;
361: dimBlock->y = BLOCK_SIZE_Y;
362: if (cols>(BLOCK_SIZE_Y*TILE_SIZE_Y)) {
363: dimGrid->y = (cols+((BLOCK_SIZE_Y*TILE_SIZE_Y)-1))/(BLOCK_SIZE_Y*TILE_SIZE_Y);
364: } else {
365: dimBlock->y = (cols+(TILE_SIZE_Y-1))/TILE_SIZE_Y;
366: }
368: if (dimGrid->y>(unsigned)devprop.maxGridSize[Y_AXIS]) {
369: *dimGrid_ycount = (dimGrid->y+(devprop.maxGridSize[Y_AXIS]-1))/devprop.maxGridSize[Y_AXIS];
370: dimGrid->y = devprop.maxGridSize[Y_AXIS];
371: }
372: PetscFunctionReturn(0);
373: }