Actual source code: fnutilcuda.cu
slepc-3.18.3 2023-03-24
1: /*
2: - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
3: SLEPc - Scalable Library for Eigenvalue Problem Computations
4: Copyright (c) 2002-, Universitat Politecnica de Valencia, Spain
6: This file is part of SLEPc.
7: SLEPc is distributed under a 2-clause BSD license (see LICENSE).
8: - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
9: */
10: /*
11: Utility subroutines common to several impls
12: */
14: #include "fnutilcuda.h"
16: __global__ void set_diagonal_kernel(PetscInt n,PetscScalar *d_pa,PetscInt ld,PetscScalar v,PetscInt xcount)
17: {
18: PetscInt x;
19: x = xcount*gridDim.x*blockDim.x+blockIdx.x*blockDim.x+threadIdx.x;
21: if (x<n) {
22: d_pa[x+x*ld] = v;
23: }
24: }
26: __host__ PetscErrorCode set_diagonal(PetscInt n,PetscScalar *d_pa,PetscInt ld,PetscScalar v)
27: {
28: PetscInt i,dimGrid_xcount;
29: dim3 blocks3d,threads3d;
31: SlepcKernelSetGrid1D(n,&blocks3d,&threads3d,&dimGrid_xcount);
32: for (i=0;i<dimGrid_xcount;i++) {
33: set_diagonal_kernel<<<blocks3d, threads3d>>>(n,d_pa,ld,v,i);
34: cudaGetLastError();
35: }
36: return 0;
37: }
39: __global__ void set_Cdiagonal_kernel(PetscInt n,PetscComplex *d_pa,PetscInt ld,PetscReal vr,PetscReal vi,PetscInt xcount)
40: {
41: PetscInt x;
42: x = xcount*gridDim.x*blockDim.x+blockIdx.x*blockDim.x+threadIdx.x;
44: if (x<n) {
45: d_pa[x+x*ld] = thrust::complex<PetscReal>(vr, vi);
46: }
47: }
49: __host__ PetscErrorCode set_Cdiagonal(PetscInt n,PetscComplex *d_pa,PetscInt ld,PetscReal vr,PetscReal vi)
50: {
51: PetscInt i,dimGrid_xcount;
52: dim3 blocks3d,threads3d;
54: SlepcKernelSetGrid1D(n,&blocks3d,&threads3d,&dimGrid_xcount);
55: for (i=0;i<dimGrid_xcount;i++) {
56: set_Cdiagonal_kernel<<<blocks3d, threads3d>>>(n,d_pa,ld,vr,vi,i);
57: cudaGetLastError();
58: }
59: return 0;
60: }
62: __global__ void shift_diagonal_kernel(PetscInt n,PetscScalar *d_pa,PetscInt ld,PetscScalar v,PetscInt xcount)
63: {
64: PetscInt x;
65: x = xcount*gridDim.x*blockDim.x+blockIdx.x*blockDim.x+threadIdx.x;
67: if (x<n) {
68: d_pa[x+x*ld] += v;
69: }
70: }
72: __host__ PetscErrorCode shift_diagonal(PetscInt n,PetscScalar *d_pa,PetscInt ld,PetscScalar v)
73: {
74: PetscInt i,dimGrid_xcount;
75: dim3 blocks3d,threads3d;
77: SlepcKernelSetGrid1D(n,&blocks3d,&threads3d,&dimGrid_xcount);
78: for (i=0;i<dimGrid_xcount;i++) {
79: shift_diagonal_kernel<<<blocks3d, threads3d>>>(n,d_pa,ld,v,i);
80: cudaGetLastError();
81: }
82: return 0;
83: }
85: __global__ void shift_Cdiagonal_kernel(PetscInt n,PetscComplex *d_pa,PetscInt ld,PetscReal vr,PetscReal vi,PetscInt xcount)
86: {
87: PetscInt x;
88: x = xcount*gridDim.x*blockDim.x+blockIdx.x*blockDim.x+threadIdx.x;
90: if (x<n) {
91: d_pa[x+x*ld] += thrust::complex<PetscReal>(vr, vi);
92: }
93: }
95: __host__ PetscErrorCode shift_Cdiagonal(PetscInt n,PetscComplex *d_pa,PetscInt ld,PetscReal vr,PetscReal vi)
96: {
97: PetscInt i,dimGrid_xcount;
98: dim3 blocks3d,threads3d;
100: SlepcKernelSetGrid1D(n,&blocks3d,&threads3d,&dimGrid_xcount);
101: for (i=0;i<dimGrid_xcount;i++) {
102: shift_Cdiagonal_kernel<<<blocks3d, threads3d>>>(n,d_pa,ld,vr,vi,i);
103: cudaGetLastError();
104: }
105: return 0;
106: }
108: __global__ void copy_array2D_S2C_kernel(PetscInt m,PetscInt n,PetscComplex *d_pa,PetscInt lda,PetscScalar *d_pb,PetscInt ldb,PetscInt xcount,PetscInt ycount)
109: {
110: PetscInt x,y,i,j;
112: x = xcount*gridDim.x*blockDim.x+blockIdx.x*blockDim.x+threadIdx.x;
113: y = ycount*gridDim.y*blockDim.y+blockIdx.y*blockDim.y+threadIdx.y;
114: for (j=y;j<n;j+=blockDim.y) {
115: for (i=x;i<m;i+=blockDim.x) {
116: d_pa[i+j*lda] = d_pb[i+j*ldb];
117: }
118: }
119: }
121: __host__ PetscErrorCode copy_array2D_S2C(PetscInt m,PetscInt n,PetscComplex *d_pa,PetscInt lda,PetscScalar *d_pb,PetscInt ldb)
122: {
123: PetscInt i,j,dimGrid_xcount,dimGrid_ycount;
124: dim3 blocks3d,threads3d;
126: SlepcKernelSetGrid2DTiles(m,n,&blocks3d,&threads3d,&dimGrid_xcount,&dimGrid_ycount);
127: for (i=0;i<dimGrid_xcount;i++) {
128: for (j=0;j<dimGrid_ycount;j++) {
129: copy_array2D_S2C_kernel<<<blocks3d,threads3d>>>(m,n,d_pa,lda,d_pb,ldb,i,j);
130: cudaGetLastError();
131: }
132: }
133: return 0;
134: }
136: __global__ void copy_array2D_C2S_kernel(PetscInt m,PetscInt n,PetscScalar *d_pa,PetscInt lda,PetscComplex *d_pb,PetscInt ldb,PetscInt xcount,PetscInt ycount)
137: {
138: PetscInt x,y,i,j;
140: x = xcount*gridDim.x*blockDim.x+blockIdx.x*blockDim.x+threadIdx.x;
141: y = ycount*gridDim.y*blockDim.y+blockIdx.y*blockDim.y+threadIdx.y;
142: for (j=y;j<n;j+=blockDim.y) {
143: for (i=x;i<m;i+=blockDim.x) {
144: d_pa[i+j*lda] = PetscRealPartComplex(d_pb[i+j*ldb]);
145: }
146: }
147: }
149: __host__ PetscErrorCode copy_array2D_C2S(PetscInt m,PetscInt n,PetscScalar *d_pa,PetscInt lda,PetscComplex *d_pb,PetscInt ldb)
150: {
151: PetscInt i,j,dimGrid_xcount,dimGrid_ycount;
152: dim3 blocks3d,threads3d;
154: SlepcKernelSetGrid2DTiles(m,n,&blocks3d,&threads3d,&dimGrid_xcount,&dimGrid_ycount);
155: for (i=0;i<dimGrid_xcount;i++) {
156: for (j=0;j<dimGrid_ycount;j++) {
157: copy_array2D_C2S_kernel<<<blocks3d,threads3d>>>(m,n,d_pa,lda,d_pb,ldb,i,j);
158: cudaGetLastError();
159: }
160: }
161: return 0;
162: }
164: __global__ void add_array2D_Conj_kernel(PetscInt m,PetscInt n,PetscComplex *d_pa,PetscInt lda,PetscInt xcount,PetscInt ycount)
165: {
166: PetscInt x,y,i,j;
168: x = xcount*gridDim.x*blockDim.x+blockIdx.x*blockDim.x+threadIdx.x;
169: y = ycount*gridDim.y*blockDim.y+blockIdx.y*blockDim.y+threadIdx.y;
170: for (j=y;j<n;j+=blockDim.y) {
171: for (i=x;i<m;i+=blockDim.x) {
172: d_pa[i+j*lda] += PetscConj(d_pa[i+j*lda]);
173: }
174: }
175: }
177: __host__ PetscErrorCode add_array2D_Conj(PetscInt m,PetscInt n,PetscComplex *d_pa,PetscInt lda)
178: {
179: PetscInt i,j,dimGrid_xcount,dimGrid_ycount;
180: dim3 blocks3d,threads3d;
182: SlepcKernelSetGrid2DTiles(m,n,&blocks3d,&threads3d,&dimGrid_xcount,&dimGrid_ycount);
183: for (i=0;i<dimGrid_xcount;i++) {
184: for (j=0;j<dimGrid_ycount;j++) {
185: add_array2D_Conj_kernel<<<blocks3d,threads3d>>>(m,n,d_pa,lda,i,j);
186: cudaGetLastError();
187: }
188: }
189: return 0;
190: }
192: __global__ void getisreal_array2D_kernel(PetscInt m,PetscInt n,PetscComplex *d_pa,PetscInt lda,PetscBool *d_result,PetscInt xcount,PetscInt ycount)
193: {
194: PetscInt x,y,i,j;
196: x = xcount*gridDim.x*blockDim.x+blockIdx.x*blockDim.x+threadIdx.x;
197: y = ycount*gridDim.y*blockDim.y+blockIdx.y*blockDim.y+threadIdx.y;
198: if (*d_result) {
199: for (j=y;j<n;j+=blockDim.y) {
200: for (i=x;i<m;i+=blockDim.x) {
201: if (PetscImaginaryPartComplex(d_pa[i+j*lda])) *d_result=PETSC_FALSE;
202: }
203: }
204: }
205: }
207: __host__ PetscErrorCode getisreal_array2D(PetscInt m,PetscInt n,PetscComplex *d_pa,PetscInt lda,PetscBool *d_result)
208: {
209: PetscInt i,j,dimGrid_xcount,dimGrid_ycount;
210: PetscBool result=PETSC_TRUE;
211: dim3 blocks3d,threads3d;
213: cudaMemcpy(d_result,&result,sizeof(PetscBool),cudaMemcpyHostToDevice);
214: SlepcKernelSetGrid2DTiles(m,n,&blocks3d,&threads3d,&dimGrid_xcount,&dimGrid_ycount);
215: for (i=0;i<dimGrid_xcount;i++) {
216: for (j=0;j<dimGrid_ycount;j++) {
217: getisreal_array2D_kernel<<<blocks3d,threads3d>>>(m,n,d_pa,lda,d_result,i,j);
218: cudaGetLastError();
219: }
220: }
221: return 0;
222: }
224: __global__ void mult_diagonal_kernel(PetscInt n,PetscScalar *d_pa,PetscInt ld,PetscScalar *d_v,PetscInt xcount)
225: {
226: PetscInt x;
227: unsigned int bs=blockDim.x;
228: __shared__ PetscScalar shrdres[SLEPC_BLOCK_SIZE_X];
230: x = xcount*gridDim.x*blockDim.x+blockIdx.x*blockDim.x+threadIdx.x;
231: shrdres[threadIdx.x] = (x<n)? d_pa[x+ld*x]: 1.0;
232: __syncthreads();
234: /* reduction */
235: if ((bs >= 512) && (threadIdx.x < 256)) { shrdres[threadIdx.x] *= shrdres[threadIdx.x + 256]; } __syncthreads();
236: if ((bs >= 256) && (threadIdx.x < 128)) { shrdres[threadIdx.x] *= shrdres[threadIdx.x + 128]; } __syncthreads();
237: if ((bs >= 128) && (threadIdx.x < 64)) { shrdres[threadIdx.x] *= shrdres[threadIdx.x + 64]; } __syncthreads();
238: if ((bs >= 64) && (threadIdx.x < 32)) { shrdres[threadIdx.x] *= shrdres[threadIdx.x + 32]; } __syncthreads();
239: if ((bs >= 32) && (threadIdx.x < 16)) { shrdres[threadIdx.x] *= shrdres[threadIdx.x + 16]; } __syncthreads();
240: if ((bs >= 16) && (threadIdx.x < 8)) { shrdres[threadIdx.x] *= shrdres[threadIdx.x + 8]; } __syncthreads();
241: if ((bs >= 8) && (threadIdx.x < 4)) { shrdres[threadIdx.x] *= shrdres[threadIdx.x + 4]; } __syncthreads();
242: if ((bs >= 4) && (threadIdx.x < 2)) { shrdres[threadIdx.x] *= shrdres[threadIdx.x + 2]; } __syncthreads();
243: if ((bs >= 2) && (threadIdx.x < 1)) { shrdres[threadIdx.x] *= shrdres[threadIdx.x + 1]; } __syncthreads();
244: if (threadIdx.x == 0) d_v[blockIdx.x] = shrdres[threadIdx.x];
245: }
247: __host__ PetscErrorCode mult_diagonal(PetscInt n,PetscScalar *d_pa,PetscInt ld,PetscScalar *v)
248: {
249: PetscInt i,j,dimGrid_xcount;
250: PetscScalar *part,*d_part;
251: dim3 blocks3d,threads3d;
253: SlepcKernelSetGrid1D(n,&blocks3d,&threads3d,&dimGrid_xcount);
254: cudaMalloc((void **)&d_part,sizeof(PetscScalar)*blocks3d.x);
255: PetscMalloc1(blocks3d.x,&part);
256: for (i=0;i<dimGrid_xcount;i++) {
257: mult_diagonal_kernel<<<blocks3d,threads3d>>>(n,d_pa,ld,d_part,i);
258: cudaGetLastError();
259: cudaMemcpy(part,d_part,blocks3d.x*sizeof(PetscScalar),cudaMemcpyDeviceToHost);
260: if (i == 0) {
261: *v = part[0];
262: j=1;
263: } else j=0;
264: for (; j<(int)blocks3d.x; j++) *v *= part[j];
265: }
266: cudaFree(d_part);
267: PetscFree(part);
268: return 0;
269: }