Skip to content

Commit 2651368

Browse files
authored
Merge pull request #804 from CEED/jed/cuda-register-block-size
make CUDA block sizes fit according to number of used registers
2 parents a784c50 + 39532ce commit 2651368

4 files changed

Lines changed: 121 additions & 33 deletions

File tree

backends/cuda-gen/ceed-cuda-gen-operator.c

Lines changed: 87 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,81 @@ static int CeedOperatorDestroy_Cuda_gen(CeedOperator op) {
3232
return CEED_ERROR_SUCCESS;
3333
}
3434

35+
static int Waste(int threads_per_sm, int warp_size, int threads_per_elem,
36+
int elems_per_block) {
37+
int useful_threads_per_block = threads_per_elem * elems_per_block;
38+
// round up to nearest multiple of warp_size
39+
int block_size = ((useful_threads_per_block + warp_size - 1) / warp_size) *
40+
warp_size;
41+
int blocks_per_sm = threads_per_sm / block_size;
42+
return threads_per_sm - useful_threads_per_block * blocks_per_sm;
43+
}
44+
45+
// Choose the least wasteful block size constrained by blocks_per_sm of
46+
// max_threads_per_block.
47+
//
48+
// The x and y part of block[] contains per-element sizes (specified on input)
49+
// while the z part is number of elements.
50+
//
51+
// Problem setting: we'd like to make occupancy high with relatively few
52+
// inactive threads. CUDA (cuOccupancyMaxPotentialBlockSize) can tell us how
53+
// many threads can run.
54+
//
55+
// Note that full occupancy sometimes can't be achieved by one thread block. For
56+
// example, an SM might support 1536 threads in total, but only 1024 within a
57+
// single thread block. So cuOccupancyMaxPotentialBlockSize may suggest a block
58+
// size of 768 so that two blocks can run, versus one block of 1024 will prevent
59+
// a second block from running. The cuda-gen kernels are pretty heavy with lots
60+
// of instruction-level parallelism (ILP) so we'll generally be okay with
61+
// relatvely low occupancy and smaller thread blocks, but we solve a reasonably
62+
// general problem here. Empirically, we find that blocks bigger than about 256
63+
// have higher latency and worse load balancing when the number of elements is
64+
// modest.
65+
//
66+
// cuda-gen can't choose block sizes arbitrarily; they need to be a multiple of
67+
// the number of quadrature points (or number of basis functions). They also
68+
// have a lot of __syncthreads(), which is another point against excessively
69+
// large thread blocks. Suppose I have elements with 7x7x7 quadrature points.
70+
// This will loop over the last dimension, so we have 7*7=49 threads per
71+
// element. Suppose we have two elements = 2*49=98 useful threads. CUDA
72+
// schedules in units of full warps (32 threads), so 128 CUDA hardware threads
73+
// are effectively committed to that block. Now suppose
74+
// cuOccupancyMaxPotentialBlockSize returned 352. We can schedule 2 blocks of
75+
// size 98 (196 useful threads using 256 hardware threads), but not a third
76+
// block (which would need a total of 384 hardware threads).
77+
//
78+
// If instead, we had packed 3 elements, we'd have 3*49=147 useful threads
79+
// occupying 160 slots, and could schedule two blocks. Alternatively, we could
80+
// pack a single block of 7 elements (2*49=343 useful threads) into the 354
81+
// slots. The latter has the least "waste", but __syncthreads()
82+
// over-synchronizes and it might not pay off relative to smaller blocks.
83+
static int BlockGridCalculate(CeedInt nelem, int blocks_per_sm,
84+
int max_threads_per_block, int warp_size, int block[3], int *grid) {
85+
const int threads_per_sm = blocks_per_sm * max_threads_per_block;
86+
const int threads_per_elem = block[0] * block[1];
87+
int elems_per_block = 1;
88+
int waste = Waste(threads_per_sm, warp_size, threads_per_elem, 1);
89+
for (int i=2;
90+
i <= CeedIntMin(max_threads_per_block / threads_per_elem, nelem);
91+
i++) {
92+
int i_waste = Waste(threads_per_sm, warp_size, threads_per_elem, i);
93+
// We want to minimize waste, but smaller kernels have lower latency and
94+
// less __syncthreads() overhead so when a larger block size has the same
95+
// waste as a smaller one, go ahead and prefer the smaller block.
96+
if (i_waste < waste || (i_waste == waste && threads_per_elem * i <= 128)) {
97+
elems_per_block = i;
98+
waste = i_waste;
99+
}
100+
}
101+
block[2] = elems_per_block;
102+
*grid = (nelem + elems_per_block - 1) / elems_per_block;
103+
return CEED_ERROR_SUCCESS;
104+
}
105+
106+
// callback for cuOccupancyMaxPotentialBlockSize, providing the amount of
107+
// dynamic shared memory required for a thread block of size threads.
108+
static size_t dynamicSMemSize(int threads) { return threads * sizeof(CeedScalar); }
109+
35110
//------------------------------------------------------------------------------
36111
// Apply and add to output
37112
//------------------------------------------------------------------------------
@@ -40,6 +115,8 @@ static int CeedOperatorApplyAdd_Cuda_gen(CeedOperator op, CeedVector invec,
40115
int ierr;
41116
Ceed ceed;
42117
ierr = CeedOperatorGetCeed(op, &ceed); CeedChkBackend(ierr);
118+
Ceed_Cuda *cuda_data;
119+
ierr = CeedGetData(ceed, &cuda_data); CeedChkBackend(ierr);
43120
CeedOperator_Cuda_gen *data;
44121
ierr = CeedOperatorGetData(op, &data); CeedChkBackend(ierr);
45122
CeedQFunction qf;
@@ -122,28 +199,16 @@ static int CeedOperatorApplyAdd_Cuda_gen(CeedOperator op, CeedVector invec,
122199
const CeedInt Q1d = data->Q1d;
123200
const CeedInt P1d = data->maxP1d;
124201
const CeedInt thread1d = CeedIntMax(Q1d, P1d);
125-
if (dim==1) {
126-
const CeedInt elemsPerBlock = 32;
127-
CeedInt grid = nelem/elemsPerBlock + ( (nelem/elemsPerBlock*elemsPerBlock<nelem)
128-
? 1 : 0 );
129-
CeedInt sharedMem = elemsPerBlock*thread1d*sizeof(CeedScalar);
130-
ierr = CeedRunKernelDimSharedCuda(ceed, data->op, grid, thread1d, 1,
131-
elemsPerBlock, sharedMem, opargs);
132-
} else if (dim==2) {
133-
const CeedInt elemsPerBlock = thread1d<4? 16 : 2;
134-
CeedInt grid = nelem/elemsPerBlock + ( (nelem/elemsPerBlock*elemsPerBlock<nelem)
135-
? 1 : 0 );
136-
CeedInt sharedMem = elemsPerBlock*thread1d*thread1d*sizeof(CeedScalar);
137-
ierr = CeedRunKernelDimSharedCuda(ceed, data->op, grid, thread1d, thread1d,
138-
elemsPerBlock, sharedMem, opargs);
139-
} else if (dim==3) {
140-
const CeedInt elemsPerBlock = thread1d<6? 4 : (thread1d<8? 2 : 1);
141-
CeedInt grid = nelem/elemsPerBlock + ( (nelem/elemsPerBlock*elemsPerBlock<nelem)
142-
? 1 : 0 );
143-
CeedInt sharedMem = elemsPerBlock*thread1d*thread1d*sizeof(CeedScalar);
144-
ierr = CeedRunKernelDimSharedCuda(ceed, data->op, grid, thread1d, thread1d,
145-
elemsPerBlock, sharedMem, opargs);
146-
}
202+
int max_threads_per_block, min_grid_size;
203+
CeedChk_Cu(ceed, cuOccupancyMaxPotentialBlockSize(&min_grid_size,
204+
&max_threads_per_block, data->op, dynamicSMemSize, 0, 0x10000));
205+
int block[3] = {thread1d, dim < 2 ? 1 : thread1d, -1,}, grid;
206+
CeedChkBackend(BlockGridCalculate(nelem,
207+
min_grid_size/ cuda_data->deviceProp.multiProcessorCount, max_threads_per_block,
208+
cuda_data->deviceProp.warpSize, block, &grid));
209+
CeedInt shared_mem = block[0] * block[1] * block[2] * sizeof(CeedScalar);
210+
ierr = CeedRunKernelDimSharedCuda(ceed, data->op, grid, block[0], block[1],
211+
block[2], shared_mem, opargs);
147212
CeedChkBackend(ierr);
148213

149214
// Restore input arrays

backends/cuda/ceed-cuda-qfunction.c

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -41,7 +41,6 @@ static int CeedQFunctionApply_Cuda(CeedQFunction qf, CeedInt Q,
4141
CeedInt numinputfields, numoutputfields;
4242
ierr = CeedQFunctionGetNumArgs(qf, &numinputfields, &numoutputfields);
4343
CeedChkBackend(ierr);
44-
const int blocksize = ceed_Cuda->optblocksize;
4544

4645
// Read vectors
4746
for (CeedInt i = 0; i < numinputfields; i++) {
@@ -63,8 +62,8 @@ static int CeedQFunctionApply_Cuda(CeedQFunction qf, CeedInt Q,
6362

6463
// Run kernel
6564
void *args[] = {&data->d_c, (void *) &Q, &data->fields};
66-
ierr = CeedRunKernelCuda(ceed, data->qFunction, CeedDivUpInt(Q, blocksize),
67-
blocksize, args); CeedChkBackend(ierr);
65+
ierr = CeedRunKernelAutoblockCuda(ceed, data->qFunction, Q, args);
66+
CeedChkBackend(ierr);
6867

6968
// Restore vectors
7069
for (CeedInt i = 0; i < numinputfields; i++) {

backends/cuda/ceed-cuda.c

Lines changed: 28 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -105,13 +105,37 @@ int CeedGetKernelCuda(Ceed ceed, CUmodule module, const char *name,
105105
return CEED_ERROR_SUCCESS;
106106
}
107107

108+
// Run kernel with block size selected automatically based on the kernel (which
109+
// may use enough registers to require a smaller block size than the hardware is
110+
// capable).
111+
int CeedRunKernelAutoblockCuda(Ceed ceed, CUfunction kernel, size_t points,
112+
void **args) {
113+
int min_grid_size, max_block_size;
114+
CeedChk_Cu(ceed, cuOccupancyMaxPotentialBlockSize(&min_grid_size,
115+
&max_block_size, kernel, NULL, 0, 0x10000));
116+
CeedChkBackend(CeedRunKernelCuda(ceed, kernel, CeedDivUpInt(points,
117+
max_block_size), max_block_size, args));
118+
return 0;
119+
}
120+
108121
//------------------------------------------------------------------------------
109122
// Run CUDA kernel
110123
//------------------------------------------------------------------------------
111124
int CeedRunKernelCuda(Ceed ceed, CUfunction kernel, const int gridSize,
112125
const int blockSize, void **args) {
113-
CeedChk_Cu(ceed, cuLaunchKernel(kernel, gridSize, 1, 1, blockSize, 1,
114-
1, 0, NULL, args, NULL));
126+
CUresult result = cuLaunchKernel(kernel, gridSize, 1, 1, blockSize, 1,
127+
1, 0, NULL, args, NULL);
128+
if (result == CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES) {
129+
int max_threads_per_block, shared_size_bytes, num_regs;
130+
cuFuncGetAttribute(&max_threads_per_block,
131+
CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, kernel);
132+
cuFuncGetAttribute(&shared_size_bytes, CU_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES,
133+
kernel);
134+
cuFuncGetAttribute(&num_regs, CU_FUNC_ATTRIBUTE_NUM_REGS, kernel);
135+
return CeedError(ceed, CEED_ERROR_BACKEND,
136+
"CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES: max_threads_per_block %d on block size (%d,%d,%d), shared_size %d, num_regs %d",
137+
max_threads_per_block, blockSize, 1, 1, shared_size_bytes, num_regs);
138+
} else CeedChk_Cu(ceed, result);
115139
return CEED_ERROR_SUCCESS;
116140
}
117141

@@ -162,14 +186,11 @@ int CeedCudaInit(Ceed ceed, const char *resource, int nrc) {
162186
ierr = cudaSetDevice(deviceID); CeedChk_Cu(ceed,ierr);
163187
currentDeviceID = deviceID;
164188
}
165-
struct cudaDeviceProp deviceProp;
166-
ierr = cudaGetDeviceProperties(&deviceProp, currentDeviceID);
167-
CeedChk_Cu(ceed,ierr);
168-
169189
Ceed_Cuda *data;
170190
ierr = CeedGetData(ceed, &data); CeedChkBackend(ierr);
171191
data->deviceId = currentDeviceID;
172-
data->optblocksize = deviceProp.maxThreadsPerBlock;
192+
ierr = cudaGetDeviceProperties(&data->deviceProp, currentDeviceID);
193+
CeedChk_Cu(ceed,ierr);
173194
return CEED_ERROR_SUCCESS;
174195
}
175196

backends/cuda/ceed-cuda.h

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -169,9 +169,9 @@ typedef struct {
169169
} CeedOperator_Cuda;
170170

171171
typedef struct {
172-
int optblocksize;
173172
int deviceId;
174173
cublasHandle_t cublasHandle;
174+
struct cudaDeviceProp deviceProp;
175175
} Ceed_Cuda;
176176

177177
static inline CeedInt CeedDivUpInt(CeedInt numer, CeedInt denom) {
@@ -188,6 +188,9 @@ CEED_INTERN int CeedRunKernelCuda(Ceed ceed, CUfunction kernel,
188188
const int gridSize,
189189
const int blockSize, void **args);
190190

191+
CEED_INTERN int CeedRunKernelAutoblockCuda(Ceed ceed, CUfunction kernel,
192+
size_t size, void **args);
193+
191194
CEED_INTERN int CeedRunKernelDimCuda(Ceed ceed, CUfunction kernel,
192195
const int gridSize,
193196
const int blockSizeX, const int blockSizeY,

0 commit comments

Comments
 (0)