Skip to content

Commit 39532ce

Browse files
committed
backends/cuda-gen: use occupancy to calculate launch sizes
Choose sizes that actually fit while being big enough to amortize thread block overhead and choosing sizes that permit high occupancy. https://developer.nvidia.com/blog/cuda-pro-tip-occupancy-api-simplifies-launch-configuration/
1 parent 44abf3e commit 39532ce

1 file changed

Lines changed: 87 additions & 22 deletions

File tree

backends/cuda-gen/ceed-cuda-gen-operator.c

Lines changed: 87 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,81 @@ static int CeedOperatorDestroy_Cuda_gen(CeedOperator op) {
3232
return CEED_ERROR_SUCCESS;
3333
}
3434

35+
static int Waste(int threads_per_sm, int warp_size, int threads_per_elem,
36+
int elems_per_block) {
37+
int useful_threads_per_block = threads_per_elem * elems_per_block;
38+
// round up to nearest multiple of warp_size
39+
int block_size = ((useful_threads_per_block + warp_size - 1) / warp_size) *
40+
warp_size;
41+
int blocks_per_sm = threads_per_sm / block_size;
42+
return threads_per_sm - useful_threads_per_block * blocks_per_sm;
43+
}
44+
45+
// Choose the least wasteful block size constrained by blocks_per_sm of
46+
// max_threads_per_block.
47+
//
48+
// The x and y part of block[] contains per-element sizes (specified on input)
49+
// while the z part is number of elements.
50+
//
51+
// Problem setting: we'd like to make occupancy high with relatively few
52+
// inactive threads. CUDA (cuOccupancyMaxPotentialBlockSize) can tell us how
53+
// many threads can run.
54+
//
55+
// Note that full occupancy sometimes can't be achieved by one thread block. For
56+
// example, an SM might support 1536 threads in total, but only 1024 within a
57+
// single thread block. So cuOccupancyMaxPotentialBlockSize may suggest a block
58+
// size of 768 so that two blocks can run, versus one block of 1024 will prevent
59+
// a second block from running. The cuda-gen kernels are pretty heavy with lots
60+
// of instruction-level parallelism (ILP) so we'll generally be okay with
61+
// relatvely low occupancy and smaller thread blocks, but we solve a reasonably
62+
// general problem here. Empirically, we find that blocks bigger than about 256
63+
// have higher latency and worse load balancing when the number of elements is
64+
// modest.
65+
//
66+
// cuda-gen can't choose block sizes arbitrarily; they need to be a multiple of
67+
// the number of quadrature points (or number of basis functions). They also
68+
// have a lot of __syncthreads(), which is another point against excessively
69+
// large thread blocks. Suppose I have elements with 7x7x7 quadrature points.
70+
// This will loop over the last dimension, so we have 7*7=49 threads per
71+
// element. Suppose we have two elements = 2*49=98 useful threads. CUDA
72+
// schedules in units of full warps (32 threads), so 128 CUDA hardware threads
73+
// are effectively committed to that block. Now suppose
74+
// cuOccupancyMaxPotentialBlockSize returned 352. We can schedule 2 blocks of
75+
// size 98 (196 useful threads using 256 hardware threads), but not a third
76+
// block (which would need a total of 384 hardware threads).
77+
//
78+
// If instead, we had packed 3 elements, we'd have 3*49=147 useful threads
79+
// occupying 160 slots, and could schedule two blocks. Alternatively, we could
80+
// pack a single block of 7 elements (2*49=343 useful threads) into the 354
81+
// slots. The latter has the least "waste", but __syncthreads()
82+
// over-synchronizes and it might not pay off relative to smaller blocks.
83+
static int BlockGridCalculate(CeedInt nelem, int blocks_per_sm,
84+
int max_threads_per_block, int warp_size, int block[3], int *grid) {
85+
const int threads_per_sm = blocks_per_sm * max_threads_per_block;
86+
const int threads_per_elem = block[0] * block[1];
87+
int elems_per_block = 1;
88+
int waste = Waste(threads_per_sm, warp_size, threads_per_elem, 1);
89+
for (int i=2;
90+
i <= CeedIntMin(max_threads_per_block / threads_per_elem, nelem);
91+
i++) {
92+
int i_waste = Waste(threads_per_sm, warp_size, threads_per_elem, i);
93+
// We want to minimize waste, but smaller kernels have lower latency and
94+
// less __syncthreads() overhead so when a larger block size has the same
95+
// waste as a smaller one, go ahead and prefer the smaller block.
96+
if (i_waste < waste || (i_waste == waste && threads_per_elem * i <= 128)) {
97+
elems_per_block = i;
98+
waste = i_waste;
99+
}
100+
}
101+
block[2] = elems_per_block;
102+
*grid = (nelem + elems_per_block - 1) / elems_per_block;
103+
return CEED_ERROR_SUCCESS;
104+
}
105+
106+
// callback for cuOccupancyMaxPotentialBlockSize, providing the amount of
107+
// dynamic shared memory required for a thread block of size threads.
108+
static size_t dynamicSMemSize(int threads) { return threads * sizeof(CeedScalar); }
109+
35110
//------------------------------------------------------------------------------
36111
// Apply and add to output
37112
//------------------------------------------------------------------------------
@@ -40,6 +115,8 @@ static int CeedOperatorApplyAdd_Cuda_gen(CeedOperator op, CeedVector invec,
40115
int ierr;
41116
Ceed ceed;
42117
ierr = CeedOperatorGetCeed(op, &ceed); CeedChkBackend(ierr);
118+
Ceed_Cuda *cuda_data;
119+
ierr = CeedGetData(ceed, &cuda_data); CeedChkBackend(ierr);
43120
CeedOperator_Cuda_gen *data;
44121
ierr = CeedOperatorGetData(op, &data); CeedChkBackend(ierr);
45122
CeedQFunction qf;
@@ -122,28 +199,16 @@ static int CeedOperatorApplyAdd_Cuda_gen(CeedOperator op, CeedVector invec,
122199
const CeedInt Q1d = data->Q1d;
123200
const CeedInt P1d = data->maxP1d;
124201
const CeedInt thread1d = CeedIntMax(Q1d, P1d);
125-
if (dim==1) {
126-
const CeedInt elemsPerBlock = 32;
127-
CeedInt grid = nelem/elemsPerBlock + ( (nelem/elemsPerBlock*elemsPerBlock<nelem)
128-
? 1 : 0 );
129-
CeedInt sharedMem = elemsPerBlock*thread1d*sizeof(CeedScalar);
130-
ierr = CeedRunKernelDimSharedCuda(ceed, data->op, grid, thread1d, 1,
131-
elemsPerBlock, sharedMem, opargs);
132-
} else if (dim==2) {
133-
const CeedInt elemsPerBlock = thread1d<4? 16 : 2;
134-
CeedInt grid = nelem/elemsPerBlock + ( (nelem/elemsPerBlock*elemsPerBlock<nelem)
135-
? 1 : 0 );
136-
CeedInt sharedMem = elemsPerBlock*thread1d*thread1d*sizeof(CeedScalar);
137-
ierr = CeedRunKernelDimSharedCuda(ceed, data->op, grid, thread1d, thread1d,
138-
elemsPerBlock, sharedMem, opargs);
139-
} else if (dim==3) {
140-
const CeedInt elemsPerBlock = thread1d<6? 4 : (thread1d<8? 2 : 1);
141-
CeedInt grid = nelem/elemsPerBlock + ( (nelem/elemsPerBlock*elemsPerBlock<nelem)
142-
? 1 : 0 );
143-
CeedInt sharedMem = elemsPerBlock*thread1d*thread1d*sizeof(CeedScalar);
144-
ierr = CeedRunKernelDimSharedCuda(ceed, data->op, grid, thread1d, thread1d,
145-
elemsPerBlock, sharedMem, opargs);
146-
}
202+
int max_threads_per_block, min_grid_size;
203+
CeedChk_Cu(ceed, cuOccupancyMaxPotentialBlockSize(&min_grid_size,
204+
&max_threads_per_block, data->op, dynamicSMemSize, 0, 0x10000));
205+
int block[3] = {thread1d, dim < 2 ? 1 : thread1d, -1,}, grid;
206+
CeedChkBackend(BlockGridCalculate(nelem,
207+
min_grid_size/ cuda_data->deviceProp.multiProcessorCount, max_threads_per_block,
208+
cuda_data->deviceProp.warpSize, block, &grid));
209+
CeedInt shared_mem = block[0] * block[1] * block[2] * sizeof(CeedScalar);
210+
ierr = CeedRunKernelDimSharedCuda(ceed, data->op, grid, block[0], block[1],
211+
block[2], shared_mem, opargs);
147212
CeedChkBackend(ierr);
148213

149214
// Restore input arrays

0 commit comments

Comments
 (0)