Skip to content

Commit c53bf7d

Browse files
authored
Merge pull request #810 from CEED/cuda-gen-launch
Adjust cuda-gen launch to limit z dimension of thread block
2 parents 2651368 + 1351654 commit c53bf7d

1 file changed

Lines changed: 7 additions & 2 deletions

File tree

backends/cuda-gen/ceed-cuda-gen-operator.c

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -81,7 +81,8 @@ static int Waste(int threads_per_sm, int warp_size, int threads_per_elem,
8181
// slots. The latter has the least "waste", but __syncthreads()
8282
// over-synchronizes and it might not pay off relative to smaller blocks.
8383
static int BlockGridCalculate(CeedInt nelem, int blocks_per_sm,
84-
int max_threads_per_block, int warp_size, int block[3], int *grid) {
84+
int max_threads_per_block, int max_threads_z,
85+
int warp_size, int block[3], int *grid) {
8586
const int threads_per_sm = blocks_per_sm * max_threads_per_block;
8687
const int threads_per_elem = block[0] * block[1];
8788
int elems_per_block = 1;
@@ -98,7 +99,10 @@ static int BlockGridCalculate(CeedInt nelem, int blocks_per_sm,
9899
waste = i_waste;
99100
}
100101
}
101-
block[2] = elems_per_block;
102+
// In low-order elements, threads_per_elem may be sufficiently low to give
103+
// an elems_per_block greater than allowable for the device, so we must check
104+
// before setting the z-dimension size of the block.
105+
block[2] = CeedIntMin(elems_per_block, max_threads_z);
102106
*grid = (nelem + elems_per_block - 1) / elems_per_block;
103107
return CEED_ERROR_SUCCESS;
104108
}
@@ -205,6 +209,7 @@ static int CeedOperatorApplyAdd_Cuda_gen(CeedOperator op, CeedVector invec,
205209
int block[3] = {thread1d, dim < 2 ? 1 : thread1d, -1,}, grid;
206210
CeedChkBackend(BlockGridCalculate(nelem,
207211
min_grid_size/ cuda_data->deviceProp.multiProcessorCount, max_threads_per_block,
212+
cuda_data->deviceProp.maxThreadsDim[2],
208213
cuda_data->deviceProp.warpSize, block, &grid));
209214
CeedInt shared_mem = block[0] * block[1] * block[2] * sizeof(CeedScalar);
210215
ierr = CeedRunKernelDimSharedCuda(ceed, data->op, grid, block[0], block[1],

0 commit comments

Comments
 (0)