@@ -81,7 +81,8 @@ static int Waste(int threads_per_sm, int warp_size, int threads_per_elem,
8181// slots. The latter has the least "waste", but __syncthreads()
8282// over-synchronizes and it might not pay off relative to smaller blocks.
8383static int BlockGridCalculate (CeedInt nelem , int blocks_per_sm ,
84- int max_threads_per_block , int warp_size , int block [3 ], int * grid ) {
84+ int max_threads_per_block , int max_threads_z ,
85+ int warp_size , int block [3 ], int * grid ) {
8586 const int threads_per_sm = blocks_per_sm * max_threads_per_block ;
8687 const int threads_per_elem = block [0 ] * block [1 ];
8788 int elems_per_block = 1 ;
@@ -98,7 +99,10 @@ static int BlockGridCalculate(CeedInt nelem, int blocks_per_sm,
9899 waste = i_waste ;
99100 }
100101 }
101- block [2 ] = elems_per_block ;
102+ // In low-order elements, threads_per_elem may be sufficiently low to give
103+ // an elems_per_block greater than allowable for the device, so we must check
104+ // before setting the z-dimension size of the block.
105+ block [2 ] = CeedIntMin (elems_per_block , max_threads_z );
102106 * grid = (nelem + elems_per_block - 1 ) / elems_per_block ;
103107 return CEED_ERROR_SUCCESS ;
104108}
@@ -205,6 +209,7 @@ static int CeedOperatorApplyAdd_Cuda_gen(CeedOperator op, CeedVector invec,
205209 int block [3 ] = {thread1d , dim < 2 ? 1 : thread1d , -1 ,}, grid ;
206210 CeedChkBackend (BlockGridCalculate (nelem ,
207211 min_grid_size / cuda_data -> deviceProp .multiProcessorCount , max_threads_per_block ,
212+ cuda_data -> deviceProp .maxThreadsDim [2 ],
208213 cuda_data -> deviceProp .warpSize , block , & grid ));
209214 CeedInt shared_mem = block [0 ] * block [1 ] * block [2 ] * sizeof (CeedScalar );
210215 ierr = CeedRunKernelDimSharedCuda (ceed , data -> op , grid , block [0 ], block [1 ],
0 commit comments