@@ -32,6 +32,81 @@ static int CeedOperatorDestroy_Cuda_gen(CeedOperator op) {
3232 return CEED_ERROR_SUCCESS ;
3333}
3434
35+ static int Waste (int threads_per_sm , int warp_size , int threads_per_elem ,
36+ int elems_per_block ) {
37+ int useful_threads_per_block = threads_per_elem * elems_per_block ;
38+ // round up to nearest multiple of warp_size
39+ int block_size = ((useful_threads_per_block + warp_size - 1 ) / warp_size ) *
40+ warp_size ;
41+ int blocks_per_sm = threads_per_sm / block_size ;
42+ return threads_per_sm - useful_threads_per_block * blocks_per_sm ;
43+ }
44+
45+ // Choose the least wasteful block size constrained by blocks_per_sm of
46+ // max_threads_per_block.
47+ //
48+ // The x and y part of block[] contains per-element sizes (specified on input)
49+ // while the z part is number of elements.
50+ //
51+ // Problem setting: we'd like to make occupancy high with relatively few
52+ // inactive threads. CUDA (cuOccupancyMaxPotentialBlockSize) can tell us how
53+ // many threads can run.
54+ //
55+ // Note that full occupancy sometimes can't be achieved by one thread block. For
56+ // example, an SM might support 1536 threads in total, but only 1024 within a
57+ // single thread block. So cuOccupancyMaxPotentialBlockSize may suggest a block
58+ // size of 768 so that two blocks can run, versus one block of 1024 will prevent
59+ // a second block from running. The cuda-gen kernels are pretty heavy with lots
60+ // of instruction-level parallelism (ILP) so we'll generally be okay with
61+ // relatvely low occupancy and smaller thread blocks, but we solve a reasonably
62+ // general problem here. Empirically, we find that blocks bigger than about 256
63+ // have higher latency and worse load balancing when the number of elements is
64+ // modest.
65+ //
66+ // cuda-gen can't choose block sizes arbitrarily; they need to be a multiple of
67+ // the number of quadrature points (or number of basis functions). They also
68+ // have a lot of __syncthreads(), which is another point against excessively
69+ // large thread blocks. Suppose I have elements with 7x7x7 quadrature points.
70+ // This will loop over the last dimension, so we have 7*7=49 threads per
71+ // element. Suppose we have two elements = 2*49=98 useful threads. CUDA
72+ // schedules in units of full warps (32 threads), so 128 CUDA hardware threads
73+ // are effectively committed to that block. Now suppose
74+ // cuOccupancyMaxPotentialBlockSize returned 352. We can schedule 2 blocks of
75+ // size 98 (196 useful threads using 256 hardware threads), but not a third
76+ // block (which would need a total of 384 hardware threads).
77+ //
78+ // If instead, we had packed 3 elements, we'd have 3*49=147 useful threads
79+ // occupying 160 slots, and could schedule two blocks. Alternatively, we could
80+ // pack a single block of 7 elements (2*49=343 useful threads) into the 354
81+ // slots. The latter has the least "waste", but __syncthreads()
82+ // over-synchronizes and it might not pay off relative to smaller blocks.
83+ static int BlockGridCalculate (CeedInt nelem , int blocks_per_sm ,
84+ int max_threads_per_block , int warp_size , int block [3 ], int * grid ) {
85+ const int threads_per_sm = blocks_per_sm * max_threads_per_block ;
86+ const int threads_per_elem = block [0 ] * block [1 ];
87+ int elems_per_block = 1 ;
88+ int waste = Waste (threads_per_sm , warp_size , threads_per_elem , 1 );
89+ for (int i = 2 ;
90+ i <= CeedIntMin (max_threads_per_block / threads_per_elem , nelem );
91+ i ++ ) {
92+ int i_waste = Waste (threads_per_sm , warp_size , threads_per_elem , i );
93+ // We want to minimize waste, but smaller kernels have lower latency and
94+ // less __syncthreads() overhead so when a larger block size has the same
95+ // waste as a smaller one, go ahead and prefer the smaller block.
96+ if (i_waste < waste || (i_waste == waste && threads_per_elem * i <= 128 )) {
97+ elems_per_block = i ;
98+ waste = i_waste ;
99+ }
100+ }
101+ block [2 ] = elems_per_block ;
102+ * grid = (nelem + elems_per_block - 1 ) / elems_per_block ;
103+ return CEED_ERROR_SUCCESS ;
104+ }
105+
106+ // callback for cuOccupancyMaxPotentialBlockSize, providing the amount of
107+ // dynamic shared memory required for a thread block of size threads.
108+ static size_t dynamicSMemSize (int threads ) { return threads * sizeof (CeedScalar ); }
109+
35110//------------------------------------------------------------------------------
36111// Apply and add to output
37112//------------------------------------------------------------------------------
@@ -40,6 +115,8 @@ static int CeedOperatorApplyAdd_Cuda_gen(CeedOperator op, CeedVector invec,
40115 int ierr ;
41116 Ceed ceed ;
42117 ierr = CeedOperatorGetCeed (op , & ceed ); CeedChkBackend (ierr );
118+ Ceed_Cuda * cuda_data ;
119+ ierr = CeedGetData (ceed , & cuda_data ); CeedChkBackend (ierr );
43120 CeedOperator_Cuda_gen * data ;
44121 ierr = CeedOperatorGetData (op , & data ); CeedChkBackend (ierr );
45122 CeedQFunction qf ;
@@ -122,28 +199,16 @@ static int CeedOperatorApplyAdd_Cuda_gen(CeedOperator op, CeedVector invec,
122199 const CeedInt Q1d = data -> Q1d ;
123200 const CeedInt P1d = data -> maxP1d ;
124201 const CeedInt thread1d = CeedIntMax (Q1d , P1d );
125- if (dim == 1 ) {
126- const CeedInt elemsPerBlock = 32 ;
127- CeedInt grid = nelem /elemsPerBlock + ( (nelem /elemsPerBlock * elemsPerBlock < nelem )
128- ? 1 : 0 );
129- CeedInt sharedMem = elemsPerBlock * thread1d * sizeof (CeedScalar );
130- ierr = CeedRunKernelDimSharedCuda (ceed , data -> op , grid , thread1d , 1 ,
131- elemsPerBlock , sharedMem , opargs );
132- } else if (dim == 2 ) {
133- const CeedInt elemsPerBlock = thread1d < 4 ? 16 : 2 ;
134- CeedInt grid = nelem /elemsPerBlock + ( (nelem /elemsPerBlock * elemsPerBlock < nelem )
135- ? 1 : 0 );
136- CeedInt sharedMem = elemsPerBlock * thread1d * thread1d * sizeof (CeedScalar );
137- ierr = CeedRunKernelDimSharedCuda (ceed , data -> op , grid , thread1d , thread1d ,
138- elemsPerBlock , sharedMem , opargs );
139- } else if (dim == 3 ) {
140- const CeedInt elemsPerBlock = thread1d < 6 ? 4 : (thread1d < 8 ? 2 : 1 );
141- CeedInt grid = nelem /elemsPerBlock + ( (nelem /elemsPerBlock * elemsPerBlock < nelem )
142- ? 1 : 0 );
143- CeedInt sharedMem = elemsPerBlock * thread1d * thread1d * sizeof (CeedScalar );
144- ierr = CeedRunKernelDimSharedCuda (ceed , data -> op , grid , thread1d , thread1d ,
145- elemsPerBlock , sharedMem , opargs );
146- }
202+ int max_threads_per_block , min_grid_size ;
203+ CeedChk_Cu (ceed , cuOccupancyMaxPotentialBlockSize (& min_grid_size ,
204+ & max_threads_per_block , data -> op , dynamicSMemSize , 0 , 0x10000 ));
205+ int block [3 ] = {thread1d , dim < 2 ? 1 : thread1d , -1 ,}, grid ;
206+ CeedChkBackend (BlockGridCalculate (nelem ,
207+ min_grid_size / cuda_data -> deviceProp .multiProcessorCount , max_threads_per_block ,
208+ cuda_data -> deviceProp .warpSize , block , & grid ));
209+ CeedInt shared_mem = block [0 ] * block [1 ] * block [2 ] * sizeof (CeedScalar );
210+ ierr = CeedRunKernelDimSharedCuda (ceed , data -> op , grid , block [0 ], block [1 ],
211+ block [2 ], shared_mem , opargs );
147212 CeedChkBackend (ierr );
148213
149214 // Restore input arrays
0 commit comments