Skip to content

Commit fb37386

Browse files
mbrost05Thomas Hellström
authored andcommitted
drm/xe: Forcefully tear down exec queues in GuC submit fini
In GuC submit fini, forcefully tear down any exec queues by disabling CTs, stopping the scheduler (which cleans up lost G2H), killing all remaining queues, and resuming scheduling to allow any remaining cleanup actions to complete and signal any remaining fences. Split guc_submit_fini into device related and software only part. Using device-managed and drm-managed action guarantees the correct ordering of cleanup. Fixes: dd08ebf ("drm/xe: Introduce a new DRM driver for Intel GPUs") Cc: stable@vger.kernel.org Reviewed-by: Zhanjun Dong <zhanjun.dong@intel.com> Signed-off-by: Matthew Brost <matthew.brost@intel.com> Link: https://patch.msgid.link/20260310225039.1320161-3-zhanjun.dong@intel.com (cherry picked from commit a6ab444) Signed-off-by: Thomas Hellström <thomas.hellstrom@linux.intel.com>
1 parent 26c638d commit fb37386

3 files changed

Lines changed: 63 additions & 12 deletions

File tree

drivers/gpu/drm/xe/xe_guc.c

Lines changed: 24 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1347,15 +1347,37 @@ int xe_guc_enable_communication(struct xe_guc *guc)
13471347
return 0;
13481348
}
13491349

1350-
int xe_guc_suspend(struct xe_guc *guc)
1350+
/**
1351+
* xe_guc_softreset() - Soft reset GuC
1352+
* @guc: The GuC object
1353+
*
1354+
* Send soft reset command to GuC through mmio send.
1355+
*
1356+
* Return: 0 if success, otherwise error code
1357+
*/
1358+
int xe_guc_softreset(struct xe_guc *guc)
13511359
{
1352-
struct xe_gt *gt = guc_to_gt(guc);
13531360
u32 action[] = {
13541361
XE_GUC_ACTION_CLIENT_SOFT_RESET,
13551362
};
13561363
int ret;
13571364

1365+
if (!xe_uc_fw_is_running(&guc->fw))
1366+
return 0;
1367+
13581368
ret = xe_guc_mmio_send(guc, action, ARRAY_SIZE(action));
1369+
if (ret)
1370+
return ret;
1371+
1372+
return 0;
1373+
}
1374+
1375+
int xe_guc_suspend(struct xe_guc *guc)
1376+
{
1377+
struct xe_gt *gt = guc_to_gt(guc);
1378+
int ret;
1379+
1380+
ret = xe_guc_softreset(guc);
13591381
if (ret) {
13601382
xe_gt_err(gt, "GuC suspend failed: %pe\n", ERR_PTR(ret));
13611383
return ret;

drivers/gpu/drm/xe/xe_guc.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,7 @@ int xe_guc_opt_in_features_enable(struct xe_guc *guc);
4444
void xe_guc_runtime_suspend(struct xe_guc *guc);
4545
void xe_guc_runtime_resume(struct xe_guc *guc);
4646
int xe_guc_suspend(struct xe_guc *guc);
47+
int xe_guc_softreset(struct xe_guc *guc);
4748
void xe_guc_notify(struct xe_guc *guc);
4849
int xe_guc_auth_huc(struct xe_guc *guc, u32 rsa_addr);
4950
int xe_guc_mmio_send(struct xe_guc *guc, const u32 *request, u32 len);

drivers/gpu/drm/xe/xe_guc_submit.c

Lines changed: 38 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -48,6 +48,8 @@
4848

4949
#define XE_GUC_EXEC_QUEUE_CGP_CONTEXT_ERROR_LEN 6
5050

51+
static int guc_submit_reset_prepare(struct xe_guc *guc);
52+
5153
static struct xe_guc *
5254
exec_queue_to_guc(struct xe_exec_queue *q)
5355
{
@@ -239,7 +241,7 @@ static bool exec_queue_killed_or_banned_or_wedged(struct xe_exec_queue *q)
239241
EXEC_QUEUE_STATE_BANNED));
240242
}
241243

242-
static void guc_submit_fini(struct drm_device *drm, void *arg)
244+
static void guc_submit_sw_fini(struct drm_device *drm, void *arg)
243245
{
244246
struct xe_guc *guc = arg;
245247
struct xe_device *xe = guc_to_xe(guc);
@@ -257,6 +259,19 @@ static void guc_submit_fini(struct drm_device *drm, void *arg)
257259
xa_destroy(&guc->submission_state.exec_queue_lookup);
258260
}
259261

262+
static void guc_submit_fini(void *arg)
263+
{
264+
struct xe_guc *guc = arg;
265+
266+
/* Forcefully kill any remaining exec queues */
267+
xe_guc_ct_stop(&guc->ct);
268+
guc_submit_reset_prepare(guc);
269+
xe_guc_softreset(guc);
270+
xe_guc_submit_stop(guc);
271+
xe_uc_fw_sanitize(&guc->fw);
272+
xe_guc_submit_pause_abort(guc);
273+
}
274+
260275
static void guc_submit_wedged_fini(void *arg)
261276
{
262277
struct xe_guc *guc = arg;
@@ -326,7 +341,11 @@ int xe_guc_submit_init(struct xe_guc *guc, unsigned int num_ids)
326341

327342
guc->submission_state.initialized = true;
328343

329-
return drmm_add_action_or_reset(&xe->drm, guc_submit_fini, guc);
344+
err = drmm_add_action_or_reset(&xe->drm, guc_submit_sw_fini, guc);
345+
if (err)
346+
return err;
347+
348+
return devm_add_action_or_reset(xe->drm.dev, guc_submit_fini, guc);
330349
}
331350

332351
/*
@@ -2230,14 +2249,15 @@ static const struct xe_exec_queue_ops guc_exec_queue_ops = {
22302249
static void guc_exec_queue_stop(struct xe_guc *guc, struct xe_exec_queue *q)
22312250
{
22322251
struct xe_gpu_scheduler *sched = &q->guc->sched;
2252+
bool do_destroy = false;
22332253

22342254
/* Stop scheduling + flush any DRM scheduler operations */
22352255
xe_sched_submission_stop(sched);
22362256

22372257
/* Clean up lost G2H + reset engine state */
22382258
if (exec_queue_registered(q)) {
22392259
if (exec_queue_destroyed(q))
2240-
__guc_exec_queue_destroy(guc, q);
2260+
do_destroy = true;
22412261
}
22422262
if (q->guc->suspend_pending) {
22432263
set_exec_queue_suspended(q);
@@ -2273,18 +2293,15 @@ static void guc_exec_queue_stop(struct xe_guc *guc, struct xe_exec_queue *q)
22732293
xe_guc_exec_queue_trigger_cleanup(q);
22742294
}
22752295
}
2296+
2297+
if (do_destroy)
2298+
__guc_exec_queue_destroy(guc, q);
22762299
}
22772300

2278-
int xe_guc_submit_reset_prepare(struct xe_guc *guc)
2301+
static int guc_submit_reset_prepare(struct xe_guc *guc)
22792302
{
22802303
int ret;
22812304

2282-
if (xe_gt_WARN_ON(guc_to_gt(guc), vf_recovery(guc)))
2283-
return 0;
2284-
2285-
if (!guc->submission_state.initialized)
2286-
return 0;
2287-
22882305
/*
22892306
* Using an atomic here rather than submission_state.lock as this
22902307
* function can be called while holding the CT lock (engine reset
@@ -2299,6 +2316,17 @@ int xe_guc_submit_reset_prepare(struct xe_guc *guc)
22992316
return ret;
23002317
}
23012318

2319+
int xe_guc_submit_reset_prepare(struct xe_guc *guc)
2320+
{
2321+
if (xe_gt_WARN_ON(guc_to_gt(guc), vf_recovery(guc)))
2322+
return 0;
2323+
2324+
if (!guc->submission_state.initialized)
2325+
return 0;
2326+
2327+
return guc_submit_reset_prepare(guc);
2328+
}
2329+
23022330
void xe_guc_submit_reset_wait(struct xe_guc *guc)
23032331
{
23042332
wait_event(guc->ct.wq, xe_device_wedged(guc_to_xe(guc)) ||

0 commit comments

Comments
 (0)