Skip to content

Commit 0620837

Browse files
author
Thomas Hellström
committed
drm/xe/svm: Serialize migration to device if racing
Introduce an rw-semaphore to serialize migration to device if it's likely that migration races with another device migration of the same CPU address space range. This is a temporary fix to attempt to mitigate a livelock that might happen if many devices try to migrate a range at the same time, and it affects only devices using the xe driver. A longer term fix is probably improvements in the core mm migration layer. Suggested-by: Matthew Brost <matthew.brost@intel.com> Signed-off-by: Thomas Hellström <thomas.hellstrom@linux.intel.com> Reviewed-by: Matthew Brost <matthew.brost@intel.com> Link: https://patch.msgid.link/20251219113320.183860-25-thomas.hellstrom@linux.intel.com
1 parent ec265e1 commit 0620837

1 file changed

Lines changed: 20 additions & 2 deletions

File tree

drivers/gpu/drm/xe/xe_svm.c

Lines changed: 20 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1593,10 +1593,12 @@ struct drm_pagemap *xe_vma_resolve_pagemap(struct xe_vma *vma, struct xe_tile *t
15931593
int xe_svm_alloc_vram(struct xe_svm_range *range, const struct drm_gpusvm_ctx *ctx,
15941594
struct drm_pagemap *dpagemap)
15951595
{
1596+
static DECLARE_RWSEM(driver_migrate_lock);
15961597
struct xe_vm *vm = range_to_vm(&range->base);
15971598
enum drm_gpusvm_scan_result migration_state;
15981599
struct xe_device *xe = vm->xe;
15991600
int err, retries = 1;
1601+
bool write_locked = false;
16001602

16011603
xe_assert(range_to_vm(&range->base)->xe, range->base.pages.flags.migrate_devmem);
16021604
range_debug(range, "ALLOCATE VRAM");
@@ -1615,16 +1617,32 @@ int xe_svm_alloc_vram(struct xe_svm_range *range, const struct drm_gpusvm_ctx *c
16151617
drm_dbg(&xe->drm, "Request migration to device memory on \"%s\".\n",
16161618
dpagemap->drm->unique);
16171619

1620+
err = down_read_interruptible(&driver_migrate_lock);
1621+
if (err)
1622+
return err;
16181623
do {
16191624
err = drm_pagemap_populate_mm(dpagemap, xe_svm_range_start(range),
16201625
xe_svm_range_end(range),
16211626
range->base.gpusvm->mm,
16221627
ctx->timeslice_ms);
16231628

1624-
if (err == -EBUSY && retries)
1625-
drm_gpusvm_range_evict(range->base.gpusvm, &range->base);
1629+
if (err == -EBUSY && retries) {
1630+
if (!write_locked) {
1631+
int lock_err;
16261632

1633+
up_read(&driver_migrate_lock);
1634+
lock_err = down_write_killable(&driver_migrate_lock);
1635+
if (lock_err)
1636+
return lock_err;
1637+
write_locked = true;
1638+
}
1639+
drm_gpusvm_range_evict(range->base.gpusvm, &range->base);
1640+
}
16271641
} while (err == -EBUSY && retries--);
1642+
if (write_locked)
1643+
up_write(&driver_migrate_lock);
1644+
else
1645+
up_read(&driver_migrate_lock);
16281646

16291647
return err;
16301648
}

0 commit comments

Comments
 (0)