Skip to content

Commit 6cca686

Browse files
xiaogang-chen-amdalexdeucher
authored andcommitted
drm/amdkfd: kfd driver supports hot unplug/replug amdgpu devices
This patch allows kfd driver function correctly when AMD gpu devices got unplug/replug at run time. When an AMD gpu device got unplug kfd driver gracefully terminates existing kfd processes after stops all queues by sending SIGBUS to user process. After that user space can still use remaining AMD gpu devices. When all AMD gpu devices at system got removed kfd driver will not response new requests. Unplugged AMD gpu devices can be re-plugged. kfd driver will use added devices to function as usual. The purpose of this patch is having kfd driver behavior as expected during and after AMD gpu devices unplug/replug at run time. Signed-off-by: Xiaogang Chen <Xiaogang.Chen@amd.com> Acked-by: Alex Deucher <alexander.deucher@amd.com> Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
1 parent d81e52f commit 6cca686

8 files changed

Lines changed: 156 additions & 2 deletions

File tree

drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -248,6 +248,11 @@ void amdgpu_amdkfd_interrupt(struct amdgpu_device *adev,
248248
kgd2kfd_interrupt(adev->kfd.dev, ih_ring_entry);
249249
}
250250

251+
void amdgpu_amdkfd_teardown_processes(struct amdgpu_device *adev)
252+
{
253+
kgd2kfd_teardown_processes(adev);
254+
}
255+
251256
void amdgpu_amdkfd_suspend(struct amdgpu_device *adev, bool suspend_proc)
252257
{
253258
if (adev->kfd.dev) {

drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -158,6 +158,7 @@ struct amdkfd_process_info {
158158

159159
int amdgpu_amdkfd_init(void);
160160
void amdgpu_amdkfd_fini(void);
161+
void amdgpu_amdkfd_teardown_processes(struct amdgpu_device *adev);
161162

162163
void amdgpu_amdkfd_suspend(struct amdgpu_device *adev, bool suspend_proc);
163164
int amdgpu_amdkfd_resume(struct amdgpu_device *adev, bool resume_proc);
@@ -438,6 +439,8 @@ int kgd2kfd_stop_sched_all_nodes(struct kfd_dev *kfd);
438439
bool kgd2kfd_compute_active(struct kfd_dev *kfd, uint32_t node_id);
439440
bool kgd2kfd_vmfault_fast_path(struct amdgpu_device *adev, struct amdgpu_iv_entry *entry,
440441
bool retry_fault);
442+
void kgd2kfd_lock_kfd(void);
443+
void kgd2kfd_teardown_processes(struct amdgpu_device *adev);
441444

442445
#else
443446
static inline int kgd2kfd_init(void)
@@ -550,5 +553,13 @@ static inline bool kgd2kfd_vmfault_fast_path(struct amdgpu_device *adev, struct
550553
return false;
551554
}
552555

556+
static inline void kgd2kfd_lock_kfd(void)
557+
{
558+
}
559+
560+
static inline void kgd2kfd_teardown_processes(struct amdgpu_device *adev)
561+
{
562+
}
563+
553564
#endif
554565
#endif /* AMDGPU_AMDKFD_H_INCLUDED */

drivers/gpu/drm/amd/amdgpu/amdgpu_device.c

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3510,6 +3510,7 @@ static int amdgpu_device_ip_fini_early(struct amdgpu_device *adev)
35103510
amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE);
35113511

35123512
amdgpu_amdkfd_suspend(adev, true);
3513+
amdgpu_amdkfd_teardown_processes(adev);
35133514
amdgpu_userq_suspend(adev);
35143515

35153516
/* Workaround for ASICs need to disable SMC first */

drivers/gpu/drm/amd/amdkfd/kfd_device.c

Lines changed: 75 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -973,6 +973,9 @@ void kgd2kfd_device_exit(struct kfd_dev *kfd)
973973
}
974974

975975
kfree(kfd);
976+
977+
/* after remove a kfd device unlock kfd driver */
978+
kgd2kfd_unlock_kfd(NULL);
976979
}
977980

978981
int kgd2kfd_pre_reset(struct kfd_dev *kfd,
@@ -1557,10 +1560,14 @@ int kgd2kfd_check_and_lock_kfd(struct kfd_dev *kfd)
15571560
return r;
15581561
}
15591562

1563+
/* unlock a kfd dev or kfd driver */
15601564
void kgd2kfd_unlock_kfd(struct kfd_dev *kfd)
15611565
{
15621566
mutex_lock(&kfd_processes_mutex);
1563-
--kfd->kfd_dev_lock;
1567+
if (kfd)
1568+
--kfd->kfd_dev_lock;
1569+
else
1570+
--kfd_locked;
15641571
mutex_unlock(&kfd_processes_mutex);
15651572
}
15661573

@@ -1729,6 +1736,73 @@ bool kgd2kfd_vmfault_fast_path(struct amdgpu_device *adev, struct amdgpu_iv_entr
17291736
return false;
17301737
}
17311738

1739+
/* check if there is kfd process still uses adev */
1740+
static bool kgd2kfd_check_device_idle(struct amdgpu_device *adev)
1741+
{
1742+
struct kfd_process *p;
1743+
struct hlist_node *p_temp;
1744+
unsigned int temp;
1745+
struct kfd_node *dev;
1746+
1747+
mutex_lock(&kfd_processes_mutex);
1748+
1749+
if (hash_empty(kfd_processes_table)) {
1750+
mutex_unlock(&kfd_processes_mutex);
1751+
return true;
1752+
}
1753+
1754+
/* check if there is device still use adev */
1755+
hash_for_each_safe(kfd_processes_table, temp, p_temp, p, kfd_processes) {
1756+
for (int i = 0; i < p->n_pdds; i++) {
1757+
dev = p->pdds[i]->dev;
1758+
if (dev->adev == adev) {
1759+
mutex_unlock(&kfd_processes_mutex);
1760+
return false;
1761+
}
1762+
}
1763+
}
1764+
1765+
mutex_unlock(&kfd_processes_mutex);
1766+
1767+
return true;
1768+
}
1769+
1770+
/** kgd2kfd_teardown_processes - gracefully tear down existing
1771+
* kfd processes that use adev
1772+
*
1773+
* @adev: amdgpu_device where kfd processes run on and will be
1774+
* teardown
1775+
*
1776+
*/
1777+
void kgd2kfd_teardown_processes(struct amdgpu_device *adev)
1778+
{
1779+
struct hlist_node *p_temp;
1780+
struct kfd_process *p;
1781+
struct kfd_node *dev;
1782+
unsigned int temp;
1783+
1784+
mutex_lock(&kfd_processes_mutex);
1785+
1786+
if (hash_empty(kfd_processes_table)) {
1787+
mutex_unlock(&kfd_processes_mutex);
1788+
return;
1789+
}
1790+
1791+
hash_for_each_safe(kfd_processes_table, temp, p_temp, p, kfd_processes) {
1792+
for (int i = 0; i < p->n_pdds; i++) {
1793+
dev = p->pdds[i]->dev;
1794+
if (dev->adev == adev)
1795+
kfd_signal_process_terminate_event(p);
1796+
}
1797+
}
1798+
1799+
mutex_unlock(&kfd_processes_mutex);
1800+
1801+
/* wait all kfd processes use adev terminate */
1802+
while (!kgd2kfd_check_device_idle(adev))
1803+
cond_resched();
1804+
}
1805+
17321806
#if defined(CONFIG_DEBUG_FS)
17331807

17341808
/* This function will send a package to HIQ to hang the HWS

drivers/gpu/drm/amd/amdkfd/kfd_events.c

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1380,3 +1380,32 @@ void kfd_signal_poison_consumed_event(struct kfd_node *dev, u32 pasid)
13801380

13811381
kfd_unref_process(p);
13821382
}
1383+
1384+
/* signal KFD_EVENT_TYPE_SIGNAL events from process p
1385+
* send signal SIGBUS to correspondent user space process
1386+
*/
1387+
void kfd_signal_process_terminate_event(struct kfd_process *p)
1388+
{
1389+
struct kfd_event *ev;
1390+
u32 id;
1391+
1392+
rcu_read_lock();
1393+
1394+
/* iterate from id 1 for KFD_EVENT_TYPE_SIGNAL events */
1395+
id = 1;
1396+
idr_for_each_entry_continue(&p->event_idr, ev, id)
1397+
if (ev->type == KFD_EVENT_TYPE_SIGNAL) {
1398+
spin_lock(&ev->lock);
1399+
set_event(ev);
1400+
spin_unlock(&ev->lock);
1401+
}
1402+
1403+
/* Send SIGBUS to p->lead_thread */
1404+
dev_notice(kfd_device,
1405+
"Sending SIGBUS to process %d",
1406+
p->lead_thread->pid);
1407+
1408+
send_sig(SIGBUS, p->lead_thread, 0);
1409+
1410+
rcu_read_unlock();
1411+
}

drivers/gpu/drm/amd/amdkfd/kfd_priv.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1192,6 +1192,7 @@ static inline struct kfd_node *kfd_node_by_irq_ids(struct amdgpu_device *adev,
11921192
}
11931193
int kfd_topology_enum_kfd_devices(uint8_t idx, struct kfd_node **kdev);
11941194
int kfd_numa_node_to_apic_id(int numa_node_id);
1195+
uint32_t kfd_gpu_node_num(void);
11951196

11961197
/* Interrupts */
11971198
#define KFD_IRQ_FENCE_CLIENTID 0xff
@@ -1547,6 +1548,7 @@ void kfd_signal_vm_fault_event(struct kfd_process_device *pdd,
15471548
void kfd_signal_reset_event(struct kfd_node *dev);
15481549

15491550
void kfd_signal_poison_consumed_event(struct kfd_node *dev, u32 pasid);
1551+
void kfd_signal_process_terminate_event(struct kfd_process *p);
15501552

15511553
static inline void kfd_flush_tlb(struct kfd_process_device *pdd,
15521554
enum TLB_FLUSH_TYPE type)

drivers/gpu/drm/amd/amdkfd/kfd_process.c

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -949,6 +949,12 @@ struct kfd_process *kfd_create_process(struct task_struct *thread)
949949
*/
950950
mutex_lock(&kfd_processes_mutex);
951951

952+
if (kfd_gpu_node_num() <= 0) {
953+
pr_warn("no gpu node! Cannot create KFD process");
954+
process = ERR_PTR(-EINVAL);
955+
goto out;
956+
}
957+
952958
if (kfd_is_locked(NULL)) {
953959
pr_debug("KFD is locked! Cannot create process");
954960
process = ERR_PTR(-EINVAL);
@@ -1235,7 +1241,6 @@ static void kfd_process_wq_release(struct work_struct *work)
12351241
else
12361242
ida_destroy(&p->id_table);
12371243

1238-
kfd_process_remove_sysfs(p);
12391244
kfd_debugfs_remove_process(p);
12401245

12411246
kfd_process_kunmap_signal_bo(p);
@@ -1251,6 +1256,11 @@ static void kfd_process_wq_release(struct work_struct *work)
12511256

12521257
put_task_struct(p->lead_thread);
12531258

1259+
/* the last step is removing process entries under /sys
1260+
* to indicate the process has been terminated.
1261+
*/
1262+
kfd_process_remove_sysfs(p);
1263+
12541264
kfree(p);
12551265
}
12561266

drivers/gpu/drm/amd/amdkfd/kfd_topology.c

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2357,6 +2357,28 @@ int kfd_numa_node_to_apic_id(int numa_node_id)
23572357
return kfd_cpumask_to_apic_id(cpumask_of_node(numa_node_id));
23582358
}
23592359

2360+
/* kfd_gpu_node_num - Return kfd gpu node number at system */
2361+
uint32_t kfd_gpu_node_num(void)
2362+
{
2363+
struct kfd_node *dev;
2364+
u8 gpu_num = 0;
2365+
u8 id = 0;
2366+
2367+
while (kfd_topology_enum_kfd_devices(id, &dev) == 0) {
2368+
if (!dev || kfd_devcgroup_check_permission(dev)) {
2369+
/* Skip non GPU devices and devices to which the
2370+
* current process have no access to
2371+
*/
2372+
id++;
2373+
continue;
2374+
}
2375+
id++;
2376+
gpu_num++;
2377+
}
2378+
2379+
return gpu_num;
2380+
}
2381+
23602382
#if defined(CONFIG_DEBUG_FS)
23612383

23622384
int kfd_debugfs_hqds_by_device(struct seq_file *m, void *data)

0 commit comments

Comments
 (0)