Skip to content

Commit c106394

Browse files
dcuigregkh
authored andcommitted
Drivers: hv: vmbus: Offload the handling of channels to two workqueues
commit 37c2578 upstream. vmbus_process_offer() mustn't call channel->sc_creation_callback() directly for sub-channels, because sc_creation_callback() -> vmbus_open() may never get the host's response to the OPEN_CHANNEL message (the host may rescind a channel at any time, e.g. in the case of hot removing a NIC), and vmbus_onoffer_rescind() may not wake up the vmbus_open() as it's blocked due to a non-zero vmbus_connection.offer_in_progress, and finally we have a deadlock. The above is also true for primary channels, if the related device drivers use sync probing mode by default. And, usually the handling of primary channels and sub-channels can depend on each other, so we should offload them to different workqueues to avoid possible deadlock, e.g. in sync-probing mode, NIC1's netvsc_subchan_work() can race with NIC2's netvsc_probe() -> rtnl_lock(), and causes deadlock: the former gets the rtnl_lock and waits for all the sub-channels to appear, but the latter can't get the rtnl_lock and this blocks the handling of sub-channels. The patch can fix the multiple-NIC deadlock described above for v3.x kernels (e.g. RHEL 7.x) which don't support async-probing of devices, and v4.4, v4.9, v4.14 and v4.18 which support async-probing but don't enable async-probing for Hyper-V drivers (yet). The patch can also fix the hang issue in sub-channel's handling described above for all versions of kernels, including v4.19 and v4.20-rc4. So actually the patch should be applied to all the existing kernels, not only the kernels that have 8195b13. Fixes: 8195b13 ("hv_netvsc: fix deadlock on hotplug") Cc: stable@vger.kernel.org Cc: Stephen Hemminger <sthemmin@microsoft.com> Cc: K. Y. Srinivasan <kys@microsoft.com> Cc: Haiyang Zhang <haiyangz@microsoft.com> Signed-off-by: Dexuan Cui <decui@microsoft.com> Signed-off-by: K. Y. Srinivasan <kys@microsoft.com> Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
1 parent eff5e74 commit c106394

4 files changed

Lines changed: 160 additions & 63 deletions

File tree

drivers/hv/channel_mgmt.c

Lines changed: 125 additions & 60 deletions
Original file line numberDiff line numberDiff line change
@@ -444,61 +444,16 @@ void vmbus_free_channels(void)
444444
}
445445
}
446446

447-
/*
448-
* vmbus_process_offer - Process the offer by creating a channel/device
449-
* associated with this offer
450-
*/
451-
static void vmbus_process_offer(struct vmbus_channel *newchannel)
447+
/* Note: the function can run concurrently for primary/sub channels. */
448+
static void vmbus_add_channel_work(struct work_struct *work)
452449
{
453-
struct vmbus_channel *channel;
454-
bool fnew = true;
450+
struct vmbus_channel *newchannel =
451+
container_of(work, struct vmbus_channel, add_channel_work);
452+
struct vmbus_channel *primary_channel = newchannel->primary_channel;
455453
unsigned long flags;
456454
u16 dev_type;
457455
int ret;
458456

459-
/* Make sure this is a new offer */
460-
mutex_lock(&vmbus_connection.channel_mutex);
461-
462-
/*
463-
* Now that we have acquired the channel_mutex,
464-
* we can release the potentially racing rescind thread.
465-
*/
466-
atomic_dec(&vmbus_connection.offer_in_progress);
467-
468-
list_for_each_entry(channel, &vmbus_connection.chn_list, listentry) {
469-
if (!uuid_le_cmp(channel->offermsg.offer.if_type,
470-
newchannel->offermsg.offer.if_type) &&
471-
!uuid_le_cmp(channel->offermsg.offer.if_instance,
472-
newchannel->offermsg.offer.if_instance)) {
473-
fnew = false;
474-
break;
475-
}
476-
}
477-
478-
if (fnew)
479-
list_add_tail(&newchannel->listentry,
480-
&vmbus_connection.chn_list);
481-
482-
mutex_unlock(&vmbus_connection.channel_mutex);
483-
484-
if (!fnew) {
485-
/*
486-
* Check to see if this is a sub-channel.
487-
*/
488-
if (newchannel->offermsg.offer.sub_channel_index != 0) {
489-
/*
490-
* Process the sub-channel.
491-
*/
492-
newchannel->primary_channel = channel;
493-
spin_lock_irqsave(&channel->lock, flags);
494-
list_add_tail(&newchannel->sc_list, &channel->sc_list);
495-
channel->num_sc++;
496-
spin_unlock_irqrestore(&channel->lock, flags);
497-
} else {
498-
goto err_free_chan;
499-
}
500-
}
501-
502457
dev_type = hv_get_dev_type(newchannel);
503458

504459
init_vp_index(newchannel, dev_type);
@@ -516,21 +471,22 @@ static void vmbus_process_offer(struct vmbus_channel *newchannel)
516471
/*
517472
* This state is used to indicate a successful open
518473
* so that when we do close the channel normally, we
519-
* can cleanup properly
474+
* can cleanup properly.
520475
*/
521476
newchannel->state = CHANNEL_OPEN_STATE;
522477

523-
if (!fnew) {
524-
if (channel->sc_creation_callback != NULL)
525-
channel->sc_creation_callback(newchannel);
478+
if (primary_channel != NULL) {
479+
/* newchannel is a sub-channel. */
480+
481+
if (primary_channel->sc_creation_callback != NULL)
482+
primary_channel->sc_creation_callback(newchannel);
483+
526484
newchannel->probe_done = true;
527485
return;
528486
}
529487

530488
/*
531-
* Start the process of binding this offer to the driver
532-
* We need to set the DeviceObject field before calling
533-
* vmbus_child_dev_add()
489+
* Start the process of binding the primary channel to the driver
534490
*/
535491
newchannel->device_obj = vmbus_device_create(
536492
&newchannel->offermsg.offer.if_type,
@@ -559,28 +515,133 @@ static void vmbus_process_offer(struct vmbus_channel *newchannel)
559515

560516
err_deq_chan:
561517
mutex_lock(&vmbus_connection.channel_mutex);
562-
list_del(&newchannel->listentry);
518+
519+
/*
520+
* We need to set the flag, otherwise
521+
* vmbus_onoffer_rescind() can be blocked.
522+
*/
523+
newchannel->probe_done = true;
524+
525+
if (primary_channel == NULL) {
526+
list_del(&newchannel->listentry);
527+
} else {
528+
spin_lock_irqsave(&primary_channel->lock, flags);
529+
list_del(&newchannel->sc_list);
530+
spin_unlock_irqrestore(&primary_channel->lock, flags);
531+
}
532+
563533
mutex_unlock(&vmbus_connection.channel_mutex);
564534

565535
if (newchannel->target_cpu != get_cpu()) {
566536
put_cpu();
567537
smp_call_function_single(newchannel->target_cpu,
568-
percpu_channel_deq, newchannel, true);
538+
percpu_channel_deq,
539+
newchannel, true);
569540
} else {
570541
percpu_channel_deq(newchannel);
571542
put_cpu();
572543
}
573544

574545
vmbus_release_relid(newchannel->offermsg.child_relid);
575546

576-
err_free_chan:
577547
free_channel(newchannel);
578548
}
579549

550+
/*
551+
* vmbus_process_offer - Process the offer by creating a channel/device
552+
* associated with this offer
553+
*/
554+
static void vmbus_process_offer(struct vmbus_channel *newchannel)
555+
{
556+
struct vmbus_channel *channel;
557+
struct workqueue_struct *wq;
558+
unsigned long flags;
559+
bool fnew = true;
560+
561+
mutex_lock(&vmbus_connection.channel_mutex);
562+
563+
/*
564+
* Now that we have acquired the channel_mutex,
565+
* we can release the potentially racing rescind thread.
566+
*/
567+
atomic_dec(&vmbus_connection.offer_in_progress);
568+
569+
list_for_each_entry(channel, &vmbus_connection.chn_list, listentry) {
570+
if (!uuid_le_cmp(channel->offermsg.offer.if_type,
571+
newchannel->offermsg.offer.if_type) &&
572+
!uuid_le_cmp(channel->offermsg.offer.if_instance,
573+
newchannel->offermsg.offer.if_instance)) {
574+
fnew = false;
575+
break;
576+
}
577+
}
578+
579+
if (fnew)
580+
list_add_tail(&newchannel->listentry,
581+
&vmbus_connection.chn_list);
582+
else {
583+
/*
584+
* Check to see if this is a valid sub-channel.
585+
*/
586+
if (newchannel->offermsg.offer.sub_channel_index == 0) {
587+
mutex_unlock(&vmbus_connection.channel_mutex);
588+
/*
589+
* Don't call free_channel(), because newchannel->kobj
590+
* is not initialized yet.
591+
*/
592+
kfree(newchannel);
593+
WARN_ON_ONCE(1);
594+
return;
595+
}
596+
/*
597+
* Process the sub-channel.
598+
*/
599+
newchannel->primary_channel = channel;
600+
spin_lock_irqsave(&channel->lock, flags);
601+
list_add_tail(&newchannel->sc_list, &channel->sc_list);
602+
spin_unlock_irqrestore(&channel->lock, flags);
603+
}
604+
605+
mutex_unlock(&vmbus_connection.channel_mutex);
606+
607+
/*
608+
* vmbus_process_offer() mustn't call channel->sc_creation_callback()
609+
* directly for sub-channels, because sc_creation_callback() ->
610+
* vmbus_open() may never get the host's response to the
611+
* OPEN_CHANNEL message (the host may rescind a channel at any time,
612+
* e.g. in the case of hot removing a NIC), and vmbus_onoffer_rescind()
613+
* may not wake up the vmbus_open() as it's blocked due to a non-zero
614+
* vmbus_connection.offer_in_progress, and finally we have a deadlock.
615+
*
616+
* The above is also true for primary channels, if the related device
617+
* drivers use sync probing mode by default.
618+
*
619+
* And, usually the handling of primary channels and sub-channels can
620+
* depend on each other, so we should offload them to different
621+
* workqueues to avoid possible deadlock, e.g. in sync-probing mode,
622+
* NIC1's netvsc_subchan_work() can race with NIC2's netvsc_probe() ->
623+
* rtnl_lock(), and causes deadlock: the former gets the rtnl_lock
624+
* and waits for all the sub-channels to appear, but the latter
625+
* can't get the rtnl_lock and this blocks the handling of
626+
* sub-channels.
627+
*/
628+
INIT_WORK(&newchannel->add_channel_work, vmbus_add_channel_work);
629+
wq = fnew ? vmbus_connection.handle_primary_chan_wq :
630+
vmbus_connection.handle_sub_chan_wq;
631+
queue_work(wq, &newchannel->add_channel_work);
632+
}
633+
580634
/*
581635
* We use this state to statically distribute the channel interrupt load.
582636
*/
583637
static int next_numa_node_id;
638+
/*
639+
* init_vp_index() accesses global variables like next_numa_node_id, and
640+
* it can run concurrently for primary channels and sub-channels: see
641+
* vmbus_process_offer(), so we need the lock to protect the global
642+
* variables.
643+
*/
644+
static DEFINE_SPINLOCK(bind_channel_to_cpu_lock);
584645

585646
/*
586647
* Starting with Win8, we can statically distribute the incoming
@@ -618,6 +679,8 @@ static void init_vp_index(struct vmbus_channel *channel, u16 dev_type)
618679
return;
619680
}
620681

682+
spin_lock(&bind_channel_to_cpu_lock);
683+
621684
/*
622685
* Based on the channel affinity policy, we will assign the NUMA
623686
* nodes.
@@ -700,6 +763,8 @@ static void init_vp_index(struct vmbus_channel *channel, u16 dev_type)
700763
channel->target_cpu = cur_cpu;
701764
channel->target_vp = hv_cpu_number_to_vp_number(cur_cpu);
702765

766+
spin_unlock(&bind_channel_to_cpu_lock);
767+
703768
free_cpumask_var(available_mask);
704769
}
705770

drivers/hv/connection.c

Lines changed: 21 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -161,6 +161,20 @@ int vmbus_connect(void)
161161
goto cleanup;
162162
}
163163

164+
vmbus_connection.handle_primary_chan_wq =
165+
create_workqueue("hv_pri_chan");
166+
if (!vmbus_connection.handle_primary_chan_wq) {
167+
ret = -ENOMEM;
168+
goto cleanup;
169+
}
170+
171+
vmbus_connection.handle_sub_chan_wq =
172+
create_workqueue("hv_sub_chan");
173+
if (!vmbus_connection.handle_sub_chan_wq) {
174+
ret = -ENOMEM;
175+
goto cleanup;
176+
}
177+
164178
INIT_LIST_HEAD(&vmbus_connection.chn_msg_list);
165179
spin_lock_init(&vmbus_connection.channelmsg_lock);
166180

@@ -251,10 +265,14 @@ void vmbus_disconnect(void)
251265
*/
252266
vmbus_initiate_unload(false);
253267

254-
if (vmbus_connection.work_queue) {
255-
drain_workqueue(vmbus_connection.work_queue);
268+
if (vmbus_connection.handle_sub_chan_wq)
269+
destroy_workqueue(vmbus_connection.handle_sub_chan_wq);
270+
271+
if (vmbus_connection.handle_primary_chan_wq)
272+
destroy_workqueue(vmbus_connection.handle_primary_chan_wq);
273+
274+
if (vmbus_connection.work_queue)
256275
destroy_workqueue(vmbus_connection.work_queue);
257-
}
258276

259277
if (vmbus_connection.int_page) {
260278
free_pages((unsigned long)vmbus_connection.int_page, 0);

drivers/hv/hyperv_vmbus.h

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -327,7 +327,14 @@ struct vmbus_connection {
327327
struct list_head chn_list;
328328
struct mutex channel_mutex;
329329

330+
/*
331+
* An offer message is handled first on the work_queue, and then
332+
* is further handled on handle_primary_chan_wq or
333+
* handle_sub_chan_wq.
334+
*/
330335
struct workqueue_struct *work_queue;
336+
struct workqueue_struct *handle_primary_chan_wq;
337+
struct workqueue_struct *handle_sub_chan_wq;
331338
};
332339

333340

include/linux/hyperv.h

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -869,6 +869,13 @@ struct vmbus_channel {
869869

870870
bool probe_done;
871871

872+
/*
873+
* We must offload the handling of the primary/sub channels
874+
* from the single-threaded vmbus_connection.work_queue to
875+
* two different workqueue, otherwise we can block
876+
* vmbus_connection.work_queue and hang: see vmbus_process_offer().
877+
*/
878+
struct work_struct add_channel_work;
872879
};
873880

874881
static inline bool is_hvsock_channel(const struct vmbus_channel *c)

0 commit comments

Comments
 (0)