Skip to content

Commit 808cec7

Browse files
XiaoNi87hailan94
authored andcommitted
md/raid1: serialize overlap io for writemostly disk
Previously, using wait_event() would wake up all waiters simultaneously, and they would compete for the tree lock. The bio which gets the lock first will be handled, so the write sequence cannot be guaranteed. For example: bio1(100,200) bio2(150,200) bio3(150,300) The write sequence of fast device is bio1,bio2,bio3. But the write sequence of slow device could be bio1,bio3,bio2 due to lock competition. This causes data corruption. Replace waitqueue with a fifo list to guarantee the write sequence. And it also needs to iterate the list when removing one entry. If not, it may miss the opportunity to wake up the waiting io. For example: bio1(1,3), bio2(2,4) bio3(5,7), bio4(6,8) These four bios are in the same bucket. bio1 and bio3 are inserted into the rbtree. bio2 and bio4 are added to the waiting list and bio2 is the first one. bio3 returns from slow disk and tries to wake up the waiting bios. bio2 is removed from the list and will be handled. But bio1 hasn't finished. So bio2 will be added into waiting list again. Then bio1 returns from slow disk and wakes up waiting bios. bio4 is removed from the list and will be handled. Now bio1, bio3 and bio4 all finish and bio2 is left on the waiting list. So it needs to iterate the waiting list to wake up the right bio. Signed-off-by: Xiao Ni <xni@redhat.com> Link: https://lore.kernel.org/linux-raid/20260324072501.59865-1-xni@redhat.com/ Signed-off-by: Yu Kuai <yukuai@fnnas.com>
1 parent e92a532 commit 808cec7

3 files changed

Lines changed: 39 additions & 14 deletions

File tree

drivers/md/md.c

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -187,7 +187,6 @@ static int rdev_init_serial(struct md_rdev *rdev)
187187

188188
spin_lock_init(&serial_tmp->serial_lock);
189189
serial_tmp->serial_rb = RB_ROOT_CACHED;
190-
init_waitqueue_head(&serial_tmp->serial_io_wait);
191190
}
192191

193192
rdev->serial = serial;

drivers/md/md.h

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -126,7 +126,6 @@ enum sync_action {
126126
struct serial_in_rdev {
127127
struct rb_root_cached serial_rb;
128128
spinlock_t serial_lock;
129-
wait_queue_head_t serial_io_wait;
130129
};
131130

132131
/*
@@ -381,7 +380,11 @@ struct serial_info {
381380
struct rb_node node;
382381
sector_t start; /* start sector of rb node */
383382
sector_t last; /* end sector of rb node */
383+
sector_t wnode_start; /* address of waiting nodes on the same list */
384384
sector_t _subtree_last; /* highest sector in subtree of rb node */
385+
struct list_head list_node;
386+
struct list_head waiters;
387+
struct completion ready;
385388
};
386389

387390
/*

drivers/md/raid1.c

Lines changed: 35 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -57,21 +57,29 @@ INTERVAL_TREE_DEFINE(struct serial_info, node, sector_t, _subtree_last,
5757
START, LAST, static inline, raid1_rb);
5858

5959
static int check_and_add_serial(struct md_rdev *rdev, struct r1bio *r1_bio,
60-
struct serial_info *si, int idx)
60+
struct serial_info *si)
6161
{
6262
unsigned long flags;
6363
int ret = 0;
6464
sector_t lo = r1_bio->sector;
6565
sector_t hi = lo + r1_bio->sectors - 1;
66+
int idx = sector_to_idx(r1_bio->sector);
6667
struct serial_in_rdev *serial = &rdev->serial[idx];
68+
struct serial_info *head_si;
6769

6870
spin_lock_irqsave(&serial->serial_lock, flags);
6971
/* collision happened */
70-
if (raid1_rb_iter_first(&serial->serial_rb, lo, hi))
72+
head_si = raid1_rb_iter_first(&serial->serial_rb, lo, hi);
73+
if (head_si && head_si != si) {
74+
si->start = lo;
75+
si->last = hi;
76+
si->wnode_start = head_si->wnode_start;
77+
list_add_tail(&si->list_node, &head_si->waiters);
7178
ret = -EBUSY;
72-
else {
79+
} else if (!head_si) {
7380
si->start = lo;
7481
si->last = hi;
82+
si->wnode_start = si->start;
7583
raid1_rb_insert(si, &serial->serial_rb);
7684
}
7785
spin_unlock_irqrestore(&serial->serial_lock, flags);
@@ -83,19 +91,22 @@ static void wait_for_serialization(struct md_rdev *rdev, struct r1bio *r1_bio)
8391
{
8492
struct mddev *mddev = rdev->mddev;
8593
struct serial_info *si;
86-
int idx = sector_to_idx(r1_bio->sector);
87-
struct serial_in_rdev *serial = &rdev->serial[idx];
8894

8995
if (WARN_ON(!mddev->serial_info_pool))
9096
return;
9197
si = mempool_alloc(mddev->serial_info_pool, GFP_NOIO);
92-
wait_event(serial->serial_io_wait,
93-
check_and_add_serial(rdev, r1_bio, si, idx) == 0);
98+
INIT_LIST_HEAD(&si->waiters);
99+
INIT_LIST_HEAD(&si->list_node);
100+
init_completion(&si->ready);
101+
while (check_and_add_serial(rdev, r1_bio, si)) {
102+
wait_for_completion(&si->ready);
103+
reinit_completion(&si->ready);
104+
}
94105
}
95106

96107
static void remove_serial(struct md_rdev *rdev, sector_t lo, sector_t hi)
97108
{
98-
struct serial_info *si;
109+
struct serial_info *si, *iter_si;
99110
unsigned long flags;
100111
int found = 0;
101112
struct mddev *mddev = rdev->mddev;
@@ -106,16 +117,28 @@ static void remove_serial(struct md_rdev *rdev, sector_t lo, sector_t hi)
106117
for (si = raid1_rb_iter_first(&serial->serial_rb, lo, hi);
107118
si; si = raid1_rb_iter_next(si, lo, hi)) {
108119
if (si->start == lo && si->last == hi) {
109-
raid1_rb_remove(si, &serial->serial_rb);
110-
mempool_free(si, mddev->serial_info_pool);
111120
found = 1;
112121
break;
113122
}
114123
}
115-
if (!found)
124+
if (found) {
125+
raid1_rb_remove(si, &serial->serial_rb);
126+
if (!list_empty(&si->waiters)) {
127+
list_for_each_entry(iter_si, &si->waiters, list_node) {
128+
if (iter_si->wnode_start == si->wnode_start) {
129+
list_del_init(&iter_si->list_node);
130+
list_splice_init(&si->waiters, &iter_si->waiters);
131+
raid1_rb_insert(iter_si, &serial->serial_rb);
132+
complete(&iter_si->ready);
133+
break;
134+
}
135+
}
136+
}
137+
mempool_free(si, mddev->serial_info_pool);
138+
} else {
116139
WARN(1, "The write IO is not recorded for serialization\n");
140+
}
117141
spin_unlock_irqrestore(&serial->serial_lock, flags);
118-
wake_up(&serial->serial_io_wait);
119142
}
120143

121144
/*

0 commit comments

Comments
 (0)