Skip to content

Commit 1484d4f

Browse files
jthornbergregkh
authored andcommitted
dm thin metadata: try to avoid ever aborting transactions
[ Upstream commit 3ab9182 ] Committing a transaction can consume some metadata of it's own, we now reserve a small amount of metadata to cover this. Free metadata reported by the kernel will not include this reserve. If any of the reserve has been used after a commit we enter a new internal state PM_OUT_OF_METADATA_SPACE. This is reported as PM_READ_ONLY, so no userland changes are needed. If the metadata device is resized the pool will move back to PM_WRITE. These changes mean we never need to abort and rollback a transaction due to running out of metadata space. This is particularly important because there have been a handful of reports of data corruption against DM thin-provisioning that can all be attributed to the thin-pool having ran out of metadata space. Signed-off-by: Joe Thornber <ejt@redhat.com> Signed-off-by: Mike Snitzer <snitzer@redhat.com> Signed-off-by: Sasha Levin <alexander.levin@microsoft.com> Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
1 parent 1e9054e commit 1484d4f

2 files changed

Lines changed: 100 additions & 9 deletions

File tree

drivers/md/dm-thin-metadata.c

Lines changed: 35 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -188,6 +188,12 @@ struct dm_pool_metadata {
188188
unsigned long flags;
189189
sector_t data_block_size;
190190

191+
/*
192+
* We reserve a section of the metadata for commit overhead.
193+
* All reported space does *not* include this.
194+
*/
195+
dm_block_t metadata_reserve;
196+
191197
/*
192198
* Set if a transaction has to be aborted but the attempt to roll back
193199
* to the previous (good) transaction failed. The only pool metadata
@@ -825,6 +831,22 @@ static int __commit_transaction(struct dm_pool_metadata *pmd)
825831
return dm_tm_commit(pmd->tm, sblock);
826832
}
827833

834+
static void __set_metadata_reserve(struct dm_pool_metadata *pmd)
835+
{
836+
int r;
837+
dm_block_t total;
838+
dm_block_t max_blocks = 4096; /* 16M */
839+
840+
r = dm_sm_get_nr_blocks(pmd->metadata_sm, &total);
841+
if (r) {
842+
DMERR("could not get size of metadata device");
843+
pmd->metadata_reserve = max_blocks;
844+
} else {
845+
sector_div(total, 10);
846+
pmd->metadata_reserve = min(max_blocks, total);
847+
}
848+
}
849+
828850
struct dm_pool_metadata *dm_pool_metadata_open(struct block_device *bdev,
829851
sector_t data_block_size,
830852
bool format_device)
@@ -858,6 +880,8 @@ struct dm_pool_metadata *dm_pool_metadata_open(struct block_device *bdev,
858880
return ERR_PTR(r);
859881
}
860882

883+
__set_metadata_reserve(pmd);
884+
861885
return pmd;
862886
}
863887

@@ -1829,6 +1853,13 @@ int dm_pool_get_free_metadata_block_count(struct dm_pool_metadata *pmd,
18291853
down_read(&pmd->root_lock);
18301854
if (!pmd->fail_io)
18311855
r = dm_sm_get_nr_free(pmd->metadata_sm, result);
1856+
1857+
if (!r) {
1858+
if (*result < pmd->metadata_reserve)
1859+
*result = 0;
1860+
else
1861+
*result -= pmd->metadata_reserve;
1862+
}
18321863
up_read(&pmd->root_lock);
18331864

18341865
return r;
@@ -1941,8 +1972,11 @@ int dm_pool_resize_metadata_dev(struct dm_pool_metadata *pmd, dm_block_t new_cou
19411972
int r = -EINVAL;
19421973

19431974
down_write(&pmd->root_lock);
1944-
if (!pmd->fail_io)
1975+
if (!pmd->fail_io) {
19451976
r = __resize_space_map(pmd->metadata_sm, new_count);
1977+
if (!r)
1978+
__set_metadata_reserve(pmd);
1979+
}
19461980
up_write(&pmd->root_lock);
19471981

19481982
return r;

drivers/md/dm-thin.c

Lines changed: 65 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -200,7 +200,13 @@ struct dm_thin_new_mapping;
200200
enum pool_mode {
201201
PM_WRITE, /* metadata may be changed */
202202
PM_OUT_OF_DATA_SPACE, /* metadata may be changed, though data may not be allocated */
203+
204+
/*
205+
* Like READ_ONLY, except may switch back to WRITE on metadata resize. Reported as READ_ONLY.
206+
*/
207+
PM_OUT_OF_METADATA_SPACE,
203208
PM_READ_ONLY, /* metadata may not be changed */
209+
204210
PM_FAIL, /* all I/O fails */
205211
};
206212

@@ -1382,7 +1388,35 @@ static void set_pool_mode(struct pool *pool, enum pool_mode new_mode);
13821388

13831389
static void requeue_bios(struct pool *pool);
13841390

1385-
static void check_for_space(struct pool *pool)
1391+
static bool is_read_only_pool_mode(enum pool_mode mode)
1392+
{
1393+
return (mode == PM_OUT_OF_METADATA_SPACE || mode == PM_READ_ONLY);
1394+
}
1395+
1396+
static bool is_read_only(struct pool *pool)
1397+
{
1398+
return is_read_only_pool_mode(get_pool_mode(pool));
1399+
}
1400+
1401+
static void check_for_metadata_space(struct pool *pool)
1402+
{
1403+
int r;
1404+
const char *ooms_reason = NULL;
1405+
dm_block_t nr_free;
1406+
1407+
r = dm_pool_get_free_metadata_block_count(pool->pmd, &nr_free);
1408+
if (r)
1409+
ooms_reason = "Could not get free metadata blocks";
1410+
else if (!nr_free)
1411+
ooms_reason = "No free metadata blocks";
1412+
1413+
if (ooms_reason && !is_read_only(pool)) {
1414+
DMERR("%s", ooms_reason);
1415+
set_pool_mode(pool, PM_OUT_OF_METADATA_SPACE);
1416+
}
1417+
}
1418+
1419+
static void check_for_data_space(struct pool *pool)
13861420
{
13871421
int r;
13881422
dm_block_t nr_free;
@@ -1408,14 +1442,16 @@ static int commit(struct pool *pool)
14081442
{
14091443
int r;
14101444

1411-
if (get_pool_mode(pool) >= PM_READ_ONLY)
1445+
if (get_pool_mode(pool) >= PM_OUT_OF_METADATA_SPACE)
14121446
return -EINVAL;
14131447

14141448
r = dm_pool_commit_metadata(pool->pmd);
14151449
if (r)
14161450
metadata_operation_failed(pool, "dm_pool_commit_metadata", r);
1417-
else
1418-
check_for_space(pool);
1451+
else {
1452+
check_for_metadata_space(pool);
1453+
check_for_data_space(pool);
1454+
}
14191455

14201456
return r;
14211457
}
@@ -1481,6 +1517,19 @@ static int alloc_data_block(struct thin_c *tc, dm_block_t *result)
14811517
return r;
14821518
}
14831519

1520+
r = dm_pool_get_free_metadata_block_count(pool->pmd, &free_blocks);
1521+
if (r) {
1522+
metadata_operation_failed(pool, "dm_pool_get_free_metadata_block_count", r);
1523+
return r;
1524+
}
1525+
1526+
if (!free_blocks) {
1527+
/* Let's commit before we use up the metadata reserve. */
1528+
r = commit(pool);
1529+
if (r)
1530+
return r;
1531+
}
1532+
14841533
return 0;
14851534
}
14861535

@@ -1512,6 +1561,7 @@ static blk_status_t should_error_unserviceable_bio(struct pool *pool)
15121561
case PM_OUT_OF_DATA_SPACE:
15131562
return pool->pf.error_if_no_space ? BLK_STS_NOSPC : 0;
15141563

1564+
case PM_OUT_OF_METADATA_SPACE:
15151565
case PM_READ_ONLY:
15161566
case PM_FAIL:
15171567
return BLK_STS_IOERR;
@@ -2475,8 +2525,9 @@ static void set_pool_mode(struct pool *pool, enum pool_mode new_mode)
24752525
error_retry_list(pool);
24762526
break;
24772527

2528+
case PM_OUT_OF_METADATA_SPACE:
24782529
case PM_READ_ONLY:
2479-
if (old_mode != new_mode)
2530+
if (!is_read_only_pool_mode(old_mode))
24802531
notify_of_pool_mode_change(pool, "read-only");
24812532
dm_pool_metadata_read_only(pool->pmd);
24822533
pool->process_bio = process_bio_read_only;
@@ -3412,6 +3463,10 @@ static int maybe_resize_metadata_dev(struct dm_target *ti, bool *need_commit)
34123463
DMINFO("%s: growing the metadata device from %llu to %llu blocks",
34133464
dm_device_name(pool->pool_md),
34143465
sb_metadata_dev_size, metadata_dev_size);
3466+
3467+
if (get_pool_mode(pool) == PM_OUT_OF_METADATA_SPACE)
3468+
set_pool_mode(pool, PM_WRITE);
3469+
34153470
r = dm_pool_resize_metadata_dev(pool->pmd, metadata_dev_size);
34163471
if (r) {
34173472
metadata_operation_failed(pool, "dm_pool_resize_metadata_dev", r);
@@ -3715,7 +3770,7 @@ static int pool_message(struct dm_target *ti, unsigned argc, char **argv)
37153770
struct pool_c *pt = ti->private;
37163771
struct pool *pool = pt->pool;
37173772

3718-
if (get_pool_mode(pool) >= PM_READ_ONLY) {
3773+
if (get_pool_mode(pool) >= PM_OUT_OF_METADATA_SPACE) {
37193774
DMERR("%s: unable to service pool target messages in READ_ONLY or FAIL mode",
37203775
dm_device_name(pool->pool_md));
37213776
return -EOPNOTSUPP;
@@ -3789,6 +3844,7 @@ static void pool_status(struct dm_target *ti, status_type_t type,
37893844
dm_block_t nr_blocks_data;
37903845
dm_block_t nr_blocks_metadata;
37913846
dm_block_t held_root;
3847+
enum pool_mode mode;
37923848
char buf[BDEVNAME_SIZE];
37933849
char buf2[BDEVNAME_SIZE];
37943850
struct pool_c *pt = ti->private;
@@ -3859,9 +3915,10 @@ static void pool_status(struct dm_target *ti, status_type_t type,
38593915
else
38603916
DMEMIT("- ");
38613917

3862-
if (pool->pf.mode == PM_OUT_OF_DATA_SPACE)
3918+
mode = get_pool_mode(pool);
3919+
if (mode == PM_OUT_OF_DATA_SPACE)
38633920
DMEMIT("out_of_data_space ");
3864-
else if (pool->pf.mode == PM_READ_ONLY)
3921+
else if (is_read_only_pool_mode(mode))
38653922
DMEMIT("ro ");
38663923
else
38673924
DMEMIT("rw ");

0 commit comments

Comments
 (0)