Skip to content

Commit 2628bd6

Browse files
yhuang-inteltorvalds
authored andcommitted
mm, swap: fix race between swap count continuation operations
One page may store a set of entries of the sis->swap_map (swap_info_struct->swap_map) in multiple swap clusters. If some of the entries has sis->swap_map[offset] > SWAP_MAP_MAX, multiple pages will be used to store the set of entries of the sis->swap_map. And the pages are linked with page->lru. This is called swap count continuation. To access the pages which store the set of entries of the sis->swap_map simultaneously, previously, sis->lock is used. But to improve the scalability of __swap_duplicate(), swap cluster lock may be used in swap_count_continued() now. This may race with add_swap_count_continuation() which operates on a nearby swap cluster, in which the sis->swap_map entries are stored in the same page. The race can cause wrong swap count in practice, thus cause unfreeable swap entries or software lockup, etc. To fix the race, a new spin lock called cont_lock is added to struct swap_info_struct to protect the swap count continuation page list. This is a lock at the swap device level, so the scalability isn't very well. But it is still much better than the original sis->lock, because it is only acquired/released when swap count continuation is used. Which is considered rare in practice. If it turns out that the scalability becomes an issue for some workloads, we can split the lock into some more fine grained locks. Link: http://lkml.kernel.org/r/20171017081320.28133-1-ying.huang@intel.com Fixes: 235b621 ("mm/swap: add cluster lock") Signed-off-by: "Huang, Ying" <ying.huang@intel.com> Cc: Johannes Weiner <hannes@cmpxchg.org> Cc: Shaohua Li <shli@kernel.org> Cc: Tim Chen <tim.c.chen@intel.com> Cc: Michal Hocko <mhocko@suse.com> Cc: Aaron Lu <aaron.lu@intel.com> Cc: Dave Hansen <dave.hansen@intel.com> Cc: Andi Kleen <ak@linux.intel.com> Cc: Minchan Kim <minchan@kernel.org> Cc: Hugh Dickins <hughd@google.com> Cc: <stable@vger.kernel.org> [4.11+] Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
1 parent dd8a67f commit 2628bd6

2 files changed

Lines changed: 21 additions & 6 deletions

File tree

include/linux/swap.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -266,6 +266,10 @@ struct swap_info_struct {
266266
* both locks need hold, hold swap_lock
267267
* first.
268268
*/
269+
spinlock_t cont_lock; /*
270+
* protect swap count continuation page
271+
* list.
272+
*/
269273
struct work_struct discard_work; /* discard worker */
270274
struct swap_cluster_list discard_clusters; /* discard clusters list */
271275
};

mm/swapfile.c

Lines changed: 17 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -2869,6 +2869,7 @@ static struct swap_info_struct *alloc_swap_info(void)
28692869
p->flags = SWP_USED;
28702870
spin_unlock(&swap_lock);
28712871
spin_lock_init(&p->lock);
2872+
spin_lock_init(&p->cont_lock);
28722873

28732874
return p;
28742875
}
@@ -3545,6 +3546,7 @@ int add_swap_count_continuation(swp_entry_t entry, gfp_t gfp_mask)
35453546
head = vmalloc_to_page(si->swap_map + offset);
35463547
offset &= ~PAGE_MASK;
35473548

3549+
spin_lock(&si->cont_lock);
35483550
/*
35493551
* Page allocation does not initialize the page's lru field,
35503552
* but it does always reset its private field.
@@ -3564,7 +3566,7 @@ int add_swap_count_continuation(swp_entry_t entry, gfp_t gfp_mask)
35643566
* a continuation page, free our allocation and use this one.
35653567
*/
35663568
if (!(count & COUNT_CONTINUED))
3567-
goto out;
3569+
goto out_unlock_cont;
35683570

35693571
map = kmap_atomic(list_page) + offset;
35703572
count = *map;
@@ -3575,11 +3577,13 @@ int add_swap_count_continuation(swp_entry_t entry, gfp_t gfp_mask)
35753577
* free our allocation and use this one.
35763578
*/
35773579
if ((count & ~COUNT_CONTINUED) != SWAP_CONT_MAX)
3578-
goto out;
3580+
goto out_unlock_cont;
35793581
}
35803582

35813583
list_add_tail(&page->lru, &head->lru);
35823584
page = NULL; /* now it's attached, don't free it */
3585+
out_unlock_cont:
3586+
spin_unlock(&si->cont_lock);
35833587
out:
35843588
unlock_cluster(ci);
35853589
spin_unlock(&si->lock);
@@ -3604,13 +3608,15 @@ static bool swap_count_continued(struct swap_info_struct *si,
36043608
struct page *head;
36053609
struct page *page;
36063610
unsigned char *map;
3611+
bool ret;
36073612

36083613
head = vmalloc_to_page(si->swap_map + offset);
36093614
if (page_private(head) != SWP_CONTINUED) {
36103615
BUG_ON(count & COUNT_CONTINUED);
36113616
return false; /* need to add count continuation */
36123617
}
36133618

3619+
spin_lock(&si->cont_lock);
36143620
offset &= ~PAGE_MASK;
36153621
page = list_entry(head->lru.next, struct page, lru);
36163622
map = kmap_atomic(page) + offset;
@@ -3631,8 +3637,10 @@ static bool swap_count_continued(struct swap_info_struct *si,
36313637
if (*map == SWAP_CONT_MAX) {
36323638
kunmap_atomic(map);
36333639
page = list_entry(page->lru.next, struct page, lru);
3634-
if (page == head)
3635-
return false; /* add count continuation */
3640+
if (page == head) {
3641+
ret = false; /* add count continuation */
3642+
goto out;
3643+
}
36363644
map = kmap_atomic(page) + offset;
36373645
init_map: *map = 0; /* we didn't zero the page */
36383646
}
@@ -3645,7 +3653,7 @@ init_map: *map = 0; /* we didn't zero the page */
36453653
kunmap_atomic(map);
36463654
page = list_entry(page->lru.prev, struct page, lru);
36473655
}
3648-
return true; /* incremented */
3656+
ret = true; /* incremented */
36493657

36503658
} else { /* decrementing */
36513659
/*
@@ -3671,8 +3679,11 @@ init_map: *map = 0; /* we didn't zero the page */
36713679
kunmap_atomic(map);
36723680
page = list_entry(page->lru.prev, struct page, lru);
36733681
}
3674-
return count == COUNT_CONTINUED;
3682+
ret = count == COUNT_CONTINUED;
36753683
}
3684+
out:
3685+
spin_unlock(&si->cont_lock);
3686+
return ret;
36763687
}
36773688

36783689
/*

0 commit comments

Comments
 (0)