Skip to content

Commit b29ea3c

Browse files
djbwgregkh
authored andcommitted
mm: introduce get_user_pages_longterm
commit 2bb6d28 upstream. Patch series "introduce get_user_pages_longterm()", v2. Here is a new get_user_pages api for cases where a driver intends to keep an elevated page count indefinitely. This is distinct from usages like iov_iter_get_pages where the elevated page counts are transient. The iov_iter_get_pages cases immediately turn around and submit the pages to a device driver which will put_page when the i/o operation completes (under kernel control). In the longterm case userspace is responsible for dropping the page reference at some undefined point in the future. This is untenable for filesystem-dax case where the filesystem is in control of the lifetime of the block / page and needs reasonable limits on how long it can wait for pages in a mapping to become idle. Fixing filesystems to actually wait for dax pages to be idle before blocks from a truncate/hole-punch operation are repurposed is saved for a later patch series. Also, allowing longterm registration of dax mappings is a future patch series that introduces a "map with lease" semantic where the kernel can revoke a lease and force userspace to drop its page references. I have also tagged these for -stable to purposely break cases that might assume that longterm memory registrations for filesystem-dax mappings were supported by the kernel. The behavior regression this policy change implies is one of the reasons we maintain the "dax enabled. Warning: EXPERIMENTAL, use at your own risk" notification when mounting a filesystem in dax mode. It is worth noting the device-dax interface does not suffer the same constraints since it does not support file space management operations like hole-punch. This patch (of 4): Until there is a solution to the dma-to-dax vs truncate problem it is not safe to allow long standing memory registrations against filesytem-dax vmas. Device-dax vmas do not have this problem and are explicitly allowed. This is temporary until a "memory registration with layout-lease" mechanism can be implemented for the affected sub-systems (RDMA and V4L2). [akpm@linux-foundation.org: use kcalloc()] Link: http://lkml.kernel.org/r/151068939435.7446.13560129395419350737.stgit@dwillia2-desk3.amr.corp.intel.com Fixes: 3565fce ("mm, x86: get_user_pages() for dax mappings") Signed-off-by: Dan Williams <dan.j.williams@intel.com> Suggested-by: Christoph Hellwig <hch@lst.de> Cc: Doug Ledford <dledford@redhat.com> Cc: Hal Rosenstock <hal.rosenstock@gmail.com> Cc: Inki Dae <inki.dae@samsung.com> Cc: Jan Kara <jack@suse.cz> Cc: Jason Gunthorpe <jgg@mellanox.com> Cc: Jeff Moyer <jmoyer@redhat.com> Cc: Joonyoung Shim <jy0922.shim@samsung.com> Cc: Kyungmin Park <kyungmin.park@samsung.com> Cc: Mauro Carvalho Chehab <mchehab@kernel.org> Cc: Mel Gorman <mgorman@suse.de> Cc: Ross Zwisler <ross.zwisler@linux.intel.com> Cc: Sean Hefty <sean.hefty@intel.com> Cc: Seung-Woo Kim <sw0312.kim@samsung.com> Cc: Vlastimil Babka <vbabka@suse.cz> Cc: <stable@vger.kernel.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org> Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
1 parent be38759 commit b29ea3c

4 files changed

Lines changed: 97 additions & 5 deletions

File tree

include/linux/dax.h

Lines changed: 0 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -61,11 +61,6 @@ static inline int dax_pmd_fault(struct vm_area_struct *vma, unsigned long addr,
6161
int dax_pfn_mkwrite(struct vm_area_struct *, struct vm_fault *);
6262
#define dax_mkwrite(vma, vmf, gb) dax_fault(vma, vmf, gb)
6363

64-
static inline bool vma_is_dax(struct vm_area_struct *vma)
65-
{
66-
return vma->vm_file && IS_DAX(vma->vm_file->f_mapping->host);
67-
}
68-
6964
static inline bool dax_mapping(struct address_space *mapping)
7065
{
7166
return mapping->host && IS_DAX(mapping->host);

include/linux/fs.h

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@
1818
#include <linux/bug.h>
1919
#include <linux/mutex.h>
2020
#include <linux/rwsem.h>
21+
#include <linux/mm_types.h>
2122
#include <linux/capability.h>
2223
#include <linux/semaphore.h>
2324
#include <linux/fiemap.h>
@@ -3033,6 +3034,25 @@ static inline bool io_is_direct(struct file *filp)
30333034
return (filp->f_flags & O_DIRECT) || IS_DAX(filp->f_mapping->host);
30343035
}
30353036

3037+
static inline bool vma_is_dax(struct vm_area_struct *vma)
3038+
{
3039+
return vma->vm_file && IS_DAX(vma->vm_file->f_mapping->host);
3040+
}
3041+
3042+
static inline bool vma_is_fsdax(struct vm_area_struct *vma)
3043+
{
3044+
struct inode *inode;
3045+
3046+
if (!vma->vm_file)
3047+
return false;
3048+
if (!vma_is_dax(vma))
3049+
return false;
3050+
inode = file_inode(vma->vm_file);
3051+
if (inode->i_mode == S_IFCHR)
3052+
return false; /* device-dax */
3053+
return true;
3054+
}
3055+
30363056
static inline int iocb_flags(struct file *file)
30373057
{
30383058
int res = 0;

include/linux/mm.h

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1288,6 +1288,19 @@ long __get_user_pages_unlocked(struct task_struct *tsk, struct mm_struct *mm,
12881288
struct page **pages, unsigned int gup_flags);
12891289
long get_user_pages_unlocked(unsigned long start, unsigned long nr_pages,
12901290
struct page **pages, unsigned int gup_flags);
1291+
#ifdef CONFIG_FS_DAX
1292+
long get_user_pages_longterm(unsigned long start, unsigned long nr_pages,
1293+
unsigned int gup_flags, struct page **pages,
1294+
struct vm_area_struct **vmas);
1295+
#else
1296+
static inline long get_user_pages_longterm(unsigned long start,
1297+
unsigned long nr_pages, unsigned int gup_flags,
1298+
struct page **pages, struct vm_area_struct **vmas)
1299+
{
1300+
return get_user_pages(start, nr_pages, gup_flags, pages, vmas);
1301+
}
1302+
#endif /* CONFIG_FS_DAX */
1303+
12911304
int get_user_pages_fast(unsigned long start, int nr_pages, int write,
12921305
struct page **pages);
12931306

mm/gup.c

Lines changed: 64 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -982,6 +982,70 @@ long get_user_pages(unsigned long start, unsigned long nr_pages,
982982
}
983983
EXPORT_SYMBOL(get_user_pages);
984984

985+
#ifdef CONFIG_FS_DAX
986+
/*
987+
* This is the same as get_user_pages() in that it assumes we are
988+
* operating on the current task's mm, but it goes further to validate
989+
* that the vmas associated with the address range are suitable for
990+
* longterm elevated page reference counts. For example, filesystem-dax
991+
* mappings are subject to the lifetime enforced by the filesystem and
992+
* we need guarantees that longterm users like RDMA and V4L2 only
993+
* establish mappings that have a kernel enforced revocation mechanism.
994+
*
995+
* "longterm" == userspace controlled elevated page count lifetime.
996+
* Contrast this to iov_iter_get_pages() usages which are transient.
997+
*/
998+
long get_user_pages_longterm(unsigned long start, unsigned long nr_pages,
999+
unsigned int gup_flags, struct page **pages,
1000+
struct vm_area_struct **vmas_arg)
1001+
{
1002+
struct vm_area_struct **vmas = vmas_arg;
1003+
struct vm_area_struct *vma_prev = NULL;
1004+
long rc, i;
1005+
1006+
if (!pages)
1007+
return -EINVAL;
1008+
1009+
if (!vmas) {
1010+
vmas = kcalloc(nr_pages, sizeof(struct vm_area_struct *),
1011+
GFP_KERNEL);
1012+
if (!vmas)
1013+
return -ENOMEM;
1014+
}
1015+
1016+
rc = get_user_pages(start, nr_pages, gup_flags, pages, vmas);
1017+
1018+
for (i = 0; i < rc; i++) {
1019+
struct vm_area_struct *vma = vmas[i];
1020+
1021+
if (vma == vma_prev)
1022+
continue;
1023+
1024+
vma_prev = vma;
1025+
1026+
if (vma_is_fsdax(vma))
1027+
break;
1028+
}
1029+
1030+
/*
1031+
* Either get_user_pages() failed, or the vma validation
1032+
* succeeded, in either case we don't need to put_page() before
1033+
* returning.
1034+
*/
1035+
if (i >= rc)
1036+
goto out;
1037+
1038+
for (i = 0; i < rc; i++)
1039+
put_page(pages[i]);
1040+
rc = -EOPNOTSUPP;
1041+
out:
1042+
if (vmas != vmas_arg)
1043+
kfree(vmas);
1044+
return rc;
1045+
}
1046+
EXPORT_SYMBOL(get_user_pages_longterm);
1047+
#endif /* CONFIG_FS_DAX */
1048+
9851049
/**
9861050
* populate_vma_page_range() - populate a range of pages in the vma.
9871051
* @vma: target vma

0 commit comments

Comments
 (0)