fs/userfaultfd.c at 4e5591c2fc1b30f4ea5e2eab4c3a695acc404e39

tjh.dev / kernel
fork
Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
fork
kernel / fs / userfaultfd.c
at 4e5591c2fc1b30f4ea5e2eab4c3a695acc404e39 2231 lines 58 kB view raw
wrap content
   1// SPDX-License-Identifier: GPL-2.0-only
   2/*
   3 *  fs/userfaultfd.c
   4 *
   5 *  Copyright (C) 2007  Davide Libenzi <davidel@xmailserver.org>
   6 *  Copyright (C) 2008-2009 Red Hat, Inc.
   7 *  Copyright (C) 2015  Red Hat, Inc.
   8 *
   9 *  Some part derived from fs/eventfd.c (anon inode setup) and
  10 *  mm/ksm.c (mm hashing).
  11 */
  12
  13#include <linux/list.h>
  14#include <linux/hashtable.h>
  15#include <linux/sched/signal.h>
  16#include <linux/sched/mm.h>
  17#include <linux/mm.h>
  18#include <linux/mm_inline.h>
  19#include <linux/mmu_notifier.h>
  20#include <linux/poll.h>
  21#include <linux/slab.h>
  22#include <linux/seq_file.h>
  23#include <linux/file.h>
  24#include <linux/bug.h>
  25#include <linux/anon_inodes.h>
  26#include <linux/syscalls.h>
  27#include <linux/userfaultfd_k.h>
  28#include <linux/mempolicy.h>
  29#include <linux/ioctl.h>
  30#include <linux/security.h>
  31#include <linux/hugetlb.h>
  32#include <linux/leafops.h>
  33#include <linux/miscdevice.h>
  34#include <linux/uio.h>
  35
  36static int sysctl_unprivileged_userfaultfd __read_mostly;
  37
  38#ifdef CONFIG_SYSCTL
  39static const struct ctl_table vm_userfaultfd_table[] = {
  40	{
  41		.procname	= "unprivileged_userfaultfd",
  42		.data		= &sysctl_unprivileged_userfaultfd,
  43		.maxlen		= sizeof(sysctl_unprivileged_userfaultfd),
  44		.mode		= 0644,
  45		.proc_handler	= proc_dointvec_minmax,
  46		.extra1		= SYSCTL_ZERO,
  47		.extra2		= SYSCTL_ONE,
  48	},
  49};
  50#endif
  51
  52static struct kmem_cache *userfaultfd_ctx_cachep __ro_after_init;
  53
  54struct userfaultfd_fork_ctx {
  55	struct userfaultfd_ctx *orig;
  56	struct userfaultfd_ctx *new;
  57	struct list_head list;
  58};
  59
  60struct userfaultfd_unmap_ctx {
  61	struct userfaultfd_ctx *ctx;
  62	unsigned long start;
  63	unsigned long end;
  64	struct list_head list;
  65};
  66
  67struct userfaultfd_wait_queue {
  68	struct uffd_msg msg;
  69	wait_queue_entry_t wq;
  70	struct userfaultfd_ctx *ctx;
  71	bool waken;
  72};
  73
  74struct userfaultfd_wake_range {
  75	unsigned long start;
  76	unsigned long len;
  77};
  78
  79/* internal indication that UFFD_API ioctl was successfully executed */
  80#define UFFD_FEATURE_INITIALIZED		(1u << 31)
  81
  82static bool userfaultfd_is_initialized(struct userfaultfd_ctx *ctx)
  83{
  84	return ctx->features & UFFD_FEATURE_INITIALIZED;
  85}
  86
  87static bool userfaultfd_wp_async_ctx(struct userfaultfd_ctx *ctx)
  88{
  89	return ctx && (ctx->features & UFFD_FEATURE_WP_ASYNC);
  90}
  91
  92/*
  93 * Whether WP_UNPOPULATED is enabled on the uffd context.  It is only
  94 * meaningful when userfaultfd_wp()==true on the vma and when it's
  95 * anonymous.
  96 */
  97bool userfaultfd_wp_unpopulated(struct vm_area_struct *vma)
  98{
  99	struct userfaultfd_ctx *ctx = vma->vm_userfaultfd_ctx.ctx;
 100
 101	if (!ctx)
 102		return false;
 103
 104	return ctx->features & UFFD_FEATURE_WP_UNPOPULATED;
 105}
 106
 107static int userfaultfd_wake_function(wait_queue_entry_t *wq, unsigned mode,
 108				     int wake_flags, void *key)
 109{
 110	struct userfaultfd_wake_range *range = key;
 111	int ret;
 112	struct userfaultfd_wait_queue *uwq;
 113	unsigned long start, len;
 114
 115	uwq = container_of(wq, struct userfaultfd_wait_queue, wq);
 116	ret = 0;
 117	/* len == 0 means wake all */
 118	start = range->start;
 119	len = range->len;
 120	if (len && (start > uwq->msg.arg.pagefault.address ||
 121		    start + len <= uwq->msg.arg.pagefault.address))
 122		goto out;
 123	WRITE_ONCE(uwq->waken, true);
 124	/*
 125	 * The Program-Order guarantees provided by the scheduler
 126	 * ensure uwq->waken is visible before the task is woken.
 127	 */
 128	ret = wake_up_state(wq->private, mode);
 129	if (ret) {
 130		/*
 131		 * Wake only once, autoremove behavior.
 132		 *
 133		 * After the effect of list_del_init is visible to the other
 134		 * CPUs, the waitqueue may disappear from under us, see the
 135		 * !list_empty_careful() in handle_userfault().
 136		 *
 137		 * try_to_wake_up() has an implicit smp_mb(), and the
 138		 * wq->private is read before calling the extern function
 139		 * "wake_up_state" (which in turns calls try_to_wake_up).
 140		 */
 141		list_del_init(&wq->entry);
 142	}
 143out:
 144	return ret;
 145}
 146
 147/**
 148 * userfaultfd_ctx_get - Acquires a reference to the internal userfaultfd
 149 * context.
 150 * @ctx: [in] Pointer to the userfaultfd context.
 151 */
 152static void userfaultfd_ctx_get(struct userfaultfd_ctx *ctx)
 153{
 154	refcount_inc(&ctx->refcount);
 155}
 156
 157/**
 158 * userfaultfd_ctx_put - Releases a reference to the internal userfaultfd
 159 * context.
 160 * @ctx: [in] Pointer to userfaultfd context.
 161 *
 162 * The userfaultfd context reference must have been previously acquired either
 163 * with userfaultfd_ctx_get() or userfaultfd_ctx_fdget().
 164 */
 165static void userfaultfd_ctx_put(struct userfaultfd_ctx *ctx)
 166{
 167	if (refcount_dec_and_test(&ctx->refcount)) {
 168		VM_WARN_ON_ONCE(spin_is_locked(&ctx->fault_pending_wqh.lock));
 169		VM_WARN_ON_ONCE(waitqueue_active(&ctx->fault_pending_wqh));
 170		VM_WARN_ON_ONCE(spin_is_locked(&ctx->fault_wqh.lock));
 171		VM_WARN_ON_ONCE(waitqueue_active(&ctx->fault_wqh));
 172		VM_WARN_ON_ONCE(spin_is_locked(&ctx->event_wqh.lock));
 173		VM_WARN_ON_ONCE(waitqueue_active(&ctx->event_wqh));
 174		VM_WARN_ON_ONCE(spin_is_locked(&ctx->fd_wqh.lock));
 175		VM_WARN_ON_ONCE(waitqueue_active(&ctx->fd_wqh));
 176		mmdrop(ctx->mm);
 177		kmem_cache_free(userfaultfd_ctx_cachep, ctx);
 178	}
 179}
 180
 181static inline void msg_init(struct uffd_msg *msg)
 182{
 183	BUILD_BUG_ON(sizeof(struct uffd_msg) != 32);
 184	/*
 185	 * Must use memset to zero out the paddings or kernel data is
 186	 * leaked to userland.
 187	 */
 188	memset(msg, 0, sizeof(struct uffd_msg));
 189}
 190
 191static inline struct uffd_msg userfault_msg(unsigned long address,
 192					    unsigned long real_address,
 193					    unsigned int flags,
 194					    unsigned long reason,
 195					    unsigned int features)
 196{
 197	struct uffd_msg msg;
 198
 199	msg_init(&msg);
 200	msg.event = UFFD_EVENT_PAGEFAULT;
 201
 202	msg.arg.pagefault.address = (features & UFFD_FEATURE_EXACT_ADDRESS) ?
 203				    real_address : address;
 204
 205	/*
 206	 * These flags indicate why the userfault occurred:
 207	 * - UFFD_PAGEFAULT_FLAG_WP indicates a write protect fault.
 208	 * - UFFD_PAGEFAULT_FLAG_MINOR indicates a minor fault.
 209	 * - Neither of these flags being set indicates a MISSING fault.
 210	 *
 211	 * Separately, UFFD_PAGEFAULT_FLAG_WRITE indicates it was a write
 212	 * fault. Otherwise, it was a read fault.
 213	 */
 214	if (flags & FAULT_FLAG_WRITE)
 215		msg.arg.pagefault.flags |= UFFD_PAGEFAULT_FLAG_WRITE;
 216	if (reason & VM_UFFD_WP)
 217		msg.arg.pagefault.flags |= UFFD_PAGEFAULT_FLAG_WP;
 218	if (reason & VM_UFFD_MINOR)
 219		msg.arg.pagefault.flags |= UFFD_PAGEFAULT_FLAG_MINOR;
 220	if (features & UFFD_FEATURE_THREAD_ID)
 221		msg.arg.pagefault.feat.ptid = task_pid_vnr(current);
 222	return msg;
 223}
 224
 225#ifdef CONFIG_HUGETLB_PAGE
 226/*
 227 * Same functionality as userfaultfd_must_wait below with modifications for
 228 * hugepmd ranges.
 229 */
 230static inline bool userfaultfd_huge_must_wait(struct userfaultfd_ctx *ctx,
 231					      struct vm_fault *vmf,
 232					      unsigned long reason)
 233{
 234	struct vm_area_struct *vma = vmf->vma;
 235	pte_t *ptep, pte;
 236
 237	assert_fault_locked(vmf);
 238
 239	ptep = hugetlb_walk(vma, vmf->address, vma_mmu_pagesize(vma));
 240	if (!ptep)
 241		return true;
 242
 243	pte = huge_ptep_get(vma->vm_mm, vmf->address, ptep);
 244
 245	/*
 246	 * Lockless access: we're in a wait_event so it's ok if it
 247	 * changes under us.
 248	 */
 249
 250	/* Entry is still missing, wait for userspace to resolve the fault. */
 251	if (huge_pte_none(pte))
 252		return true;
 253	/* UFFD PTE markers require userspace to resolve the fault. */
 254	if (pte_is_uffd_marker(pte))
 255		return true;
 256	/*
 257	 * If VMA has UFFD WP faults enabled and WP fault, wait for userspace to
 258	 * resolve the fault.
 259	 */
 260	if (!huge_pte_write(pte) && (reason & VM_UFFD_WP))
 261		return true;
 262
 263	return false;
 264}
 265#else
 266static inline bool userfaultfd_huge_must_wait(struct userfaultfd_ctx *ctx,
 267					      struct vm_fault *vmf,
 268					      unsigned long reason)
 269{
 270	/* Should never get here. */
 271	VM_WARN_ON_ONCE(1);
 272	return false;
 273}
 274#endif /* CONFIG_HUGETLB_PAGE */
 275
 276/*
 277 * Verify the pagetables are still not ok after having registered into
 278 * the fault_pending_wqh to avoid userland having to UFFDIO_WAKE any
 279 * userfault that has already been resolved, if userfaultfd_read_iter and
 280 * UFFDIO_COPY|ZEROPAGE are being run simultaneously on two different
 281 * threads.
 282 */
 283static inline bool userfaultfd_must_wait(struct userfaultfd_ctx *ctx,
 284					 struct vm_fault *vmf,
 285					 unsigned long reason)
 286{
 287	struct mm_struct *mm = ctx->mm;
 288	unsigned long address = vmf->address;
 289	pgd_t *pgd;
 290	p4d_t *p4d;
 291	pud_t *pud;
 292	pmd_t *pmd, _pmd;
 293	pte_t *pte;
 294	pte_t ptent;
 295	bool ret;
 296
 297	assert_fault_locked(vmf);
 298
 299	pgd = pgd_offset(mm, address);
 300	if (!pgd_present(*pgd))
 301		return true;
 302	p4d = p4d_offset(pgd, address);
 303	if (!p4d_present(*p4d))
 304		return true;
 305	pud = pud_offset(p4d, address);
 306	if (!pud_present(*pud))
 307		return true;
 308	pmd = pmd_offset(pud, address);
 309again:
 310	_pmd = pmdp_get_lockless(pmd);
 311	if (pmd_none(_pmd))
 312		return true;
 313
 314	/*
 315	 * A race could arise which would result in a softleaf entry such as
 316	 * migration entry unexpectedly being present in the PMD, so explicitly
 317	 * check for this and bail out if so.
 318	 */
 319	if (!pmd_present(_pmd))
 320		return false;
 321
 322	if (pmd_trans_huge(_pmd))
 323		return !pmd_write(_pmd) && (reason & VM_UFFD_WP);
 324
 325	pte = pte_offset_map(pmd, address);
 326	if (!pte)
 327		goto again;
 328
 329	/*
 330	 * Lockless access: we're in a wait_event so it's ok if it
 331	 * changes under us.
 332	 */
 333	ptent = ptep_get(pte);
 334
 335	ret = true;
 336	/* Entry is still missing, wait for userspace to resolve the fault. */
 337	if (pte_none(ptent))
 338		goto out;
 339	/* UFFD PTE markers require userspace to resolve the fault. */
 340	if (pte_is_uffd_marker(ptent))
 341		goto out;
 342	/*
 343	 * If VMA has UFFD WP faults enabled and WP fault, wait for userspace to
 344	 * resolve the fault.
 345	 */
 346	if (!pte_write(ptent) && (reason & VM_UFFD_WP))
 347		goto out;
 348
 349	ret = false;
 350out:
 351	pte_unmap(pte);
 352	return ret;
 353}
 354
 355static inline unsigned int userfaultfd_get_blocking_state(unsigned int flags)
 356{
 357	if (flags & FAULT_FLAG_INTERRUPTIBLE)
 358		return TASK_INTERRUPTIBLE;
 359
 360	if (flags & FAULT_FLAG_KILLABLE)
 361		return TASK_KILLABLE;
 362
 363	return TASK_UNINTERRUPTIBLE;
 364}
 365
 366/*
 367 * The locking rules involved in returning VM_FAULT_RETRY depending on
 368 * FAULT_FLAG_ALLOW_RETRY, FAULT_FLAG_RETRY_NOWAIT and
 369 * FAULT_FLAG_KILLABLE are not straightforward. The "Caution"
 370 * recommendation in __lock_page_or_retry is not an understatement.
 371 *
 372 * If FAULT_FLAG_ALLOW_RETRY is set, the mmap_lock must be released
 373 * before returning VM_FAULT_RETRY only if FAULT_FLAG_RETRY_NOWAIT is
 374 * not set.
 375 *
 376 * If FAULT_FLAG_ALLOW_RETRY is set but FAULT_FLAG_KILLABLE is not
 377 * set, VM_FAULT_RETRY can still be returned if and only if there are
 378 * fatal_signal_pending()s, and the mmap_lock must be released before
 379 * returning it.
 380 */
 381vm_fault_t handle_userfault(struct vm_fault *vmf, unsigned long reason)
 382{
 383	struct vm_area_struct *vma = vmf->vma;
 384	struct mm_struct *mm = vma->vm_mm;
 385	struct userfaultfd_ctx *ctx;
 386	struct userfaultfd_wait_queue uwq;
 387	vm_fault_t ret = VM_FAULT_SIGBUS;
 388	bool must_wait;
 389	unsigned int blocking_state;
 390
 391	/*
 392	 * We don't do userfault handling for the final child pid update
 393	 * and when coredumping (faults triggered by get_dump_page()).
 394	 */
 395	if (current->flags & (PF_EXITING|PF_DUMPCORE))
 396		goto out;
 397
 398	assert_fault_locked(vmf);
 399
 400	ctx = vma->vm_userfaultfd_ctx.ctx;
 401	if (!ctx)
 402		goto out;
 403
 404	VM_WARN_ON_ONCE(ctx->mm != mm);
 405
 406	/* Any unrecognized flag is a bug. */
 407	VM_WARN_ON_ONCE(reason & ~__VM_UFFD_FLAGS);
 408	/* 0 or > 1 flags set is a bug; we expect exactly 1. */
 409	VM_WARN_ON_ONCE(!reason || (reason & (reason - 1)));
 410
 411	if (ctx->features & UFFD_FEATURE_SIGBUS)
 412		goto out;
 413	if (!(vmf->flags & FAULT_FLAG_USER) && (ctx->flags & UFFD_USER_MODE_ONLY))
 414		goto out;
 415
 416	/*
 417	 * Check that we can return VM_FAULT_RETRY.
 418	 *
 419	 * NOTE: it should become possible to return VM_FAULT_RETRY
 420	 * even if FAULT_FLAG_TRIED is set without leading to gup()
 421	 * -EBUSY failures, if the userfaultfd is to be extended for
 422	 * VM_UFFD_WP tracking and we intend to arm the userfault
 423	 * without first stopping userland access to the memory. For
 424	 * VM_UFFD_MISSING userfaults this is enough for now.
 425	 */
 426	if (unlikely(!(vmf->flags & FAULT_FLAG_ALLOW_RETRY))) {
 427		/*
 428		 * Validate the invariant that nowait must allow retry
 429		 * to be sure not to return SIGBUS erroneously on
 430		 * nowait invocations.
 431		 */
 432		VM_WARN_ON_ONCE(vmf->flags & FAULT_FLAG_RETRY_NOWAIT);
 433#ifdef CONFIG_DEBUG_VM
 434		if (printk_ratelimit()) {
 435			pr_warn("FAULT_FLAG_ALLOW_RETRY missing %x\n",
 436				vmf->flags);
 437			dump_stack();
 438		}
 439#endif
 440		goto out;
 441	}
 442
 443	/*
 444	 * Handle nowait, not much to do other than tell it to retry
 445	 * and wait.
 446	 */
 447	ret = VM_FAULT_RETRY;
 448	if (vmf->flags & FAULT_FLAG_RETRY_NOWAIT)
 449		goto out;
 450
 451	if (unlikely(READ_ONCE(ctx->released))) {
 452		/*
 453		 * If a concurrent release is detected, do not return
 454		 * VM_FAULT_SIGBUS or VM_FAULT_NOPAGE, but instead always
 455		 * return VM_FAULT_RETRY with lock released proactively.
 456		 *
 457		 * If we were to return VM_FAULT_SIGBUS here, the non
 458		 * cooperative manager would be instead forced to
 459		 * always call UFFDIO_UNREGISTER before it can safely
 460		 * close the uffd, to avoid involuntary SIGBUS triggered.
 461		 *
 462		 * If we were to return VM_FAULT_NOPAGE, it would work for
 463		 * the fault path, in which the lock will be released
 464		 * later.  However for GUP, faultin_page() does nothing
 465		 * special on NOPAGE, so GUP would spin retrying without
 466		 * releasing the mmap read lock, causing possible livelock.
 467		 *
 468		 * Here only VM_FAULT_RETRY would make sure the mmap lock
 469		 * be released immediately, so that the thread concurrently
 470		 * releasing the userfault would always make progress.
 471		 */
 472		release_fault_lock(vmf);
 473		goto out;
 474	}
 475
 476	/* take the reference before dropping the mmap_lock */
 477	userfaultfd_ctx_get(ctx);
 478
 479	init_waitqueue_func_entry(&uwq.wq, userfaultfd_wake_function);
 480	uwq.wq.private = current;
 481	uwq.msg = userfault_msg(vmf->address, vmf->real_address, vmf->flags,
 482				reason, ctx->features);
 483	uwq.ctx = ctx;
 484	uwq.waken = false;
 485
 486	blocking_state = userfaultfd_get_blocking_state(vmf->flags);
 487
 488        /*
 489         * Take the vma lock now, in order to safely call
 490         * userfaultfd_huge_must_wait() later. Since acquiring the
 491         * (sleepable) vma lock can modify the current task state, that
 492         * must be before explicitly calling set_current_state().
 493         */
 494	if (is_vm_hugetlb_page(vma))
 495		hugetlb_vma_lock_read(vma);
 496
 497	spin_lock_irq(&ctx->fault_pending_wqh.lock);
 498	/*
 499	 * After the __add_wait_queue the uwq is visible to userland
 500	 * through poll/read().
 501	 */
 502	__add_wait_queue(&ctx->fault_pending_wqh, &uwq.wq);
 503	/*
 504	 * The smp_mb() after __set_current_state prevents the reads
 505	 * following the spin_unlock to happen before the list_add in
 506	 * __add_wait_queue.
 507	 */
 508	set_current_state(blocking_state);
 509	spin_unlock_irq(&ctx->fault_pending_wqh.lock);
 510
 511	if (is_vm_hugetlb_page(vma)) {
 512		must_wait = userfaultfd_huge_must_wait(ctx, vmf, reason);
 513		hugetlb_vma_unlock_read(vma);
 514	} else {
 515		must_wait = userfaultfd_must_wait(ctx, vmf, reason);
 516	}
 517
 518	release_fault_lock(vmf);
 519
 520	if (likely(must_wait && !READ_ONCE(ctx->released))) {
 521		wake_up_poll(&ctx->fd_wqh, EPOLLIN);
 522		schedule();
 523	}
 524
 525	__set_current_state(TASK_RUNNING);
 526
 527	/*
 528	 * Here we race with the list_del; list_add in
 529	 * userfaultfd_ctx_read(), however because we don't ever run
 530	 * list_del_init() to refile across the two lists, the prev
 531	 * and next pointers will never point to self. list_add also
 532	 * would never let any of the two pointers to point to
 533	 * self. So list_empty_careful won't risk to see both pointers
 534	 * pointing to self at any time during the list refile. The
 535	 * only case where list_del_init() is called is the full
 536	 * removal in the wake function and there we don't re-list_add
 537	 * and it's fine not to block on the spinlock. The uwq on this
 538	 * kernel stack can be released after the list_del_init.
 539	 */
 540	if (!list_empty_careful(&uwq.wq.entry)) {
 541		spin_lock_irq(&ctx->fault_pending_wqh.lock);
 542		/*
 543		 * No need of list_del_init(), the uwq on the stack
 544		 * will be freed shortly anyway.
 545		 */
 546		list_del(&uwq.wq.entry);
 547		spin_unlock_irq(&ctx->fault_pending_wqh.lock);
 548	}
 549
 550	/*
 551	 * ctx may go away after this if the userfault pseudo fd is
 552	 * already released.
 553	 */
 554	userfaultfd_ctx_put(ctx);
 555
 556out:
 557	return ret;
 558}
 559
 560static void userfaultfd_event_wait_completion(struct userfaultfd_ctx *ctx,
 561					      struct userfaultfd_wait_queue *ewq)
 562{
 563	struct userfaultfd_ctx *release_new_ctx;
 564
 565	if (WARN_ON_ONCE(current->flags & PF_EXITING))
 566		goto out;
 567
 568	ewq->ctx = ctx;
 569	init_waitqueue_entry(&ewq->wq, current);
 570	release_new_ctx = NULL;
 571
 572	spin_lock_irq(&ctx->event_wqh.lock);
 573	/*
 574	 * After the __add_wait_queue the uwq is visible to userland
 575	 * through poll/read().
 576	 */
 577	__add_wait_queue(&ctx->event_wqh, &ewq->wq);
 578	for (;;) {
 579		set_current_state(TASK_KILLABLE);
 580		if (ewq->msg.event == 0)
 581			break;
 582		if (READ_ONCE(ctx->released) ||
 583		    fatal_signal_pending(current)) {
 584			/*
 585			 * &ewq->wq may be queued in fork_event, but
 586			 * __remove_wait_queue ignores the head
 587			 * parameter. It would be a problem if it
 588			 * didn't.
 589			 */
 590			__remove_wait_queue(&ctx->event_wqh, &ewq->wq);
 591			if (ewq->msg.event == UFFD_EVENT_FORK) {
 592				struct userfaultfd_ctx *new;
 593
 594				new = (struct userfaultfd_ctx *)
 595					(unsigned long)
 596					ewq->msg.arg.reserved.reserved1;
 597				release_new_ctx = new;
 598			}
 599			break;
 600		}
 601
 602		spin_unlock_irq(&ctx->event_wqh.lock);
 603
 604		wake_up_poll(&ctx->fd_wqh, EPOLLIN);
 605		schedule();
 606
 607		spin_lock_irq(&ctx->event_wqh.lock);
 608	}
 609	__set_current_state(TASK_RUNNING);
 610	spin_unlock_irq(&ctx->event_wqh.lock);
 611
 612	if (release_new_ctx) {
 613		userfaultfd_release_new(release_new_ctx);
 614		userfaultfd_ctx_put(release_new_ctx);
 615	}
 616
 617	/*
 618	 * ctx may go away after this if the userfault pseudo fd is
 619	 * already released.
 620	 */
 621out:
 622	atomic_dec(&ctx->mmap_changing);
 623	VM_WARN_ON_ONCE(atomic_read(&ctx->mmap_changing) < 0);
 624	userfaultfd_ctx_put(ctx);
 625}
 626
 627static void userfaultfd_event_complete(struct userfaultfd_ctx *ctx,
 628				       struct userfaultfd_wait_queue *ewq)
 629{
 630	ewq->msg.event = 0;
 631	wake_up_locked(&ctx->event_wqh);
 632	__remove_wait_queue(&ctx->event_wqh, &ewq->wq);
 633}
 634
 635int dup_userfaultfd(struct vm_area_struct *vma, struct list_head *fcs)
 636{
 637	struct userfaultfd_ctx *ctx = NULL, *octx;
 638	struct userfaultfd_fork_ctx *fctx;
 639
 640	octx = vma->vm_userfaultfd_ctx.ctx;
 641	if (!octx)
 642		return 0;
 643
 644	if (!(octx->features & UFFD_FEATURE_EVENT_FORK)) {
 645		userfaultfd_reset_ctx(vma);
 646		return 0;
 647	}
 648
 649	list_for_each_entry(fctx, fcs, list)
 650		if (fctx->orig == octx) {
 651			ctx = fctx->new;
 652			break;
 653		}
 654
 655	if (!ctx) {
 656		fctx = kmalloc_obj(*fctx);
 657		if (!fctx)
 658			return -ENOMEM;
 659
 660		ctx = kmem_cache_alloc(userfaultfd_ctx_cachep, GFP_KERNEL);
 661		if (!ctx) {
 662			kfree(fctx);
 663			return -ENOMEM;
 664		}
 665
 666		refcount_set(&ctx->refcount, 1);
 667		ctx->flags = octx->flags;
 668		ctx->features = octx->features;
 669		ctx->released = false;
 670		init_rwsem(&ctx->map_changing_lock);
 671		atomic_set(&ctx->mmap_changing, 0);
 672		ctx->mm = vma->vm_mm;
 673		mmgrab(ctx->mm);
 674
 675		userfaultfd_ctx_get(octx);
 676		down_write(&octx->map_changing_lock);
 677		atomic_inc(&octx->mmap_changing);
 678		up_write(&octx->map_changing_lock);
 679		fctx->orig = octx;
 680		fctx->new = ctx;
 681		list_add_tail(&fctx->list, fcs);
 682	}
 683
 684	vma->vm_userfaultfd_ctx.ctx = ctx;
 685	return 0;
 686}
 687
 688static void dup_fctx(struct userfaultfd_fork_ctx *fctx)
 689{
 690	struct userfaultfd_ctx *ctx = fctx->orig;
 691	struct userfaultfd_wait_queue ewq;
 692
 693	msg_init(&ewq.msg);
 694
 695	ewq.msg.event = UFFD_EVENT_FORK;
 696	ewq.msg.arg.reserved.reserved1 = (unsigned long)fctx->new;
 697
 698	userfaultfd_event_wait_completion(ctx, &ewq);
 699}
 700
 701void dup_userfaultfd_complete(struct list_head *fcs)
 702{
 703	struct userfaultfd_fork_ctx *fctx, *n;
 704
 705	list_for_each_entry_safe(fctx, n, fcs, list) {
 706		dup_fctx(fctx);
 707		list_del(&fctx->list);
 708		kfree(fctx);
 709	}
 710}
 711
 712void dup_userfaultfd_fail(struct list_head *fcs)
 713{
 714	struct userfaultfd_fork_ctx *fctx, *n;
 715
 716	/*
 717	 * An error has occurred on fork, we will tear memory down, but have
 718	 * allocated memory for fctx's and raised reference counts for both the
 719	 * original and child contexts (and on the mm for each as a result).
 720	 *
 721	 * These would ordinarily be taken care of by a user handling the event,
 722	 * but we are no longer doing so, so manually clean up here.
 723	 *
 724	 * mm tear down will take care of cleaning up VMA contexts.
 725	 */
 726	list_for_each_entry_safe(fctx, n, fcs, list) {
 727		struct userfaultfd_ctx *octx = fctx->orig;
 728		struct userfaultfd_ctx *ctx = fctx->new;
 729
 730		atomic_dec(&octx->mmap_changing);
 731		VM_WARN_ON_ONCE(atomic_read(&octx->mmap_changing) < 0);
 732		userfaultfd_ctx_put(octx);
 733		userfaultfd_ctx_put(ctx);
 734
 735		list_del(&fctx->list);
 736		kfree(fctx);
 737	}
 738}
 739
 740void mremap_userfaultfd_prep(struct vm_area_struct *vma,
 741			     struct vm_userfaultfd_ctx *vm_ctx)
 742{
 743	struct userfaultfd_ctx *ctx;
 744
 745	ctx = vma->vm_userfaultfd_ctx.ctx;
 746
 747	if (!ctx)
 748		return;
 749
 750	if (ctx->features & UFFD_FEATURE_EVENT_REMAP) {
 751		vm_ctx->ctx = ctx;
 752		userfaultfd_ctx_get(ctx);
 753		down_write(&ctx->map_changing_lock);
 754		atomic_inc(&ctx->mmap_changing);
 755		up_write(&ctx->map_changing_lock);
 756	} else {
 757		/* Drop uffd context if remap feature not enabled */
 758		userfaultfd_reset_ctx(vma);
 759	}
 760}
 761
 762void mremap_userfaultfd_complete(struct vm_userfaultfd_ctx *vm_ctx,
 763				 unsigned long from, unsigned long to,
 764				 unsigned long len)
 765{
 766	struct userfaultfd_ctx *ctx = vm_ctx->ctx;
 767	struct userfaultfd_wait_queue ewq;
 768
 769	if (!ctx)
 770		return;
 771
 772	msg_init(&ewq.msg);
 773
 774	ewq.msg.event = UFFD_EVENT_REMAP;
 775	ewq.msg.arg.remap.from = from;
 776	ewq.msg.arg.remap.to = to;
 777	ewq.msg.arg.remap.len = len;
 778
 779	userfaultfd_event_wait_completion(ctx, &ewq);
 780}
 781
 782void mremap_userfaultfd_fail(struct vm_userfaultfd_ctx *vm_ctx)
 783{
 784	struct userfaultfd_ctx *ctx = vm_ctx->ctx;
 785
 786	if (!ctx)
 787		return;
 788
 789	userfaultfd_ctx_put(ctx);
 790}
 791
 792bool userfaultfd_remove(struct vm_area_struct *vma,
 793			unsigned long start, unsigned long end)
 794{
 795	struct mm_struct *mm = vma->vm_mm;
 796	struct userfaultfd_ctx *ctx;
 797	struct userfaultfd_wait_queue ewq;
 798
 799	ctx = vma->vm_userfaultfd_ctx.ctx;
 800	if (!ctx || !(ctx->features & UFFD_FEATURE_EVENT_REMOVE))
 801		return true;
 802
 803	userfaultfd_ctx_get(ctx);
 804	down_write(&ctx->map_changing_lock);
 805	atomic_inc(&ctx->mmap_changing);
 806	up_write(&ctx->map_changing_lock);
 807	mmap_read_unlock(mm);
 808
 809	msg_init(&ewq.msg);
 810
 811	ewq.msg.event = UFFD_EVENT_REMOVE;
 812	ewq.msg.arg.remove.start = start;
 813	ewq.msg.arg.remove.end = end;
 814
 815	userfaultfd_event_wait_completion(ctx, &ewq);
 816
 817	return false;
 818}
 819
 820static bool has_unmap_ctx(struct userfaultfd_ctx *ctx, struct list_head *unmaps,
 821			  unsigned long start, unsigned long end)
 822{
 823	struct userfaultfd_unmap_ctx *unmap_ctx;
 824
 825	list_for_each_entry(unmap_ctx, unmaps, list)
 826		if (unmap_ctx->ctx == ctx && unmap_ctx->start == start &&
 827		    unmap_ctx->end == end)
 828			return true;
 829
 830	return false;
 831}
 832
 833int userfaultfd_unmap_prep(struct vm_area_struct *vma, unsigned long start,
 834			   unsigned long end, struct list_head *unmaps)
 835{
 836	struct userfaultfd_unmap_ctx *unmap_ctx;
 837	struct userfaultfd_ctx *ctx = vma->vm_userfaultfd_ctx.ctx;
 838
 839	if (!ctx || !(ctx->features & UFFD_FEATURE_EVENT_UNMAP) ||
 840	    has_unmap_ctx(ctx, unmaps, start, end))
 841		return 0;
 842
 843	unmap_ctx = kzalloc_obj(*unmap_ctx);
 844	if (!unmap_ctx)
 845		return -ENOMEM;
 846
 847	userfaultfd_ctx_get(ctx);
 848	down_write(&ctx->map_changing_lock);
 849	atomic_inc(&ctx->mmap_changing);
 850	up_write(&ctx->map_changing_lock);
 851	unmap_ctx->ctx = ctx;
 852	unmap_ctx->start = start;
 853	unmap_ctx->end = end;
 854	list_add_tail(&unmap_ctx->list, unmaps);
 855
 856	return 0;
 857}
 858
 859void userfaultfd_unmap_complete(struct mm_struct *mm, struct list_head *uf)
 860{
 861	struct userfaultfd_unmap_ctx *ctx, *n;
 862	struct userfaultfd_wait_queue ewq;
 863
 864	list_for_each_entry_safe(ctx, n, uf, list) {
 865		msg_init(&ewq.msg);
 866
 867		ewq.msg.event = UFFD_EVENT_UNMAP;
 868		ewq.msg.arg.remove.start = ctx->start;
 869		ewq.msg.arg.remove.end = ctx->end;
 870
 871		userfaultfd_event_wait_completion(ctx->ctx, &ewq);
 872
 873		list_del(&ctx->list);
 874		kfree(ctx);
 875	}
 876}
 877
 878static int userfaultfd_release(struct inode *inode, struct file *file)
 879{
 880	struct userfaultfd_ctx *ctx = file->private_data;
 881	struct mm_struct *mm = ctx->mm;
 882	/* len == 0 means wake all */
 883	struct userfaultfd_wake_range range = { .len = 0, };
 884
 885	WRITE_ONCE(ctx->released, true);
 886
 887	userfaultfd_release_all(mm, ctx);
 888
 889	/*
 890	 * After no new page faults can wait on this fault_*wqh, flush
 891	 * the last page faults that may have been already waiting on
 892	 * the fault_*wqh.
 893	 */
 894	spin_lock_irq(&ctx->fault_pending_wqh.lock);
 895	__wake_up_locked_key(&ctx->fault_pending_wqh, TASK_NORMAL, &range);
 896	__wake_up(&ctx->fault_wqh, TASK_NORMAL, 1, &range);
 897	spin_unlock_irq(&ctx->fault_pending_wqh.lock);
 898
 899	/* Flush pending events that may still wait on event_wqh */
 900	wake_up_all(&ctx->event_wqh);
 901
 902	wake_up_poll(&ctx->fd_wqh, EPOLLHUP);
 903	userfaultfd_ctx_put(ctx);
 904	return 0;
 905}
 906
 907/* fault_pending_wqh.lock must be hold by the caller */
 908static inline struct userfaultfd_wait_queue *find_userfault_in(
 909		wait_queue_head_t *wqh)
 910{
 911	wait_queue_entry_t *wq;
 912	struct userfaultfd_wait_queue *uwq;
 913
 914	lockdep_assert_held(&wqh->lock);
 915
 916	uwq = NULL;
 917	if (!waitqueue_active(wqh))
 918		goto out;
 919	/* walk in reverse to provide FIFO behavior to read userfaults */
 920	wq = list_last_entry(&wqh->head, typeof(*wq), entry);
 921	uwq = container_of(wq, struct userfaultfd_wait_queue, wq);
 922out:
 923	return uwq;
 924}
 925
 926static inline struct userfaultfd_wait_queue *find_userfault(
 927		struct userfaultfd_ctx *ctx)
 928{
 929	return find_userfault_in(&ctx->fault_pending_wqh);
 930}
 931
 932static inline struct userfaultfd_wait_queue *find_userfault_evt(
 933		struct userfaultfd_ctx *ctx)
 934{
 935	return find_userfault_in(&ctx->event_wqh);
 936}
 937
 938static __poll_t userfaultfd_poll(struct file *file, poll_table *wait)
 939{
 940	struct userfaultfd_ctx *ctx = file->private_data;
 941	__poll_t ret;
 942
 943	poll_wait(file, &ctx->fd_wqh, wait);
 944
 945	if (!userfaultfd_is_initialized(ctx))
 946		return EPOLLERR;
 947
 948	/*
 949	 * poll() never guarantees that read won't block.
 950	 * userfaults can be waken before they're read().
 951	 */
 952	if (unlikely(!(file->f_flags & O_NONBLOCK)))
 953		return EPOLLERR;
 954	/*
 955	 * lockless access to see if there are pending faults
 956	 * __pollwait last action is the add_wait_queue but
 957	 * the spin_unlock would allow the waitqueue_active to
 958	 * pass above the actual list_add inside
 959	 * add_wait_queue critical section. So use a full
 960	 * memory barrier to serialize the list_add write of
 961	 * add_wait_queue() with the waitqueue_active read
 962	 * below.
 963	 */
 964	ret = 0;
 965	smp_mb();
 966	if (waitqueue_active(&ctx->fault_pending_wqh))
 967		ret = EPOLLIN;
 968	else if (waitqueue_active(&ctx->event_wqh))
 969		ret = EPOLLIN;
 970
 971	return ret;
 972}
 973
 974static const struct file_operations userfaultfd_fops;
 975
 976static int resolve_userfault_fork(struct userfaultfd_ctx *new,
 977				  struct inode *inode,
 978				  struct uffd_msg *msg)
 979{
 980	int fd;
 981
 982	fd = anon_inode_create_getfd("[userfaultfd]", &userfaultfd_fops, new,
 983			O_RDONLY | (new->flags & UFFD_SHARED_FCNTL_FLAGS), inode);
 984	if (fd < 0)
 985		return fd;
 986
 987	msg->arg.reserved.reserved1 = 0;
 988	msg->arg.fork.ufd = fd;
 989	return 0;
 990}
 991
 992static ssize_t userfaultfd_ctx_read(struct userfaultfd_ctx *ctx, int no_wait,
 993				    struct uffd_msg *msg, struct inode *inode)
 994{
 995	ssize_t ret;
 996	DECLARE_WAITQUEUE(wait, current);
 997	struct userfaultfd_wait_queue *uwq;
 998	/*
 999	 * Handling fork event requires sleeping operations, so
1000	 * we drop the event_wqh lock, then do these ops, then
1001	 * lock it back and wake up the waiter. While the lock is
1002	 * dropped the ewq may go away so we keep track of it
1003	 * carefully.
1004	 */
1005	LIST_HEAD(fork_event);
1006	struct userfaultfd_ctx *fork_nctx = NULL;
1007
1008	/* always take the fd_wqh lock before the fault_pending_wqh lock */
1009	spin_lock_irq(&ctx->fd_wqh.lock);
1010	__add_wait_queue(&ctx->fd_wqh, &wait);
1011	for (;;) {
1012		set_current_state(TASK_INTERRUPTIBLE);
1013		spin_lock(&ctx->fault_pending_wqh.lock);
1014		uwq = find_userfault(ctx);
1015		if (uwq) {
1016			/*
1017			 * Use a seqcount to repeat the lockless check
1018			 * in wake_userfault() to avoid missing
1019			 * wakeups because during the refile both
1020			 * waitqueue could become empty if this is the
1021			 * only userfault.
1022			 */
1023			write_seqcount_begin(&ctx->refile_seq);
1024
1025			/*
1026			 * The fault_pending_wqh.lock prevents the uwq
1027			 * to disappear from under us.
1028			 *
1029			 * Refile this userfault from
1030			 * fault_pending_wqh to fault_wqh, it's not
1031			 * pending anymore after we read it.
1032			 *
1033			 * Use list_del() by hand (as
1034			 * userfaultfd_wake_function also uses
1035			 * list_del_init() by hand) to be sure nobody
1036			 * changes __remove_wait_queue() to use
1037			 * list_del_init() in turn breaking the
1038			 * !list_empty_careful() check in
1039			 * handle_userfault(). The uwq->wq.head list
1040			 * must never be empty at any time during the
1041			 * refile, or the waitqueue could disappear
1042			 * from under us. The "wait_queue_head_t"
1043			 * parameter of __remove_wait_queue() is unused
1044			 * anyway.
1045			 */
1046			list_del(&uwq->wq.entry);
1047			add_wait_queue(&ctx->fault_wqh, &uwq->wq);
1048
1049			write_seqcount_end(&ctx->refile_seq);
1050
1051			/* careful to always initialize msg if ret == 0 */
1052			*msg = uwq->msg;
1053			spin_unlock(&ctx->fault_pending_wqh.lock);
1054			ret = 0;
1055			break;
1056		}
1057		spin_unlock(&ctx->fault_pending_wqh.lock);
1058
1059		spin_lock(&ctx->event_wqh.lock);
1060		uwq = find_userfault_evt(ctx);
1061		if (uwq) {
1062			*msg = uwq->msg;
1063
1064			if (uwq->msg.event == UFFD_EVENT_FORK) {
1065				fork_nctx = (struct userfaultfd_ctx *)
1066					(unsigned long)
1067					uwq->msg.arg.reserved.reserved1;
1068				list_move(&uwq->wq.entry, &fork_event);
1069				/*
1070				 * fork_nctx can be freed as soon as
1071				 * we drop the lock, unless we take a
1072				 * reference on it.
1073				 */
1074				userfaultfd_ctx_get(fork_nctx);
1075				spin_unlock(&ctx->event_wqh.lock);
1076				ret = 0;
1077				break;
1078			}
1079
1080			userfaultfd_event_complete(ctx, uwq);
1081			spin_unlock(&ctx->event_wqh.lock);
1082			ret = 0;
1083			break;
1084		}
1085		spin_unlock(&ctx->event_wqh.lock);
1086
1087		if (signal_pending(current)) {
1088			ret = -ERESTARTSYS;
1089			break;
1090		}
1091		if (no_wait) {
1092			ret = -EAGAIN;
1093			break;
1094		}
1095		spin_unlock_irq(&ctx->fd_wqh.lock);
1096		schedule();
1097		spin_lock_irq(&ctx->fd_wqh.lock);
1098	}
1099	__remove_wait_queue(&ctx->fd_wqh, &wait);
1100	__set_current_state(TASK_RUNNING);
1101	spin_unlock_irq(&ctx->fd_wqh.lock);
1102
1103	if (!ret && msg->event == UFFD_EVENT_FORK) {
1104		ret = resolve_userfault_fork(fork_nctx, inode, msg);
1105		spin_lock_irq(&ctx->event_wqh.lock);
1106		if (!list_empty(&fork_event)) {
1107			/*
1108			 * The fork thread didn't abort, so we can
1109			 * drop the temporary refcount.
1110			 */
1111			userfaultfd_ctx_put(fork_nctx);
1112
1113			uwq = list_first_entry(&fork_event,
1114					       typeof(*uwq),
1115					       wq.entry);
1116			/*
1117			 * If fork_event list wasn't empty and in turn
1118			 * the event wasn't already released by fork
1119			 * (the event is allocated on fork kernel
1120			 * stack), put the event back to its place in
1121			 * the event_wq. fork_event head will be freed
1122			 * as soon as we return so the event cannot
1123			 * stay queued there no matter the current
1124			 * "ret" value.
1125			 */
1126			list_del(&uwq->wq.entry);
1127			__add_wait_queue(&ctx->event_wqh, &uwq->wq);
1128
1129			/*
1130			 * Leave the event in the waitqueue and report
1131			 * error to userland if we failed to resolve
1132			 * the userfault fork.
1133			 */
1134			if (likely(!ret))
1135				userfaultfd_event_complete(ctx, uwq);
1136		} else {
1137			/*
1138			 * Here the fork thread aborted and the
1139			 * refcount from the fork thread on fork_nctx
1140			 * has already been released. We still hold
1141			 * the reference we took before releasing the
1142			 * lock above. If resolve_userfault_fork
1143			 * failed we've to drop it because the
1144			 * fork_nctx has to be freed in such case. If
1145			 * it succeeded we'll hold it because the new
1146			 * uffd references it.
1147			 */
1148			if (ret)
1149				userfaultfd_ctx_put(fork_nctx);
1150		}
1151		spin_unlock_irq(&ctx->event_wqh.lock);
1152	}
1153
1154	return ret;
1155}
1156
1157static ssize_t userfaultfd_read_iter(struct kiocb *iocb, struct iov_iter *to)
1158{
1159	struct file *file = iocb->ki_filp;
1160	struct userfaultfd_ctx *ctx = file->private_data;
1161	ssize_t _ret, ret = 0;
1162	struct uffd_msg msg;
1163	struct inode *inode = file_inode(file);
1164	bool no_wait;
1165
1166	if (!userfaultfd_is_initialized(ctx))
1167		return -EINVAL;
1168
1169	no_wait = file->f_flags & O_NONBLOCK || iocb->ki_flags & IOCB_NOWAIT;
1170	for (;;) {
1171		if (iov_iter_count(to) < sizeof(msg))
1172			return ret ? ret : -EINVAL;
1173		_ret = userfaultfd_ctx_read(ctx, no_wait, &msg, inode);
1174		if (_ret < 0)
1175			return ret ? ret : _ret;
1176		_ret = !copy_to_iter_full(&msg, sizeof(msg), to);
1177		if (_ret)
1178			return ret ? ret : -EFAULT;
1179		ret += sizeof(msg);
1180		/*
1181		 * Allow to read more than one fault at time but only
1182		 * block if waiting for the very first one.
1183		 */
1184		no_wait = true;
1185	}
1186}
1187
1188static void __wake_userfault(struct userfaultfd_ctx *ctx,
1189			     struct userfaultfd_wake_range *range)
1190{
1191	spin_lock_irq(&ctx->fault_pending_wqh.lock);
1192	/* wake all in the range and autoremove */
1193	if (waitqueue_active(&ctx->fault_pending_wqh))
1194		__wake_up_locked_key(&ctx->fault_pending_wqh, TASK_NORMAL,
1195				     range);
1196	if (waitqueue_active(&ctx->fault_wqh))
1197		__wake_up(&ctx->fault_wqh, TASK_NORMAL, 1, range);
1198	spin_unlock_irq(&ctx->fault_pending_wqh.lock);
1199}
1200
1201static __always_inline void wake_userfault(struct userfaultfd_ctx *ctx,
1202					   struct userfaultfd_wake_range *range)
1203{
1204	unsigned seq;
1205	bool need_wakeup;
1206
1207	/*
1208	 * To be sure waitqueue_active() is not reordered by the CPU
1209	 * before the pagetable update, use an explicit SMP memory
1210	 * barrier here. PT lock release or mmap_read_unlock(mm) still
1211	 * have release semantics that can allow the
1212	 * waitqueue_active() to be reordered before the pte update.
1213	 */
1214	smp_mb();
1215
1216	/*
1217	 * Use waitqueue_active because it's very frequent to
1218	 * change the address space atomically even if there are no
1219	 * userfaults yet. So we take the spinlock only when we're
1220	 * sure we've userfaults to wake.
1221	 */
1222	do {
1223		seq = read_seqcount_begin(&ctx->refile_seq);
1224		need_wakeup = waitqueue_active(&ctx->fault_pending_wqh) ||
1225			waitqueue_active(&ctx->fault_wqh);
1226		cond_resched();
1227	} while (read_seqcount_retry(&ctx->refile_seq, seq));
1228	if (need_wakeup)
1229		__wake_userfault(ctx, range);
1230}
1231
1232static __always_inline int validate_unaligned_range(
1233	struct mm_struct *mm, __u64 start, __u64 len)
1234{
1235	__u64 task_size = mm->task_size;
1236
1237	if (len & ~PAGE_MASK)
1238		return -EINVAL;
1239	if (!len)
1240		return -EINVAL;
1241	if (start >= task_size)
1242		return -EINVAL;
1243	if (len > task_size - start)
1244		return -EINVAL;
1245	if (start + len <= start)
1246		return -EINVAL;
1247	return 0;
1248}
1249
1250static __always_inline int validate_range(struct mm_struct *mm,
1251					  __u64 start, __u64 len)
1252{
1253	if (start & ~PAGE_MASK)
1254		return -EINVAL;
1255
1256	return validate_unaligned_range(mm, start, len);
1257}
1258
1259static int userfaultfd_register(struct userfaultfd_ctx *ctx,
1260				unsigned long arg)
1261{
1262	struct mm_struct *mm = ctx->mm;
1263	struct vm_area_struct *vma, *cur;
1264	int ret;
1265	struct uffdio_register uffdio_register;
1266	struct uffdio_register __user *user_uffdio_register;
1267	vm_flags_t vm_flags;
1268	bool found;
1269	bool basic_ioctls;
1270	unsigned long start, end;
1271	struct vma_iterator vmi;
1272	bool wp_async = userfaultfd_wp_async_ctx(ctx);
1273
1274	user_uffdio_register = (struct uffdio_register __user *) arg;
1275
1276	ret = -EFAULT;
1277	if (copy_from_user(&uffdio_register, user_uffdio_register,
1278			   sizeof(uffdio_register)-sizeof(__u64)))
1279		goto out;
1280
1281	ret = -EINVAL;
1282	if (!uffdio_register.mode)
1283		goto out;
1284	if (uffdio_register.mode & ~UFFD_API_REGISTER_MODES)
1285		goto out;
1286	vm_flags = 0;
1287	if (uffdio_register.mode & UFFDIO_REGISTER_MODE_MISSING)
1288		vm_flags |= VM_UFFD_MISSING;
1289	if (uffdio_register.mode & UFFDIO_REGISTER_MODE_WP) {
1290		if (!pgtable_supports_uffd_wp())
1291			goto out;
1292
1293		vm_flags |= VM_UFFD_WP;
1294	}
1295	if (uffdio_register.mode & UFFDIO_REGISTER_MODE_MINOR) {
1296#ifndef CONFIG_HAVE_ARCH_USERFAULTFD_MINOR
1297		goto out;
1298#endif
1299		vm_flags |= VM_UFFD_MINOR;
1300	}
1301
1302	ret = validate_range(mm, uffdio_register.range.start,
1303			     uffdio_register.range.len);
1304	if (ret)
1305		goto out;
1306
1307	start = uffdio_register.range.start;
1308	end = start + uffdio_register.range.len;
1309
1310	ret = -ENOMEM;
1311	if (!mmget_not_zero(mm))
1312		goto out;
1313
1314	ret = -EINVAL;
1315	mmap_write_lock(mm);
1316	vma_iter_init(&vmi, mm, start);
1317	vma = vma_find(&vmi, end);
1318	if (!vma)
1319		goto out_unlock;
1320
1321	/*
1322	 * If the first vma contains huge pages, make sure start address
1323	 * is aligned to huge page size.
1324	 */
1325	if (is_vm_hugetlb_page(vma)) {
1326		unsigned long vma_hpagesize = vma_kernel_pagesize(vma);
1327
1328		if (start & (vma_hpagesize - 1))
1329			goto out_unlock;
1330	}
1331
1332	/*
1333	 * Search for not compatible vmas.
1334	 */
1335	found = false;
1336	basic_ioctls = false;
1337	cur = vma;
1338	do {
1339		cond_resched();
1340
1341		VM_WARN_ON_ONCE(!!cur->vm_userfaultfd_ctx.ctx ^
1342				!!(cur->vm_flags & __VM_UFFD_FLAGS));
1343
1344		/* check not compatible vmas */
1345		ret = -EINVAL;
1346		if (!vma_can_userfault(cur, vm_flags, wp_async))
1347			goto out_unlock;
1348
1349		/*
1350		 * UFFDIO_COPY will fill file holes even without
1351		 * PROT_WRITE. This check enforces that if this is a
1352		 * MAP_SHARED, the process has write permission to the backing
1353		 * file. If VM_MAYWRITE is set it also enforces that on a
1354		 * MAP_SHARED vma: there is no F_WRITE_SEAL and no further
1355		 * F_WRITE_SEAL can be taken until the vma is destroyed.
1356		 */
1357		ret = -EPERM;
1358		if (unlikely(!(cur->vm_flags & VM_MAYWRITE)))
1359			goto out_unlock;
1360
1361		/*
1362		 * If this vma contains ending address, and huge pages
1363		 * check alignment.
1364		 */
1365		if (is_vm_hugetlb_page(cur) && end <= cur->vm_end &&
1366		    end > cur->vm_start) {
1367			unsigned long vma_hpagesize = vma_kernel_pagesize(cur);
1368
1369			ret = -EINVAL;
1370
1371			if (end & (vma_hpagesize - 1))
1372				goto out_unlock;
1373		}
1374		if ((vm_flags & VM_UFFD_WP) && !(cur->vm_flags & VM_MAYWRITE))
1375			goto out_unlock;
1376
1377		/*
1378		 * Check that this vma isn't already owned by a
1379		 * different userfaultfd. We can't allow more than one
1380		 * userfaultfd to own a single vma simultaneously or we
1381		 * wouldn't know which one to deliver the userfaults to.
1382		 */
1383		ret = -EBUSY;
1384		if (cur->vm_userfaultfd_ctx.ctx &&
1385		    cur->vm_userfaultfd_ctx.ctx != ctx)
1386			goto out_unlock;
1387
1388		/*
1389		 * Note vmas containing huge pages
1390		 */
1391		if (is_vm_hugetlb_page(cur))
1392			basic_ioctls = true;
1393
1394		found = true;
1395	} for_each_vma_range(vmi, cur, end);
1396	VM_WARN_ON_ONCE(!found);
1397
1398	ret = userfaultfd_register_range(ctx, vma, vm_flags, start, end,
1399					 wp_async);
1400
1401out_unlock:
1402	mmap_write_unlock(mm);
1403	mmput(mm);
1404	if (!ret) {
1405		__u64 ioctls_out;
1406
1407		ioctls_out = basic_ioctls ? UFFD_API_RANGE_IOCTLS_BASIC :
1408		    UFFD_API_RANGE_IOCTLS;
1409
1410		/*
1411		 * Declare the WP ioctl only if the WP mode is
1412		 * specified and all checks passed with the range
1413		 */
1414		if (!(uffdio_register.mode & UFFDIO_REGISTER_MODE_WP))
1415			ioctls_out &= ~((__u64)1 << _UFFDIO_WRITEPROTECT);
1416
1417		/* CONTINUE ioctl is only supported for MINOR ranges. */
1418		if (!(uffdio_register.mode & UFFDIO_REGISTER_MODE_MINOR))
1419			ioctls_out &= ~((__u64)1 << _UFFDIO_CONTINUE);
1420
1421		/*
1422		 * Now that we scanned all vmas we can already tell
1423		 * userland which ioctls methods are guaranteed to
1424		 * succeed on this range.
1425		 */
1426		if (put_user(ioctls_out, &user_uffdio_register->ioctls))
1427			ret = -EFAULT;
1428	}
1429out:
1430	return ret;
1431}
1432
1433static int userfaultfd_unregister(struct userfaultfd_ctx *ctx,
1434				  unsigned long arg)
1435{
1436	struct mm_struct *mm = ctx->mm;
1437	struct vm_area_struct *vma, *prev, *cur;
1438	int ret;
1439	struct uffdio_range uffdio_unregister;
1440	bool found;
1441	unsigned long start, end, vma_end;
1442	const void __user *buf = (void __user *)arg;
1443	struct vma_iterator vmi;
1444	bool wp_async = userfaultfd_wp_async_ctx(ctx);
1445
1446	ret = -EFAULT;
1447	if (copy_from_user(&uffdio_unregister, buf, sizeof(uffdio_unregister)))
1448		goto out;
1449
1450	ret = validate_range(mm, uffdio_unregister.start,
1451			     uffdio_unregister.len);
1452	if (ret)
1453		goto out;
1454
1455	start = uffdio_unregister.start;
1456	end = start + uffdio_unregister.len;
1457
1458	ret = -ENOMEM;
1459	if (!mmget_not_zero(mm))
1460		goto out;
1461
1462	mmap_write_lock(mm);
1463	ret = -EINVAL;
1464	vma_iter_init(&vmi, mm, start);
1465	vma = vma_find(&vmi, end);
1466	if (!vma)
1467		goto out_unlock;
1468
1469	/*
1470	 * If the first vma contains huge pages, make sure start address
1471	 * is aligned to huge page size.
1472	 */
1473	if (is_vm_hugetlb_page(vma)) {
1474		unsigned long vma_hpagesize = vma_kernel_pagesize(vma);
1475
1476		if (start & (vma_hpagesize - 1))
1477			goto out_unlock;
1478	}
1479
1480	/*
1481	 * Search for not compatible vmas.
1482	 */
1483	found = false;
1484	cur = vma;
1485	do {
1486		cond_resched();
1487
1488		VM_WARN_ON_ONCE(!!cur->vm_userfaultfd_ctx.ctx ^
1489				!!(cur->vm_flags & __VM_UFFD_FLAGS));
1490
1491		/*
1492		 * Prevent unregistering through a different userfaultfd than
1493		 * the one used for registration.
1494		 */
1495		if (cur->vm_userfaultfd_ctx.ctx &&
1496		    cur->vm_userfaultfd_ctx.ctx != ctx)
1497			goto out_unlock;
1498
1499		/*
1500		 * Check not compatible vmas, not strictly required
1501		 * here as not compatible vmas cannot have an
1502		 * userfaultfd_ctx registered on them, but this
1503		 * provides for more strict behavior to notice
1504		 * unregistration errors.
1505		 */
1506		if (!vma_can_userfault(cur, cur->vm_flags, wp_async))
1507			goto out_unlock;
1508
1509		found = true;
1510	} for_each_vma_range(vmi, cur, end);
1511	VM_WARN_ON_ONCE(!found);
1512
1513	vma_iter_set(&vmi, start);
1514	prev = vma_prev(&vmi);
1515	if (vma->vm_start < start)
1516		prev = vma;
1517
1518	ret = 0;
1519	for_each_vma_range(vmi, vma, end) {
1520		cond_resched();
1521
1522		/* VMA not registered with userfaultfd. */
1523		if (!vma->vm_userfaultfd_ctx.ctx)
1524			goto skip;
1525
1526		VM_WARN_ON_ONCE(vma->vm_userfaultfd_ctx.ctx != ctx);
1527		VM_WARN_ON_ONCE(!vma_can_userfault(vma, vma->vm_flags, wp_async));
1528		VM_WARN_ON_ONCE(!(vma->vm_flags & VM_MAYWRITE));
1529
1530		if (vma->vm_start > start)
1531			start = vma->vm_start;
1532		vma_end = min(end, vma->vm_end);
1533
1534		if (userfaultfd_missing(vma)) {
1535			/*
1536			 * Wake any concurrent pending userfault while
1537			 * we unregister, so they will not hang
1538			 * permanently and it avoids userland to call
1539			 * UFFDIO_WAKE explicitly.
1540			 */
1541			struct userfaultfd_wake_range range;
1542			range.start = start;
1543			range.len = vma_end - start;
1544			wake_userfault(vma->vm_userfaultfd_ctx.ctx, &range);
1545		}
1546
1547		vma = userfaultfd_clear_vma(&vmi, prev, vma,
1548					    start, vma_end);
1549		if (IS_ERR(vma)) {
1550			ret = PTR_ERR(vma);
1551			break;
1552		}
1553
1554	skip:
1555		prev = vma;
1556		start = vma->vm_end;
1557	}
1558
1559out_unlock:
1560	mmap_write_unlock(mm);
1561	mmput(mm);
1562out:
1563	return ret;
1564}
1565
1566/*
1567 * userfaultfd_wake may be used in combination with the
1568 * UFFDIO_*_MODE_DONTWAKE to wakeup userfaults in batches.
1569 */
1570static int userfaultfd_wake(struct userfaultfd_ctx *ctx,
1571			    unsigned long arg)
1572{
1573	int ret;
1574	struct uffdio_range uffdio_wake;
1575	struct userfaultfd_wake_range range;
1576	const void __user *buf = (void __user *)arg;
1577
1578	ret = -EFAULT;
1579	if (copy_from_user(&uffdio_wake, buf, sizeof(uffdio_wake)))
1580		goto out;
1581
1582	ret = validate_range(ctx->mm, uffdio_wake.start, uffdio_wake.len);
1583	if (ret)
1584		goto out;
1585
1586	range.start = uffdio_wake.start;
1587	range.len = uffdio_wake.len;
1588
1589	/*
1590	 * len == 0 means wake all and we don't want to wake all here,
1591	 * so check it again to be sure.
1592	 */
1593	VM_WARN_ON_ONCE(!range.len);
1594
1595	wake_userfault(ctx, &range);
1596	ret = 0;
1597
1598out:
1599	return ret;
1600}
1601
1602static int userfaultfd_copy(struct userfaultfd_ctx *ctx,
1603			    unsigned long arg)
1604{
1605	__s64 ret;
1606	struct uffdio_copy uffdio_copy;
1607	struct uffdio_copy __user *user_uffdio_copy;
1608	struct userfaultfd_wake_range range;
1609	uffd_flags_t flags = 0;
1610
1611	user_uffdio_copy = (struct uffdio_copy __user *) arg;
1612
1613	ret = -EAGAIN;
1614	if (unlikely(atomic_read(&ctx->mmap_changing))) {
1615		if (unlikely(put_user(ret, &user_uffdio_copy->copy)))
1616			return -EFAULT;
1617		goto out;
1618	}
1619
1620	ret = -EFAULT;
1621	if (copy_from_user(&uffdio_copy, user_uffdio_copy,
1622			   /* don't copy "copy" last field */
1623			   sizeof(uffdio_copy)-sizeof(__s64)))
1624		goto out;
1625
1626	ret = validate_unaligned_range(ctx->mm, uffdio_copy.src,
1627				       uffdio_copy.len);
1628	if (ret)
1629		goto out;
1630	ret = validate_range(ctx->mm, uffdio_copy.dst, uffdio_copy.len);
1631	if (ret)
1632		goto out;
1633
1634	ret = -EINVAL;
1635	if (uffdio_copy.mode & ~(UFFDIO_COPY_MODE_DONTWAKE|UFFDIO_COPY_MODE_WP))
1636		goto out;
1637	if (uffdio_copy.mode & UFFDIO_COPY_MODE_WP)
1638		flags |= MFILL_ATOMIC_WP;
1639	if (mmget_not_zero(ctx->mm)) {
1640		ret = mfill_atomic_copy(ctx, uffdio_copy.dst, uffdio_copy.src,
1641					uffdio_copy.len, flags);
1642		mmput(ctx->mm);
1643	} else {
1644		return -ESRCH;
1645	}
1646	if (unlikely(put_user(ret, &user_uffdio_copy->copy)))
1647		return -EFAULT;
1648	if (ret < 0)
1649		goto out;
1650	VM_WARN_ON_ONCE(!ret);
1651	/* len == 0 would wake all */
1652	range.len = ret;
1653	if (!(uffdio_copy.mode & UFFDIO_COPY_MODE_DONTWAKE)) {
1654		range.start = uffdio_copy.dst;
1655		wake_userfault(ctx, &range);
1656	}
1657	ret = range.len == uffdio_copy.len ? 0 : -EAGAIN;
1658out:
1659	return ret;
1660}
1661
1662static int userfaultfd_zeropage(struct userfaultfd_ctx *ctx,
1663				unsigned long arg)
1664{
1665	__s64 ret;
1666	struct uffdio_zeropage uffdio_zeropage;
1667	struct uffdio_zeropage __user *user_uffdio_zeropage;
1668	struct userfaultfd_wake_range range;
1669
1670	user_uffdio_zeropage = (struct uffdio_zeropage __user *) arg;
1671
1672	ret = -EAGAIN;
1673	if (unlikely(atomic_read(&ctx->mmap_changing))) {
1674		if (unlikely(put_user(ret, &user_uffdio_zeropage->zeropage)))
1675			return -EFAULT;
1676		goto out;
1677	}
1678
1679	ret = -EFAULT;
1680	if (copy_from_user(&uffdio_zeropage, user_uffdio_zeropage,
1681			   /* don't copy "zeropage" last field */
1682			   sizeof(uffdio_zeropage)-sizeof(__s64)))
1683		goto out;
1684
1685	ret = validate_range(ctx->mm, uffdio_zeropage.range.start,
1686			     uffdio_zeropage.range.len);
1687	if (ret)
1688		goto out;
1689	ret = -EINVAL;
1690	if (uffdio_zeropage.mode & ~UFFDIO_ZEROPAGE_MODE_DONTWAKE)
1691		goto out;
1692
1693	if (mmget_not_zero(ctx->mm)) {
1694		ret = mfill_atomic_zeropage(ctx, uffdio_zeropage.range.start,
1695					   uffdio_zeropage.range.len);
1696		mmput(ctx->mm);
1697	} else {
1698		return -ESRCH;
1699	}
1700	if (unlikely(put_user(ret, &user_uffdio_zeropage->zeropage)))
1701		return -EFAULT;
1702	if (ret < 0)
1703		goto out;
1704	/* len == 0 would wake all */
1705	VM_WARN_ON_ONCE(!ret);
1706	range.len = ret;
1707	if (!(uffdio_zeropage.mode & UFFDIO_ZEROPAGE_MODE_DONTWAKE)) {
1708		range.start = uffdio_zeropage.range.start;
1709		wake_userfault(ctx, &range);
1710	}
1711	ret = range.len == uffdio_zeropage.range.len ? 0 : -EAGAIN;
1712out:
1713	return ret;
1714}
1715
1716static int userfaultfd_writeprotect(struct userfaultfd_ctx *ctx,
1717				    unsigned long arg)
1718{
1719	int ret;
1720	struct uffdio_writeprotect uffdio_wp;
1721	struct uffdio_writeprotect __user *user_uffdio_wp;
1722	struct userfaultfd_wake_range range;
1723	bool mode_wp, mode_dontwake;
1724
1725	if (atomic_read(&ctx->mmap_changing))
1726		return -EAGAIN;
1727
1728	user_uffdio_wp = (struct uffdio_writeprotect __user *) arg;
1729
1730	if (copy_from_user(&uffdio_wp, user_uffdio_wp,
1731			   sizeof(struct uffdio_writeprotect)))
1732		return -EFAULT;
1733
1734	ret = validate_range(ctx->mm, uffdio_wp.range.start,
1735			     uffdio_wp.range.len);
1736	if (ret)
1737		return ret;
1738
1739	if (uffdio_wp.mode & ~(UFFDIO_WRITEPROTECT_MODE_DONTWAKE |
1740			       UFFDIO_WRITEPROTECT_MODE_WP))
1741		return -EINVAL;
1742
1743	mode_wp = uffdio_wp.mode & UFFDIO_WRITEPROTECT_MODE_WP;
1744	mode_dontwake = uffdio_wp.mode & UFFDIO_WRITEPROTECT_MODE_DONTWAKE;
1745
1746	if (mode_wp && mode_dontwake)
1747		return -EINVAL;
1748
1749	if (mmget_not_zero(ctx->mm)) {
1750		ret = mwriteprotect_range(ctx, uffdio_wp.range.start,
1751					  uffdio_wp.range.len, mode_wp);
1752		mmput(ctx->mm);
1753	} else {
1754		return -ESRCH;
1755	}
1756
1757	if (ret)
1758		return ret;
1759
1760	if (!mode_wp && !mode_dontwake) {
1761		range.start = uffdio_wp.range.start;
1762		range.len = uffdio_wp.range.len;
1763		wake_userfault(ctx, &range);
1764	}
1765	return ret;
1766}
1767
1768static int userfaultfd_continue(struct userfaultfd_ctx *ctx, unsigned long arg)
1769{
1770	__s64 ret;
1771	struct uffdio_continue uffdio_continue;
1772	struct uffdio_continue __user *user_uffdio_continue;
1773	struct userfaultfd_wake_range range;
1774	uffd_flags_t flags = 0;
1775
1776	user_uffdio_continue = (struct uffdio_continue __user *)arg;
1777
1778	ret = -EAGAIN;
1779	if (unlikely(atomic_read(&ctx->mmap_changing))) {
1780		if (unlikely(put_user(ret, &user_uffdio_continue->mapped)))
1781			return -EFAULT;
1782		goto out;
1783	}
1784
1785	ret = -EFAULT;
1786	if (copy_from_user(&uffdio_continue, user_uffdio_continue,
1787			   /* don't copy the output fields */
1788			   sizeof(uffdio_continue) - (sizeof(__s64))))
1789		goto out;
1790
1791	ret = validate_range(ctx->mm, uffdio_continue.range.start,
1792			     uffdio_continue.range.len);
1793	if (ret)
1794		goto out;
1795
1796	ret = -EINVAL;
1797	if (uffdio_continue.mode & ~(UFFDIO_CONTINUE_MODE_DONTWAKE |
1798				     UFFDIO_CONTINUE_MODE_WP))
1799		goto out;
1800	if (uffdio_continue.mode & UFFDIO_CONTINUE_MODE_WP)
1801		flags |= MFILL_ATOMIC_WP;
1802
1803	if (mmget_not_zero(ctx->mm)) {
1804		ret = mfill_atomic_continue(ctx, uffdio_continue.range.start,
1805					    uffdio_continue.range.len, flags);
1806		mmput(ctx->mm);
1807	} else {
1808		return -ESRCH;
1809	}
1810
1811	if (unlikely(put_user(ret, &user_uffdio_continue->mapped)))
1812		return -EFAULT;
1813	if (ret < 0)
1814		goto out;
1815
1816	/* len == 0 would wake all */
1817	VM_WARN_ON_ONCE(!ret);
1818	range.len = ret;
1819	if (!(uffdio_continue.mode & UFFDIO_CONTINUE_MODE_DONTWAKE)) {
1820		range.start = uffdio_continue.range.start;
1821		wake_userfault(ctx, &range);
1822	}
1823	ret = range.len == uffdio_continue.range.len ? 0 : -EAGAIN;
1824
1825out:
1826	return ret;
1827}
1828
1829static inline int userfaultfd_poison(struct userfaultfd_ctx *ctx, unsigned long arg)
1830{
1831	__s64 ret;
1832	struct uffdio_poison uffdio_poison;
1833	struct uffdio_poison __user *user_uffdio_poison;
1834	struct userfaultfd_wake_range range;
1835
1836	user_uffdio_poison = (struct uffdio_poison __user *)arg;
1837
1838	ret = -EAGAIN;
1839	if (unlikely(atomic_read(&ctx->mmap_changing))) {
1840		if (unlikely(put_user(ret, &user_uffdio_poison->updated)))
1841			return -EFAULT;
1842		goto out;
1843	}
1844
1845	ret = -EFAULT;
1846	if (copy_from_user(&uffdio_poison, user_uffdio_poison,
1847			   /* don't copy the output fields */
1848			   sizeof(uffdio_poison) - (sizeof(__s64))))
1849		goto out;
1850
1851	ret = validate_range(ctx->mm, uffdio_poison.range.start,
1852			     uffdio_poison.range.len);
1853	if (ret)
1854		goto out;
1855
1856	ret = -EINVAL;
1857	if (uffdio_poison.mode & ~UFFDIO_POISON_MODE_DONTWAKE)
1858		goto out;
1859
1860	if (mmget_not_zero(ctx->mm)) {
1861		ret = mfill_atomic_poison(ctx, uffdio_poison.range.start,
1862					  uffdio_poison.range.len, 0);
1863		mmput(ctx->mm);
1864	} else {
1865		return -ESRCH;
1866	}
1867
1868	if (unlikely(put_user(ret, &user_uffdio_poison->updated)))
1869		return -EFAULT;
1870	if (ret < 0)
1871		goto out;
1872
1873	/* len == 0 would wake all */
1874	VM_WARN_ON_ONCE(!ret);
1875	range.len = ret;
1876	if (!(uffdio_poison.mode & UFFDIO_POISON_MODE_DONTWAKE)) {
1877		range.start = uffdio_poison.range.start;
1878		wake_userfault(ctx, &range);
1879	}
1880	ret = range.len == uffdio_poison.range.len ? 0 : -EAGAIN;
1881
1882out:
1883	return ret;
1884}
1885
1886bool userfaultfd_wp_async(struct vm_area_struct *vma)
1887{
1888	return userfaultfd_wp_async_ctx(vma->vm_userfaultfd_ctx.ctx);
1889}
1890
1891static inline unsigned int uffd_ctx_features(__u64 user_features)
1892{
1893	/*
1894	 * For the current set of features the bits just coincide. Set
1895	 * UFFD_FEATURE_INITIALIZED to mark the features as enabled.
1896	 */
1897	return (unsigned int)user_features | UFFD_FEATURE_INITIALIZED;
1898}
1899
1900static int userfaultfd_move(struct userfaultfd_ctx *ctx,
1901			    unsigned long arg)
1902{
1903	__s64 ret;
1904	struct uffdio_move uffdio_move;
1905	struct uffdio_move __user *user_uffdio_move;
1906	struct userfaultfd_wake_range range;
1907	struct mm_struct *mm = ctx->mm;
1908
1909	user_uffdio_move = (struct uffdio_move __user *) arg;
1910
1911	ret = -EAGAIN;
1912	if (unlikely(atomic_read(&ctx->mmap_changing))) {
1913		if (unlikely(put_user(ret, &user_uffdio_move->move)))
1914			return -EFAULT;
1915		goto out;
1916	}
1917
1918	if (copy_from_user(&uffdio_move, user_uffdio_move,
1919			   /* don't copy "move" last field */
1920			   sizeof(uffdio_move)-sizeof(__s64)))
1921		return -EFAULT;
1922
1923	/* Do not allow cross-mm moves. */
1924	if (mm != current->mm)
1925		return -EINVAL;
1926
1927	ret = validate_range(mm, uffdio_move.dst, uffdio_move.len);
1928	if (ret)
1929		return ret;
1930
1931	ret = validate_range(mm, uffdio_move.src, uffdio_move.len);
1932	if (ret)
1933		return ret;
1934
1935	if (uffdio_move.mode & ~(UFFDIO_MOVE_MODE_ALLOW_SRC_HOLES|
1936				  UFFDIO_MOVE_MODE_DONTWAKE))
1937		return -EINVAL;
1938
1939	if (mmget_not_zero(mm)) {
1940		ret = move_pages(ctx, uffdio_move.dst, uffdio_move.src,
1941				 uffdio_move.len, uffdio_move.mode);
1942		mmput(mm);
1943	} else {
1944		return -ESRCH;
1945	}
1946
1947	if (unlikely(put_user(ret, &user_uffdio_move->move)))
1948		return -EFAULT;
1949	if (ret < 0)
1950		goto out;
1951
1952	/* len == 0 would wake all */
1953	VM_WARN_ON(!ret);
1954	range.len = ret;
1955	if (!(uffdio_move.mode & UFFDIO_MOVE_MODE_DONTWAKE)) {
1956		range.start = uffdio_move.dst;
1957		wake_userfault(ctx, &range);
1958	}
1959	ret = range.len == uffdio_move.len ? 0 : -EAGAIN;
1960
1961out:
1962	return ret;
1963}
1964
1965/*
1966 * userland asks for a certain API version and we return which bits
1967 * and ioctl commands are implemented in this kernel for such API
1968 * version or -EINVAL if unknown.
1969 */
1970static int userfaultfd_api(struct userfaultfd_ctx *ctx,
1971			   unsigned long arg)
1972{
1973	struct uffdio_api uffdio_api;
1974	void __user *buf = (void __user *)arg;
1975	unsigned int ctx_features;
1976	int ret;
1977	__u64 features;
1978
1979	ret = -EFAULT;
1980	if (copy_from_user(&uffdio_api, buf, sizeof(uffdio_api)))
1981		goto out;
1982	features = uffdio_api.features;
1983	ret = -EINVAL;
1984	if (uffdio_api.api != UFFD_API)
1985		goto err_out;
1986	ret = -EPERM;
1987	if ((features & UFFD_FEATURE_EVENT_FORK) && !capable(CAP_SYS_PTRACE))
1988		goto err_out;
1989
1990	/* WP_ASYNC relies on WP_UNPOPULATED, choose it unconditionally */
1991	if (features & UFFD_FEATURE_WP_ASYNC)
1992		features |= UFFD_FEATURE_WP_UNPOPULATED;
1993
1994	/* report all available features and ioctls to userland */
1995	uffdio_api.features = UFFD_API_FEATURES;
1996#ifndef CONFIG_HAVE_ARCH_USERFAULTFD_MINOR
1997	uffdio_api.features &=
1998		~(UFFD_FEATURE_MINOR_HUGETLBFS | UFFD_FEATURE_MINOR_SHMEM);
1999#endif
2000	if (!pgtable_supports_uffd_wp())
2001		uffdio_api.features &= ~UFFD_FEATURE_PAGEFAULT_FLAG_WP;
2002
2003	if (!uffd_supports_wp_marker()) {
2004		uffdio_api.features &= ~UFFD_FEATURE_WP_HUGETLBFS_SHMEM;
2005		uffdio_api.features &= ~UFFD_FEATURE_WP_UNPOPULATED;
2006		uffdio_api.features &= ~UFFD_FEATURE_WP_ASYNC;
2007	}
2008
2009	ret = -EINVAL;
2010	if (features & ~uffdio_api.features)
2011		goto err_out;
2012
2013	uffdio_api.ioctls = UFFD_API_IOCTLS;
2014	ret = -EFAULT;
2015	if (copy_to_user(buf, &uffdio_api, sizeof(uffdio_api)))
2016		goto out;
2017
2018	/* only enable the requested features for this uffd context */
2019	ctx_features = uffd_ctx_features(features);
2020	ret = -EINVAL;
2021	if (cmpxchg(&ctx->features, 0, ctx_features) != 0)
2022		goto err_out;
2023
2024	ret = 0;
2025out:
2026	return ret;
2027err_out:
2028	memset(&uffdio_api, 0, sizeof(uffdio_api));
2029	if (copy_to_user(buf, &uffdio_api, sizeof(uffdio_api)))
2030		ret = -EFAULT;
2031	goto out;
2032}
2033
2034static long userfaultfd_ioctl(struct file *file, unsigned cmd,
2035			      unsigned long arg)
2036{
2037	int ret = -EINVAL;
2038	struct userfaultfd_ctx *ctx = file->private_data;
2039
2040	if (cmd != UFFDIO_API && !userfaultfd_is_initialized(ctx))
2041		return -EINVAL;
2042
2043	switch(cmd) {
2044	case UFFDIO_API:
2045		ret = userfaultfd_api(ctx, arg);
2046		break;
2047	case UFFDIO_REGISTER:
2048		ret = userfaultfd_register(ctx, arg);
2049		break;
2050	case UFFDIO_UNREGISTER:
2051		ret = userfaultfd_unregister(ctx, arg);
2052		break;
2053	case UFFDIO_WAKE:
2054		ret = userfaultfd_wake(ctx, arg);
2055		break;
2056	case UFFDIO_COPY:
2057		ret = userfaultfd_copy(ctx, arg);
2058		break;
2059	case UFFDIO_ZEROPAGE:
2060		ret = userfaultfd_zeropage(ctx, arg);
2061		break;
2062	case UFFDIO_MOVE:
2063		ret = userfaultfd_move(ctx, arg);
2064		break;
2065	case UFFDIO_WRITEPROTECT:
2066		ret = userfaultfd_writeprotect(ctx, arg);
2067		break;
2068	case UFFDIO_CONTINUE:
2069		ret = userfaultfd_continue(ctx, arg);
2070		break;
2071	case UFFDIO_POISON:
2072		ret = userfaultfd_poison(ctx, arg);
2073		break;
2074	}
2075	return ret;
2076}
2077
2078#ifdef CONFIG_PROC_FS
2079static void userfaultfd_show_fdinfo(struct seq_file *m, struct file *f)
2080{
2081	struct userfaultfd_ctx *ctx = f->private_data;
2082	wait_queue_entry_t *wq;
2083	unsigned long pending = 0, total = 0;
2084
2085	spin_lock_irq(&ctx->fault_pending_wqh.lock);
2086	list_for_each_entry(wq, &ctx->fault_pending_wqh.head, entry) {
2087		pending++;
2088		total++;
2089	}
2090	list_for_each_entry(wq, &ctx->fault_wqh.head, entry) {
2091		total++;
2092	}
2093	spin_unlock_irq(&ctx->fault_pending_wqh.lock);
2094
2095	/*
2096	 * If more protocols will be added, there will be all shown
2097	 * separated by a space. Like this:
2098	 *	protocols: aa:... bb:...
2099	 */
2100	seq_printf(m, "pending:\t%lu\ntotal:\t%lu\nAPI:\t%Lx:%x:%Lx\n",
2101		   pending, total, UFFD_API, ctx->features,
2102		   UFFD_API_IOCTLS|UFFD_API_RANGE_IOCTLS);
2103}
2104#endif
2105
2106static const struct file_operations userfaultfd_fops = {
2107#ifdef CONFIG_PROC_FS
2108	.show_fdinfo	= userfaultfd_show_fdinfo,
2109#endif
2110	.release	= userfaultfd_release,
2111	.poll		= userfaultfd_poll,
2112	.read_iter	= userfaultfd_read_iter,
2113	.unlocked_ioctl = userfaultfd_ioctl,
2114	.compat_ioctl	= compat_ptr_ioctl,
2115	.llseek		= noop_llseek,
2116};
2117
2118static void init_once_userfaultfd_ctx(void *mem)
2119{
2120	struct userfaultfd_ctx *ctx = (struct userfaultfd_ctx *) mem;
2121
2122	init_waitqueue_head(&ctx->fault_pending_wqh);
2123	init_waitqueue_head(&ctx->fault_wqh);
2124	init_waitqueue_head(&ctx->event_wqh);
2125	init_waitqueue_head(&ctx->fd_wqh);
2126	seqcount_spinlock_init(&ctx->refile_seq, &ctx->fault_pending_wqh.lock);
2127}
2128
2129static int new_userfaultfd(int flags)
2130{
2131	struct userfaultfd_ctx *ctx __free(kfree) = NULL;
2132
2133	VM_WARN_ON_ONCE(!current->mm);
2134
2135	/* Check the UFFD_* constants for consistency.  */
2136	BUILD_BUG_ON(UFFD_USER_MODE_ONLY & UFFD_SHARED_FCNTL_FLAGS);
2137
2138	if (flags & ~(UFFD_SHARED_FCNTL_FLAGS | UFFD_USER_MODE_ONLY))
2139		return -EINVAL;
2140
2141	ctx = kmem_cache_alloc(userfaultfd_ctx_cachep, GFP_KERNEL);
2142	if (!ctx)
2143		return -ENOMEM;
2144
2145	refcount_set(&ctx->refcount, 1);
2146	ctx->flags = flags;
2147	ctx->features = 0;
2148	ctx->released = false;
2149	init_rwsem(&ctx->map_changing_lock);
2150	atomic_set(&ctx->mmap_changing, 0);
2151	ctx->mm = current->mm;
2152
2153	FD_PREPARE(fdf, flags & UFFD_SHARED_FCNTL_FLAGS,
2154		   anon_inode_create_getfile("[userfaultfd]", &userfaultfd_fops, ctx,
2155					     O_RDONLY | (flags & UFFD_SHARED_FCNTL_FLAGS),
2156					     NULL));
2157	if (fdf.err)
2158		return fdf.err;
2159
2160	/* prevent the mm struct to be freed */
2161	mmgrab(ctx->mm);
2162	fd_prepare_file(fdf)->f_mode |= FMODE_NOWAIT;
2163	retain_and_null_ptr(ctx);
2164	return fd_publish(fdf);
2165}
2166
2167static inline bool userfaultfd_syscall_allowed(int flags)
2168{
2169	/* Userspace-only page faults are always allowed */
2170	if (flags & UFFD_USER_MODE_ONLY)
2171		return true;
2172
2173	/*
2174	 * The user is requesting a userfaultfd which can handle kernel faults.
2175	 * Privileged users are always allowed to do this.
2176	 */
2177	if (capable(CAP_SYS_PTRACE))
2178		return true;
2179
2180	/* Otherwise, access to kernel fault handling is sysctl controlled. */
2181	return sysctl_unprivileged_userfaultfd;
2182}
2183
2184SYSCALL_DEFINE1(userfaultfd, int, flags)
2185{
2186	if (!userfaultfd_syscall_allowed(flags))
2187		return -EPERM;
2188
2189	return new_userfaultfd(flags);
2190}
2191
2192static long userfaultfd_dev_ioctl(struct file *file, unsigned int cmd, unsigned long flags)
2193{
2194	if (cmd != USERFAULTFD_IOC_NEW)
2195		return -EINVAL;
2196
2197	return new_userfaultfd(flags);
2198}
2199
2200static const struct file_operations userfaultfd_dev_fops = {
2201	.unlocked_ioctl = userfaultfd_dev_ioctl,
2202	.compat_ioctl = userfaultfd_dev_ioctl,
2203	.owner = THIS_MODULE,
2204	.llseek = noop_llseek,
2205};
2206
2207static struct miscdevice userfaultfd_misc = {
2208	.minor = MISC_DYNAMIC_MINOR,
2209	.name = "userfaultfd",
2210	.fops = &userfaultfd_dev_fops
2211};
2212
2213static int __init userfaultfd_init(void)
2214{
2215	int ret;
2216
2217	ret = misc_register(&userfaultfd_misc);
2218	if (ret)
2219		return ret;
2220
2221	userfaultfd_ctx_cachep = kmem_cache_create("userfaultfd_ctx_cache",
2222						sizeof(struct userfaultfd_ctx),
2223						0,
2224						SLAB_HWCACHE_ALIGN|SLAB_PANIC,
2225						init_once_userfaultfd_ctx);
2226#ifdef CONFIG_SYSCTL
2227	register_sysctl_init("vm", vm_userfaultfd_table);
2228#endif
2229	return 0;
2230}
2231__initcall(userfaultfd_init);
Configure Feed

Configure Feed