Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

Merge branch 'work.aio' of git://git.kernel.org/pub/scm/linux/kernel/git/viro/vfs

Pull vfs aio updates from Al Viro:
"Christoph's aio poll, saner this time around.

This time it's pretty much local to fs/aio.c. Hopefully race-free..."

* 'work.aio' of git://git.kernel.org/pub/scm/linux/kernel/git/viro/vfs:
aio: allow direct aio poll comletions for keyed wakeups
aio: implement IOCB_CMD_POLL
aio: add a iocb refcount
timerfd: add support for keyed wakeups

+209 -11
+204 -4
fs/aio.c
··· 5 5 * Implements an efficient asynchronous io interface. 6 6 * 7 7 * Copyright 2000, 2001, 2002 Red Hat, Inc. All Rights Reserved. 8 + * Copyright 2018 Christoph Hellwig. 8 9 * 9 10 * See ../COPYING for licensing terms. 10 11 */ ··· 19 18 #include <linux/export.h> 20 19 #include <linux/syscalls.h> 21 20 #include <linux/backing-dev.h> 21 + #include <linux/refcount.h> 22 22 #include <linux/uio.h> 23 23 24 24 #include <linux/sched/signal.h> ··· 166 164 bool datasync; 167 165 }; 168 166 167 + struct poll_iocb { 168 + struct file *file; 169 + struct wait_queue_head *head; 170 + __poll_t events; 171 + bool woken; 172 + bool cancelled; 173 + struct wait_queue_entry wait; 174 + struct work_struct work; 175 + }; 176 + 169 177 struct aio_kiocb { 170 178 union { 171 179 struct kiocb rw; 172 180 struct fsync_iocb fsync; 181 + struct poll_iocb poll; 173 182 }; 174 183 175 184 struct kioctx *ki_ctx; ··· 191 178 192 179 struct list_head ki_list; /* the aio core uses this 193 180 * for cancellation */ 181 + refcount_t ki_refcnt; 194 182 195 183 /* 196 184 * If the aio_resfd field of the userspace iocb is not zero, ··· 1013 999 1014 1000 percpu_ref_get(&ctx->reqs); 1015 1001 INIT_LIST_HEAD(&req->ki_list); 1002 + refcount_set(&req->ki_refcnt, 0); 1016 1003 req->ki_ctx = ctx; 1017 1004 return req; 1018 1005 out_put: ··· 1046 1031 out: 1047 1032 rcu_read_unlock(); 1048 1033 return ret; 1034 + } 1035 + 1036 + static inline void iocb_put(struct aio_kiocb *iocb) 1037 + { 1038 + if (refcount_read(&iocb->ki_refcnt) == 0 || 1039 + refcount_dec_and_test(&iocb->ki_refcnt)) { 1040 + percpu_ref_put(&iocb->ki_ctx->reqs); 1041 + kmem_cache_free(kiocb_cachep, iocb); 1042 + } 1049 1043 } 1050 1044 1051 1045 /* aio_complete ··· 1126 1102 eventfd_ctx_put(iocb->ki_eventfd); 1127 1103 } 1128 1104 1129 - kmem_cache_free(kiocb_cachep, iocb); 1130 - 1131 1105 /* 1132 1106 * We have to order our ring_info tail store above and test 1133 1107 * of the wait list below outside the wait lock. This is ··· 1136 1114 1137 1115 if (waitqueue_active(&ctx->wait)) 1138 1116 wake_up(&ctx->wait); 1139 - 1140 - percpu_ref_put(&ctx->reqs); 1117 + iocb_put(iocb); 1141 1118 } 1142 1119 1143 1120 /* aio_read_events_ring ··· 1597 1576 return 0; 1598 1577 } 1599 1578 1579 + static inline void aio_poll_complete(struct aio_kiocb *iocb, __poll_t mask) 1580 + { 1581 + struct file *file = iocb->poll.file; 1582 + 1583 + aio_complete(iocb, mangle_poll(mask), 0); 1584 + fput(file); 1585 + } 1586 + 1587 + static void aio_poll_complete_work(struct work_struct *work) 1588 + { 1589 + struct poll_iocb *req = container_of(work, struct poll_iocb, work); 1590 + struct aio_kiocb *iocb = container_of(req, struct aio_kiocb, poll); 1591 + struct poll_table_struct pt = { ._key = req->events }; 1592 + struct kioctx *ctx = iocb->ki_ctx; 1593 + __poll_t mask = 0; 1594 + 1595 + if (!READ_ONCE(req->cancelled)) 1596 + mask = vfs_poll(req->file, &pt) & req->events; 1597 + 1598 + /* 1599 + * Note that ->ki_cancel callers also delete iocb from active_reqs after 1600 + * calling ->ki_cancel. We need the ctx_lock roundtrip here to 1601 + * synchronize with them. In the cancellation case the list_del_init 1602 + * itself is not actually needed, but harmless so we keep it in to 1603 + * avoid further branches in the fast path. 1604 + */ 1605 + spin_lock_irq(&ctx->ctx_lock); 1606 + if (!mask && !READ_ONCE(req->cancelled)) { 1607 + add_wait_queue(req->head, &req->wait); 1608 + spin_unlock_irq(&ctx->ctx_lock); 1609 + return; 1610 + } 1611 + list_del_init(&iocb->ki_list); 1612 + spin_unlock_irq(&ctx->ctx_lock); 1613 + 1614 + aio_poll_complete(iocb, mask); 1615 + } 1616 + 1617 + /* assumes we are called with irqs disabled */ 1618 + static int aio_poll_cancel(struct kiocb *iocb) 1619 + { 1620 + struct aio_kiocb *aiocb = container_of(iocb, struct aio_kiocb, rw); 1621 + struct poll_iocb *req = &aiocb->poll; 1622 + 1623 + spin_lock(&req->head->lock); 1624 + WRITE_ONCE(req->cancelled, true); 1625 + if (!list_empty(&req->wait.entry)) { 1626 + list_del_init(&req->wait.entry); 1627 + schedule_work(&aiocb->poll.work); 1628 + } 1629 + spin_unlock(&req->head->lock); 1630 + 1631 + return 0; 1632 + } 1633 + 1634 + static int aio_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync, 1635 + void *key) 1636 + { 1637 + struct poll_iocb *req = container_of(wait, struct poll_iocb, wait); 1638 + struct aio_kiocb *iocb = container_of(req, struct aio_kiocb, poll); 1639 + __poll_t mask = key_to_poll(key); 1640 + 1641 + req->woken = true; 1642 + 1643 + /* for instances that support it check for an event match first: */ 1644 + if (mask) { 1645 + if (!(mask & req->events)) 1646 + return 0; 1647 + 1648 + /* try to complete the iocb inline if we can: */ 1649 + if (spin_trylock(&iocb->ki_ctx->ctx_lock)) { 1650 + list_del(&iocb->ki_list); 1651 + spin_unlock(&iocb->ki_ctx->ctx_lock); 1652 + 1653 + list_del_init(&req->wait.entry); 1654 + aio_poll_complete(iocb, mask); 1655 + return 1; 1656 + } 1657 + } 1658 + 1659 + list_del_init(&req->wait.entry); 1660 + schedule_work(&req->work); 1661 + return 1; 1662 + } 1663 + 1664 + struct aio_poll_table { 1665 + struct poll_table_struct pt; 1666 + struct aio_kiocb *iocb; 1667 + int error; 1668 + }; 1669 + 1670 + static void 1671 + aio_poll_queue_proc(struct file *file, struct wait_queue_head *head, 1672 + struct poll_table_struct *p) 1673 + { 1674 + struct aio_poll_table *pt = container_of(p, struct aio_poll_table, pt); 1675 + 1676 + /* multiple wait queues per file are not supported */ 1677 + if (unlikely(pt->iocb->poll.head)) { 1678 + pt->error = -EINVAL; 1679 + return; 1680 + } 1681 + 1682 + pt->error = 0; 1683 + pt->iocb->poll.head = head; 1684 + add_wait_queue(head, &pt->iocb->poll.wait); 1685 + } 1686 + 1687 + static ssize_t aio_poll(struct aio_kiocb *aiocb, struct iocb *iocb) 1688 + { 1689 + struct kioctx *ctx = aiocb->ki_ctx; 1690 + struct poll_iocb *req = &aiocb->poll; 1691 + struct aio_poll_table apt; 1692 + __poll_t mask; 1693 + 1694 + /* reject any unknown events outside the normal event mask. */ 1695 + if ((u16)iocb->aio_buf != iocb->aio_buf) 1696 + return -EINVAL; 1697 + /* reject fields that are not defined for poll */ 1698 + if (iocb->aio_offset || iocb->aio_nbytes || iocb->aio_rw_flags) 1699 + return -EINVAL; 1700 + 1701 + INIT_WORK(&req->work, aio_poll_complete_work); 1702 + req->events = demangle_poll(iocb->aio_buf) | EPOLLERR | EPOLLHUP; 1703 + req->file = fget(iocb->aio_fildes); 1704 + if (unlikely(!req->file)) 1705 + return -EBADF; 1706 + 1707 + apt.pt._qproc = aio_poll_queue_proc; 1708 + apt.pt._key = req->events; 1709 + apt.iocb = aiocb; 1710 + apt.error = -EINVAL; /* same as no support for IOCB_CMD_POLL */ 1711 + 1712 + /* initialized the list so that we can do list_empty checks */ 1713 + INIT_LIST_HEAD(&req->wait.entry); 1714 + init_waitqueue_func_entry(&req->wait, aio_poll_wake); 1715 + 1716 + /* one for removal from waitqueue, one for this function */ 1717 + refcount_set(&aiocb->ki_refcnt, 2); 1718 + 1719 + mask = vfs_poll(req->file, &apt.pt) & req->events; 1720 + if (unlikely(!req->head)) { 1721 + /* we did not manage to set up a waitqueue, done */ 1722 + goto out; 1723 + } 1724 + 1725 + spin_lock_irq(&ctx->ctx_lock); 1726 + spin_lock(&req->head->lock); 1727 + if (req->woken) { 1728 + /* wake_up context handles the rest */ 1729 + mask = 0; 1730 + apt.error = 0; 1731 + } else if (mask || apt.error) { 1732 + /* if we get an error or a mask we are done */ 1733 + WARN_ON_ONCE(list_empty(&req->wait.entry)); 1734 + list_del_init(&req->wait.entry); 1735 + } else { 1736 + /* actually waiting for an event */ 1737 + list_add_tail(&aiocb->ki_list, &ctx->active_reqs); 1738 + aiocb->ki_cancel = aio_poll_cancel; 1739 + } 1740 + spin_unlock(&req->head->lock); 1741 + spin_unlock_irq(&ctx->ctx_lock); 1742 + 1743 + out: 1744 + if (unlikely(apt.error)) { 1745 + fput(req->file); 1746 + return apt.error; 1747 + } 1748 + 1749 + if (mask) 1750 + aio_poll_complete(aiocb, mask); 1751 + iocb_put(aiocb); 1752 + return 0; 1753 + } 1754 + 1600 1755 static int io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb, 1601 1756 bool compat) 1602 1757 { ··· 1845 1648 break; 1846 1649 case IOCB_CMD_FDSYNC: 1847 1650 ret = aio_fsync(&req->fsync, &iocb, true); 1651 + break; 1652 + case IOCB_CMD_POLL: 1653 + ret = aio_poll(req, &iocb); 1848 1654 break; 1849 1655 default: 1850 1656 pr_debug("invalid aio operation %d\n", iocb.aio_lio_opcode);
+3 -3
fs/timerfd.c
··· 66 66 spin_lock_irqsave(&ctx->wqh.lock, flags); 67 67 ctx->expired = 1; 68 68 ctx->ticks++; 69 - wake_up_locked(&ctx->wqh); 69 + wake_up_locked_poll(&ctx->wqh, EPOLLIN); 70 70 spin_unlock_irqrestore(&ctx->wqh.lock, flags); 71 71 } 72 72 ··· 107 107 if (ctx->moffs != moffs) { 108 108 ctx->moffs = KTIME_MAX; 109 109 ctx->ticks++; 110 - wake_up_locked(&ctx->wqh); 110 + wake_up_locked_poll(&ctx->wqh, EPOLLIN); 111 111 } 112 112 spin_unlock_irqrestore(&ctx->wqh.lock, flags); 113 113 } ··· 345 345 spin_lock_irq(&ctx->wqh.lock); 346 346 if (!timerfd_canceled(ctx)) { 347 347 ctx->ticks = ticks; 348 - wake_up_locked(&ctx->wqh); 348 + wake_up_locked_poll(&ctx->wqh, EPOLLIN); 349 349 } else 350 350 ret = -ECANCELED; 351 351 spin_unlock_irq(&ctx->wqh.lock);
+2 -4
include/uapi/linux/aio_abi.h
··· 38 38 IOCB_CMD_PWRITE = 1, 39 39 IOCB_CMD_FSYNC = 2, 40 40 IOCB_CMD_FDSYNC = 3, 41 - /* These two are experimental. 42 - * IOCB_CMD_PREADX = 4, 43 - * IOCB_CMD_POLL = 5, 44 - */ 41 + /* 4 was the experimental IOCB_CMD_PREADX */ 42 + IOCB_CMD_POLL = 5, 45 43 IOCB_CMD_NOOP = 6, 46 44 IOCB_CMD_PREADV = 7, 47 45 IOCB_CMD_PWRITEV = 8,