Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

Revert "eventpoll: Fix priority inversion problem"

This reverts commit 8c44dac8add7503c345c0f6c7962e4863b88ba42.

I haven't figured out what the actual bug in this commit is, but I did
spend a lot of time chasing it down and eventually succeeded in
bisecting it down to this.

For some reason, this eventpoll commit ends up causing delays and stuck
user space processes, but it only happens on one of my machines, and
only during early boot or during the flurry of initial activity when
logging in.

I must be triggering some very subtle timing issue, but once I figured
out the behavior pattern that made it reasonably reliable to trigger, it
did bisect right to this, and reverting the commit fixes the problem.

Of course, that was only after I had failed at bisecting it several
times, and had flailed around blaming both the drm people and the
netlink people for the odd problems. The most obvious of which happened
at the time of the first graphical login (the most common symptom being
that some gnome app aborted due to a 30s timeout, often leading to the
whole session then failing if it was some critical component like
gnome-shell or similar).

Acked-by: Nam Cao <namcao@linutronix.de>
Cc: Frederic Weisbecker <frederic@kernel.org>
Cc: Valentin Schneider <vschneid@redhat.com>
Cc: Christian Brauner <brauner@kernel.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

+324 -134
+324 -134
fs/eventpoll.c
··· 137 137 }; 138 138 139 139 /* List header used to link this structure to the eventpoll ready list */ 140 - struct llist_node rdllink; 140 + struct list_head rdllink; 141 + 142 + /* 143 + * Works together "struct eventpoll"->ovflist in keeping the 144 + * single linked chain of items. 145 + */ 146 + struct epitem *next; 141 147 142 148 /* The file descriptor information this item refers to */ 143 149 struct epoll_filefd ffd; ··· 191 185 /* Wait queue used by file->poll() */ 192 186 wait_queue_head_t poll_wait; 193 187 194 - /* 195 - * List of ready file descriptors. Adding to this list is lockless. Items can be removed 196 - * only with eventpoll::mtx 197 - */ 198 - struct llist_head rdllist; 188 + /* List of ready file descriptors */ 189 + struct list_head rdllist; 190 + 191 + /* Lock which protects rdllist and ovflist */ 192 + rwlock_t lock; 199 193 200 194 /* RB tree root used to store monitored fd structs */ 201 195 struct rb_root_cached rbr; 196 + 197 + /* 198 + * This is a single linked list that chains all the "struct epitem" that 199 + * happened while transferring ready events to userspace w/out 200 + * holding ->lock. 201 + */ 202 + struct epitem *ovflist; 202 203 203 204 /* wakeup_source used when ep_send_events or __ep_eventpoll_poll is running */ 204 205 struct wakeup_source *ws; ··· 361 348 (p1->file < p2->file ? -1 : p1->fd - p2->fd)); 362 349 } 363 350 364 - /* 365 - * Add the item to its container eventpoll's rdllist; do nothing if the item is already on rdllist. 366 - */ 367 - static void epitem_ready(struct epitem *epi) 351 + /* Tells us if the item is currently linked */ 352 + static inline int ep_is_linked(struct epitem *epi) 368 353 { 369 - if (&epi->rdllink == cmpxchg(&epi->rdllink.next, &epi->rdllink, NULL)) 370 - llist_add(&epi->rdllink, &epi->ep->rdllist); 371 - 354 + return !list_empty(&epi->rdllink); 372 355 } 373 356 374 357 static inline struct eppoll_entry *ep_pwq_from_wait(wait_queue_entry_t *p) ··· 383 374 * 384 375 * @ep: Pointer to the eventpoll context. 385 376 * 386 - * Return: true if ready events might be available, false otherwise. 377 + * Return: a value different than %zero if ready events are available, 378 + * or %zero otherwise. 387 379 */ 388 - static inline bool ep_events_available(struct eventpoll *ep) 380 + static inline int ep_events_available(struct eventpoll *ep) 389 381 { 390 - bool available; 391 - int locked; 392 - 393 - locked = mutex_trylock(&ep->mtx); 394 - if (!locked) { 395 - /* 396 - * The lock held and someone might have removed all items while inspecting it. The 397 - * llist_empty() check in this case is futile. Assume that something is enqueued and 398 - * let ep_try_send_events() figure it out. 399 - */ 400 - return true; 401 - } 402 - 403 - available = !llist_empty(&ep->rdllist); 404 - mutex_unlock(&ep->mtx); 405 - return available; 382 + return !list_empty_careful(&ep->rdllist) || 383 + READ_ONCE(ep->ovflist) != EP_UNACTIVE_PTR; 406 384 } 407 385 408 386 #ifdef CONFIG_NET_RX_BUSY_POLL ··· 724 728 rcu_read_unlock(); 725 729 } 726 730 731 + 732 + /* 733 + * ep->mutex needs to be held because we could be hit by 734 + * eventpoll_release_file() and epoll_ctl(). 735 + */ 736 + static void ep_start_scan(struct eventpoll *ep, struct list_head *txlist) 737 + { 738 + /* 739 + * Steal the ready list, and re-init the original one to the 740 + * empty list. Also, set ep->ovflist to NULL so that events 741 + * happening while looping w/out locks, are not lost. We cannot 742 + * have the poll callback to queue directly on ep->rdllist, 743 + * because we want the "sproc" callback to be able to do it 744 + * in a lockless way. 745 + */ 746 + lockdep_assert_irqs_enabled(); 747 + write_lock_irq(&ep->lock); 748 + list_splice_init(&ep->rdllist, txlist); 749 + WRITE_ONCE(ep->ovflist, NULL); 750 + write_unlock_irq(&ep->lock); 751 + } 752 + 753 + static void ep_done_scan(struct eventpoll *ep, 754 + struct list_head *txlist) 755 + { 756 + struct epitem *epi, *nepi; 757 + 758 + write_lock_irq(&ep->lock); 759 + /* 760 + * During the time we spent inside the "sproc" callback, some 761 + * other events might have been queued by the poll callback. 762 + * We re-insert them inside the main ready-list here. 763 + */ 764 + for (nepi = READ_ONCE(ep->ovflist); (epi = nepi) != NULL; 765 + nepi = epi->next, epi->next = EP_UNACTIVE_PTR) { 766 + /* 767 + * We need to check if the item is already in the list. 768 + * During the "sproc" callback execution time, items are 769 + * queued into ->ovflist but the "txlist" might already 770 + * contain them, and the list_splice() below takes care of them. 771 + */ 772 + if (!ep_is_linked(epi)) { 773 + /* 774 + * ->ovflist is LIFO, so we have to reverse it in order 775 + * to keep in FIFO. 776 + */ 777 + list_add(&epi->rdllink, &ep->rdllist); 778 + ep_pm_stay_awake(epi); 779 + } 780 + } 781 + /* 782 + * We need to set back ep->ovflist to EP_UNACTIVE_PTR, so that after 783 + * releasing the lock, events will be queued in the normal way inside 784 + * ep->rdllist. 785 + */ 786 + WRITE_ONCE(ep->ovflist, EP_UNACTIVE_PTR); 787 + 788 + /* 789 + * Quickly re-inject items left on "txlist". 790 + */ 791 + list_splice(txlist, &ep->rdllist); 792 + __pm_relax(ep->ws); 793 + 794 + if (!list_empty(&ep->rdllist)) { 795 + if (waitqueue_active(&ep->wq)) 796 + wake_up(&ep->wq); 797 + } 798 + 799 + write_unlock_irq(&ep->lock); 800 + } 801 + 727 802 static void ep_get(struct eventpoll *ep) 728 803 { 729 804 refcount_inc(&ep->refcount); ··· 832 765 static bool __ep_remove(struct eventpoll *ep, struct epitem *epi, bool force) 833 766 { 834 767 struct file *file = epi->ffd.file; 835 - struct llist_node *put_back_last; 836 768 struct epitems_head *to_free; 837 769 struct hlist_head *head; 838 - LLIST_HEAD(put_back); 839 770 840 - lockdep_assert_held(&ep->mtx); 771 + lockdep_assert_irqs_enabled(); 841 772 842 773 /* 843 774 * Removes poll wait queue hooks. ··· 867 802 868 803 rb_erase_cached(&epi->rbn, &ep->rbr); 869 804 870 - if (llist_on_list(&epi->rdllink)) { 871 - put_back_last = NULL; 872 - while (true) { 873 - struct llist_node *n = llist_del_first(&ep->rdllist); 874 - 875 - if (&epi->rdllink == n || WARN_ON(!n)) 876 - break; 877 - if (!put_back_last) 878 - put_back_last = n; 879 - __llist_add(n, &put_back); 880 - } 881 - if (put_back_last) 882 - llist_add_batch(put_back.first, put_back_last, &ep->rdllist); 883 - } 805 + write_lock_irq(&ep->lock); 806 + if (ep_is_linked(epi)) 807 + list_del_init(&epi->rdllink); 808 + write_unlock_irq(&ep->lock); 884 809 885 810 wakeup_source_unregister(ep_wakeup_source(epi)); 886 811 /* ··· 972 917 static __poll_t __ep_eventpoll_poll(struct file *file, poll_table *wait, int depth) 973 918 { 974 919 struct eventpoll *ep = file->private_data; 975 - struct wakeup_source *ws; 976 - struct llist_node *n; 977 - struct epitem *epi; 920 + LIST_HEAD(txlist); 921 + struct epitem *epi, *tmp; 978 922 poll_table pt; 979 923 __poll_t res = 0; 980 924 ··· 987 933 * the ready list. 988 934 */ 989 935 mutex_lock_nested(&ep->mtx, depth); 990 - while (true) { 991 - n = llist_del_first_init(&ep->rdllist); 992 - if (!n) 993 - break; 994 - 995 - epi = llist_entry(n, struct epitem, rdllink); 996 - 936 + ep_start_scan(ep, &txlist); 937 + list_for_each_entry_safe(epi, tmp, &txlist, rdllink) { 997 938 if (ep_item_poll(epi, &pt, depth + 1)) { 998 939 res = EPOLLIN | EPOLLRDNORM; 999 - epitem_ready(epi); 1000 940 break; 1001 941 } else { 1002 942 /* 1003 - * We need to activate ep before deactivating epi, to prevent autosuspend 1004 - * just in case epi becomes active after ep_item_poll() above. 1005 - * 1006 - * This is similar to ep_send_events(). 943 + * Item has been dropped into the ready list by the poll 944 + * callback, but it's not actually ready, as far as 945 + * caller requested events goes. We can remove it here. 1007 946 */ 1008 - ws = ep_wakeup_source(epi); 1009 - if (ws) { 1010 - if (ws->active) 1011 - __pm_stay_awake(ep->ws); 1012 - __pm_relax(ws); 1013 - } 1014 947 __pm_relax(ep_wakeup_source(epi)); 1015 - 1016 - /* Just in case epi becomes active right before __pm_relax() */ 1017 - if (unlikely(ep_item_poll(epi, &pt, depth + 1))) 1018 - ep_pm_stay_awake(epi); 1019 - 1020 - __pm_relax(ep->ws); 948 + list_del_init(&epi->rdllink); 1021 949 } 1022 950 } 951 + ep_done_scan(ep, &txlist); 1023 952 mutex_unlock(&ep->mtx); 1024 953 return res; 1025 954 } ··· 1151 1114 return -ENOMEM; 1152 1115 1153 1116 mutex_init(&ep->mtx); 1117 + rwlock_init(&ep->lock); 1154 1118 init_waitqueue_head(&ep->wq); 1155 1119 init_waitqueue_head(&ep->poll_wait); 1156 - init_llist_head(&ep->rdllist); 1120 + INIT_LIST_HEAD(&ep->rdllist); 1157 1121 ep->rbr = RB_ROOT_CACHED; 1122 + ep->ovflist = EP_UNACTIVE_PTR; 1158 1123 ep->user = get_current_user(); 1159 1124 refcount_set(&ep->refcount, 1); 1160 1125 ··· 1239 1200 #endif /* CONFIG_KCMP */ 1240 1201 1241 1202 /* 1203 + * Adds a new entry to the tail of the list in a lockless way, i.e. 1204 + * multiple CPUs are allowed to call this function concurrently. 1205 + * 1206 + * Beware: it is necessary to prevent any other modifications of the 1207 + * existing list until all changes are completed, in other words 1208 + * concurrent list_add_tail_lockless() calls should be protected 1209 + * with a read lock, where write lock acts as a barrier which 1210 + * makes sure all list_add_tail_lockless() calls are fully 1211 + * completed. 1212 + * 1213 + * Also an element can be locklessly added to the list only in one 1214 + * direction i.e. either to the tail or to the head, otherwise 1215 + * concurrent access will corrupt the list. 1216 + * 1217 + * Return: %false if element has been already added to the list, %true 1218 + * otherwise. 1219 + */ 1220 + static inline bool list_add_tail_lockless(struct list_head *new, 1221 + struct list_head *head) 1222 + { 1223 + struct list_head *prev; 1224 + 1225 + /* 1226 + * This is simple 'new->next = head' operation, but cmpxchg() 1227 + * is used in order to detect that same element has been just 1228 + * added to the list from another CPU: the winner observes 1229 + * new->next == new. 1230 + */ 1231 + if (!try_cmpxchg(&new->next, &new, head)) 1232 + return false; 1233 + 1234 + /* 1235 + * Initially ->next of a new element must be updated with the head 1236 + * (we are inserting to the tail) and only then pointers are atomically 1237 + * exchanged. XCHG guarantees memory ordering, thus ->next should be 1238 + * updated before pointers are actually swapped and pointers are 1239 + * swapped before prev->next is updated. 1240 + */ 1241 + 1242 + prev = xchg(&head->prev, new); 1243 + 1244 + /* 1245 + * It is safe to modify prev->next and new->prev, because a new element 1246 + * is added only to the tail and new->next is updated before XCHG. 1247 + */ 1248 + 1249 + prev->next = new; 1250 + new->prev = prev; 1251 + 1252 + return true; 1253 + } 1254 + 1255 + /* 1256 + * Chains a new epi entry to the tail of the ep->ovflist in a lockless way, 1257 + * i.e. multiple CPUs are allowed to call this function concurrently. 1258 + * 1259 + * Return: %false if epi element has been already chained, %true otherwise. 1260 + */ 1261 + static inline bool chain_epi_lockless(struct epitem *epi) 1262 + { 1263 + struct eventpoll *ep = epi->ep; 1264 + 1265 + /* Fast preliminary check */ 1266 + if (epi->next != EP_UNACTIVE_PTR) 1267 + return false; 1268 + 1269 + /* Check that the same epi has not been just chained from another CPU */ 1270 + if (cmpxchg(&epi->next, EP_UNACTIVE_PTR, NULL) != EP_UNACTIVE_PTR) 1271 + return false; 1272 + 1273 + /* Atomically exchange tail */ 1274 + epi->next = xchg(&ep->ovflist, epi); 1275 + 1276 + return true; 1277 + } 1278 + 1279 + /* 1242 1280 * This is the callback that is passed to the wait queue wakeup 1243 1281 * mechanism. It is called by the stored file descriptors when they 1244 1282 * have events to report. 1283 + * 1284 + * This callback takes a read lock in order not to contend with concurrent 1285 + * events from another file descriptor, thus all modifications to ->rdllist 1286 + * or ->ovflist are lockless. Read lock is paired with the write lock from 1287 + * ep_start/done_scan(), which stops all list modifications and guarantees 1288 + * that lists state is seen correctly. 1245 1289 * 1246 1290 * Another thing worth to mention is that ep_poll_callback() can be called 1247 1291 * concurrently for the same @epi from different CPUs if poll table was inited ··· 1335 1213 */ 1336 1214 static int ep_poll_callback(wait_queue_entry_t *wait, unsigned mode, int sync, void *key) 1337 1215 { 1216 + int pwake = 0; 1338 1217 struct epitem *epi = ep_item_from_wait(wait); 1339 1218 struct eventpoll *ep = epi->ep; 1340 1219 __poll_t pollflags = key_to_poll(key); 1220 + unsigned long flags; 1341 1221 int ewake = 0; 1222 + 1223 + read_lock_irqsave(&ep->lock, flags); 1342 1224 1343 1225 ep_set_busy_poll_napi_id(epi); 1344 1226 ··· 1353 1227 * until the next EPOLL_CTL_MOD will be issued. 1354 1228 */ 1355 1229 if (!(epi->event.events & ~EP_PRIVATE_BITS)) 1356 - goto out; 1230 + goto out_unlock; 1357 1231 1358 1232 /* 1359 1233 * Check the events coming with the callback. At this stage, not ··· 1362 1236 * test for "key" != NULL before the event match test. 1363 1237 */ 1364 1238 if (pollflags && !(pollflags & epi->event.events)) 1365 - goto out; 1239 + goto out_unlock; 1366 1240 1367 - ep_pm_stay_awake_rcu(epi); 1368 - epitem_ready(epi); 1241 + /* 1242 + * If we are transferring events to userspace, we can hold no locks 1243 + * (because we're accessing user memory, and because of linux f_op->poll() 1244 + * semantics). All the events that happen during that period of time are 1245 + * chained in ep->ovflist and requeued later on. 1246 + */ 1247 + if (READ_ONCE(ep->ovflist) != EP_UNACTIVE_PTR) { 1248 + if (chain_epi_lockless(epi)) 1249 + ep_pm_stay_awake_rcu(epi); 1250 + } else if (!ep_is_linked(epi)) { 1251 + /* In the usual case, add event to ready list. */ 1252 + if (list_add_tail_lockless(&epi->rdllink, &ep->rdllist)) 1253 + ep_pm_stay_awake_rcu(epi); 1254 + } 1369 1255 1370 1256 /* 1371 1257 * Wake up ( if active ) both the eventpoll wait list and the ->poll() ··· 1406 1268 wake_up(&ep->wq); 1407 1269 } 1408 1270 if (waitqueue_active(&ep->poll_wait)) 1271 + pwake++; 1272 + 1273 + out_unlock: 1274 + read_unlock_irqrestore(&ep->lock, flags); 1275 + 1276 + /* We have to call this outside the lock */ 1277 + if (pwake) 1409 1278 ep_poll_safewake(ep, epi, pollflags & EPOLL_URING_WAKE); 1410 1279 1411 - out: 1412 1280 if (!(epi->event.events & EPOLLEXCLUSIVE)) 1413 1281 ewake = 1; 1414 1282 ··· 1659 1515 if (is_file_epoll(tfile)) 1660 1516 tep = tfile->private_data; 1661 1517 1518 + lockdep_assert_irqs_enabled(); 1519 + 1662 1520 if (unlikely(percpu_counter_compare(&ep->user->epoll_watches, 1663 1521 max_user_watches) >= 0)) 1664 1522 return -ENOSPC; ··· 1672 1526 } 1673 1527 1674 1528 /* Item initialization follow here ... */ 1675 - init_llist_node(&epi->rdllink); 1529 + INIT_LIST_HEAD(&epi->rdllink); 1676 1530 epi->ep = ep; 1677 1531 ep_set_ffd(&epi->ffd, tfile, fd); 1678 1532 epi->event = *event; 1533 + epi->next = EP_UNACTIVE_PTR; 1679 1534 1680 1535 if (tep) 1681 1536 mutex_lock_nested(&tep->mtx, 1); ··· 1743 1596 return -ENOMEM; 1744 1597 } 1745 1598 1599 + /* We have to drop the new item inside our item list to keep track of it */ 1600 + write_lock_irq(&ep->lock); 1601 + 1746 1602 /* record NAPI ID of new item if present */ 1747 1603 ep_set_busy_poll_napi_id(epi); 1748 1604 1749 1605 /* If the file is already "ready" we drop it inside the ready list */ 1750 - if (revents) { 1606 + if (revents && !ep_is_linked(epi)) { 1607 + list_add_tail(&epi->rdllink, &ep->rdllist); 1751 1608 ep_pm_stay_awake(epi); 1752 - epitem_ready(epi); 1753 1609 1754 1610 /* Notify waiting tasks that events are available */ 1755 1611 if (waitqueue_active(&ep->wq)) ··· 1760 1610 if (waitqueue_active(&ep->poll_wait)) 1761 1611 pwake++; 1762 1612 } 1613 + 1614 + write_unlock_irq(&ep->lock); 1763 1615 1764 1616 /* We have to call this outside the lock */ 1765 1617 if (pwake) ··· 1777 1625 static int ep_modify(struct eventpoll *ep, struct epitem *epi, 1778 1626 const struct epoll_event *event) 1779 1627 { 1628 + int pwake = 0; 1780 1629 poll_table pt; 1630 + 1631 + lockdep_assert_irqs_enabled(); 1781 1632 1782 1633 init_poll_funcptr(&pt, NULL); 1783 1634 ··· 1825 1670 * list, push it inside. 1826 1671 */ 1827 1672 if (ep_item_poll(epi, &pt, 1)) { 1828 - ep_pm_stay_awake(epi); 1829 - epitem_ready(epi); 1673 + write_lock_irq(&ep->lock); 1674 + if (!ep_is_linked(epi)) { 1675 + list_add_tail(&epi->rdllink, &ep->rdllist); 1676 + ep_pm_stay_awake(epi); 1830 1677 1831 - /* Notify waiting tasks that events are available */ 1832 - if (waitqueue_active(&ep->wq)) 1833 - wake_up(&ep->wq); 1834 - if (waitqueue_active(&ep->poll_wait)) 1835 - ep_poll_safewake(ep, NULL, 0); 1678 + /* Notify waiting tasks that events are available */ 1679 + if (waitqueue_active(&ep->wq)) 1680 + wake_up(&ep->wq); 1681 + if (waitqueue_active(&ep->poll_wait)) 1682 + pwake++; 1683 + } 1684 + write_unlock_irq(&ep->lock); 1836 1685 } 1686 + 1687 + /* We have to call this outside the lock */ 1688 + if (pwake) 1689 + ep_poll_safewake(ep, NULL, 0); 1837 1690 1838 1691 return 0; 1839 1692 } ··· 1850 1687 struct epoll_event __user *events, int maxevents) 1851 1688 { 1852 1689 struct epitem *epi, *tmp; 1853 - LLIST_HEAD(txlist); 1690 + LIST_HEAD(txlist); 1854 1691 poll_table pt; 1855 1692 int res = 0; 1856 1693 ··· 1865 1702 init_poll_funcptr(&pt, NULL); 1866 1703 1867 1704 mutex_lock(&ep->mtx); 1705 + ep_start_scan(ep, &txlist); 1868 1706 1869 - while (res < maxevents) { 1707 + /* 1708 + * We can loop without lock because we are passed a task private list. 1709 + * Items cannot vanish during the loop we are holding ep->mtx. 1710 + */ 1711 + list_for_each_entry_safe(epi, tmp, &txlist, rdllink) { 1870 1712 struct wakeup_source *ws; 1871 - struct llist_node *n; 1872 1713 __poll_t revents; 1873 1714 1874 - n = llist_del_first(&ep->rdllist); 1875 - if (!n) 1715 + if (res >= maxevents) 1876 1716 break; 1877 - 1878 - epi = llist_entry(n, struct epitem, rdllink); 1879 1717 1880 1718 /* 1881 1719 * Activate ep->ws before deactivating epi->ws to prevent ··· 1894 1730 __pm_relax(ws); 1895 1731 } 1896 1732 1733 + list_del_init(&epi->rdllink); 1734 + 1897 1735 /* 1898 1736 * If the event mask intersect the caller-requested one, 1899 1737 * deliver the event to userspace. Again, we are holding ep->mtx, 1900 1738 * so no operations coming from userspace can change the item. 1901 1739 */ 1902 1740 revents = ep_item_poll(epi, &pt, 1); 1903 - if (!revents) { 1904 - init_llist_node(n); 1905 - 1906 - /* 1907 - * Just in case epi becomes ready after ep_item_poll() above, but before 1908 - * init_llist_node(). Make sure to add it to the ready list, otherwise an 1909 - * event may be lost. 1910 - */ 1911 - if (unlikely(ep_item_poll(epi, &pt, 1))) { 1912 - ep_pm_stay_awake(epi); 1913 - epitem_ready(epi); 1914 - } 1741 + if (!revents) 1915 1742 continue; 1916 - } 1917 1743 1918 1744 events = epoll_put_uevent(revents, epi->event.data, events); 1919 1745 if (!events) { 1920 - llist_add(&epi->rdllink, &ep->rdllist); 1746 + list_add(&epi->rdllink, &txlist); 1747 + ep_pm_stay_awake(epi); 1921 1748 if (!res) 1922 1749 res = -EFAULT; 1923 1750 break; ··· 1916 1761 res++; 1917 1762 if (epi->event.events & EPOLLONESHOT) 1918 1763 epi->event.events &= EP_PRIVATE_BITS; 1919 - __llist_add(n, &txlist); 1920 - } 1921 - 1922 - llist_for_each_entry_safe(epi, tmp, txlist.first, rdllink) { 1923 - init_llist_node(&epi->rdllink); 1924 - 1925 - if (!(epi->event.events & EPOLLET)) { 1764 + else if (!(epi->event.events & EPOLLET)) { 1926 1765 /* 1927 - * If this file has been added with Level Trigger mode, we need to insert 1928 - * back inside the ready list, so that the next call to epoll_wait() will 1929 - * check again the events availability. 1766 + * If this file has been added with Level 1767 + * Trigger mode, we need to insert back inside 1768 + * the ready list, so that the next call to 1769 + * epoll_wait() will check again the events 1770 + * availability. At this point, no one can insert 1771 + * into ep->rdllist besides us. The epoll_ctl() 1772 + * callers are locked out by 1773 + * ep_send_events() holding "mtx" and the 1774 + * poll callback will queue them in ep->ovflist. 1930 1775 */ 1776 + list_add_tail(&epi->rdllink, &ep->rdllist); 1931 1777 ep_pm_stay_awake(epi); 1932 - epitem_ready(epi); 1933 1778 } 1934 1779 } 1935 - 1936 - __pm_relax(ep->ws); 1780 + ep_done_scan(ep, &txlist); 1937 1781 mutex_unlock(&ep->mtx); 1938 - 1939 - if (!llist_empty(&ep->rdllist)) { 1940 - if (waitqueue_active(&ep->wq)) 1941 - wake_up(&ep->wq); 1942 - } 1943 1782 1944 1783 return res; 1945 1784 } ··· 2027 1878 wait_queue_entry_t wait; 2028 1879 ktime_t expires, *to = NULL; 2029 1880 1881 + lockdep_assert_irqs_enabled(); 1882 + 2030 1883 if (timeout && (timeout->tv_sec | timeout->tv_nsec)) { 2031 1884 slack = select_estimate_accuracy(timeout); 2032 1885 to = &expires; ··· 2088 1937 init_wait(&wait); 2089 1938 wait.func = ep_autoremove_wake_function; 2090 1939 2091 - prepare_to_wait_exclusive(&ep->wq, &wait, TASK_INTERRUPTIBLE); 1940 + write_lock_irq(&ep->lock); 1941 + /* 1942 + * Barrierless variant, waitqueue_active() is called under 1943 + * the same lock on wakeup ep_poll_callback() side, so it 1944 + * is safe to avoid an explicit barrier. 1945 + */ 1946 + __set_current_state(TASK_INTERRUPTIBLE); 2092 1947 2093 - if (!ep_events_available(ep)) 1948 + /* 1949 + * Do the final check under the lock. ep_start/done_scan() 1950 + * plays with two lists (->rdllist and ->ovflist) and there 1951 + * is always a race when both lists are empty for short 1952 + * period of time although events are pending, so lock is 1953 + * important. 1954 + */ 1955 + eavail = ep_events_available(ep); 1956 + if (!eavail) 1957 + __add_wait_queue_exclusive(&ep->wq, &wait); 1958 + 1959 + write_unlock_irq(&ep->lock); 1960 + 1961 + if (!eavail) 2094 1962 timed_out = !ep_schedule_timeout(to) || 2095 1963 !schedule_hrtimeout_range(to, slack, 2096 1964 HRTIMER_MODE_ABS); 1965 + __set_current_state(TASK_RUNNING); 2097 1966 2098 - finish_wait(&ep->wq, &wait); 2099 - eavail = ep_events_available(ep); 1967 + /* 1968 + * We were woken up, thus go and try to harvest some events. 1969 + * If timed out and still on the wait queue, recheck eavail 1970 + * carefully under lock, below. 1971 + */ 1972 + eavail = 1; 1973 + 1974 + if (!list_empty_careful(&wait.entry)) { 1975 + write_lock_irq(&ep->lock); 1976 + /* 1977 + * If the thread timed out and is not on the wait queue, 1978 + * it means that the thread was woken up after its 1979 + * timeout expired before it could reacquire the lock. 1980 + * Thus, when wait.entry is empty, it needs to harvest 1981 + * events. 1982 + */ 1983 + if (timed_out) 1984 + eavail = list_empty(&wait.entry); 1985 + __remove_wait_queue(&ep->wq, &wait); 1986 + write_unlock_irq(&ep->lock); 1987 + } 2100 1988 } 2101 1989 } 2102 1990