Fix inotify watch removal/umount races · tjh.dev/kernel@8f7b0ba

+144 -6

fs/inotify.c

··· 106 106 } 107 107 EXPORT_SYMBOL_GPL(get_inotify_watch); 108 108 109 + int pin_inotify_watch(struct inotify_watch *watch) 110 + { 111 + struct super_block *sb = watch->inode->i_sb; 112 + spin_lock(&sb_lock); 113 + if (sb->s_count >= S_BIAS) { 114 + atomic_inc(&sb->s_active); 115 + spin_unlock(&sb_lock); 116 + atomic_inc(&watch->count); 117 + return 1; 118 + } 119 + spin_unlock(&sb_lock); 120 + return 0; 121 + } 122 + 109 123 /** 110 124 * put_inotify_watch - decrements the ref count on a given watch. cleans up 111 125 * watch references if the count reaches zero. inotify_watch is freed by ··· 137 123 } 138 124 } 139 125 EXPORT_SYMBOL_GPL(put_inotify_watch); 126 + 127 + void unpin_inotify_watch(struct inotify_watch *watch) 128 + { 129 + struct super_block *sb = watch->inode->i_sb; 130 + put_inotify_watch(watch); 131 + deactivate_super(sb); 132 + } 140 133 141 134 /* 142 135 * inotify_handle_get_wd - returns the next WD for use by the given handle ··· 500 479 } 501 480 EXPORT_SYMBOL_GPL(inotify_init_watch); 502 481 482 + /* 483 + * Watch removals suck violently. To kick the watch out we need (in this 484 + * order) inode->inotify_mutex and ih->mutex. That's fine if we have 485 + * a hold on inode; however, for all other cases we need to make damn sure 486 + * we don't race with umount. We can *NOT* just grab a reference to a 487 + * watch - inotify_unmount_inodes() will happily sail past it and we'll end 488 + * with reference to inode potentially outliving its superblock. Ideally 489 + * we just want to grab an active reference to superblock if we can; that 490 + * will make sure we won't go into inotify_umount_inodes() until we are 491 + * done. Cleanup is just deactivate_super(). However, that leaves a messy 492 + * case - what if we *are* racing with umount() and active references to 493 + * superblock can't be acquired anymore? We can bump ->s_count, grab 494 + * ->s_umount, which will almost certainly wait until the superblock is shut 495 + * down and the watch in question is pining for fjords. That's fine, but 496 + * there is a problem - we might have hit the window between ->s_active 497 + * getting to 0 / ->s_count - below S_BIAS (i.e. the moment when superblock 498 + * is past the point of no return and is heading for shutdown) and the 499 + * moment when deactivate_super() acquires ->s_umount. We could just do 500 + * drop_super() yield() and retry, but that's rather antisocial and this 501 + * stuff is luser-triggerable. OTOH, having grabbed ->s_umount and having 502 + * found that we'd got there first (i.e. that ->s_root is non-NULL) we know 503 + * that we won't race with inotify_umount_inodes(). So we could grab a 504 + * reference to watch and do the rest as above, just with drop_super() instead 505 + * of deactivate_super(), right? Wrong. We had to drop ih->mutex before we 506 + * could grab ->s_umount. So the watch could've been gone already. 507 + * 508 + * That still can be dealt with - we need to save watch->wd, do idr_find() 509 + * and compare its result with our pointer. If they match, we either have 510 + * the damn thing still alive or we'd lost not one but two races at once, 511 + * the watch had been killed and a new one got created with the same ->wd 512 + * at the same address. That couldn't have happened in inotify_destroy(), 513 + * but inotify_rm_wd() could run into that. Still, "new one got created" 514 + * is not a problem - we have every right to kill it or leave it alone, 515 + * whatever's more convenient. 516 + * 517 + * So we can use idr_find(...) == watch && watch->inode->i_sb == sb as 518 + * "grab it and kill it" check. If it's been our original watch, we are 519 + * fine, if it's a newcomer - nevermind, just pretend that we'd won the 520 + * race and kill the fscker anyway; we are safe since we know that its 521 + * superblock won't be going away. 522 + * 523 + * And yes, this is far beyond mere "not very pretty"; so's the entire 524 + * concept of inotify to start with. 525 + */ 526 + 527 + /** 528 + * pin_to_kill - pin the watch down for removal 529 + * @ih: inotify handle 530 + * @watch: watch to kill 531 + * 532 + * Called with ih->mutex held, drops it. Possible return values: 533 + * 0 - nothing to do, it has died 534 + * 1 - remove it, drop the reference and deactivate_super() 535 + * 2 - remove it, drop the reference and drop_super(); we tried hard to avoid 536 + * that variant, since it involved a lot of PITA, but that's the best that 537 + * could've been done. 538 + */ 539 + static int pin_to_kill(struct inotify_handle *ih, struct inotify_watch *watch) 540 + { 541 + struct super_block *sb = watch->inode->i_sb; 542 + s32 wd = watch->wd; 543 + 544 + spin_lock(&sb_lock); 545 + if (sb->s_count >= S_BIAS) { 546 + atomic_inc(&sb->s_active); 547 + spin_unlock(&sb_lock); 548 + get_inotify_watch(watch); 549 + mutex_unlock(&ih->mutex); 550 + return 1; /* the best outcome */ 551 + } 552 + sb->s_count++; 553 + spin_unlock(&sb_lock); 554 + mutex_unlock(&ih->mutex); /* can't grab ->s_umount under it */ 555 + down_read(&sb->s_umount); 556 + if (likely(!sb->s_root)) { 557 + /* fs is already shut down; the watch is dead */ 558 + drop_super(sb); 559 + return 0; 560 + } 561 + /* raced with the final deactivate_super() */ 562 + mutex_lock(&ih->mutex); 563 + if (idr_find(&ih->idr, wd) != watch || watch->inode->i_sb != sb) { 564 + /* the watch is dead */ 565 + mutex_unlock(&ih->mutex); 566 + drop_super(sb); 567 + return 0; 568 + } 569 + /* still alive or freed and reused with the same sb and wd; kill */ 570 + get_inotify_watch(watch); 571 + mutex_unlock(&ih->mutex); 572 + return 2; 573 + } 574 + 575 + static void unpin_and_kill(struct inotify_watch *watch, int how) 576 + { 577 + struct super_block *sb = watch->inode->i_sb; 578 + put_inotify_watch(watch); 579 + switch (how) { 580 + case 1: 581 + deactivate_super(sb); 582 + break; 583 + case 2: 584 + drop_super(sb); 585 + } 586 + } 587 + 503 588 /** 504 589 * inotify_destroy - clean up and destroy an inotify instance 505 590 * @ih: inotify handle ··· 617 490 * pretty. We cannot do a simple iteration over the list, because we 618 491 * do not know the inode until we iterate to the watch. But we need to 619 492 * hold inode->inotify_mutex before ih->mutex. The following works. 493 + * 494 + * AV: it had to become even uglier to start working ;-/ 620 495 */ 621 496 while (1) { 622 497 struct inotify_watch *watch; 623 498 struct list_head *watches; 499 + struct super_block *sb; 624 500 struct inode *inode; 501 + int how; 625 502 626 503 mutex_lock(&ih->mutex); 627 504 watches = &ih->watches; ··· 634 503 break; 635 504 } 636 505 watch = list_first_entry(watches, struct inotify_watch, h_list); 637 - get_inotify_watch(watch); 638 - mutex_unlock(&ih->mutex); 506 + sb = watch->inode->i_sb; 507 + how = pin_to_kill(ih, watch); 508 + if (!how) 509 + continue; 639 510 640 511 inode = watch->inode; 641 512 mutex_lock(&inode->inotify_mutex); ··· 651 518 652 519 mutex_unlock(&ih->mutex); 653 520 mutex_unlock(&inode->inotify_mutex); 654 - put_inotify_watch(watch); 521 + unpin_and_kill(watch, how); 655 522 } 656 523 657 524 /* free this handle: the put matching the get in inotify_init() */ ··· 852 719 int inotify_rm_wd(struct inotify_handle *ih, u32 wd) 853 720 { 854 721 struct inotify_watch *watch; 722 + struct super_block *sb; 855 723 struct inode *inode; 724 + int how; 856 725 857 726 mutex_lock(&ih->mutex); 858 727 watch = idr_find(&ih->idr, wd); ··· 862 727 mutex_unlock(&ih->mutex); 863 728 return -EINVAL; 864 729 } 865 - get_inotify_watch(watch); 730 + sb = watch->inode->i_sb; 731 + how = pin_to_kill(ih, watch); 732 + if (!how) 733 + return 0; 734 + 866 735 inode = watch->inode; 867 - mutex_unlock(&ih->mutex); 868 736 869 737 mutex_lock(&inode->inotify_mutex); 870 738 mutex_lock(&ih->mutex); ··· 878 740 879 741 mutex_unlock(&ih->mutex); 880 742 mutex_unlock(&inode->inotify_mutex); 881 - put_inotify_watch(watch); 743 + unpin_and_kill(watch, how); 882 744 883 745 return 0; 884 746 }

+11

include/linux/inotify.h

··· 134 134 struct inotify_watch *); 135 135 extern void get_inotify_watch(struct inotify_watch *); 136 136 extern void put_inotify_watch(struct inotify_watch *); 137 + extern int pin_inotify_watch(struct inotify_watch *); 138 + extern void unpin_inotify_watch(struct inotify_watch *); 137 139 138 140 #else 139 141 ··· 227 225 } 228 226 229 227 static inline void put_inotify_watch(struct inotify_watch *watch) 228 + { 229 + } 230 + 231 + extern inline int pin_inotify_watch(struct inotify_watch *watch) 232 + { 233 + return 0; 234 + } 235 + 236 + extern inline void unpin_inotify_watch(struct inotify_watch *watch) 230 237 { 231 238 } 232 239

+56 -39

kernel/audit_tree.c

··· 24 24 struct list_head trees; /* with root here */ 25 25 int dead; 26 26 int count; 27 + atomic_long_t refs; 27 28 struct rcu_head head; 28 29 struct node { 29 30 struct list_head list; ··· 57 56 * tree is refcounted; one reference for "some rules on rules_list refer to 58 57 * it", one for each chunk with pointer to it. 59 58 * 60 - * chunk is refcounted by embedded inotify_watch. 59 + * chunk is refcounted by embedded inotify_watch + .refs (non-zero refcount 60 + * of watch contributes 1 to .refs). 61 61 * 62 62 * node.index allows to get from node.list to containing chunk. 63 63 * MSB of that sucker is stolen to mark taggings that we might have to ··· 123 121 INIT_LIST_HEAD(&chunk->hash); 124 122 INIT_LIST_HEAD(&chunk->trees); 125 123 chunk->count = count; 124 + atomic_long_set(&chunk->refs, 1); 126 125 for (i = 0; i < count; i++) { 127 126 INIT_LIST_HEAD(&chunk->owners[i].list); 128 127 chunk->owners[i].index = i; ··· 132 129 return chunk; 133 130 } 134 131 135 - static void __free_chunk(struct rcu_head *rcu) 132 + static void free_chunk(struct audit_chunk *chunk) 136 133 { 137 - struct audit_chunk *chunk = container_of(rcu, struct audit_chunk, head); 138 134 int i; 139 135 140 136 for (i = 0; i < chunk->count; i++) { ··· 143 141 kfree(chunk); 144 142 } 145 143 146 - static inline void free_chunk(struct audit_chunk *chunk) 147 - { 148 - call_rcu(&chunk->head, __free_chunk); 149 - } 150 - 151 144 void audit_put_chunk(struct audit_chunk *chunk) 152 145 { 153 - put_inotify_watch(&chunk->watch); 146 + if (atomic_long_dec_and_test(&chunk->refs)) 147 + free_chunk(chunk); 148 + } 149 + 150 + static void __put_chunk(struct rcu_head *rcu) 151 + { 152 + struct audit_chunk *chunk = container_of(rcu, struct audit_chunk, head); 153 + audit_put_chunk(chunk); 154 154 } 155 155 156 156 enum {HASH_SIZE = 128}; ··· 180 176 181 177 list_for_each_entry_rcu(p, list, hash) { 182 178 if (p->watch.inode == inode) { 183 - get_inotify_watch(&p->watch); 179 + atomic_long_inc(&p->refs); 184 180 return p; 185 181 } 186 182 } ··· 198 194 199 195 /* tagging and untagging inodes with trees */ 200 196 201 - static void untag_chunk(struct audit_chunk *chunk, struct node *p) 197 + static struct audit_chunk *find_chunk(struct node *p) 202 198 { 199 + int index = p->index & ~(1U<<31); 200 + p -= index; 201 + return container_of(p, struct audit_chunk, owners[0]); 202 + } 203 + 204 + static void untag_chunk(struct node *p) 205 + { 206 + struct audit_chunk *chunk = find_chunk(p); 203 207 struct audit_chunk *new; 204 208 struct audit_tree *owner; 205 209 int size = chunk->count - 1; 206 210 int i, j; 207 211 212 + if (!pin_inotify_watch(&chunk->watch)) { 213 + /* 214 + * Filesystem is shutting down; all watches are getting 215 + * evicted, just take it off the node list for this 216 + * tree and let the eviction logics take care of the 217 + * rest. 218 + */ 219 + owner = p->owner; 220 + if (owner->root == chunk) { 221 + list_del_init(&owner->same_root); 222 + owner->root = NULL; 223 + } 224 + list_del_init(&p->list); 225 + p->owner = NULL; 226 + put_tree(owner); 227 + return; 228 + } 229 + 230 + spin_unlock(&hash_lock); 231 + 232 + /* 233 + * pin_inotify_watch() succeeded, so the watch won't go away 234 + * from under us. 235 + */ 208 236 mutex_lock(&chunk->watch.inode->inotify_mutex); 209 237 if (chunk->dead) { 210 238 mutex_unlock(&chunk->watch.inode->inotify_mutex); 211 - return; 239 + goto out; 212 240 } 213 241 214 242 owner = p->owner; ··· 257 221 inotify_evict_watch(&chunk->watch); 258 222 mutex_unlock(&chunk->watch.inode->inotify_mutex); 259 223 put_inotify_watch(&chunk->watch); 260 - return; 224 + goto out; 261 225 } 262 226 263 227 new = alloc_chunk(size); ··· 299 263 inotify_evict_watch(&chunk->watch); 300 264 mutex_unlock(&chunk->watch.inode->inotify_mutex); 301 265 put_inotify_watch(&chunk->watch); 302 - return; 266 + goto out; 303 267 304 268 Fallback: 305 269 // do the best we can ··· 313 277 put_tree(owner); 314 278 spin_unlock(&hash_lock); 315 279 mutex_unlock(&chunk->watch.inode->inotify_mutex); 280 + out: 281 + unpin_inotify_watch(&chunk->watch); 282 + spin_lock(&hash_lock); 316 283 } 317 284 318 285 static int create_chunk(struct inode *inode, struct audit_tree *tree) ··· 426 387 return 0; 427 388 } 428 389 429 - static struct audit_chunk *find_chunk(struct node *p) 430 - { 431 - int index = p->index & ~(1U<<31); 432 - p -= index; 433 - return container_of(p, struct audit_chunk, owners[0]); 434 - } 435 - 436 390 static void kill_rules(struct audit_tree *tree) 437 391 { 438 392 struct audit_krule *rule, *next; ··· 463 431 spin_lock(&hash_lock); 464 432 while (!list_empty(&victim->chunks)) { 465 433 struct node *p; 466 - struct audit_chunk *chunk; 467 434 468 435 p = list_entry(victim->chunks.next, struct node, list); 469 - chunk = find_chunk(p); 470 - get_inotify_watch(&chunk->watch); 471 - spin_unlock(&hash_lock); 472 436 473 - untag_chunk(chunk, p); 474 - 475 - put_inotify_watch(&chunk->watch); 476 - spin_lock(&hash_lock); 437 + untag_chunk(p); 477 438 } 478 439 spin_unlock(&hash_lock); 479 440 put_tree(victim); ··· 494 469 495 470 while (!list_empty(&tree->chunks)) { 496 471 struct node *node; 497 - struct audit_chunk *chunk; 498 472 499 473 node = list_entry(tree->chunks.next, struct node, list); 500 474 ··· 501 477 if (!(node->index & (1U<<31))) 502 478 break; 503 479 504 - chunk = find_chunk(node); 505 - get_inotify_watch(&chunk->watch); 506 - spin_unlock(&hash_lock); 507 - 508 - untag_chunk(chunk, node); 509 - 510 - put_inotify_watch(&chunk->watch); 511 - spin_lock(&hash_lock); 480 + untag_chunk(node); 512 481 } 513 482 if (!tree->root && !tree->goner) { 514 483 tree->goner = 1; ··· 895 878 static void destroy_watch(struct inotify_watch *watch) 896 879 { 897 880 struct audit_chunk *chunk = container_of(watch, struct audit_chunk, watch); 898 - free_chunk(chunk); 881 + call_rcu(&chunk->head, __put_chunk); 899 882 } 900 883 901 884 static const struct inotify_operations rtree_inotify_ops = {

+9 -5

kernel/auditfilter.c

··· 1094 1094 list_for_each_entry_safe(p, n, in_list, ilist) { 1095 1095 list_del(&p->ilist); 1096 1096 inotify_rm_watch(audit_ih, &p->wdata); 1097 - /* the put matching the get in audit_do_del_rule() */ 1098 - put_inotify_watch(&p->wdata); 1097 + /* the unpin matching the pin in audit_do_del_rule() */ 1098 + unpin_inotify_watch(&p->wdata); 1099 1099 } 1100 1100 } 1101 1101 ··· 1389 1389 /* Put parent on the inotify un-registration 1390 1390 * list. Grab a reference before releasing 1391 1391 * audit_filter_mutex, to be released in 1392 - * audit_inotify_unregister(). */ 1393 - list_add(&parent->ilist, &inotify_list); 1394 - get_inotify_watch(&parent->wdata); 1392 + * audit_inotify_unregister(). 1393 + * If filesystem is going away, just leave 1394 + * the sucker alone, eviction will take 1395 + * care of it. 1396 + */ 1397 + if (pin_inotify_watch(&parent->wdata)) 1398 + list_add(&parent->ilist, &inotify_list); 1395 1399 } 1396 1400 } 1397 1401 }

Configure Feed

Configure Feed