Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

Merge tag 'nf-next-26-03-04' of https://git.kernel.org/pub/scm/linux/kernel/git/netfilter/nf-next

Florian Westphal says:

====================
netfilter: updates for net-next

The following patchset contains Netfilter updates for *net-next*,
including changes to IPv6 stack and updates to IPVS from Julian Anastasov.

1) ipv6: export fib6_lookup for nft_fib_ipv6 module
2) factor out ipv6_anycast_destination logic so its usable without
dst_entry. These are dependencies for patch 3.
3) switch nft_fib_ipv6 module to no longer need temporary dst_entry
object allocations by using fib6_lookup() + RCU.
This gets us ~13% higher packet rate in my tests.

Patches 4 to 8, from Eric Dumazet, zap sk_callback_lock usage in
netfilter. Patch 9 removes another sk_callback_lock instance.

Remaining patches, from Julian Anastasov, improve IPVS, Quoting Julian:
* Add infrastructure for resizable hash tables based on hlist_bl.
* Change the 256-bucket service hash table to be resizable.
* Change the global connection table to be per-net and resizable.
* Make connection hashing more secure for setups with multiple services.

netfilter pull request nf-next-26-03-04

* tag 'nf-next-26-03-04' of https://git.kernel.org/pub/scm/linux/kernel/git/netfilter/nf-next:
ipvs: use more keys for connection hashing
ipvs: switch to per-net connection table
ipvs: use resizable hash table for services
ipvs: add resizable hash tables
rculist_bl: add hlist_bl_for_each_entry_continue_rcu
netfilter: nfnetlink_queue: remove locking in nfqnl_get_sk_secctx
netfilter: nfnetlink_queue: no longer acquire sk_callback_lock
netfilter: nfnetlink_log: no longer acquire sk_callback_lock
netfilter: nft_meta: no longer acquire sk_callback_lock in nft_meta_get_eval_skugid()
netfilter: xt_owner: no longer acquire sk_callback_lock in mt_owner()
netfilter: nf_log_syslog: no longer acquire sk_callback_lock in nf_log_dump_sk_uid_gid()
netfilter: nft_fib_ipv6: switch to fib6_lookup
ipv6: make ipv6_anycast_destination logic usable without dst_entry
ipv6: export fib6_lookup for nft_fib_ipv6
====================

Link: https://patch.msgid.link/20260304114921.31042-1-fw@strlen.de
Signed-off-by: Paolo Abeni <pabeni@redhat.com>

+2012 -514
+40 -9
include/linux/rculist_bl.h
··· 8 8 #include <linux/list_bl.h> 9 9 #include <linux/rcupdate.h> 10 10 11 + /* return the first ptr or next element in an RCU protected list */ 12 + #define hlist_bl_first_rcu(head) \ 13 + (*((struct hlist_bl_node __rcu **)(&(head)->first))) 14 + #define hlist_bl_next_rcu(node) \ 15 + (*((struct hlist_bl_node __rcu **)(&(node)->next))) 16 + 11 17 static inline void hlist_bl_set_first_rcu(struct hlist_bl_head *h, 12 18 struct hlist_bl_node *n) 13 19 { 14 20 LIST_BL_BUG_ON((unsigned long)n & LIST_BL_LOCKMASK); 15 21 LIST_BL_BUG_ON(((unsigned long)h->first & LIST_BL_LOCKMASK) != 16 22 LIST_BL_LOCKMASK); 17 - rcu_assign_pointer(h->first, 23 + rcu_assign_pointer(hlist_bl_first_rcu(h), 18 24 (struct hlist_bl_node *)((unsigned long)n | LIST_BL_LOCKMASK)); 19 25 } 20 26 21 - static inline struct hlist_bl_node *hlist_bl_first_rcu(struct hlist_bl_head *h) 22 - { 23 - return (struct hlist_bl_node *) 24 - ((unsigned long)rcu_dereference_check(h->first, hlist_bl_is_locked(h)) & ~LIST_BL_LOCKMASK); 25 - } 27 + #define hlist_bl_first_rcu_dereference(head) \ 28 + ({ \ 29 + struct hlist_bl_head *__head = (head); \ 30 + \ 31 + (struct hlist_bl_node *) \ 32 + ((unsigned long)rcu_dereference_check(hlist_bl_first_rcu(__head), \ 33 + hlist_bl_is_locked(__head)) & \ 34 + ~LIST_BL_LOCKMASK); \ 35 + }) 26 36 27 37 /** 28 38 * hlist_bl_del_rcu - deletes entry from hash list without re-initialization ··· 83 73 { 84 74 struct hlist_bl_node *first; 85 75 86 - /* don't need hlist_bl_first_rcu because we're under lock */ 76 + /* don't need hlist_bl_first_rcu* because we're under lock */ 87 77 first = hlist_bl_first(h); 88 78 89 79 n->next = first; ··· 103 93 * 104 94 */ 105 95 #define hlist_bl_for_each_entry_rcu(tpos, pos, head, member) \ 106 - for (pos = hlist_bl_first_rcu(head); \ 96 + for (pos = hlist_bl_first_rcu_dereference(head); \ 107 97 pos && \ 108 98 ({ tpos = hlist_bl_entry(pos, typeof(*tpos), member); 1; }); \ 109 - pos = rcu_dereference_raw(pos->next)) 99 + pos = rcu_dereference_raw(hlist_bl_next_rcu(pos))) 100 + 101 + /** 102 + * hlist_bl_for_each_entry_continue_rcu - continue iteration over list of given 103 + * type 104 + * @tpos: the type * to use as a loop cursor. 105 + * @pos: the &struct hlist_bl_node to use as a loop cursor. 106 + * @member: the name of the hlist_bl_node within the struct. 107 + * 108 + * Continue to iterate over list of given type, continuing after 109 + * the current position which must have been in the list when the RCU read 110 + * lock was taken. 111 + * This would typically require either that you obtained the node from a 112 + * previous walk of the list in the same RCU read-side critical section, or 113 + * that you held some sort of non-RCU reference (such as a reference count) 114 + * to keep the node alive *and* in the list. 115 + */ 116 + #define hlist_bl_for_each_entry_continue_rcu(tpos, pos, member) \ 117 + for (pos = rcu_dereference_raw(hlist_bl_next_rcu(&(tpos)->member)); \ 118 + pos && \ 119 + ({ tpos = hlist_bl_entry(pos, typeof(*tpos), member); 1; }); \ 120 + pos = rcu_dereference_raw(hlist_bl_next_rcu(pos))) 110 121 111 122 #endif
+11 -4
include/net/ip6_route.h
··· 252 252 return rt->rt6i_flags & RTF_LOCAL; 253 253 } 254 254 255 + static inline bool __ipv6_anycast_destination(const struct rt6key *rt6i_dst, 256 + u32 rt6i_flags, 257 + const struct in6_addr *daddr) 258 + { 259 + return rt6i_flags & RTF_ANYCAST || 260 + (rt6i_dst->plen < 127 && 261 + !(rt6i_flags & (RTF_GATEWAY | RTF_NONEXTHOP)) && 262 + ipv6_addr_equal(&rt6i_dst->addr, daddr)); 263 + } 264 + 255 265 static inline bool ipv6_anycast_destination(const struct dst_entry *dst, 256 266 const struct in6_addr *daddr) 257 267 { 258 268 const struct rt6_info *rt = dst_rt6_info(dst); 259 269 260 - return rt->rt6i_flags & RTF_ANYCAST || 261 - (rt->rt6i_dst.plen < 127 && 262 - !(rt->rt6i_flags & (RTF_GATEWAY | RTF_NONEXTHOP)) && 263 - ipv6_addr_equal(&rt->rt6i_dst.addr, daddr)); 270 + return __ipv6_anycast_destination(&rt->rt6i_dst, rt->rt6i_flags, daddr); 264 271 } 265 272 266 273 int ip6_fragment(struct net *net, struct sock *sk, struct sk_buff *skb,
+330 -55
include/net/ip_vs.h
··· 11 11 #include <asm/types.h> /* for __uXX types */ 12 12 13 13 #include <linux/list.h> /* for struct list_head */ 14 + #include <linux/rculist_bl.h> /* for struct hlist_bl_head */ 14 15 #include <linux/spinlock.h> /* for struct rwlock_t */ 15 16 #include <linux/atomic.h> /* for struct atomic_t */ 16 17 #include <linux/refcount.h> /* for struct refcount_t */ ··· 31 30 #endif 32 31 #include <net/net_namespace.h> /* Netw namespace */ 33 32 #include <linux/sched/isolation.h> 33 + #include <linux/siphash.h> 34 34 35 35 #define IP_VS_HDR_INVERSE 1 36 36 #define IP_VS_HDR_ICMP 2 37 - /* 38 - * Hash table: for virtual service lookups 39 - */ 40 - #define IP_VS_SVC_TAB_BITS 8 41 - #define IP_VS_SVC_TAB_SIZE BIT(IP_VS_SVC_TAB_BITS) 42 - #define IP_VS_SVC_TAB_MASK (IP_VS_SVC_TAB_SIZE - 1) 37 + 38 + /* conn_tab limits (as per Kconfig) */ 39 + #define IP_VS_CONN_TAB_MIN_BITS 8 40 + #if BITS_PER_LONG > 32 41 + #define IP_VS_CONN_TAB_MAX_BITS 27 42 + #else 43 + #define IP_VS_CONN_TAB_MAX_BITS 20 44 + #endif 45 + 46 + /* svc_table limits */ 47 + #define IP_VS_SVC_TAB_MIN_BITS 4 48 + #define IP_VS_SVC_TAB_MAX_BITS 20 43 49 44 50 /* Generic access of ipvs struct */ 45 51 static inline struct netns_ipvs *net_ipvs(struct net* net) ··· 56 48 57 49 /* Connections' size value needed by ip_vs_ctl.c */ 58 50 extern int ip_vs_conn_tab_size; 59 - 60 - extern struct mutex __ip_vs_mutex; 61 51 62 52 struct ip_vs_iphdr { 63 53 int hdr_flags; /* ipvs flags */ ··· 277 271 pr_err(msg, ##__VA_ARGS__); \ 278 272 } while (0) 279 273 274 + struct ip_vs_aligned_lock { 275 + spinlock_t l; /* Protect buckets */ 276 + } ____cacheline_aligned_in_smp; 277 + 280 278 /* For arrays per family */ 281 279 enum { 282 280 IP_VS_AF_INET, ··· 292 282 { 293 283 return af == AF_INET6 ? IP_VS_AF_INET6 : IP_VS_AF_INET; 294 284 } 285 + 286 + /* work_flags */ 287 + enum { 288 + IP_VS_WORK_SVC_RESIZE, /* Schedule svc_resize_work */ 289 + IP_VS_WORK_SVC_NORESIZE, /* Stopping svc_resize_work */ 290 + IP_VS_WORK_CONN_RESIZE, /* Schedule conn_resize_work */ 291 + }; 295 292 296 293 /* The port number of FTP service (in network order). */ 297 294 #define FTPPORT cpu_to_be16(21) ··· 501 484 int est_row; /* estimated row */ 502 485 }; 503 486 487 + /* IPVS resizable hash tables */ 488 + struct ip_vs_rht { 489 + struct hlist_bl_head *buckets; 490 + struct ip_vs_rht __rcu *new_tbl; /* New/Same table */ 491 + seqcount_t *seqc; /* Protects moves */ 492 + struct ip_vs_aligned_lock *lock; /* Protect seqc */ 493 + int mask; /* Buckets mask */ 494 + int size; /* Buckets */ 495 + int seqc_mask; /* seqc mask */ 496 + int lock_mask; /* lock mask */ 497 + u32 table_id; 498 + int u_thresh; /* upper threshold */ 499 + int l_thresh; /* lower threshold */ 500 + int lfactor; /* Load Factor (shift)*/ 501 + int bits; /* size = 1 << bits */ 502 + siphash_key_t hash_key; 503 + struct rcu_head rcu_head; 504 + }; 505 + 506 + /** 507 + * ip_vs_rht_for_each_table() - Walk the hash tables 508 + * @table: struct ip_vs_rht __rcu *table 509 + * @t: current table, used as cursor, struct ip_vs_rht *var 510 + * @p: previous table, temp struct ip_vs_rht *var 511 + * 512 + * Walk tables assuming others can not change the installed tables 513 + */ 514 + #define ip_vs_rht_for_each_table(table, t, p) \ 515 + for (p = NULL, t = rcu_dereference_protected(table, 1); \ 516 + t != p; \ 517 + p = t, t = rcu_dereference_protected(t->new_tbl, 1)) 518 + 519 + /** 520 + * ip_vs_rht_for_each_table_rcu() - Walk the hash tables under RCU reader lock 521 + * @table: struct ip_vs_rht __rcu *table 522 + * @t: current table, used as cursor, struct ip_vs_rht *var 523 + * @p: previous table, temp struct ip_vs_rht *var 524 + * 525 + * We usually search in one table and also in second table on resizing 526 + */ 527 + #define ip_vs_rht_for_each_table_rcu(table, t, p) \ 528 + for (p = NULL, t = rcu_dereference(table); \ 529 + t != p; \ 530 + p = t, t = rcu_dereference(t->new_tbl)) 531 + 532 + /** 533 + * ip_vs_rht_for_each_bucket() - Walk all table buckets 534 + * @t: current table, used as cursor, struct ip_vs_rht *var 535 + * @bucket: bucket index, used as cursor, u32 var 536 + * @head: bucket address, used as cursor, struct hlist_bl_head *var 537 + */ 538 + #define ip_vs_rht_for_each_bucket(t, bucket, head) \ 539 + for (bucket = 0, head = (t)->buckets; \ 540 + bucket < t->size; bucket++, head++) 541 + 542 + /** 543 + * ip_vs_rht_for_bucket_retry() - Retry bucket if entries are moved 544 + * @t: current table, used as cursor, struct ip_vs_rht *var 545 + * @bucket: index of current bucket or hash key 546 + * @sc: temp seqcount_t *var 547 + * @seq: temp unsigned int var for sequence count 548 + * @retry: temp int var 549 + */ 550 + #define ip_vs_rht_for_bucket_retry(t, bucket, sc, seq, retry) \ 551 + for (retry = 1, sc = &(t)->seqc[(bucket) & (t)->seqc_mask]; \ 552 + retry && ({ seq = read_seqcount_begin(sc); 1; }); \ 553 + retry = read_seqcount_retry(sc, seq)) 554 + 555 + /** 556 + * DECLARE_IP_VS_RHT_WALK_BUCKETS_RCU() - Declare variables 557 + * 558 + * Variables for ip_vs_rht_walk_buckets_rcu 559 + */ 560 + #define DECLARE_IP_VS_RHT_WALK_BUCKETS_RCU() \ 561 + struct ip_vs_rht *_t, *_p; \ 562 + unsigned int _seq; \ 563 + seqcount_t *_sc; \ 564 + u32 _bucket; \ 565 + int _retry 566 + /** 567 + * ip_vs_rht_walk_buckets_rcu() - Walk all buckets under RCU read lock 568 + * @table: struct ip_vs_rht __rcu *table 569 + * @head: bucket address, used as cursor, struct hlist_bl_head *var 570 + * 571 + * Can be used while others add/delete/move entries 572 + * Not suitable if duplicates are not desired 573 + * Possible cases for reader that uses cond_resched_rcu() in the loop: 574 + * - new table can not be installed, no need to repeat 575 + * - new table can be installed => check and repeat if new table is 576 + * installed, needed for !PREEMPT_RCU 577 + */ 578 + #define ip_vs_rht_walk_buckets_rcu(table, head) \ 579 + ip_vs_rht_for_each_table_rcu(table, _t, _p) \ 580 + ip_vs_rht_for_each_bucket(_t, _bucket, head) \ 581 + ip_vs_rht_for_bucket_retry(_t, _bucket, _sc, \ 582 + _seq, _retry) 583 + 584 + /** 585 + * DECLARE_IP_VS_RHT_WALK_BUCKET_RCU() - Declare variables 586 + * 587 + * Variables for ip_vs_rht_walk_bucket_rcu 588 + */ 589 + #define DECLARE_IP_VS_RHT_WALK_BUCKET_RCU() \ 590 + unsigned int _seq; \ 591 + seqcount_t *_sc; \ 592 + int _retry 593 + /** 594 + * ip_vs_rht_walk_bucket_rcu() - Walk bucket under RCU read lock 595 + * @t: current table, struct ip_vs_rht *var 596 + * @bucket: index of current bucket or hash key 597 + * @head: bucket address, used as cursor, struct hlist_bl_head *var 598 + * 599 + * Can be used while others add/delete/move entries 600 + * Not suitable if duplicates are not desired 601 + * Possible cases for reader that uses cond_resched_rcu() in the loop: 602 + * - new table can not be installed, no need to repeat 603 + * - new table can be installed => check and repeat if new table is 604 + * installed, needed for !PREEMPT_RCU 605 + */ 606 + #define ip_vs_rht_walk_bucket_rcu(t, bucket, head) \ 607 + if (({ head = (t)->buckets + ((bucket) & (t)->mask); 0; })) \ 608 + {} \ 609 + else \ 610 + ip_vs_rht_for_bucket_retry(t, (bucket), _sc, _seq, _retry) 611 + 612 + /** 613 + * DECLARE_IP_VS_RHT_WALK_BUCKETS_SAFE_RCU() - Declare variables 614 + * 615 + * Variables for ip_vs_rht_walk_buckets_safe_rcu 616 + */ 617 + #define DECLARE_IP_VS_RHT_WALK_BUCKETS_SAFE_RCU() \ 618 + struct ip_vs_rht *_t, *_p; \ 619 + u32 _bucket 620 + /** 621 + * ip_vs_rht_walk_buckets_safe_rcu() - Walk all buckets under RCU read lock 622 + * @table: struct ip_vs_rht __rcu *table 623 + * @head: bucket address, used as cursor, struct hlist_bl_head *var 624 + * 625 + * Can be used while others add/delete entries but moving is disabled 626 + * Using cond_resched_rcu() should be safe if tables do not change 627 + */ 628 + #define ip_vs_rht_walk_buckets_safe_rcu(table, head) \ 629 + ip_vs_rht_for_each_table_rcu(table, _t, _p) \ 630 + ip_vs_rht_for_each_bucket(_t, _bucket, head) 631 + 632 + /** 633 + * DECLARE_IP_VS_RHT_WALK_BUCKETS() - Declare variables 634 + * 635 + * Variables for ip_vs_rht_walk_buckets 636 + */ 637 + #define DECLARE_IP_VS_RHT_WALK_BUCKETS() \ 638 + struct ip_vs_rht *_t, *_p; \ 639 + u32 _bucket 640 + 641 + /** 642 + * ip_vs_rht_walk_buckets() - Walk all buckets 643 + * @table: struct ip_vs_rht __rcu *table 644 + * @head: bucket address, used as cursor, struct hlist_bl_head *var 645 + * 646 + * Use if others can not add/delete/move entries 647 + */ 648 + #define ip_vs_rht_walk_buckets(table, head) \ 649 + ip_vs_rht_for_each_table(table, _t, _p) \ 650 + ip_vs_rht_for_each_bucket(_t, _bucket, head) 651 + 652 + /* Entries can be in one of two tables, so we flip bit when new table is 653 + * created and store it as highest bit in hash keys 654 + */ 655 + #define IP_VS_RHT_TABLE_ID_MASK BIT(31) 656 + 657 + /* Check if hash key is from this table */ 658 + static inline bool ip_vs_rht_same_table(struct ip_vs_rht *t, u32 hash_key) 659 + { 660 + return !((t->table_id ^ hash_key) & IP_VS_RHT_TABLE_ID_MASK); 661 + } 662 + 663 + /* Build per-table hash key from hash value */ 664 + static inline u32 ip_vs_rht_build_hash_key(struct ip_vs_rht *t, u32 hash) 665 + { 666 + return t->table_id | (hash & ~IP_VS_RHT_TABLE_ID_MASK); 667 + } 668 + 669 + void ip_vs_rht_free(struct ip_vs_rht *t); 670 + void ip_vs_rht_rcu_free(struct rcu_head *head); 671 + struct ip_vs_rht *ip_vs_rht_alloc(int buckets, int scounts, int locks); 672 + int ip_vs_rht_desired_size(struct netns_ipvs *ipvs, struct ip_vs_rht *t, int n, 673 + int lfactor, int min_bits, int max_bits); 674 + void ip_vs_rht_set_thresholds(struct ip_vs_rht *t, int size, int lfactor, 675 + int min_bits, int max_bits); 676 + u32 ip_vs_rht_hash_linfo(struct ip_vs_rht *t, int af, 677 + const union nf_inet_addr *addr, u32 v1, u32 v2); 678 + 504 679 struct dst_entry; 505 680 struct iphdr; 506 681 struct ip_vs_conn; ··· 786 577 __u8 pe_data_len; 787 578 }; 788 579 580 + /* Hash node in conn_tab */ 581 + struct ip_vs_conn_hnode { 582 + struct hlist_bl_node node; /* node in conn_tab */ 583 + u32 hash_key; /* Key for the hash table */ 584 + u8 dir; /* 0=out->in, 1=in->out */ 585 + } __packed; 586 + 789 587 /* IP_VS structure allocated for each dynamically scheduled connection */ 790 588 struct ip_vs_conn { 791 - struct hlist_node c_list; /* hashed list heads */ 792 - /* Protocol, addresses and port numbers */ 589 + /* Cacheline for hash table nodes - rarely modified */ 590 + 591 + struct ip_vs_conn_hnode hn0; /* Original direction */ 592 + u8 af; /* address family */ 793 593 __be16 cport; 594 + struct ip_vs_conn_hnode hn1; /* Reply direction */ 595 + u8 daf; /* Address family of the dest */ 794 596 __be16 dport; 795 - __be16 vport; 796 - u16 af; /* address family */ 797 - union nf_inet_addr caddr; /* client address */ 798 - union nf_inet_addr vaddr; /* virtual address */ 799 - union nf_inet_addr daddr; /* destination address */ 597 + struct ip_vs_dest *dest; /* real server */ 598 + atomic_t n_control; /* Number of controlled ones */ 800 599 volatile __u32 flags; /* status flags */ 801 - __u16 protocol; /* Which protocol (TCP/UDP) */ 802 - __u16 daf; /* Address family of the dest */ 803 - struct netns_ipvs *ipvs; 600 + /* 44/64 */ 804 601 805 - /* counter and timer */ 806 - refcount_t refcnt; /* reference count */ 807 - struct timer_list timer; /* Expiration timer */ 808 - volatile unsigned long timeout; /* timeout */ 809 - 810 - /* Flags and state transition */ 811 - spinlock_t lock; /* lock for state transition */ 602 + struct ip_vs_conn *control; /* Master control connection */ 603 + const struct ip_vs_pe *pe; 604 + char *pe_data; 605 + __u8 pe_data_len; 812 606 volatile __u16 state; /* state info */ 813 607 volatile __u16 old_state; /* old state, to be used for 814 608 * state transition triggered 815 609 * synchronization 816 610 */ 817 - __u32 fwmark; /* Fire wall mark from skb */ 818 - unsigned long sync_endtime; /* jiffies + sent_retries */ 611 + /* 2-byte hole */ 612 + /* 64/96 */ 819 613 820 - /* Control members */ 821 - struct ip_vs_conn *control; /* Master control connection */ 822 - atomic_t n_control; /* Number of controlled ones */ 823 - struct ip_vs_dest *dest; /* real server */ 614 + union nf_inet_addr caddr; /* client address */ 615 + union nf_inet_addr vaddr; /* virtual address */ 616 + /* 96/128 */ 617 + 618 + union nf_inet_addr daddr; /* destination address */ 619 + __u32 fwmark; /* Fire wall mark from skb */ 620 + __be16 vport; 621 + __u16 protocol; /* Which protocol (TCP/UDP) */ 622 + 623 + /* Note: we can group the following members into a structure, 624 + * in order to save more space, and the following members are 625 + * only used in VS/NAT anyway 626 + */ 627 + struct ip_vs_app *app; /* bound ip_vs_app object */ 628 + void *app_data; /* Application private data */ 629 + /* 128/168 */ 630 + struct_group(sync_conn_opt, 631 + struct ip_vs_seq in_seq; /* incoming seq. struct */ 632 + struct ip_vs_seq out_seq; /* outgoing seq. struct */ 633 + ); 634 + /* 152/192 */ 635 + 636 + struct timer_list timer; /* Expiration timer */ 637 + volatile unsigned long timeout; /* timeout */ 638 + spinlock_t lock; /* lock for state transition */ 639 + refcount_t refcnt; /* reference count */ 824 640 atomic_t in_pkts; /* incoming packet counter */ 641 + /* 64-bit: 4-byte gap */ 642 + 643 + /* 188/256 */ 644 + unsigned long sync_endtime; /* jiffies + sent_retries */ 645 + struct netns_ipvs *ipvs; 825 646 826 647 /* Packet transmitter for different forwarding methods. If it 827 648 * mangles the packet, it must return NF_DROP or better NF_STOLEN, ··· 860 621 */ 861 622 int (*packet_xmit)(struct sk_buff *skb, struct ip_vs_conn *cp, 862 623 struct ip_vs_protocol *pp, struct ip_vs_iphdr *iph); 863 - 864 - /* Note: we can group the following members into a structure, 865 - * in order to save more space, and the following members are 866 - * only used in VS/NAT anyway 867 - */ 868 - struct ip_vs_app *app; /* bound ip_vs_app object */ 869 - void *app_data; /* Application private data */ 870 - struct_group(sync_conn_opt, 871 - struct ip_vs_seq in_seq; /* incoming seq. struct */ 872 - struct ip_vs_seq out_seq; /* outgoing seq. struct */ 873 - ); 874 - 875 - const struct ip_vs_pe *pe; 876 - char *pe_data; 877 - __u8 pe_data_len; 878 624 879 625 struct rcu_head rcu_head; 880 626 }; ··· 915 691 * forwarding entries. 916 692 */ 917 693 struct ip_vs_service { 918 - struct hlist_node s_list; /* node in service table */ 919 - atomic_t refcnt; /* reference counter */ 920 - 694 + struct hlist_bl_node s_list; /* node in service table */ 695 + u32 hash_key; /* Key for the hash table */ 921 696 u16 af; /* address family */ 922 697 __u16 protocol; /* which protocol (TCP/UDP) */ 698 + 923 699 union nf_inet_addr addr; /* IP address for virtual service */ 924 - __be16 port; /* port number for the service */ 925 700 __u32 fwmark; /* firewall mark of the service */ 701 + atomic_t refcnt; /* reference counter */ 702 + __be16 port; /* port number for the service */ 926 703 unsigned int flags; /* service status flags */ 927 704 unsigned int timeout; /* persistent timeout in ticks */ 928 705 __be32 netmask; /* grouping granularity, mask/plen */ ··· 1033 808 int (*fill_param)(struct ip_vs_conn_param *p, struct sk_buff *skb); 1034 809 bool (*ct_match)(const struct ip_vs_conn_param *p, 1035 810 struct ip_vs_conn *ct); 1036 - u32 (*hashkey_raw)(const struct ip_vs_conn_param *p, u32 initval, 1037 - bool inverse); 811 + u32 (*hashkey_raw)(const struct ip_vs_conn_param *p, 812 + struct ip_vs_rht *t, bool inverse); 1038 813 int (*show_pe_data)(const struct ip_vs_conn *cp, char *buf); 1039 814 /* create connections for real-server outgoing packets */ 1040 815 struct ip_vs_conn* (*conn_out)(struct ip_vs_service *svc, ··· 1174 949 /* ip_vs_conn */ 1175 950 atomic_t conn_count; /* connection counter */ 1176 951 atomic_t no_cport_conns[IP_VS_AF_MAX]; 952 + struct delayed_work conn_resize_work;/* resize conn_tab */ 1177 953 1178 954 /* ip_vs_ctl */ 1179 955 struct ip_vs_stats_rcu *tot_stats; /* Statistics & est. */ ··· 1183 957 struct list_head dest_trash; 1184 958 spinlock_t dest_trash_lock; 1185 959 struct timer_list dest_trash_timer; /* expiration timer */ 960 + struct mutex service_mutex; /* service reconfig */ 961 + struct rw_semaphore svc_resize_sem; /* svc_table resizing */ 962 + struct delayed_work svc_resize_work; /* resize svc_table */ 963 + atomic_t svc_table_changes;/* ++ on new table */ 1186 964 /* Service counters */ 1187 965 atomic_t num_services[IP_VS_AF_MAX]; /* Services */ 1188 966 atomic_t fwm_services[IP_VS_AF_MAX]; /* Services */ ··· 1251 1021 int sysctl_est_nice; /* kthread nice */ 1252 1022 int est_stopped; /* stop tasks */ 1253 1023 #endif 1024 + int sysctl_conn_lfactor; 1025 + int sysctl_svc_lfactor; 1254 1026 1255 1027 /* ip_vs_lblc */ 1256 1028 int sysctl_lblc_expiration; ··· 1262 1030 int sysctl_lblcr_expiration; 1263 1031 struct ctl_table_header *lblcr_ctl_header; 1264 1032 struct ctl_table *lblcr_ctl_table; 1033 + unsigned long work_flags; /* IP_VS_WORK_* flags */ 1265 1034 /* ip_vs_est */ 1266 1035 struct delayed_work est_reload_work;/* Reload kthread tasks */ 1267 1036 struct mutex est_mutex; /* protect kthread tasks */ ··· 1294 1061 unsigned int mixed_address_family_dests; 1295 1062 unsigned int hooks_afmask; /* &1=AF_INET, &2=AF_INET6 */ 1296 1063 1297 - /* the service mutex that protect svc_table and svc_fwm_table */ 1298 - struct mutex service_mutex; 1299 - struct hlist_head svc_table[IP_VS_SVC_TAB_SIZE]; /* Services */ 1064 + struct ip_vs_rht __rcu *svc_table; /* Services */ 1065 + struct ip_vs_rht __rcu *conn_tab; /* Connections */ 1066 + atomic_t conn_tab_changes;/* ++ on new table */ 1300 1067 }; 1301 1068 1302 1069 #define DEFAULT_SYNC_THRESHOLD 3 ··· 1546 1313 1547 1314 #endif 1548 1315 1316 + /* Get load factor to map conn_count/u_thresh to t->size */ 1317 + static inline int sysctl_conn_lfactor(struct netns_ipvs *ipvs) 1318 + { 1319 + return READ_ONCE(ipvs->sysctl_conn_lfactor); 1320 + } 1321 + 1322 + /* Get load factor to map num_services/u_thresh to t->size 1323 + * Smaller value decreases u_thresh to reduce collisions but increases 1324 + * the table size 1325 + * Returns factor where: 1326 + * - <0: u_thresh = size >> -factor, eg. lfactor -2 = 25% load 1327 + * - >=0: u_thresh = size << factor, eg. lfactor 1 = 200% load 1328 + */ 1329 + static inline int sysctl_svc_lfactor(struct netns_ipvs *ipvs) 1330 + { 1331 + return READ_ONCE(ipvs->sysctl_svc_lfactor); 1332 + } 1333 + 1549 1334 /* IPVS core functions 1550 1335 * (from ip_vs_core.c) 1551 1336 */ ··· 1637 1386 } 1638 1387 void ip_vs_conn_put(struct ip_vs_conn *cp); 1639 1388 void ip_vs_conn_fill_cport(struct ip_vs_conn *cp, __be16 cport); 1389 + int ip_vs_conn_desired_size(struct netns_ipvs *ipvs, struct ip_vs_rht *t, 1390 + int lfactor); 1391 + struct ip_vs_rht *ip_vs_conn_tab_alloc(struct netns_ipvs *ipvs, int buckets, 1392 + int lfactor); 1393 + 1394 + static inline struct ip_vs_conn * 1395 + ip_vs_hn0_to_conn(struct ip_vs_conn_hnode *hn) 1396 + { 1397 + return container_of(hn, struct ip_vs_conn, hn0); 1398 + } 1399 + 1400 + static inline struct ip_vs_conn * 1401 + ip_vs_hn_to_conn(struct ip_vs_conn_hnode *hn) 1402 + { 1403 + return hn->dir ? container_of(hn, struct ip_vs_conn, hn1) : 1404 + container_of(hn, struct ip_vs_conn, hn0); 1405 + } 1640 1406 1641 1407 struct ip_vs_conn *ip_vs_conn_new(const struct ip_vs_conn_param *p, int dest_af, 1642 1408 const union nf_inet_addr *daddr, ··· 2005 1737 fwd = '?'; break; 2006 1738 } 2007 1739 return fwd; 1740 + } 1741 + 1742 + /* Check if connection uses double hashing */ 1743 + static inline bool ip_vs_conn_use_hash2(struct ip_vs_conn *cp) 1744 + { 1745 + return IP_VS_FWD_METHOD(cp) == IP_VS_CONN_F_MASQ && 1746 + !(cp->flags & IP_VS_CONN_F_TEMPLATE); 2008 1747 } 2009 1748 2010 1749 void ip_vs_nat_icmp(struct sk_buff *skb, struct ip_vs_protocol *pp,
+3
net/ipv6/fib6_rules.c
··· 92 92 93 93 return err; 94 94 } 95 + #if IS_MODULE(CONFIG_NFT_FIB_IPV6) 96 + EXPORT_SYMBOL_GPL(fib6_lookup); 97 + #endif 95 98 96 99 struct dst_entry *fib6_rule_lookup(struct net *net, struct flowi6 *fl6, 97 100 const struct sk_buff *skb,
+3
net/ipv6/ip6_fib.c
··· 342 342 return fib6_table_lookup(net, net->ipv6.fib6_main_tbl, oif, fl6, 343 343 res, flags); 344 344 } 345 + #if IS_MODULE(CONFIG_NFT_FIB_IPV6) 346 + EXPORT_SYMBOL_GPL(fib6_lookup); 347 + #endif 345 348 346 349 static void __net_init fib6_tables_init(struct net *net) 347 350 {
+49 -30
net/ipv6/netfilter/nft_fib_ipv6.c
··· 52 52 fl6->flowlabel = (*(__be32 *)iph) & IPV6_FLOWINFO_MASK; 53 53 fl6->flowi6_l3mdev = nft_fib_l3mdev_master_ifindex_rcu(pkt, dev); 54 54 55 - return lookup_flags; 55 + return lookup_flags | RT6_LOOKUP_F_DST_NOREF; 56 + } 57 + 58 + static int nft_fib6_lookup(struct net *net, struct flowi6 *fl6, 59 + struct fib6_result *res, int flags) 60 + { 61 + return fib6_lookup(net, fl6->flowi6_oif, fl6, res, flags); 56 62 } 57 63 58 64 static u32 __nft_fib6_eval_type(const struct nft_fib *priv, ··· 66 60 struct ipv6hdr *iph) 67 61 { 68 62 const struct net_device *dev = NULL; 63 + struct fib6_result res = {}; 69 64 int route_err, addrtype; 70 - struct rt6_info *rt; 71 65 struct flowi6 fl6 = { 72 66 .flowi6_iif = LOOPBACK_IFINDEX, 73 67 .flowi6_proto = pkt->tprot, 74 68 .flowi6_uid = sock_net_uid(nft_net(pkt), NULL), 75 69 }; 70 + int lookup_flags; 76 71 u32 ret = 0; 77 72 78 73 if (priv->flags & NFTA_FIB_F_IIF) ··· 81 74 else if (priv->flags & NFTA_FIB_F_OIF) 82 75 dev = nft_out(pkt); 83 76 84 - nft_fib6_flowi_init(&fl6, priv, pkt, dev, iph); 77 + lookup_flags = nft_fib6_flowi_init(&fl6, priv, pkt, dev, iph); 85 78 86 79 if (dev && nf_ipv6_chk_addr(nft_net(pkt), &fl6.daddr, dev, true)) 87 80 ret = RTN_LOCAL; 88 81 89 - route_err = nf_ip6_route(nft_net(pkt), (struct dst_entry **)&rt, 90 - flowi6_to_flowi(&fl6), false); 82 + route_err = nft_fib6_lookup(nft_net(pkt), &fl6, &res, lookup_flags); 91 83 if (route_err) 92 84 goto err; 93 85 94 - if (rt->rt6i_flags & RTF_REJECT) { 95 - route_err = rt->dst.error; 96 - dst_release(&rt->dst); 97 - goto err; 98 - } 86 + if (res.fib6_flags & RTF_REJECT) 87 + return res.fib6_type; 99 88 100 - if (ipv6_anycast_destination((struct dst_entry *)rt, &fl6.daddr)) 89 + if (__ipv6_anycast_destination(&res.f6i->fib6_dst, res.fib6_flags, &fl6.daddr)) 101 90 ret = RTN_ANYCAST; 102 - else if (!dev && rt->rt6i_flags & RTF_LOCAL) 91 + else if (!dev && res.fib6_flags & RTF_LOCAL) 103 92 ret = RTN_LOCAL; 104 - 105 - dst_release(&rt->dst); 106 93 107 94 if (ret) 108 95 return ret; ··· 153 152 return ipv6_addr_type(&iph->daddr) & IPV6_ADDR_LINKLOCAL; 154 153 } 155 154 155 + static bool nft_fib6_info_nh_dev_match(const struct net_device *nh_dev, 156 + const struct net_device *dev) 157 + { 158 + return nh_dev == dev || 159 + l3mdev_master_ifindex_rcu(nh_dev) == dev->ifindex; 160 + } 161 + 162 + static bool nft_fib6_info_nh_uses_dev(struct fib6_info *rt, 163 + const struct net_device *dev) 164 + { 165 + const struct net_device *nh_dev; 166 + struct fib6_info *iter; 167 + 168 + nh_dev = fib6_info_nh_dev(rt); 169 + if (nft_fib6_info_nh_dev_match(nh_dev, dev)) 170 + return true; 171 + 172 + list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings) { 173 + nh_dev = fib6_info_nh_dev(iter); 174 + 175 + if (nft_fib6_info_nh_dev_match(nh_dev, dev)) 176 + return true; 177 + } 178 + 179 + return false; 180 + } 181 + 156 182 void nft_fib6_eval(const struct nft_expr *expr, struct nft_regs *regs, 157 183 const struct nft_pktinfo *pkt) 158 184 { ··· 188 160 const struct net_device *found = NULL; 189 161 const struct net_device *oif = NULL; 190 162 u32 *dest = &regs->data[priv->dreg]; 163 + struct fib6_result res = {}; 191 164 struct ipv6hdr *iph, _iph; 192 165 struct flowi6 fl6 = { 193 166 .flowi6_iif = LOOPBACK_IFINDEX, 194 167 .flowi6_proto = pkt->tprot, 195 168 .flowi6_uid = sock_net_uid(nft_net(pkt), NULL), 196 169 }; 197 - struct rt6_info *rt; 198 - int lookup_flags; 170 + int lookup_flags, ret; 199 171 200 172 if (nft_fib_can_skip(pkt)) { 201 173 nft_fib_store_result(dest, priv, nft_in(pkt)); ··· 221 193 lookup_flags = nft_fib6_flowi_init(&fl6, priv, pkt, oif, iph); 222 194 223 195 *dest = 0; 224 - rt = (void *)ip6_route_lookup(nft_net(pkt), &fl6, pkt->skb, 225 - lookup_flags); 226 - if (rt->dst.error) 227 - goto put_rt_err; 228 - 229 - /* Should not see RTF_LOCAL here */ 230 - if (rt->rt6i_flags & (RTF_REJECT | RTF_ANYCAST | RTF_LOCAL)) 231 - goto put_rt_err; 196 + ret = nft_fib6_lookup(nft_net(pkt), &fl6, &res, lookup_flags); 197 + if (ret || res.fib6_flags & (RTF_REJECT | RTF_ANYCAST | RTF_LOCAL)) 198 + return; 232 199 233 200 if (!oif) { 234 - found = rt->rt6i_idev->dev; 201 + found = fib6_info_nh_dev(res.f6i); 235 202 } else { 236 - if (oif == rt->rt6i_idev->dev || 237 - l3mdev_master_ifindex_rcu(rt->rt6i_idev->dev) == oif->ifindex) 203 + if (nft_fib6_info_nh_uses_dev(res.f6i, oif)) 238 204 found = oif; 239 205 } 240 - 241 206 nft_fib_store_result(dest, priv, found); 242 - put_rt_err: 243 - ip6_rt_put(rt); 244 207 } 245 208 EXPORT_SYMBOL_GPL(nft_fib6_eval); 246 209
+736 -259
net/netfilter/ipvs/ip_vs_conn.c
··· 47 47 module_param_named(conn_tab_bits, ip_vs_conn_tab_bits, int, 0444); 48 48 MODULE_PARM_DESC(conn_tab_bits, "Set connections' hash size"); 49 49 50 - /* size and mask values */ 50 + /* Max table size */ 51 51 int ip_vs_conn_tab_size __read_mostly; 52 - static int ip_vs_conn_tab_mask __read_mostly; 53 - 54 - /* 55 - * Connection hash table: for input and output packets lookups of IPVS 56 - */ 57 - static struct hlist_head *ip_vs_conn_tab __read_mostly; 58 52 59 53 /* SLAB cache for IPVS connections */ 60 54 static struct kmem_cache *ip_vs_conn_cachep __read_mostly; 61 - 62 - /* random value for IPVS connection hash */ 63 - static unsigned int ip_vs_conn_rnd __read_mostly; 64 - 65 - /* 66 - * Fine locking granularity for big connection hash table 67 - */ 68 - #define CT_LOCKARRAY_BITS 5 69 - #define CT_LOCKARRAY_SIZE (1<<CT_LOCKARRAY_BITS) 70 - #define CT_LOCKARRAY_MASK (CT_LOCKARRAY_SIZE-1) 71 55 72 56 /* We need an addrstrlen that works with or without v6 */ 73 57 #ifdef CONFIG_IP_VS_IPV6 ··· 60 76 #define IP_VS_ADDRSTRLEN (8+1) 61 77 #endif 62 78 63 - struct ip_vs_aligned_lock 64 - { 65 - spinlock_t l; 66 - } __attribute__((__aligned__(SMP_CACHE_BYTES))); 79 + /* Connection hashing: 80 + * - hash (add conn) and unhash (del conn) are safe for RCU readers walking 81 + * the bucket, they will not jump to another bucket or hash table and to miss 82 + * conns 83 + * - rehash (fill cport) hashes the conn to new bucket or even new table, 84 + * so we use seqcount to retry lookups on buckets where we delete 85 + * conns (unhash) because after hashing their next ptr can point to another 86 + * bucket or hash table 87 + * - hash table resize works like rehash but always rehashes into new table 88 + * - bit lock on bucket serializes all operations that modify the chain 89 + * - cp->lock protects conn fields like cp->flags, cp->dest 90 + */ 67 91 68 - /* lock array for conn table */ 69 - static struct ip_vs_aligned_lock 70 - __ip_vs_conntbl_lock_array[CT_LOCKARRAY_SIZE] __cacheline_aligned; 71 - 72 - static inline void ct_write_lock_bh(unsigned int key) 92 + /* Lock conn_tab bucket for conn hash/unhash, not for rehash */ 93 + static __always_inline void 94 + conn_tab_lock(struct ip_vs_rht *t, struct ip_vs_conn *cp, u32 hash_key, 95 + u32 hash_key2, bool use2, bool new_hash, 96 + struct hlist_bl_head **head_ret, struct hlist_bl_head **head2_ret) 73 97 { 74 - spin_lock_bh(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l); 98 + struct hlist_bl_head *head, *head2; 99 + u32 hash_key_new, hash_key_new2; 100 + struct ip_vs_rht *t2 = t; 101 + u32 idx, idx2; 102 + 103 + idx = hash_key & t->mask; 104 + if (use2) 105 + idx2 = hash_key2 & t->mask; 106 + else 107 + idx2 = idx; 108 + if (!new_hash) { 109 + /* We need to lock the bucket in the right table */ 110 + 111 + retry: 112 + if (!ip_vs_rht_same_table(t, hash_key)) { 113 + /* It is already moved to new table */ 114 + t = rcu_dereference(t->new_tbl); 115 + /* Rehashing works in two steps and we may detect 116 + * both nodes in different tables, use idx/idx2 117 + * for proper lock ordering for heads. 118 + */ 119 + idx = hash_key & t->mask; 120 + idx |= IP_VS_RHT_TABLE_ID_MASK; 121 + } 122 + if (use2) { 123 + if (!ip_vs_rht_same_table(t2, hash_key2)) { 124 + /* It is already moved to new table */ 125 + t2 = rcu_dereference(t2->new_tbl); 126 + idx2 = hash_key2 & t2->mask; 127 + idx2 |= IP_VS_RHT_TABLE_ID_MASK; 128 + } 129 + } else { 130 + idx2 = idx; 131 + } 132 + } 133 + 134 + head = t->buckets + (hash_key & t->mask); 135 + head2 = use2 ? t2->buckets + (hash_key2 & t2->mask) : head; 136 + 137 + local_bh_disable(); 138 + /* Do not touch seqcount, this is a safe operation */ 139 + 140 + if (idx <= idx2) { 141 + hlist_bl_lock(head); 142 + if (head != head2) 143 + hlist_bl_lock(head2); 144 + } else { 145 + hlist_bl_lock(head2); 146 + hlist_bl_lock(head); 147 + } 148 + if (!new_hash) { 149 + /* Ensure hash_key is read under lock */ 150 + hash_key_new = READ_ONCE(cp->hn0.hash_key); 151 + hash_key_new2 = READ_ONCE(cp->hn1.hash_key); 152 + /* Hash changed ? */ 153 + if (hash_key != hash_key_new || 154 + (hash_key2 != hash_key_new2 && use2)) { 155 + if (head != head2) 156 + hlist_bl_unlock(head2); 157 + hlist_bl_unlock(head); 158 + local_bh_enable(); 159 + hash_key = hash_key_new; 160 + hash_key2 = hash_key_new2; 161 + goto retry; 162 + } 163 + } 164 + *head_ret = head; 165 + *head2_ret = head2; 75 166 } 76 167 77 - static inline void ct_write_unlock_bh(unsigned int key) 168 + static inline void conn_tab_unlock(struct hlist_bl_head *head, 169 + struct hlist_bl_head *head2) 78 170 { 79 - spin_unlock_bh(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l); 171 + if (head != head2) 172 + hlist_bl_unlock(head2); 173 + hlist_bl_unlock(head); 174 + local_bh_enable(); 80 175 } 81 176 82 177 static void ip_vs_conn_expire(struct timer_list *t); ··· 163 100 /* 164 101 * Returns hash value for IPVS connection entry 165 102 */ 166 - static unsigned int ip_vs_conn_hashkey(struct netns_ipvs *ipvs, int af, unsigned int proto, 167 - const union nf_inet_addr *addr, 168 - __be16 port) 103 + static u32 ip_vs_conn_hashkey(struct ip_vs_rht *t, int af, unsigned int proto, 104 + const union nf_inet_addr *addr, __be16 port, 105 + const union nf_inet_addr *laddr, __be16 lport) 169 106 { 107 + u64 a = (u32)proto << 16 | (__force u32)port; 108 + u64 d; 109 + 170 110 #ifdef CONFIG_IP_VS_IPV6 171 - if (af == AF_INET6) 172 - return (jhash_3words(jhash(addr, 16, ip_vs_conn_rnd), 173 - (__force u32)port, proto, ip_vs_conn_rnd) ^ 174 - ((size_t)ipvs>>8)) & ip_vs_conn_tab_mask; 111 + if (af == AF_INET6) { 112 + u64 b = (u64)addr->all[0] << 32 | addr->all[1]; 113 + u64 c = (u64)addr->all[2] << 32 | addr->all[3]; 114 + 115 + a |= (u64)laddr->all[2] << 32 ^ (__force u32)lport; 116 + c ^= laddr->all[1]; 117 + d = (u64)laddr->all[0] << 32 | laddr->all[3]; 118 + return (u32)siphash_4u64(a, b, c, d, &t->hash_key); 119 + } 175 120 #endif 176 - return (jhash_3words((__force u32)addr->ip, (__force u32)port, proto, 177 - ip_vs_conn_rnd) ^ 178 - ((size_t)ipvs>>8)) & ip_vs_conn_tab_mask; 121 + a |= (u64)addr->all[0] << 32; 122 + d = (u64)laddr->all[0] << 32 | (__force u32)lport; 123 + return (u32)siphash_2u64(a, d, &t->hash_key); 179 124 } 180 125 181 126 static unsigned int ip_vs_conn_hashkey_param(const struct ip_vs_conn_param *p, 182 - bool inverse) 127 + struct ip_vs_rht *t, bool inverse) 183 128 { 129 + const union nf_inet_addr *laddr; 184 130 const union nf_inet_addr *addr; 131 + __be16 lport; 185 132 __be16 port; 186 133 187 134 if (p->pe_data && p->pe->hashkey_raw) 188 - return p->pe->hashkey_raw(p, ip_vs_conn_rnd, inverse) & 189 - ip_vs_conn_tab_mask; 135 + return p->pe->hashkey_raw(p, t, inverse); 190 136 191 137 if (likely(!inverse)) { 192 138 addr = p->caddr; 193 139 port = p->cport; 140 + laddr = p->vaddr; 141 + lport = p->vport; 194 142 } else { 195 143 addr = p->vaddr; 196 144 port = p->vport; 145 + laddr = p->caddr; 146 + lport = p->cport; 197 147 } 198 148 199 - return ip_vs_conn_hashkey(p->ipvs, p->af, p->protocol, addr, port); 149 + return ip_vs_conn_hashkey(t, p->af, p->protocol, addr, port, laddr, 150 + lport); 200 151 } 201 152 202 - static unsigned int ip_vs_conn_hashkey_conn(const struct ip_vs_conn *cp) 153 + static unsigned int ip_vs_conn_hashkey_conn(struct ip_vs_rht *t, 154 + const struct ip_vs_conn *cp, 155 + bool out) 203 156 { 204 157 struct ip_vs_conn_param p; 205 158 206 - ip_vs_conn_fill_param(cp->ipvs, cp->af, cp->protocol, 207 - &cp->caddr, cp->cport, NULL, 0, &p); 159 + if (!out) 160 + ip_vs_conn_fill_param(cp->ipvs, cp->af, cp->protocol, 161 + &cp->caddr, cp->cport, &cp->vaddr, 162 + cp->vport, &p); 163 + else 164 + ip_vs_conn_fill_param(cp->ipvs, cp->af, cp->protocol, 165 + &cp->daddr, cp->dport, &cp->caddr, 166 + cp->cport, &p); 208 167 209 168 if (cp->pe) { 210 169 p.pe = cp->pe; ··· 234 149 p.pe_data_len = cp->pe_data_len; 235 150 } 236 151 237 - return ip_vs_conn_hashkey_param(&p, false); 152 + return ip_vs_conn_hashkey_param(&p, t, out); 238 153 } 239 154 240 - /* 241 - * Hashes ip_vs_conn in ip_vs_conn_tab by netns,proto,addr,port. 155 + /* Hashes ip_vs_conn in conn_tab 242 156 * returns bool success. 243 157 */ 244 158 static inline int ip_vs_conn_hash(struct ip_vs_conn *cp) 245 159 { 246 - unsigned int hash; 160 + struct netns_ipvs *ipvs = cp->ipvs; 161 + struct hlist_bl_head *head, *head2; 162 + u32 hash_key, hash_key2; 163 + struct ip_vs_rht *t; 164 + u32 hash, hash2; 165 + bool use2; 247 166 int ret; 248 167 249 168 if (cp->flags & IP_VS_CONN_F_ONE_PACKET) 250 169 return 0; 251 170 252 - /* Hash by protocol, client address and port */ 253 - hash = ip_vs_conn_hashkey_conn(cp); 171 + /* New entries go into recent table */ 172 + t = rcu_dereference(ipvs->conn_tab); 173 + t = rcu_dereference(t->new_tbl); 254 174 255 - ct_write_lock_bh(hash); 175 + hash = ip_vs_conn_hashkey_conn(t, cp, false); 176 + hash_key = ip_vs_rht_build_hash_key(t, hash); 177 + if (ip_vs_conn_use_hash2(cp)) { 178 + hash2 = ip_vs_conn_hashkey_conn(t, cp, true); 179 + hash_key2 = ip_vs_rht_build_hash_key(t, hash2); 180 + use2 = true; 181 + } else { 182 + hash_key2 = hash_key; 183 + use2 = false; 184 + } 185 + conn_tab_lock(t, cp, hash_key, hash_key2, use2, true /* new_hash */, 186 + &head, &head2); 256 187 spin_lock(&cp->lock); 257 188 258 189 if (!(cp->flags & IP_VS_CONN_F_HASHED)) { 259 190 cp->flags |= IP_VS_CONN_F_HASHED; 191 + WRITE_ONCE(cp->hn0.hash_key, hash_key); 192 + WRITE_ONCE(cp->hn1.hash_key, hash_key2); 260 193 refcount_inc(&cp->refcnt); 261 - hlist_add_head_rcu(&cp->c_list, &ip_vs_conn_tab[hash]); 194 + hlist_bl_add_head_rcu(&cp->hn0.node, head); 195 + if (use2) 196 + hlist_bl_add_head_rcu(&cp->hn1.node, head2); 262 197 ret = 1; 263 198 } else { 264 199 pr_err("%s(): request for already hashed, called from %pS\n", ··· 287 182 } 288 183 289 184 spin_unlock(&cp->lock); 290 - ct_write_unlock_bh(hash); 185 + conn_tab_unlock(head, head2); 186 + 187 + /* Schedule resizing if load increases */ 188 + if (atomic_read(&ipvs->conn_count) > t->u_thresh && 189 + !test_and_set_bit(IP_VS_WORK_CONN_RESIZE, &ipvs->work_flags)) 190 + mod_delayed_work(system_unbound_wq, &ipvs->conn_resize_work, 0); 291 191 292 192 return ret; 293 193 } 294 194 295 - 296 - /* 297 - * UNhashes ip_vs_conn from ip_vs_conn_tab. 298 - * returns bool success. Caller should hold conn reference. 299 - */ 300 - static inline int ip_vs_conn_unhash(struct ip_vs_conn *cp) 301 - { 302 - unsigned int hash; 303 - int ret; 304 - 305 - /* unhash it and decrease its reference counter */ 306 - hash = ip_vs_conn_hashkey_conn(cp); 307 - 308 - ct_write_lock_bh(hash); 309 - spin_lock(&cp->lock); 310 - 311 - if (cp->flags & IP_VS_CONN_F_HASHED) { 312 - hlist_del_rcu(&cp->c_list); 313 - cp->flags &= ~IP_VS_CONN_F_HASHED; 314 - refcount_dec(&cp->refcnt); 315 - ret = 1; 316 - } else 317 - ret = 0; 318 - 319 - spin_unlock(&cp->lock); 320 - ct_write_unlock_bh(hash); 321 - 322 - return ret; 323 - } 324 - 325 - /* Try to unlink ip_vs_conn from ip_vs_conn_tab. 195 + /* Try to unlink ip_vs_conn from conn_tab. 326 196 * returns bool success. 327 197 */ 328 198 static inline bool ip_vs_conn_unlink(struct ip_vs_conn *cp) 329 199 { 330 - unsigned int hash; 200 + struct netns_ipvs *ipvs = cp->ipvs; 201 + struct hlist_bl_head *head, *head2; 202 + u32 hash_key, hash_key2; 203 + struct ip_vs_rht *t; 331 204 bool ret = false; 205 + bool use2; 332 206 333 207 if (cp->flags & IP_VS_CONN_F_ONE_PACKET) 334 208 return refcount_dec_if_one(&cp->refcnt); 335 209 336 - hash = ip_vs_conn_hashkey_conn(cp); 210 + rcu_read_lock(); 337 211 338 - ct_write_lock_bh(hash); 212 + t = rcu_dereference(ipvs->conn_tab); 213 + hash_key = READ_ONCE(cp->hn0.hash_key); 214 + hash_key2 = READ_ONCE(cp->hn1.hash_key); 215 + use2 = ip_vs_conn_use_hash2(cp); 216 + 217 + conn_tab_lock(t, cp, hash_key, hash_key2, use2, false /* new_hash */, 218 + &head, &head2); 339 219 spin_lock(&cp->lock); 340 220 341 221 if (cp->flags & IP_VS_CONN_F_HASHED) { 342 222 /* Decrease refcnt and unlink conn only if we are last user */ 343 223 if (refcount_dec_if_one(&cp->refcnt)) { 344 - hlist_del_rcu(&cp->c_list); 224 + hlist_bl_del_rcu(&cp->hn0.node); 225 + if (use2) 226 + hlist_bl_del_rcu(&cp->hn1.node); 345 227 cp->flags &= ~IP_VS_CONN_F_HASHED; 346 228 ret = true; 347 229 } 348 230 } 349 231 350 232 spin_unlock(&cp->lock); 351 - ct_write_unlock_bh(hash); 233 + conn_tab_unlock(head, head2); 234 + 235 + rcu_read_unlock(); 352 236 353 237 return ret; 354 238 } 355 239 356 240 357 241 /* 358 - * Gets ip_vs_conn associated with supplied parameters in the ip_vs_conn_tab. 242 + * Gets ip_vs_conn associated with supplied parameters in the conn_tab. 359 243 * Called for pkts coming from OUTside-to-INside. 360 244 * p->caddr, p->cport: pkt source address (foreign host) 361 245 * p->vaddr, p->vport: pkt dest address (load balancer) ··· 352 258 static inline struct ip_vs_conn * 353 259 __ip_vs_conn_in_get(const struct ip_vs_conn_param *p) 354 260 { 355 - unsigned int hash; 261 + DECLARE_IP_VS_RHT_WALK_BUCKET_RCU(); 262 + struct netns_ipvs *ipvs = p->ipvs; 263 + struct ip_vs_conn_hnode *hn; 264 + struct hlist_bl_head *head; 265 + struct ip_vs_rht *t, *pt; 266 + struct hlist_bl_node *e; 356 267 struct ip_vs_conn *cp; 357 - 358 - hash = ip_vs_conn_hashkey_param(p, false); 268 + u32 hash, hash_key; 359 269 360 270 rcu_read_lock(); 361 271 362 - hlist_for_each_entry_rcu(cp, &ip_vs_conn_tab[hash], c_list) { 363 - if (p->cport == cp->cport && p->vport == cp->vport && 364 - cp->af == p->af && 365 - ip_vs_addr_equal(p->af, p->caddr, &cp->caddr) && 366 - ip_vs_addr_equal(p->af, p->vaddr, &cp->vaddr) && 367 - ((!p->cport) ^ (!(cp->flags & IP_VS_CONN_F_NO_CPORT))) && 368 - p->protocol == cp->protocol && 369 - cp->ipvs == p->ipvs) { 370 - if (!__ip_vs_conn_get(cp)) 371 - continue; 372 - /* HIT */ 373 - rcu_read_unlock(); 374 - return cp; 272 + ip_vs_rht_for_each_table_rcu(ipvs->conn_tab, t, pt) { 273 + hash = ip_vs_conn_hashkey_param(p, t, false); 274 + hash_key = ip_vs_rht_build_hash_key(t, hash); 275 + ip_vs_rht_walk_bucket_rcu(t, hash_key, head) { 276 + hlist_bl_for_each_entry_rcu(hn, e, head, node) { 277 + if (READ_ONCE(hn->hash_key) != hash_key || 278 + hn->dir != 0) 279 + continue; 280 + cp = ip_vs_hn0_to_conn(hn); 281 + if (p->cport == cp->cport && 282 + p->vport == cp->vport && cp->af == p->af && 283 + ip_vs_addr_equal(p->af, p->caddr, 284 + &cp->caddr) && 285 + ip_vs_addr_equal(p->af, p->vaddr, 286 + &cp->vaddr) && 287 + (!p->cport ^ 288 + (!(cp->flags & IP_VS_CONN_F_NO_CPORT))) && 289 + p->protocol == cp->protocol) { 290 + if (__ip_vs_conn_get(cp)) { 291 + /* HIT */ 292 + rcu_read_unlock(); 293 + return cp; 294 + } 295 + } 296 + } 375 297 } 376 298 } 377 299 ··· 460 350 /* Get reference to connection template */ 461 351 struct ip_vs_conn *ip_vs_ct_in_get(const struct ip_vs_conn_param *p) 462 352 { 463 - unsigned int hash; 353 + DECLARE_IP_VS_RHT_WALK_BUCKET_RCU(); 354 + struct netns_ipvs *ipvs = p->ipvs; 355 + struct ip_vs_conn_hnode *hn; 356 + struct hlist_bl_head *head; 357 + struct ip_vs_rht *t, *pt; 358 + struct hlist_bl_node *e; 464 359 struct ip_vs_conn *cp; 465 - 466 - hash = ip_vs_conn_hashkey_param(p, false); 360 + u32 hash, hash_key; 467 361 468 362 rcu_read_lock(); 469 363 470 - hlist_for_each_entry_rcu(cp, &ip_vs_conn_tab[hash], c_list) { 471 - if (unlikely(p->pe_data && p->pe->ct_match)) { 472 - if (cp->ipvs != p->ipvs) 473 - continue; 474 - if (p->pe == cp->pe && p->pe->ct_match(p, cp)) { 475 - if (__ip_vs_conn_get(cp)) 476 - goto out; 364 + ip_vs_rht_for_each_table_rcu(ipvs->conn_tab, t, pt) { 365 + hash = ip_vs_conn_hashkey_param(p, t, false); 366 + hash_key = ip_vs_rht_build_hash_key(t, hash); 367 + ip_vs_rht_walk_bucket_rcu(t, hash_key, head) { 368 + hlist_bl_for_each_entry_rcu(hn, e, head, node) { 369 + if (READ_ONCE(hn->hash_key) != hash_key || 370 + hn->dir != 0) 371 + continue; 372 + cp = ip_vs_hn0_to_conn(hn); 373 + if (unlikely(p->pe_data && p->pe->ct_match)) { 374 + if (p->pe == cp->pe && 375 + p->pe->ct_match(p, cp) && 376 + __ip_vs_conn_get(cp)) 377 + goto out; 378 + continue; 379 + } 380 + if (cp->af == p->af && 381 + ip_vs_addr_equal(p->af, p->caddr, 382 + &cp->caddr) && 383 + /* protocol should only be IPPROTO_IP if 384 + * p->vaddr is a fwmark 385 + */ 386 + ip_vs_addr_equal(p->protocol == IPPROTO_IP ? 387 + AF_UNSPEC : p->af, 388 + p->vaddr, &cp->vaddr) && 389 + p->vport == cp->vport && 390 + p->cport == cp->cport && 391 + cp->flags & IP_VS_CONN_F_TEMPLATE && 392 + p->protocol == cp->protocol && 393 + cp->dport != htons(0xffff)) { 394 + if (__ip_vs_conn_get(cp)) 395 + goto out; 396 + } 477 397 } 478 - continue; 479 398 } 480 399 481 - if (cp->af == p->af && 482 - ip_vs_addr_equal(p->af, p->caddr, &cp->caddr) && 483 - /* protocol should only be IPPROTO_IP if 484 - * p->vaddr is a fwmark */ 485 - ip_vs_addr_equal(p->protocol == IPPROTO_IP ? AF_UNSPEC : 486 - p->af, p->vaddr, &cp->vaddr) && 487 - p->vport == cp->vport && p->cport == cp->cport && 488 - cp->flags & IP_VS_CONN_F_TEMPLATE && 489 - p->protocol == cp->protocol && 490 - cp->ipvs == p->ipvs) { 491 - if (__ip_vs_conn_get(cp)) 492 - goto out; 493 - } 494 400 } 495 401 cp = NULL; 496 402 ··· 522 396 return cp; 523 397 } 524 398 525 - /* Gets ip_vs_conn associated with supplied parameters in the ip_vs_conn_tab. 399 + /* Gets ip_vs_conn associated with supplied parameters in the conn_tab. 526 400 * Called for pkts coming from inside-to-OUTside. 527 401 * p->caddr, p->cport: pkt source address (inside host) 528 402 * p->vaddr, p->vport: pkt dest address (foreign host) */ 529 403 struct ip_vs_conn *ip_vs_conn_out_get(const struct ip_vs_conn_param *p) 530 404 { 531 - unsigned int hash; 532 - struct ip_vs_conn *cp, *ret=NULL; 405 + DECLARE_IP_VS_RHT_WALK_BUCKET_RCU(); 406 + struct netns_ipvs *ipvs = p->ipvs; 533 407 const union nf_inet_addr *saddr; 408 + struct ip_vs_conn_hnode *hn; 409 + struct hlist_bl_head *head; 410 + struct ip_vs_rht *t, *pt; 411 + struct hlist_bl_node *e; 412 + struct ip_vs_conn *cp; 413 + u32 hash, hash_key; 534 414 __be16 sport; 535 - 536 - /* 537 - * Check for "full" addressed entries 538 - */ 539 - hash = ip_vs_conn_hashkey_param(p, true); 540 415 541 416 rcu_read_lock(); 542 417 543 - hlist_for_each_entry_rcu(cp, &ip_vs_conn_tab[hash], c_list) { 544 - if (p->vport != cp->cport) 545 - continue; 418 + ip_vs_rht_for_each_table_rcu(ipvs->conn_tab, t, pt) { 419 + hash = ip_vs_conn_hashkey_param(p, t, true); 420 + hash_key = ip_vs_rht_build_hash_key(t, hash); 421 + ip_vs_rht_walk_bucket_rcu(t, hash_key, head) { 422 + hlist_bl_for_each_entry_rcu(hn, e, head, node) { 423 + /* dir can be 0 for DR/TUN */ 424 + if (READ_ONCE(hn->hash_key) != hash_key) 425 + continue; 426 + cp = ip_vs_hn_to_conn(hn); 427 + if (p->vport != cp->cport) 428 + continue; 546 429 547 - if (IP_VS_FWD_METHOD(cp) != IP_VS_CONN_F_MASQ) { 548 - sport = cp->vport; 549 - saddr = &cp->vaddr; 550 - } else { 551 - sport = cp->dport; 552 - saddr = &cp->daddr; 553 - } 430 + if (IP_VS_FWD_METHOD(cp) != IP_VS_CONN_F_MASQ) { 431 + sport = cp->vport; 432 + saddr = &cp->vaddr; 433 + } else { 434 + sport = cp->dport; 435 + saddr = &cp->daddr; 436 + } 554 437 555 - if (p->cport == sport && cp->af == p->af && 556 - ip_vs_addr_equal(p->af, p->vaddr, &cp->caddr) && 557 - ip_vs_addr_equal(p->af, p->caddr, saddr) && 558 - p->protocol == cp->protocol && 559 - cp->ipvs == p->ipvs) { 560 - if (!__ip_vs_conn_get(cp)) 561 - continue; 562 - /* HIT */ 563 - ret = cp; 564 - break; 438 + if (p->cport == sport && cp->af == p->af && 439 + ip_vs_addr_equal(p->af, p->vaddr, 440 + &cp->caddr) && 441 + ip_vs_addr_equal(p->af, p->caddr, saddr) && 442 + p->protocol == cp->protocol) { 443 + if (__ip_vs_conn_get(cp)) 444 + goto out; 445 + } 446 + } 565 447 } 566 448 } 449 + cp = NULL; 567 450 451 + out: 568 452 rcu_read_unlock(); 569 453 570 454 IP_VS_DBG_BUF(9, "lookup/out %s %s:%d->%s:%d %s\n", 571 455 ip_vs_proto_name(p->protocol), 572 456 IP_VS_DBG_ADDR(p->af, p->caddr), ntohs(p->cport), 573 457 IP_VS_DBG_ADDR(p->af, p->vaddr), ntohs(p->vport), 574 - ret ? "hit" : "not hit"); 458 + cp ? "hit" : "not hit"); 575 459 576 - return ret; 460 + return cp; 577 461 } 578 462 579 463 struct ip_vs_conn * ··· 628 492 */ 629 493 void ip_vs_conn_fill_cport(struct ip_vs_conn *cp, __be16 cport) 630 494 { 631 - if (ip_vs_conn_unhash(cp)) { 632 - struct netns_ipvs *ipvs = cp->ipvs; 633 - int af_id = ip_vs_af_index(cp->af); 495 + struct hlist_bl_head *head, *head2, *head_new; 496 + bool use2 = ip_vs_conn_use_hash2(cp); 497 + struct netns_ipvs *ipvs = cp->ipvs; 498 + int af_id = ip_vs_af_index(cp->af); 499 + u32 hash_r = 0, hash_key_r = 0; 500 + struct ip_vs_rht *t, *tp, *t2; 501 + struct ip_vs_conn_hnode *hn; 502 + u32 hash_key, hash_key_new; 503 + struct ip_vs_conn_param p; 504 + int ntbl; 505 + int dir; 634 506 635 - spin_lock_bh(&cp->lock); 636 - if (cp->flags & IP_VS_CONN_F_NO_CPORT) { 507 + /* No packets from inside, so we can do it in 2 steps. */ 508 + dir = use2 ? 1 : 0; 509 + 510 + next_dir: 511 + if (dir) 512 + ip_vs_conn_fill_param(ipvs, cp->af, cp->protocol, &cp->daddr, 513 + cp->dport, &cp->caddr, cport, &p); 514 + else 515 + ip_vs_conn_fill_param(ipvs, cp->af, cp->protocol, &cp->caddr, 516 + cport, &cp->vaddr, cp->vport, &p); 517 + hn = dir ? &cp->hn1 : &cp->hn0; 518 + ntbl = 0; 519 + 520 + /* Attempt to rehash cp safely, by informing seqcount readers */ 521 + t = rcu_dereference(ipvs->conn_tab); 522 + hash_key = READ_ONCE(hn->hash_key); 523 + tp = NULL; 524 + 525 + retry: 526 + /* Moved to new table ? */ 527 + if (!ip_vs_rht_same_table(t, hash_key)) { 528 + t = rcu_dereference(t->new_tbl); 529 + ntbl++; 530 + /* We are lost? */ 531 + if (ntbl >= 2) 532 + return; 533 + } 534 + 535 + /* Rehashing during resize? Use the recent table for adds */ 536 + t2 = rcu_dereference(t->new_tbl); 537 + /* Calc new hash once per table */ 538 + if (tp != t2) { 539 + hash_r = ip_vs_conn_hashkey_param(&p, t2, dir); 540 + hash_key_r = ip_vs_rht_build_hash_key(t2, hash_r); 541 + tp = t2; 542 + } 543 + head = t->buckets + (hash_key & t->mask); 544 + head2 = t2->buckets + (hash_key_r & t2->mask); 545 + head_new = head2; 546 + 547 + if (head > head2 && t == t2) 548 + swap(head, head2); 549 + 550 + /* Lock seqcount only for the old bucket, even if we are on new table 551 + * because it affects the del operation, not the adding. 552 + */ 553 + spin_lock_bh(&t->lock[hash_key & t->lock_mask].l); 554 + preempt_disable_nested(); 555 + write_seqcount_begin(&t->seqc[hash_key & t->seqc_mask]); 556 + 557 + /* Lock buckets in same (increasing) order */ 558 + hlist_bl_lock(head); 559 + if (head != head2) 560 + hlist_bl_lock(head2); 561 + 562 + /* Ensure hash_key is read under lock */ 563 + hash_key_new = READ_ONCE(hn->hash_key); 564 + /* Racing with another rehashing ? */ 565 + if (unlikely(hash_key != hash_key_new)) { 566 + if (head != head2) 567 + hlist_bl_unlock(head2); 568 + hlist_bl_unlock(head); 569 + write_seqcount_end(&t->seqc[hash_key & t->seqc_mask]); 570 + preempt_enable_nested(); 571 + spin_unlock_bh(&t->lock[hash_key & t->lock_mask].l); 572 + hash_key = hash_key_new; 573 + goto retry; 574 + } 575 + 576 + spin_lock(&cp->lock); 577 + if ((cp->flags & IP_VS_CONN_F_NO_CPORT) && 578 + (cp->flags & IP_VS_CONN_F_HASHED)) { 579 + /* We do not recalc hash_key_r under lock, we assume the 580 + * parameters in cp do not change, i.e. cport is 581 + * the only possible change. 582 + */ 583 + WRITE_ONCE(hn->hash_key, hash_key_r); 584 + if (!use2) 585 + WRITE_ONCE(cp->hn1.hash_key, hash_key_r); 586 + /* For dir=1 we do not check in flags if hn is already 587 + * rehashed but this check will do it. 588 + */ 589 + if (head != head2) { 590 + hlist_bl_del_rcu(&hn->node); 591 + hlist_bl_add_head_rcu(&hn->node, head_new); 592 + } 593 + if (!dir) { 637 594 atomic_dec(&ipvs->no_cport_conns[af_id]); 638 595 cp->flags &= ~IP_VS_CONN_F_NO_CPORT; 639 596 cp->cport = cport; 640 597 } 641 - spin_unlock_bh(&cp->lock); 642 - 643 - /* hash on new dport */ 644 - ip_vs_conn_hash(cp); 645 598 } 599 + spin_unlock(&cp->lock); 600 + 601 + if (head != head2) 602 + hlist_bl_unlock(head2); 603 + hlist_bl_unlock(head); 604 + write_seqcount_end(&t->seqc[hash_key & t->seqc_mask]); 605 + preempt_enable_nested(); 606 + spin_unlock_bh(&t->lock[hash_key & t->lock_mask].l); 607 + if (dir--) 608 + goto next_dir; 646 609 } 647 610 611 + /* Get default load factor to map conn_count/u_thresh to t->size */ 612 + static int ip_vs_conn_default_load_factor(struct netns_ipvs *ipvs) 613 + { 614 + int factor; 615 + 616 + if (net_eq(ipvs->net, &init_net)) 617 + factor = -3; 618 + else 619 + factor = -1; 620 + /* Double hashing adds twice more nodes for NAT */ 621 + factor--; 622 + return factor; 623 + } 624 + 625 + /* Get the desired conn_tab size */ 626 + int ip_vs_conn_desired_size(struct netns_ipvs *ipvs, struct ip_vs_rht *t, 627 + int lfactor) 628 + { 629 + return ip_vs_rht_desired_size(ipvs, t, atomic_read(&ipvs->conn_count), 630 + lfactor, IP_VS_CONN_TAB_MIN_BITS, 631 + ip_vs_conn_tab_bits); 632 + } 633 + 634 + /* Allocate conn_tab */ 635 + struct ip_vs_rht *ip_vs_conn_tab_alloc(struct netns_ipvs *ipvs, int buckets, 636 + int lfactor) 637 + { 638 + struct ip_vs_rht *t; 639 + int scounts, locks; 640 + 641 + /* scounts: affects readers during resize */ 642 + scounts = clamp(buckets >> 6, 1, 256); 643 + /* locks: based on parallel IP_VS_CONN_F_NO_CPORT operations + resize */ 644 + locks = clamp(8, 1, scounts); 645 + 646 + t = ip_vs_rht_alloc(buckets, scounts, locks); 647 + if (!t) 648 + return NULL; 649 + t->lfactor = lfactor; 650 + ip_vs_rht_set_thresholds(t, t->size, lfactor, IP_VS_CONN_TAB_MIN_BITS, 651 + ip_vs_conn_tab_bits); 652 + return t; 653 + } 654 + 655 + /* conn_tab resizer work */ 656 + static void conn_resize_work_handler(struct work_struct *work) 657 + { 658 + struct hlist_bl_head *head, *head2; 659 + unsigned int resched_score = 0; 660 + struct hlist_bl_node *cn, *nn; 661 + struct ip_vs_rht *t, *t_new; 662 + struct ip_vs_conn_hnode *hn; 663 + struct netns_ipvs *ipvs; 664 + struct ip_vs_conn *cp; 665 + bool more_work = false; 666 + u32 hash, hash_key; 667 + int limit = 0; 668 + int new_size; 669 + int lfactor; 670 + u32 bucket; 671 + 672 + ipvs = container_of(work, struct netns_ipvs, conn_resize_work.work); 673 + 674 + /* Allow work to be queued again */ 675 + clear_bit(IP_VS_WORK_CONN_RESIZE, &ipvs->work_flags); 676 + t = rcu_dereference_protected(ipvs->conn_tab, 1); 677 + /* Do nothing if table is removed */ 678 + if (!t) 679 + goto out; 680 + /* New table needs to be registered? BUG! */ 681 + if (t != rcu_dereference_protected(t->new_tbl, 1)) 682 + goto out; 683 + 684 + lfactor = sysctl_conn_lfactor(ipvs); 685 + /* Should we resize ? */ 686 + new_size = ip_vs_conn_desired_size(ipvs, t, lfactor); 687 + if (new_size == t->size && lfactor == t->lfactor) 688 + goto out; 689 + 690 + t_new = ip_vs_conn_tab_alloc(ipvs, new_size, lfactor); 691 + if (!t_new) { 692 + more_work = true; 693 + goto out; 694 + } 695 + /* Flip the table_id */ 696 + t_new->table_id = t->table_id ^ IP_VS_RHT_TABLE_ID_MASK; 697 + 698 + rcu_assign_pointer(t->new_tbl, t_new); 699 + 700 + /* Wait RCU readers to see the new table, we do not want new 701 + * conns to go into old table and to be left there. 702 + */ 703 + synchronize_rcu(); 704 + 705 + ip_vs_rht_for_each_bucket(t, bucket, head) { 706 + same_bucket: 707 + if (++limit >= 16) { 708 + if (resched_score >= 100) { 709 + resched_score = 0; 710 + cond_resched(); 711 + } 712 + limit = 0; 713 + } 714 + if (hlist_bl_empty(head)) { 715 + resched_score++; 716 + continue; 717 + } 718 + /* Preemption calls ahead... */ 719 + resched_score = 0; 720 + 721 + /* seqcount_t usage considering PREEMPT_RT rules: 722 + * - other writers (SoftIRQ) => serialize with spin_lock_bh 723 + * - readers (SoftIRQ) => disable BHs 724 + * - readers (processes) => preemption should be disabled 725 + */ 726 + spin_lock_bh(&t->lock[bucket & t->lock_mask].l); 727 + preempt_disable_nested(); 728 + write_seqcount_begin(&t->seqc[bucket & t->seqc_mask]); 729 + hlist_bl_lock(head); 730 + 731 + hlist_bl_for_each_entry_safe(hn, cn, nn, head, node) { 732 + cp = ip_vs_hn_to_conn(hn); 733 + hash = ip_vs_conn_hashkey_conn(t_new, cp, hn->dir); 734 + hash_key = ip_vs_rht_build_hash_key(t_new, hash); 735 + 736 + head2 = t_new->buckets + (hash & t_new->mask); 737 + hlist_bl_lock(head2); 738 + /* t_new->seqc are not used at this stage, we race 739 + * only with add/del, so only lock the bucket. 740 + */ 741 + hlist_bl_del_rcu(&hn->node); 742 + WRITE_ONCE(hn->hash_key, hash_key); 743 + /* Keep both hash keys in sync if no double hashing */ 744 + if (!ip_vs_conn_use_hash2(cp)) 745 + WRITE_ONCE(cp->hn1.hash_key, hash_key); 746 + hlist_bl_add_head_rcu(&hn->node, head2); 747 + hlist_bl_unlock(head2); 748 + /* Too long chain? Do it in steps */ 749 + if (++limit >= 64) 750 + break; 751 + } 752 + 753 + hlist_bl_unlock(head); 754 + write_seqcount_end(&t->seqc[bucket & t->seqc_mask]); 755 + preempt_enable_nested(); 756 + spin_unlock_bh(&t->lock[bucket & t->lock_mask].l); 757 + if (limit >= 64) 758 + goto same_bucket; 759 + } 760 + 761 + rcu_assign_pointer(ipvs->conn_tab, t_new); 762 + /* Inform readers that new table is installed */ 763 + smp_mb__before_atomic(); 764 + atomic_inc(&ipvs->conn_tab_changes); 765 + 766 + /* RCU readers should not see more than two tables in chain. 767 + * To prevent new table to be attached wait here instead of 768 + * freeing the old table in RCU callback. 769 + */ 770 + synchronize_rcu(); 771 + ip_vs_rht_free(t); 772 + 773 + out: 774 + /* Monitor if we need to shrink table */ 775 + queue_delayed_work(system_unbound_wq, &ipvs->conn_resize_work, 776 + more_work ? 1 : 2 * HZ); 777 + } 648 778 649 779 /* 650 780 * Bind a connection entry with the corresponding packet_xmit. ··· 1194 792 IP_VS_DBG_ADDR(ct->daf, &ct->daddr), 1195 793 ntohs(ct->dport)); 1196 794 1197 - /* 1198 - * Invalidate the connection template 795 + /* Invalidate the connection template. Prefer to avoid 796 + * rehashing, it will move it as first in chain, so use 797 + * only dport as indication, it is not a hash key. 1199 798 */ 1200 - if (ct->vport != htons(0xffff)) { 1201 - if (ip_vs_conn_unhash(ct)) { 1202 - ct->dport = htons(0xffff); 1203 - ct->vport = htons(0xffff); 1204 - ct->cport = 0; 1205 - ip_vs_conn_hash(ct); 1206 - } 1207 - } 799 + ct->dport = htons(0xffff); 1208 800 1209 801 /* 1210 802 * Simply decrease the refcnt of the template, ··· 1339 943 1340 944 1341 945 /* 1342 - * Create a new connection entry and hash it into the ip_vs_conn_tab 946 + * Create a new connection entry and hash it into the conn_tab 1343 947 */ 1344 948 struct ip_vs_conn * 1345 949 ip_vs_conn_new(const struct ip_vs_conn_param *p, int dest_af, ··· 1357 961 return NULL; 1358 962 } 1359 963 1360 - INIT_HLIST_NODE(&cp->c_list); 964 + INIT_HLIST_BL_NODE(&cp->hn0.node); 965 + INIT_HLIST_BL_NODE(&cp->hn1.node); 1361 966 timer_setup(&cp->timer, ip_vs_conn_expire, 0); 1362 967 cp->ipvs = ipvs; 968 + cp->hn0.dir = 0; 1363 969 cp->af = p->af; 970 + cp->hn1.dir = 1; 1364 971 cp->daf = dest_af; 1365 972 cp->protocol = p->protocol; 1366 973 ip_vs_addr_set(p->af, &cp->caddr, p->caddr); ··· 1444 1045 if (ip_vs_conntrack_enabled(ipvs)) 1445 1046 cp->flags |= IP_VS_CONN_F_NFCT; 1446 1047 1447 - /* Hash it in the ip_vs_conn_tab finally */ 1048 + /* Hash it in the conn_tab finally */ 1448 1049 ip_vs_conn_hash(cp); 1449 1050 1450 1051 return cp; ··· 1456 1057 #ifdef CONFIG_PROC_FS 1457 1058 struct ip_vs_iter_state { 1458 1059 struct seq_net_private p; 1459 - unsigned int bucket; 1060 + struct ip_vs_rht *t; 1061 + int gen; 1062 + u32 bucket; 1460 1063 unsigned int skip_elems; 1461 1064 }; 1462 1065 1463 - static void *ip_vs_conn_array(struct ip_vs_iter_state *iter) 1066 + static void *ip_vs_conn_array(struct seq_file *seq) 1464 1067 { 1068 + struct ip_vs_iter_state *iter = seq->private; 1069 + struct net *net = seq_file_net(seq); 1070 + struct netns_ipvs *ipvs = net_ipvs(net); 1071 + struct ip_vs_rht *t = iter->t; 1072 + struct ip_vs_conn_hnode *hn; 1073 + struct hlist_bl_node *e; 1465 1074 int idx; 1466 - struct ip_vs_conn *cp; 1467 1075 1468 - for (idx = iter->bucket; idx < ip_vs_conn_tab_size; idx++) { 1076 + if (!t) 1077 + return NULL; 1078 + for (idx = iter->bucket; idx < t->size; idx++) { 1469 1079 unsigned int skip = 0; 1470 1080 1471 - hlist_for_each_entry_rcu(cp, &ip_vs_conn_tab[idx], c_list) { 1081 + hlist_bl_for_each_entry_rcu(hn, e, &t->buckets[idx], node) { 1472 1082 /* __ip_vs_conn_get() is not needed by 1473 1083 * ip_vs_conn_seq_show and ip_vs_conn_sync_seq_show 1474 1084 */ 1085 + if (!ip_vs_rht_same_table(t, READ_ONCE(hn->hash_key))) 1086 + break; 1087 + if (hn->dir != 0) 1088 + continue; 1475 1089 if (skip >= iter->skip_elems) { 1476 1090 iter->bucket = idx; 1477 - return cp; 1091 + return hn; 1478 1092 } 1479 1093 1480 1094 ++skip; 1481 1095 } 1482 1096 1097 + if (!(idx & 31)) { 1098 + cond_resched_rcu(); 1099 + /* New table installed ? */ 1100 + if (iter->gen != atomic_read(&ipvs->conn_tab_changes)) 1101 + break; 1102 + } 1483 1103 iter->skip_elems = 0; 1484 - cond_resched_rcu(); 1485 1104 } 1486 1105 1487 1106 iter->bucket = idx; ··· 1510 1093 __acquires(RCU) 1511 1094 { 1512 1095 struct ip_vs_iter_state *iter = seq->private; 1096 + struct net *net = seq_file_net(seq); 1097 + struct netns_ipvs *ipvs = net_ipvs(net); 1513 1098 1514 1099 rcu_read_lock(); 1100 + iter->gen = atomic_read(&ipvs->conn_tab_changes); 1101 + smp_rmb(); /* ipvs->conn_tab and conn_tab_changes */ 1102 + iter->t = rcu_dereference(ipvs->conn_tab); 1515 1103 if (*pos == 0) { 1516 1104 iter->skip_elems = 0; 1517 1105 iter->bucket = 0; 1518 1106 return SEQ_START_TOKEN; 1519 1107 } 1520 1108 1521 - return ip_vs_conn_array(iter); 1109 + return ip_vs_conn_array(seq); 1522 1110 } 1523 1111 1524 1112 static void *ip_vs_conn_seq_next(struct seq_file *seq, void *v, loff_t *pos) 1525 1113 { 1526 - struct ip_vs_conn *cp = v; 1527 1114 struct ip_vs_iter_state *iter = seq->private; 1528 - struct hlist_node *e; 1115 + struct ip_vs_conn_hnode *hn = v; 1116 + struct hlist_bl_node *e; 1117 + struct ip_vs_rht *t; 1529 1118 1530 1119 ++*pos; 1531 1120 if (v == SEQ_START_TOKEN) 1532 - return ip_vs_conn_array(iter); 1121 + return ip_vs_conn_array(seq); 1122 + 1123 + t = iter->t; 1124 + if (!t) 1125 + return NULL; 1533 1126 1534 1127 /* more on same hash chain? */ 1535 - e = rcu_dereference(hlist_next_rcu(&cp->c_list)); 1536 - if (e) { 1128 + hlist_bl_for_each_entry_continue_rcu(hn, e, node) { 1129 + /* Our cursor was moved to new table ? */ 1130 + if (!ip_vs_rht_same_table(t, READ_ONCE(hn->hash_key))) 1131 + break; 1132 + if (hn->dir != 0) 1133 + continue; 1537 1134 iter->skip_elems++; 1538 - return hlist_entry(e, struct ip_vs_conn, c_list); 1135 + return hn; 1539 1136 } 1540 1137 1541 1138 iter->skip_elems = 0; 1542 1139 iter->bucket++; 1543 1140 1544 - return ip_vs_conn_array(iter); 1141 + return ip_vs_conn_array(seq); 1545 1142 } 1546 1143 1547 1144 static void ip_vs_conn_seq_stop(struct seq_file *seq, void *v) ··· 1571 1140 seq_puts(seq, 1572 1141 "Pro FromIP FPrt ToIP TPrt DestIP DPrt State Expires PEName PEData\n"); 1573 1142 else { 1574 - const struct ip_vs_conn *cp = v; 1575 - struct net *net = seq_file_net(seq); 1143 + struct ip_vs_conn_hnode *hn = v; 1144 + const struct ip_vs_conn *cp = ip_vs_hn0_to_conn(hn); 1576 1145 char pe_data[IP_VS_PENAME_MAXLEN + IP_VS_PEDATA_MAXLEN + 3]; 1577 1146 size_t len = 0; 1578 1147 char dbuf[IP_VS_ADDRSTRLEN]; 1579 1148 1580 - if (!net_eq(cp->ipvs->net, net)) 1581 - return 0; 1582 1149 if (cp->pe_data) { 1583 1150 pe_data[0] = ' '; 1584 1151 len = strlen(cp->pe->name); ··· 1648 1219 "Pro FromIP FPrt ToIP TPrt DestIP DPrt State Origin Expires\n"); 1649 1220 else { 1650 1221 const struct ip_vs_conn *cp = v; 1651 - struct net *net = seq_file_net(seq); 1652 - 1653 - if (!net_eq(cp->ipvs->net, net)) 1654 - return 0; 1655 1222 1656 1223 #ifdef CONFIG_IP_VS_IPV6 1657 1224 if (cp->daf == AF_INET6) ··· 1737 1312 return svc && (svc->flags & IP_VS_SVC_F_ONEPACKET); 1738 1313 } 1739 1314 1740 - /* Called from keventd and must protect itself from softirqs */ 1741 1315 void ip_vs_random_dropentry(struct netns_ipvs *ipvs) 1742 1316 { 1743 - int idx; 1317 + struct ip_vs_conn_hnode *hn; 1318 + struct hlist_bl_node *e; 1744 1319 struct ip_vs_conn *cp; 1320 + struct ip_vs_rht *t; 1321 + unsigned int r; 1322 + int idx; 1745 1323 1324 + r = get_random_u32(); 1746 1325 rcu_read_lock(); 1326 + t = rcu_dereference(ipvs->conn_tab); 1327 + if (!t) 1328 + goto out; 1747 1329 /* 1748 1330 * Randomly scan 1/32 of the whole table every second 1749 1331 */ 1750 - for (idx = 0; idx < (ip_vs_conn_tab_size>>5); idx++) { 1751 - unsigned int hash = get_random_u32() & ip_vs_conn_tab_mask; 1332 + for (idx = 0; idx < (t->size >> 5); idx++) { 1333 + unsigned int hash = (r + idx) & t->mask; 1752 1334 1753 - hlist_for_each_entry_rcu(cp, &ip_vs_conn_tab[hash], c_list) { 1754 - if (cp->ipvs != ipvs) 1335 + /* Don't care if due to moved entry we jump to another bucket 1336 + * and even to new table 1337 + */ 1338 + hlist_bl_for_each_entry_rcu(hn, e, &t->buckets[hash], node) { 1339 + if (hn->dir != 0) 1755 1340 continue; 1341 + cp = ip_vs_hn0_to_conn(hn); 1756 1342 if (atomic_read(&cp->n_control)) 1757 1343 continue; 1758 1344 if (cp->flags & IP_VS_CONN_F_TEMPLATE) { ··· 1810 1374 IP_VS_DBG(4, "drop connection\n"); 1811 1375 ip_vs_conn_del(cp); 1812 1376 } 1813 - cond_resched_rcu(); 1377 + if (!(idx & 31)) { 1378 + cond_resched_rcu(); 1379 + t = rcu_dereference(ipvs->conn_tab); 1380 + if (!t) 1381 + goto out; 1382 + } 1814 1383 } 1384 + 1385 + out: 1815 1386 rcu_read_unlock(); 1816 1387 } 1817 1388 #endif 1818 1389 1819 - /* 1820 - * Flush all the connection entries in the ip_vs_conn_tab 1821 - */ 1390 + /* Flush all the connection entries in the conn_tab */ 1822 1391 static void ip_vs_conn_flush(struct netns_ipvs *ipvs) 1823 1392 { 1824 - int idx; 1393 + DECLARE_IP_VS_RHT_WALK_BUCKETS_SAFE_RCU(); 1825 1394 struct ip_vs_conn *cp, *cp_c; 1395 + struct ip_vs_conn_hnode *hn; 1396 + struct hlist_bl_head *head; 1397 + struct ip_vs_rht *t, *p; 1398 + struct hlist_bl_node *e; 1399 + 1400 + if (!rcu_dereference_protected(ipvs->conn_tab, 1)) 1401 + return; 1402 + cancel_delayed_work_sync(&ipvs->conn_resize_work); 1403 + if (!atomic_read(&ipvs->conn_count)) 1404 + goto unreg; 1826 1405 1827 1406 flush_again: 1407 + /* Rely on RCU grace period while accessing cp after ip_vs_conn_del */ 1828 1408 rcu_read_lock(); 1829 - for (idx = 0; idx < ip_vs_conn_tab_size; idx++) { 1830 - 1831 - hlist_for_each_entry_rcu(cp, &ip_vs_conn_tab[idx], c_list) { 1832 - if (cp->ipvs != ipvs) 1409 + ip_vs_rht_walk_buckets_safe_rcu(ipvs->conn_tab, head) { 1410 + hlist_bl_for_each_entry_rcu(hn, e, head, node) { 1411 + if (hn->dir != 0) 1833 1412 continue; 1413 + cp = ip_vs_hn0_to_conn(hn); 1834 1414 if (atomic_read(&cp->n_control)) 1835 1415 continue; 1836 1416 cp_c = cp->control; ··· 1867 1415 schedule(); 1868 1416 goto flush_again; 1869 1417 } 1418 + 1419 + unreg: 1420 + /* Unregister the hash table and release it after RCU grace period. 1421 + * This is needed because other works may not be stopped yet and 1422 + * they may walk the tables. 1423 + */ 1424 + t = rcu_dereference_protected(ipvs->conn_tab, 1); 1425 + rcu_assign_pointer(ipvs->conn_tab, NULL); 1426 + /* Inform readers that conn_tab is changed */ 1427 + smp_mb__before_atomic(); 1428 + atomic_inc(&ipvs->conn_tab_changes); 1429 + while (1) { 1430 + p = rcu_dereference_protected(t->new_tbl, 1); 1431 + call_rcu(&t->rcu_head, ip_vs_rht_rcu_free); 1432 + if (p == t) 1433 + break; 1434 + t = p; 1435 + } 1870 1436 } 1871 1437 1872 1438 #ifdef CONFIG_SYSCTL 1873 1439 void ip_vs_expire_nodest_conn_flush(struct netns_ipvs *ipvs) 1874 1440 { 1875 - int idx; 1441 + DECLARE_IP_VS_RHT_WALK_BUCKETS_RCU(); 1442 + unsigned int resched_score = 0; 1876 1443 struct ip_vs_conn *cp, *cp_c; 1444 + struct ip_vs_conn_hnode *hn; 1445 + struct hlist_bl_head *head; 1877 1446 struct ip_vs_dest *dest; 1447 + struct hlist_bl_node *e; 1448 + int old_gen, new_gen; 1878 1449 1450 + if (!atomic_read(&ipvs->conn_count)) 1451 + return; 1452 + old_gen = atomic_read(&ipvs->conn_tab_changes); 1879 1453 rcu_read_lock(); 1880 - for (idx = 0; idx < ip_vs_conn_tab_size; idx++) { 1881 - hlist_for_each_entry_rcu(cp, &ip_vs_conn_tab[idx], c_list) { 1882 - if (cp->ipvs != ipvs) 1883 - continue; 1884 1454 1455 + repeat: 1456 + smp_rmb(); /* ipvs->conn_tab and conn_tab_changes */ 1457 + ip_vs_rht_walk_buckets_rcu(ipvs->conn_tab, head) { 1458 + hlist_bl_for_each_entry_rcu(hn, e, head, node) { 1459 + if (hn->dir != 0) 1460 + continue; 1461 + cp = ip_vs_hn0_to_conn(hn); 1462 + resched_score++; 1885 1463 dest = cp->dest; 1886 1464 if (!dest || (dest->flags & IP_VS_DEST_F_AVAILABLE)) 1887 1465 continue; ··· 1926 1444 IP_VS_DBG(4, "del controlling connection\n"); 1927 1445 ip_vs_conn_del(cp_c); 1928 1446 } 1447 + resched_score += 10; 1929 1448 } 1930 - cond_resched_rcu(); 1931 - 1932 - /* netns clean up started, abort delayed work */ 1933 - if (!READ_ONCE(ipvs->enable)) 1934 - break; 1449 + resched_score++; 1450 + if (resched_score >= 100) { 1451 + resched_score = 0; 1452 + cond_resched_rcu(); 1453 + /* netns clean up started, abort delayed work */ 1454 + if (!READ_ONCE(ipvs->enable)) 1455 + goto out; 1456 + new_gen = atomic_read(&ipvs->conn_tab_changes); 1457 + /* New table installed ? */ 1458 + if (old_gen != new_gen) { 1459 + old_gen = new_gen; 1460 + goto repeat; 1461 + } 1462 + } 1935 1463 } 1464 + 1465 + out: 1936 1466 rcu_read_unlock(); 1937 1467 } 1938 1468 #endif ··· 1959 1465 atomic_set(&ipvs->conn_count, 0); 1960 1466 for (idx = 0; idx < IP_VS_AF_MAX; idx++) 1961 1467 atomic_set(&ipvs->no_cport_conns[idx], 0); 1468 + INIT_DELAYED_WORK(&ipvs->conn_resize_work, conn_resize_work_handler); 1469 + RCU_INIT_POINTER(ipvs->conn_tab, NULL); 1470 + atomic_set(&ipvs->conn_tab_changes, 0); 1471 + ipvs->sysctl_conn_lfactor = ip_vs_conn_default_load_factor(ipvs); 1962 1472 1963 1473 #ifdef CONFIG_PROC_FS 1964 1474 if (!proc_create_net("ip_vs_conn", 0, ipvs->net->proc_net, ··· 1998 1500 1999 1501 int __init ip_vs_conn_init(void) 2000 1502 { 1503 + int min = IP_VS_CONN_TAB_MIN_BITS; 1504 + int max = IP_VS_CONN_TAB_MAX_BITS; 2001 1505 size_t tab_array_size; 2002 1506 int max_avail; 2003 - #if BITS_PER_LONG > 32 2004 - int max = 27; 2005 - #else 2006 - int max = 20; 2007 - #endif 2008 - int min = 8; 2009 - int idx; 2010 1507 2011 1508 max_avail = order_base_2(totalram_pages()) + PAGE_SHIFT; 2012 - max_avail -= 2; /* ~4 in hash row */ 1509 + /* 64-bit: 27 bits at 64GB, 32-bit: 20 bits at 512MB */ 1510 + max_avail += 1; /* hash table loaded at 50% */ 2013 1511 max_avail -= 1; /* IPVS up to 1/2 of mem */ 2014 1512 max_avail -= order_base_2(sizeof(struct ip_vs_conn)); 2015 1513 max = clamp(max_avail, min, max); 2016 1514 ip_vs_conn_tab_bits = clamp(ip_vs_conn_tab_bits, min, max); 2017 1515 ip_vs_conn_tab_size = 1 << ip_vs_conn_tab_bits; 2018 - ip_vs_conn_tab_mask = ip_vs_conn_tab_size - 1; 2019 1516 2020 1517 /* 2021 1518 * Allocate the connection hash table and initialize its list heads 2022 1519 */ 2023 1520 tab_array_size = array_size(ip_vs_conn_tab_size, 2024 - sizeof(*ip_vs_conn_tab)); 2025 - ip_vs_conn_tab = kvmalloc_objs(*ip_vs_conn_tab, ip_vs_conn_tab_size); 2026 - if (!ip_vs_conn_tab) 2027 - return -ENOMEM; 1521 + sizeof(struct hlist_bl_head)); 2028 1522 2029 1523 /* Allocate ip_vs_conn slab cache */ 2030 1524 ip_vs_conn_cachep = KMEM_CACHE(ip_vs_conn, SLAB_HWCACHE_ALIGN); 2031 - if (!ip_vs_conn_cachep) { 2032 - kvfree(ip_vs_conn_tab); 1525 + if (!ip_vs_conn_cachep) 2033 1526 return -ENOMEM; 2034 - } 2035 1527 2036 1528 pr_info("Connection hash table configured (size=%d, memory=%zdKbytes)\n", 2037 1529 ip_vs_conn_tab_size, tab_array_size / 1024); 2038 1530 IP_VS_DBG(0, "Each connection entry needs %zd bytes at least\n", 2039 1531 sizeof(struct ip_vs_conn)); 2040 - 2041 - for (idx = 0; idx < ip_vs_conn_tab_size; idx++) 2042 - INIT_HLIST_HEAD(&ip_vs_conn_tab[idx]); 2043 - 2044 - for (idx = 0; idx < CT_LOCKARRAY_SIZE; idx++) { 2045 - spin_lock_init(&__ip_vs_conntbl_lock_array[idx].l); 2046 - } 2047 - 2048 - /* calculate the random value for connection hash */ 2049 - get_random_bytes(&ip_vs_conn_rnd, sizeof(ip_vs_conn_rnd)); 2050 1532 2051 1533 return 0; 2052 1534 } ··· 2037 1559 rcu_barrier(); 2038 1560 /* Release the empty cache */ 2039 1561 kmem_cache_destroy(ip_vs_conn_cachep); 2040 - kvfree(ip_vs_conn_tab); 2041 1562 }
+179
net/netfilter/ipvs/ip_vs_core.c
··· 117 117 INIT_LIST_HEAD(&table[rows]); 118 118 } 119 119 120 + /* IPVS Resizable Hash Tables: 121 + * - list_bl buckets with bit lock 122 + * 123 + * Goals: 124 + * - RCU lookup for entry can run in parallel with add/del/move operations 125 + * - hash keys can be on non-contiguous memory 126 + * - support entries with duplicate keys 127 + * - unlink entries without lookup, use the saved table and bucket id 128 + * - resizing can trigger on load change or depending on key refresh period 129 + * - customizable load factor to balance between speed and memory usage 130 + * - add/del/move operations should be allowed for any context 131 + * 132 + * Resizing: 133 + * - new table is attached to the current table and all entries are moved 134 + * with new hash key. Finally, the new table is installed as current one and 135 + * the old table is released after RCU grace period. 136 + * - RCU read-side critical sections will walk two tables while resizing is 137 + * in progress 138 + * - new entries are added to the new table 139 + * - entries will be deleted from the old or from the new table, the table_id 140 + * can be saved into entry as part of the hash key to know where the entry is 141 + * hashed 142 + * - move operations may delay readers or to cause retry for the modified 143 + * bucket. As result, searched entry will be found but walkers that operate 144 + * on multiple entries may see same entry twice if bucket walking is retried. 145 + * - for fast path the number of entries (load) can be compared to u_thresh 146 + * and l_thresh to decide when to trigger table growing/shrinking. They 147 + * are calculated based on load factor (shift count), negative value allows 148 + * load to be below 100% to reduce collisions by maintaining larger table 149 + * while positive value tolerates collisions by using smaller table and load 150 + * above 100%: u_thresh(load) = size * (2 ^ lfactor) 151 + * 152 + * Locking: 153 + * - lock: protect seqc if other context except resizer can move entries 154 + * - seqc: seqcount_t, delay/retry readers while entries are moved to 155 + * new table on resizing 156 + * - bit lock: serialize bucket modifications 157 + * - writers may use other locking mechanisms to serialize operations for 158 + * resizing, moving and installing new tables 159 + */ 160 + 161 + void ip_vs_rht_free(struct ip_vs_rht *t) 162 + { 163 + kvfree(t->buckets); 164 + kvfree(t->seqc); 165 + kvfree(t->lock); 166 + kfree(t); 167 + } 168 + 169 + void ip_vs_rht_rcu_free(struct rcu_head *head) 170 + { 171 + struct ip_vs_rht *t; 172 + 173 + t = container_of(head, struct ip_vs_rht, rcu_head); 174 + ip_vs_rht_free(t); 175 + } 176 + 177 + struct ip_vs_rht *ip_vs_rht_alloc(int buckets, int scounts, int locks) 178 + { 179 + struct ip_vs_rht *t = kzalloc(sizeof(*t), GFP_KERNEL); 180 + int i; 181 + 182 + if (!t) 183 + return NULL; 184 + if (scounts) { 185 + int ml = roundup_pow_of_two(nr_cpu_ids); 186 + 187 + scounts = min(scounts, buckets); 188 + scounts = min(scounts, ml); 189 + t->seqc = kvmalloc_array(scounts, sizeof(*t->seqc), GFP_KERNEL); 190 + if (!t->seqc) 191 + goto err; 192 + for (i = 0; i < scounts; i++) 193 + seqcount_init(&t->seqc[i]); 194 + 195 + if (locks) { 196 + locks = min(locks, scounts); 197 + t->lock = kvmalloc_array(locks, sizeof(*t->lock), 198 + GFP_KERNEL); 199 + if (!t->lock) 200 + goto err; 201 + for (i = 0; i < locks; i++) 202 + spin_lock_init(&t->lock[i].l); 203 + } 204 + } 205 + 206 + t->buckets = kvmalloc_array(buckets, sizeof(*t->buckets), GFP_KERNEL); 207 + if (!t->buckets) 208 + goto err; 209 + for (i = 0; i < buckets; i++) 210 + INIT_HLIST_BL_HEAD(&t->buckets[i]); 211 + t->mask = buckets - 1; 212 + t->size = buckets; 213 + t->seqc_mask = scounts - 1; 214 + t->lock_mask = locks - 1; 215 + t->u_thresh = buckets; 216 + t->l_thresh = buckets >> 4; 217 + t->bits = order_base_2(buckets); 218 + /* new_tbl points to self if no new table is filled */ 219 + RCU_INIT_POINTER(t->new_tbl, t); 220 + get_random_bytes(&t->hash_key, sizeof(t->hash_key)); 221 + return t; 222 + 223 + err: 224 + ip_vs_rht_free(t); 225 + return NULL; 226 + } 227 + 228 + /* Get the desired table size for n entries based on current table size and 229 + * by using the formula size = n / (2^lfactor) 230 + * lfactor: shift value for the load factor: 231 + * - >0: u_thresh=size << lfactor, for load factor above 100% 232 + * - <0: u_thresh=size >> -lfactor, for load factor below 100% 233 + * - 0: for load factor of 100% 234 + */ 235 + int ip_vs_rht_desired_size(struct netns_ipvs *ipvs, struct ip_vs_rht *t, int n, 236 + int lfactor, int min_bits, int max_bits) 237 + { 238 + if (!t) 239 + return 1 << min_bits; 240 + n = roundup_pow_of_two(n); 241 + if (lfactor < 0) { 242 + int factor = min(-lfactor, max_bits); 243 + 244 + n = min(n, 1 << (max_bits - factor)); 245 + n <<= factor; 246 + } else { 247 + n = min(n >> lfactor, 1 << max_bits); 248 + } 249 + if (lfactor != t->lfactor) 250 + return clamp(n, 1 << min_bits, 1 << max_bits); 251 + if (n > t->size) 252 + return n; 253 + if (n > t->size >> 4) 254 + return t->size; 255 + /* Shrink but keep it n * 2 to prevent frequent resizing */ 256 + return clamp(n << 1, 1 << min_bits, 1 << max_bits); 257 + } 258 + 259 + /* Set thresholds based on table size and load factor: 260 + * u_thresh = size * (2^lfactor) 261 + * l_thresh = u_thresh / 16 262 + * u_thresh/l_thresh can be used to check if load triggers a table grow/shrink 263 + */ 264 + void ip_vs_rht_set_thresholds(struct ip_vs_rht *t, int size, int lfactor, 265 + int min_bits, int max_bits) 266 + { 267 + if (size >= 1 << max_bits) 268 + t->u_thresh = INT_MAX; /* stop growing */ 269 + else if (lfactor <= 0) 270 + t->u_thresh = size >> min(-lfactor, max_bits); 271 + else 272 + t->u_thresh = min(size, 1 << (30 - lfactor)) << lfactor; 273 + 274 + /* l_thresh: shrink when load is 16 times lower, can be 0 */ 275 + if (size >= 1 << max_bits) 276 + t->l_thresh = (1 << max_bits) >> 4; 277 + else if (size > 1 << min_bits) 278 + t->l_thresh = t->u_thresh >> 4; 279 + else 280 + t->l_thresh = 0; /* stop shrinking */ 281 + } 282 + 283 + /* Return hash value for local info (fast, insecure) */ 284 + u32 ip_vs_rht_hash_linfo(struct ip_vs_rht *t, int af, 285 + const union nf_inet_addr *addr, u32 v1, u32 v2) 286 + { 287 + u32 v3; 288 + 289 + #ifdef CONFIG_IP_VS_IPV6 290 + if (af == AF_INET6) 291 + v3 = ipv6_addr_hash(&addr->in6); 292 + else 293 + #endif 294 + v3 = addr->all[0]; 295 + 296 + return jhash_3words(v1, v2, v3, (u32)t->hash_key.key[0]); 297 + } 298 + 120 299 static inline void 121 300 ip_vs_in_stats(struct ip_vs_conn *cp, struct sk_buff *skb) 122 301 {
+572 -109
net/netfilter/ipvs/ip_vs_ctl.c
··· 29 29 #include <linux/netfilter.h> 30 30 #include <linux/netfilter_ipv4.h> 31 31 #include <linux/mutex.h> 32 + #include <linux/rcupdate_wait.h> 32 33 33 34 #include <net/net_namespace.h> 34 35 #include <linux/nsproxy.h> ··· 294 293 } 295 294 296 295 297 - 296 + /* Service hashing: 297 + * Operation Locking order 298 + * --------------------------------------------------------------------------- 299 + * add table service_mutex, svc_resize_sem(W) 300 + * del table service_mutex 301 + * move between tables svc_resize_sem(W), seqcount_t(W), bit lock 302 + * add/del service service_mutex, bit lock 303 + * find service RCU, seqcount_t(R) 304 + * walk services(blocking) service_mutex, svc_resize_sem(R) 305 + * walk services(non-blocking) RCU, seqcount_t(R) 306 + * 307 + * - new tables are linked/unlinked under service_mutex and svc_resize_sem 308 + * - new table is linked on resizing and all operations can run in parallel 309 + * in 2 tables until the new table is registered as current one 310 + * - two contexts can modify buckets: config and table resize, both in 311 + * process context 312 + * - only table resizer can move entries, so we do not protect t->seqc[] 313 + * items with t->lock[] 314 + * - lookups occur under RCU lock and seqcount reader lock to detect if 315 + * services are moved to new table 316 + * - move operations may disturb readers: find operation will not miss entries 317 + * but walkers may see same entry twice if they are forced to retry chains 318 + * - walkers using cond_resched_rcu() on !PREEMPT_RCU may need to hold 319 + * service_mutex to disallow new tables to be installed or to check 320 + * svc_table_changes and repeat the RCU read section if new table is installed 321 + */ 298 322 299 323 /* 300 324 * Returns hash value for virtual service 301 325 */ 302 - static inline unsigned int 303 - ip_vs_svc_hashkey(struct netns_ipvs *ipvs, int af, unsigned int proto, 326 + static inline u32 327 + ip_vs_svc_hashval(struct ip_vs_rht *t, int af, unsigned int proto, 304 328 const union nf_inet_addr *addr, __be16 port) 305 329 { 306 - unsigned int porth = ntohs(port); 307 - __be32 addr_fold = addr->ip; 308 - __u32 ahash; 309 - 310 - #ifdef CONFIG_IP_VS_IPV6 311 - if (af == AF_INET6) 312 - addr_fold = addr->ip6[0]^addr->ip6[1]^ 313 - addr->ip6[2]^addr->ip6[3]; 314 - #endif 315 - ahash = ntohl(addr_fold); 316 - ahash ^= ((size_t) ipvs >> 8); 317 - 318 - return (proto ^ ahash ^ (porth >> IP_VS_SVC_TAB_BITS) ^ porth) & 319 - IP_VS_SVC_TAB_MASK; 330 + return ip_vs_rht_hash_linfo(t, af, addr, ntohs(port), proto); 320 331 } 321 332 322 333 /* 323 334 * Returns hash value of fwmark for virtual service lookup 324 335 */ 325 - static inline unsigned int ip_vs_svc_fwm_hashkey(struct netns_ipvs *ipvs, __u32 fwmark) 336 + static inline u32 ip_vs_svc_fwm_hashval(struct ip_vs_rht *t, int af, 337 + __u32 fwmark) 326 338 { 327 - return (((size_t)ipvs>>8) ^ fwmark) & IP_VS_SVC_TAB_MASK; 339 + return jhash_2words(fwmark, af, (u32)t->hash_key.key[0]); 328 340 } 329 341 330 - /* 331 - * Hashes a service in the svc_table by <netns,proto,addr,port> 332 - * or by fwmark. 333 - * Should be called with locked tables. 334 - */ 342 + /* Hashes a service in the svc_table by <proto,addr,port> or by fwmark */ 335 343 static int ip_vs_svc_hash(struct ip_vs_service *svc) 336 344 { 337 - unsigned int hash; 345 + struct netns_ipvs *ipvs = svc->ipvs; 346 + struct hlist_bl_head *head; 347 + struct ip_vs_rht *t; 348 + u32 hash; 338 349 339 350 if (svc->flags & IP_VS_SVC_F_HASHED) { 340 351 pr_err("%s(): request for already hashed, called from %pS\n", ··· 354 341 return 0; 355 342 } 356 343 344 + /* increase its refcnt because it is referenced by the svc table */ 345 + atomic_inc(&svc->refcnt); 346 + 347 + /* New entries go into recent table */ 348 + t = rcu_dereference_protected(ipvs->svc_table, 1); 349 + t = rcu_dereference_protected(t->new_tbl, 1); 350 + 357 351 if (svc->fwmark == 0) { 358 352 /* 359 - * Hash it by <netns,protocol,addr,port> 353 + * Hash it by <protocol,addr,port> 360 354 */ 361 - hash = ip_vs_svc_hashkey(svc->ipvs, svc->af, svc->protocol, 355 + hash = ip_vs_svc_hashval(t, svc->af, svc->protocol, 362 356 &svc->addr, svc->port); 363 357 } else { 364 358 /* 365 359 * Hash it by fwmark 366 360 */ 367 - hash = ip_vs_svc_fwm_hashkey(svc->ipvs, svc->fwmark); 361 + hash = ip_vs_svc_fwm_hashval(t, svc->af, svc->fwmark); 368 362 } 369 - hlist_add_head_rcu(&svc->s_list, &svc->ipvs->svc_table[hash]); 370 - 363 + head = t->buckets + (hash & t->mask); 364 + hlist_bl_lock(head); 365 + WRITE_ONCE(svc->hash_key, ip_vs_rht_build_hash_key(t, hash)); 371 366 svc->flags |= IP_VS_SVC_F_HASHED; 372 - /* increase its refcnt because it is referenced by the svc table */ 373 - atomic_inc(&svc->refcnt); 367 + hlist_bl_add_head_rcu(&svc->s_list, head); 368 + hlist_bl_unlock(head); 369 + 374 370 return 1; 375 371 } 376 372 ··· 390 368 */ 391 369 static int ip_vs_svc_unhash(struct ip_vs_service *svc) 392 370 { 371 + struct netns_ipvs *ipvs = svc->ipvs; 372 + struct hlist_bl_head *head; 373 + struct ip_vs_rht *t; 374 + u32 hash_key2; 375 + u32 hash_key; 376 + 393 377 if (!(svc->flags & IP_VS_SVC_F_HASHED)) { 394 378 pr_err("%s(): request for unhash flagged, called from %pS\n", 395 379 __func__, __builtin_return_address(0)); 396 380 return 0; 397 381 } 398 382 383 + t = rcu_dereference_protected(ipvs->svc_table, 1); 384 + hash_key = READ_ONCE(svc->hash_key); 385 + /* We need to lock the bucket in the right table */ 386 + if (ip_vs_rht_same_table(t, hash_key)) { 387 + head = t->buckets + (hash_key & t->mask); 388 + hlist_bl_lock(head); 389 + /* Ensure hash_key is read under lock */ 390 + hash_key2 = READ_ONCE(svc->hash_key); 391 + /* Moved to new table ? */ 392 + if (hash_key != hash_key2) { 393 + hlist_bl_unlock(head); 394 + t = rcu_dereference_protected(t->new_tbl, 1); 395 + head = t->buckets + (hash_key2 & t->mask); 396 + hlist_bl_lock(head); 397 + } 398 + } else { 399 + /* It is already moved to new table */ 400 + t = rcu_dereference_protected(t->new_tbl, 1); 401 + head = t->buckets + (hash_key & t->mask); 402 + hlist_bl_lock(head); 403 + } 399 404 /* Remove it from svc_table */ 400 - hlist_del_rcu(&svc->s_list); 405 + hlist_bl_del_rcu(&svc->s_list); 401 406 402 407 svc->flags &= ~IP_VS_SVC_F_HASHED; 403 408 atomic_dec(&svc->refcnt); 409 + hlist_bl_unlock(head); 404 410 return 1; 405 411 } 406 412 ··· 440 390 __ip_vs_service_find(struct netns_ipvs *ipvs, int af, __u16 protocol, 441 391 const union nf_inet_addr *vaddr, __be16 vport) 442 392 { 443 - unsigned int hash; 393 + DECLARE_IP_VS_RHT_WALK_BUCKET_RCU(); 394 + struct hlist_bl_head *head; 444 395 struct ip_vs_service *svc; 396 + struct ip_vs_rht *t, *p; 397 + struct hlist_bl_node *e; 398 + u32 hash, hash_key; 445 399 446 - /* Check for "full" addressed entries */ 447 - hash = ip_vs_svc_hashkey(ipvs, af, protocol, vaddr, vport); 400 + ip_vs_rht_for_each_table_rcu(ipvs->svc_table, t, p) { 401 + /* Check for "full" addressed entries */ 402 + hash = ip_vs_svc_hashval(t, af, protocol, vaddr, vport); 448 403 449 - hlist_for_each_entry_rcu(svc, &ipvs->svc_table[hash], s_list) { 450 - if (svc->af == af && ip_vs_addr_equal(af, &svc->addr, vaddr) && 451 - svc->port == vport && svc->protocol == protocol && 452 - !svc->fwmark) { 453 - /* HIT */ 454 - return svc; 404 + hash_key = ip_vs_rht_build_hash_key(t, hash); 405 + ip_vs_rht_walk_bucket_rcu(t, hash_key, head) { 406 + hlist_bl_for_each_entry_rcu(svc, e, head, s_list) { 407 + if (READ_ONCE(svc->hash_key) == hash_key && 408 + svc->af == af && 409 + ip_vs_addr_equal(af, &svc->addr, vaddr) && 410 + svc->port == vport && 411 + svc->protocol == protocol && !svc->fwmark) { 412 + /* HIT */ 413 + return svc; 414 + } 415 + } 455 416 } 456 417 } 457 418 ··· 476 415 static inline struct ip_vs_service * 477 416 __ip_vs_svc_fwm_find(struct netns_ipvs *ipvs, int af, __u32 fwmark) 478 417 { 479 - unsigned int hash; 418 + DECLARE_IP_VS_RHT_WALK_BUCKET_RCU(); 419 + struct hlist_bl_head *head; 480 420 struct ip_vs_service *svc; 421 + struct ip_vs_rht *t, *p; 422 + struct hlist_bl_node *e; 423 + u32 hash, hash_key; 481 424 482 - /* Check for fwmark addressed entries */ 483 - hash = ip_vs_svc_fwm_hashkey(ipvs, fwmark); 425 + ip_vs_rht_for_each_table_rcu(ipvs->svc_table, t, p) { 426 + /* Check for fwmark addressed entries */ 427 + hash = ip_vs_svc_fwm_hashval(t, af, fwmark); 484 428 485 - hlist_for_each_entry_rcu(svc, &ipvs->svc_table[hash], s_list) { 486 - if (svc->fwmark == fwmark && svc->af == af) { 487 - /* HIT */ 488 - return svc; 429 + hash_key = ip_vs_rht_build_hash_key(t, hash); 430 + ip_vs_rht_walk_bucket_rcu(t, hash_key, head) { 431 + hlist_bl_for_each_entry_rcu(svc, e, head, s_list) { 432 + if (READ_ONCE(svc->hash_key) == hash_key && 433 + svc->fwmark == fwmark && svc->af == af) { 434 + /* HIT */ 435 + return svc; 436 + } 437 + } 489 438 } 490 439 } 491 440 ··· 558 487 return svc; 559 488 } 560 489 490 + /* Return the number of registered services */ 491 + static int ip_vs_get_num_services(struct netns_ipvs *ipvs) 492 + { 493 + int ns = 0, ni = IP_VS_AF_MAX; 494 + 495 + while (--ni >= 0) 496 + ns += atomic_read(&ipvs->num_services[ni]); 497 + return ns; 498 + } 499 + 500 + /* Get default load factor to map num_services/u_thresh to t->size */ 501 + static int ip_vs_svc_default_load_factor(struct netns_ipvs *ipvs) 502 + { 503 + int factor; 504 + 505 + if (net_eq(ipvs->net, &init_net)) 506 + factor = -3; /* grow if load is above 12.5% */ 507 + else 508 + factor = -2; /* grow if load is above 25% */ 509 + return factor; 510 + } 511 + 512 + /* Get the desired svc_table size */ 513 + static int ip_vs_svc_desired_size(struct netns_ipvs *ipvs, struct ip_vs_rht *t, 514 + int lfactor) 515 + { 516 + return ip_vs_rht_desired_size(ipvs, t, ip_vs_get_num_services(ipvs), 517 + lfactor, IP_VS_SVC_TAB_MIN_BITS, 518 + IP_VS_SVC_TAB_MAX_BITS); 519 + } 520 + 521 + /* Allocate svc_table */ 522 + static struct ip_vs_rht *ip_vs_svc_table_alloc(struct netns_ipvs *ipvs, 523 + int buckets, int lfactor) 524 + { 525 + struct ip_vs_rht *t; 526 + int scounts, locks; 527 + 528 + /* No frequent lookups to race with resizing, so use max of 64 529 + * seqcounts. Only resizer moves entries, so use 0 locks. 530 + */ 531 + scounts = clamp(buckets >> 4, 1, 64); 532 + locks = 0; 533 + 534 + t = ip_vs_rht_alloc(buckets, scounts, locks); 535 + if (!t) 536 + return NULL; 537 + t->lfactor = lfactor; 538 + ip_vs_rht_set_thresholds(t, t->size, lfactor, IP_VS_SVC_TAB_MIN_BITS, 539 + IP_VS_SVC_TAB_MAX_BITS); 540 + return t; 541 + } 542 + 543 + /* svc_table resizer work */ 544 + static void svc_resize_work_handler(struct work_struct *work) 545 + { 546 + struct hlist_bl_head *head, *head2; 547 + struct ip_vs_rht *t_free = NULL; 548 + unsigned int resched_score = 0; 549 + struct hlist_bl_node *cn, *nn; 550 + struct ip_vs_rht *t, *t_new; 551 + struct ip_vs_service *svc; 552 + struct netns_ipvs *ipvs; 553 + bool more_work = true; 554 + seqcount_t *sc; 555 + int limit = 0; 556 + int new_size; 557 + int lfactor; 558 + u32 bucket; 559 + 560 + ipvs = container_of(work, struct netns_ipvs, svc_resize_work.work); 561 + 562 + if (!down_write_trylock(&ipvs->svc_resize_sem)) 563 + goto out; 564 + if (!mutex_trylock(&ipvs->service_mutex)) 565 + goto unlock_sem; 566 + more_work = false; 567 + clear_bit(IP_VS_WORK_SVC_RESIZE, &ipvs->work_flags); 568 + if (!READ_ONCE(ipvs->enable) || 569 + test_bit(IP_VS_WORK_SVC_NORESIZE, &ipvs->work_flags)) 570 + goto unlock_m; 571 + t = rcu_dereference_protected(ipvs->svc_table, 1); 572 + /* Do nothing if table is removed */ 573 + if (!t) 574 + goto unlock_m; 575 + /* New table needs to be registered? BUG! */ 576 + if (t != rcu_dereference_protected(t->new_tbl, 1)) 577 + goto unlock_m; 578 + 579 + lfactor = sysctl_svc_lfactor(ipvs); 580 + /* Should we resize ? */ 581 + new_size = ip_vs_svc_desired_size(ipvs, t, lfactor); 582 + if (new_size == t->size && lfactor == t->lfactor) 583 + goto unlock_m; 584 + 585 + t_new = ip_vs_svc_table_alloc(ipvs, new_size, lfactor); 586 + if (!t_new) { 587 + more_work = true; 588 + goto unlock_m; 589 + } 590 + /* Flip the table_id */ 591 + t_new->table_id = t->table_id ^ IP_VS_RHT_TABLE_ID_MASK; 592 + 593 + rcu_assign_pointer(t->new_tbl, t_new); 594 + /* Allow add/del to new_tbl while moving from old table */ 595 + mutex_unlock(&ipvs->service_mutex); 596 + 597 + ip_vs_rht_for_each_bucket(t, bucket, head) { 598 + same_bucket: 599 + if (++limit >= 16) { 600 + if (!READ_ONCE(ipvs->enable) || 601 + test_bit(IP_VS_WORK_SVC_NORESIZE, 602 + &ipvs->work_flags)) 603 + goto unlock_sem; 604 + if (resched_score >= 100) { 605 + resched_score = 0; 606 + cond_resched(); 607 + } 608 + limit = 0; 609 + } 610 + if (hlist_bl_empty(head)) { 611 + resched_score++; 612 + continue; 613 + } 614 + /* Preemption calls ahead... */ 615 + resched_score = 0; 616 + 617 + sc = &t->seqc[bucket & t->seqc_mask]; 618 + /* seqcount_t usage considering PREEMPT_RT rules: 619 + * - we are the only writer => preemption can be allowed 620 + * - readers (SoftIRQ) => disable BHs 621 + * - readers (processes) => preemption should be disabled 622 + */ 623 + local_bh_disable(); 624 + preempt_disable_nested(); 625 + write_seqcount_begin(sc); 626 + hlist_bl_lock(head); 627 + 628 + hlist_bl_for_each_entry_safe(svc, cn, nn, head, s_list) { 629 + u32 hash; 630 + 631 + /* New hash for the new table */ 632 + if (svc->fwmark == 0) { 633 + /* Hash it by <protocol,addr,port> */ 634 + hash = ip_vs_svc_hashval(t_new, svc->af, 635 + svc->protocol, 636 + &svc->addr, svc->port); 637 + } else { 638 + /* Hash it by fwmark */ 639 + hash = ip_vs_svc_fwm_hashval(t_new, svc->af, 640 + svc->fwmark); 641 + } 642 + hlist_bl_del_rcu(&svc->s_list); 643 + head2 = t_new->buckets + (hash & t_new->mask); 644 + 645 + hlist_bl_lock(head2); 646 + WRITE_ONCE(svc->hash_key, 647 + ip_vs_rht_build_hash_key(t_new, hash)); 648 + /* t_new->seqc are not used at this stage, we race 649 + * only with add/del, so only lock the bucket. 650 + */ 651 + hlist_bl_add_head_rcu(&svc->s_list, head2); 652 + hlist_bl_unlock(head2); 653 + /* Too long chain? Do it in steps */ 654 + if (++limit >= 64) 655 + break; 656 + } 657 + 658 + hlist_bl_unlock(head); 659 + write_seqcount_end(sc); 660 + preempt_enable_nested(); 661 + local_bh_enable(); 662 + if (limit >= 64) 663 + goto same_bucket; 664 + } 665 + 666 + /* Tables can be switched only under service_mutex */ 667 + while (!mutex_trylock(&ipvs->service_mutex)) { 668 + cond_resched(); 669 + if (!READ_ONCE(ipvs->enable) || 670 + test_bit(IP_VS_WORK_SVC_NORESIZE, &ipvs->work_flags)) 671 + goto unlock_sem; 672 + } 673 + if (!READ_ONCE(ipvs->enable) || 674 + test_bit(IP_VS_WORK_SVC_NORESIZE, &ipvs->work_flags)) 675 + goto unlock_m; 676 + 677 + rcu_assign_pointer(ipvs->svc_table, t_new); 678 + /* Inform readers that new table is installed */ 679 + smp_mb__before_atomic(); 680 + atomic_inc(&ipvs->svc_table_changes); 681 + t_free = t; 682 + 683 + unlock_m: 684 + mutex_unlock(&ipvs->service_mutex); 685 + 686 + unlock_sem: 687 + up_write(&ipvs->svc_resize_sem); 688 + 689 + if (t_free) { 690 + /* RCU readers should not see more than two tables in chain. 691 + * To prevent new table to be attached wait here instead of 692 + * freeing the old table in RCU callback. 693 + */ 694 + synchronize_rcu(); 695 + ip_vs_rht_free(t_free); 696 + } 697 + 698 + out: 699 + if (!READ_ONCE(ipvs->enable) || !more_work || 700 + test_bit(IP_VS_WORK_SVC_NORESIZE, &ipvs->work_flags)) 701 + return; 702 + queue_delayed_work(system_unbound_wq, &ipvs->svc_resize_work, 1); 703 + } 561 704 562 705 static inline void 563 706 __ip_vs_bind_svc(struct ip_vs_dest *dest, struct ip_vs_service *svc) ··· 1642 1357 ip_vs_add_service(struct netns_ipvs *ipvs, struct ip_vs_service_user_kern *u, 1643 1358 struct ip_vs_service **svc_p) 1644 1359 { 1645 - int ret = 0; 1646 1360 struct ip_vs_scheduler *sched = NULL; 1361 + struct ip_vs_rht *tc_new = NULL; 1362 + struct ip_vs_rht *t, *t_new = NULL; 1647 1363 int af_id = ip_vs_af_index(u->af); 1648 - struct ip_vs_pe *pe = NULL; 1649 1364 struct ip_vs_service *svc = NULL; 1365 + struct ip_vs_pe *pe = NULL; 1650 1366 int ret_hooks = -1; 1367 + int ret = 0; 1651 1368 1652 1369 /* increase the module use count */ 1653 1370 if (!ip_vs_use_count_inc()) ··· 1690 1403 goto out_err; 1691 1404 } 1692 1405 #endif 1406 + 1407 + t = rcu_dereference_protected(ipvs->svc_table, 1); 1408 + if (!t) { 1409 + int lfactor = sysctl_svc_lfactor(ipvs); 1410 + int new_size = ip_vs_svc_desired_size(ipvs, NULL, lfactor); 1411 + 1412 + t_new = ip_vs_svc_table_alloc(ipvs, new_size, lfactor); 1413 + if (!t_new) { 1414 + ret = -ENOMEM; 1415 + goto out_err; 1416 + } 1417 + } 1418 + 1419 + if (!rcu_dereference_protected(ipvs->conn_tab, 1)) { 1420 + int lfactor = sysctl_conn_lfactor(ipvs); 1421 + int new_size = ip_vs_conn_desired_size(ipvs, NULL, lfactor); 1422 + 1423 + tc_new = ip_vs_conn_tab_alloc(ipvs, new_size, lfactor); 1424 + if (!tc_new) { 1425 + ret = -ENOMEM; 1426 + goto out_err; 1427 + } 1428 + } 1693 1429 1694 1430 if (!atomic_read(&ipvs->num_services[af_id])) { 1695 1431 ret = ip_vs_register_hooks(ipvs, u->af); ··· 1759 1449 if (ret < 0) 1760 1450 goto out_err; 1761 1451 1452 + if (t_new) { 1453 + clear_bit(IP_VS_WORK_SVC_NORESIZE, &ipvs->work_flags); 1454 + rcu_assign_pointer(ipvs->svc_table, t_new); 1455 + t_new = NULL; 1456 + } 1457 + if (tc_new) { 1458 + rcu_assign_pointer(ipvs->conn_tab, tc_new); 1459 + tc_new = NULL; 1460 + } 1461 + 1762 1462 /* Update the virtual service counters */ 1763 1463 if (svc->port == FTPPORT) 1764 1464 atomic_inc(&ipvs->ftpsvc_counter[af_id]); ··· 1790 1470 /* Hash the service into the service table */ 1791 1471 ip_vs_svc_hash(svc); 1792 1472 1473 + /* Schedule resize work */ 1474 + if (t && ip_vs_get_num_services(ipvs) > t->u_thresh && 1475 + !test_and_set_bit(IP_VS_WORK_SVC_RESIZE, &ipvs->work_flags)) 1476 + queue_delayed_work(system_unbound_wq, &ipvs->svc_resize_work, 1477 + 1); 1478 + 1793 1479 *svc_p = svc; 1794 1480 1795 1481 if (!READ_ONCE(ipvs->enable)) { ··· 1810 1484 1811 1485 1812 1486 out_err: 1487 + if (tc_new) 1488 + ip_vs_rht_free(tc_new); 1489 + if (t_new) 1490 + ip_vs_rht_free(t_new); 1813 1491 if (ret_hooks >= 0) 1814 1492 ip_vs_unregister_hooks(ipvs, u->af); 1815 1493 if (svc != NULL) { ··· 2001 1671 */ 2002 1672 static int ip_vs_del_service(struct ip_vs_service *svc) 2003 1673 { 1674 + struct netns_ipvs *ipvs; 1675 + struct ip_vs_rht *t, *p; 1676 + int ns; 1677 + 2004 1678 if (svc == NULL) 2005 1679 return -EEXIST; 1680 + ipvs = svc->ipvs; 2006 1681 ip_vs_unlink_service(svc, false); 1682 + t = rcu_dereference_protected(ipvs->svc_table, 1); 2007 1683 1684 + /* Drop the table if no more services */ 1685 + ns = ip_vs_get_num_services(ipvs); 1686 + if (!ns) { 1687 + /* Stop the resizer and drop the tables */ 1688 + set_bit(IP_VS_WORK_SVC_NORESIZE, &ipvs->work_flags); 1689 + cancel_delayed_work_sync(&ipvs->svc_resize_work); 1690 + if (t) { 1691 + rcu_assign_pointer(ipvs->svc_table, NULL); 1692 + while (1) { 1693 + p = rcu_dereference_protected(t->new_tbl, 1); 1694 + call_rcu(&t->rcu_head, ip_vs_rht_rcu_free); 1695 + if (p == t) 1696 + break; 1697 + t = p; 1698 + } 1699 + } 1700 + } else if (ns <= t->l_thresh && 1701 + !test_and_set_bit(IP_VS_WORK_SVC_RESIZE, 1702 + &ipvs->work_flags)) { 1703 + queue_delayed_work(system_unbound_wq, &ipvs->svc_resize_work, 1704 + 1); 1705 + } 2008 1706 return 0; 2009 1707 } 2010 1708 ··· 2042 1684 */ 2043 1685 static int ip_vs_flush(struct netns_ipvs *ipvs, bool cleanup) 2044 1686 { 2045 - int idx; 1687 + DECLARE_IP_VS_RHT_WALK_BUCKETS(); 1688 + struct hlist_bl_head *head; 2046 1689 struct ip_vs_service *svc; 2047 - struct hlist_node *n; 1690 + struct hlist_bl_node *ne; 1691 + struct hlist_bl_node *e; 1692 + struct ip_vs_rht *t, *p; 2048 1693 2049 - for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) { 2050 - hlist_for_each_entry_safe(svc, n, &ipvs->svc_table[idx], 2051 - s_list) 2052 - ip_vs_unlink_service(svc, cleanup); 1694 + /* Stop the resizer and drop the tables */ 1695 + if (!test_and_set_bit(IP_VS_WORK_SVC_NORESIZE, &ipvs->work_flags)) 1696 + cancel_delayed_work_sync(&ipvs->svc_resize_work); 1697 + /* No resizer, so now we have exclusive write access */ 1698 + 1699 + if (ip_vs_get_num_services(ipvs)) { 1700 + ip_vs_rht_walk_buckets(ipvs->svc_table, head) { 1701 + hlist_bl_for_each_entry_safe(svc, e, ne, head, s_list) 1702 + ip_vs_unlink_service(svc, cleanup); 1703 + } 1704 + } 1705 + 1706 + /* Unregister the hash table and release it after RCU grace period */ 1707 + t = rcu_dereference_protected(ipvs->svc_table, 1); 1708 + if (t) { 1709 + rcu_assign_pointer(ipvs->svc_table, NULL); 1710 + while (1) { 1711 + p = rcu_dereference_protected(t->new_tbl, 1); 1712 + call_rcu(&t->rcu_head, ip_vs_rht_rcu_free); 1713 + if (p == t) 1714 + break; 1715 + t = p; 1716 + } 2053 1717 } 2054 1718 return 0; 2055 1719 } ··· 2122 1742 struct net_device *dev = netdev_notifier_info_to_dev(ptr); 2123 1743 struct net *net = dev_net(dev); 2124 1744 struct netns_ipvs *ipvs = net_ipvs(net); 1745 + DECLARE_IP_VS_RHT_WALK_BUCKETS_RCU(); 1746 + unsigned int resched_score = 0; 1747 + struct hlist_bl_head *head; 2125 1748 struct ip_vs_service *svc; 1749 + struct hlist_bl_node *e; 2126 1750 struct ip_vs_dest *dest; 2127 - unsigned int idx; 1751 + int old_gen, new_gen; 2128 1752 2129 1753 if (event != NETDEV_DOWN || !ipvs) 2130 1754 return NOTIFY_DONE; 2131 1755 IP_VS_DBG(3, "%s() dev=%s\n", __func__, dev->name); 1756 + 1757 + old_gen = atomic_read(&ipvs->svc_table_changes); 1758 + 2132 1759 rcu_read_lock(); 2133 - for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) { 2134 - hlist_for_each_entry_rcu(svc, &ipvs->svc_table[idx], s_list) 1760 + 1761 + repeat: 1762 + smp_rmb(); /* ipvs->svc_table and svc_table_changes */ 1763 + ip_vs_rht_walk_buckets_rcu(ipvs->svc_table, head) { 1764 + hlist_bl_for_each_entry_rcu(svc, e, head, s_list) { 2135 1765 list_for_each_entry_rcu(dest, &svc->destinations, 2136 - n_list) 1766 + n_list) { 2137 1767 ip_vs_forget_dev(dest, dev); 1768 + resched_score += 10; 1769 + } 1770 + resched_score++; 1771 + } 1772 + resched_score++; 1773 + if (resched_score >= 100) { 1774 + resched_score = 0; 1775 + cond_resched_rcu(); 1776 + new_gen = atomic_read(&ipvs->svc_table_changes); 1777 + /* New table installed ? */ 1778 + if (old_gen != new_gen) { 1779 + old_gen = new_gen; 1780 + goto repeat; 1781 + } 1782 + } 2138 1783 } 2139 1784 rcu_read_unlock(); 2140 1785 ··· 2182 1777 2183 1778 static int ip_vs_zero_all(struct netns_ipvs *ipvs) 2184 1779 { 2185 - int idx; 1780 + DECLARE_IP_VS_RHT_WALK_BUCKETS_RCU(); 1781 + unsigned int resched_score = 0; 1782 + struct hlist_bl_head *head; 2186 1783 struct ip_vs_service *svc; 1784 + struct hlist_bl_node *e; 2187 1785 2188 - for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) { 2189 - hlist_for_each_entry(svc, &ipvs->svc_table[idx], s_list) 1786 + rcu_read_lock(); 1787 + 1788 + ip_vs_rht_walk_buckets_rcu(ipvs->svc_table, head) { 1789 + hlist_bl_for_each_entry_rcu(svc, e, head, s_list) { 2190 1790 ip_vs_zero_service(svc); 1791 + resched_score += 10; 1792 + } 1793 + resched_score++; 1794 + if (resched_score >= 100) { 1795 + resched_score = 0; 1796 + cond_resched_rcu(); 1797 + } 2191 1798 } 1799 + 1800 + rcu_read_unlock(); 2192 1801 2193 1802 ip_vs_zero_stats(&ipvs->tot_stats->s); 2194 1803 return 0; ··· 2637 2218 2638 2219 struct ip_vs_iter { 2639 2220 struct seq_net_private p; /* Do not move this, netns depends upon it*/ 2640 - int bucket; 2221 + struct ip_vs_rht *t; 2222 + u32 bucket; 2641 2223 }; 2642 2224 2643 2225 /* ··· 2659 2239 } 2660 2240 } 2661 2241 2662 - 2242 + /* Do not expect consistent view during add, del and move(table resize). 2243 + * We may miss entries and even show duplicates. 2244 + */ 2663 2245 static struct ip_vs_service *ip_vs_info_array(struct seq_file *seq, loff_t pos) 2664 2246 { 2665 - struct net *net = seq_file_net(seq); 2666 - struct netns_ipvs *ipvs = net_ipvs(net); 2667 2247 struct ip_vs_iter *iter = seq->private; 2668 - int idx; 2248 + struct ip_vs_rht *t = iter->t; 2669 2249 struct ip_vs_service *svc; 2250 + struct hlist_bl_node *e; 2251 + int idx; 2670 2252 2671 - for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) { 2672 - hlist_for_each_entry_rcu(svc, &ipvs->svc_table[idx], s_list) { 2253 + if (!t) 2254 + return NULL; 2255 + for (idx = 0; idx < t->size; idx++) { 2256 + hlist_bl_for_each_entry_rcu(svc, e, &t->buckets[idx], s_list) { 2257 + if (!ip_vs_rht_same_table(t, READ_ONCE(svc->hash_key))) 2258 + break; 2673 2259 if (pos-- == 0) { 2674 2260 iter->bucket = idx; 2675 2261 return svc; ··· 2688 2262 static void *ip_vs_info_seq_start(struct seq_file *seq, loff_t *pos) 2689 2263 __acquires(RCU) 2690 2264 { 2265 + struct ip_vs_iter *iter = seq->private; 2266 + struct net *net = seq_file_net(seq); 2267 + struct netns_ipvs *ipvs = net_ipvs(net); 2268 + 2691 2269 rcu_read_lock(); 2270 + iter->t = rcu_dereference(ipvs->svc_table); 2692 2271 return *pos ? ip_vs_info_array(seq, *pos - 1) : SEQ_START_TOKEN; 2693 2272 } 2694 2273 2695 2274 2696 2275 static void *ip_vs_info_seq_next(struct seq_file *seq, void *v, loff_t *pos) 2697 2276 { 2698 - struct hlist_node *e; 2699 - struct ip_vs_iter *iter; 2700 2277 struct ip_vs_service *svc; 2701 - struct net *net = seq_file_net(seq); 2702 - struct netns_ipvs *ipvs = net_ipvs(net); 2278 + struct ip_vs_iter *iter; 2279 + struct hlist_bl_node *e; 2280 + struct ip_vs_rht *t; 2703 2281 2704 2282 ++*pos; 2705 2283 if (v == SEQ_START_TOKEN) ··· 2711 2281 2712 2282 svc = v; 2713 2283 iter = seq->private; 2284 + t = iter->t; 2285 + if (!t) 2286 + return NULL; 2714 2287 2715 - e = rcu_dereference(hlist_next_rcu(&svc->s_list)); 2716 - if (e) 2717 - return hlist_entry(e, struct ip_vs_service, s_list); 2288 + hlist_bl_for_each_entry_continue_rcu(svc, e, s_list) { 2289 + /* Our cursor was moved to new table ? */ 2290 + if (!ip_vs_rht_same_table(t, READ_ONCE(svc->hash_key))) 2291 + break; 2292 + return svc; 2293 + } 2718 2294 2719 - while (++iter->bucket < IP_VS_SVC_TAB_SIZE) { 2720 - hlist_for_each_entry_rcu(svc, 2721 - &ipvs->svc_table[iter->bucket], 2722 - s_list) { 2295 + while (++iter->bucket < t->size) { 2296 + hlist_bl_for_each_entry_rcu(svc, e, &t->buckets[iter->bucket], 2297 + s_list) { 2298 + if (!ip_vs_rht_same_table(t, READ_ONCE(svc->hash_key))) 2299 + break; 2723 2300 return svc; 2724 2301 } 2725 2302 } ··· 3207 2770 const struct ip_vs_get_services *get, 3208 2771 struct ip_vs_get_services __user *uptr) 3209 2772 { 3210 - int idx, count=0; 3211 - struct ip_vs_service *svc; 3212 2773 struct ip_vs_service_entry entry; 2774 + DECLARE_IP_VS_RHT_WALK_BUCKETS(); 2775 + struct hlist_bl_head *head; 2776 + struct ip_vs_service *svc; 2777 + struct hlist_bl_node *e; 2778 + int count = 0; 3213 2779 int ret = 0; 3214 2780 3215 - for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) { 3216 - hlist_for_each_entry(svc, &ipvs->svc_table[idx], s_list) { 2781 + lockdep_assert_held(&ipvs->svc_resize_sem); 2782 + /* All service modifications are disabled, go ahead */ 2783 + ip_vs_rht_walk_buckets(ipvs->svc_table, head) { 2784 + hlist_bl_for_each_entry(svc, e, head, s_list) { 3217 2785 /* Only expose IPv4 entries to old interface */ 3218 2786 if (svc->af != AF_INET) 3219 2787 continue; ··· 3390 2948 return ret; 3391 2949 } 3392 2950 2951 + if (cmd == IP_VS_SO_GET_SERVICES) { 2952 + struct ip_vs_get_services *get; 2953 + size_t size; 2954 + 2955 + get = (struct ip_vs_get_services *)arg; 2956 + size = struct_size(get, entrytable, get->num_services); 2957 + if (*len != size) { 2958 + pr_err("length: %u != %zu\n", *len, size); 2959 + return -EINVAL; 2960 + } 2961 + /* Protect against table resizer moving the entries. 2962 + * Try reverse locking, so that we do not hold the mutex 2963 + * while waiting for semaphore. 2964 + */ 2965 + while (1) { 2966 + ret = down_read_killable(&ipvs->svc_resize_sem); 2967 + if (ret < 0) 2968 + return ret; 2969 + if (mutex_trylock(&ipvs->service_mutex)) 2970 + break; 2971 + up_read(&ipvs->svc_resize_sem); 2972 + cond_resched(); 2973 + } 2974 + ret = __ip_vs_get_service_entries(ipvs, get, user); 2975 + up_read(&ipvs->svc_resize_sem); 2976 + mutex_unlock(&ipvs->service_mutex); 2977 + return ret; 2978 + } 2979 + 3393 2980 mutex_lock(&ipvs->service_mutex); 3394 2981 switch (cmd) { 3395 2982 case IP_VS_SO_GET_VERSION: ··· 3444 2973 atomic_read(&ipvs->num_services[IP_VS_AF_INET]); 3445 2974 if (copy_to_user(user, &info, sizeof(info)) != 0) 3446 2975 ret = -EFAULT; 3447 - } 3448 - break; 3449 - 3450 - case IP_VS_SO_GET_SERVICES: 3451 - { 3452 - struct ip_vs_get_services *get; 3453 - size_t size; 3454 - 3455 - get = (struct ip_vs_get_services *)arg; 3456 - size = struct_size(get, entrytable, get->num_services); 3457 - if (*len != size) { 3458 - pr_err("length: %u != %zu\n", *len, size); 3459 - ret = -EINVAL; 3460 - goto out; 3461 - } 3462 - ret = __ip_vs_get_service_entries(ipvs, get, user); 3463 2976 } 3464 2977 break; 3465 2978 ··· 3732 3277 static int ip_vs_genl_dump_services(struct sk_buff *skb, 3733 3278 struct netlink_callback *cb) 3734 3279 { 3735 - int idx = 0, i; 3736 - int start = cb->args[0]; 3737 - struct ip_vs_service *svc; 3280 + DECLARE_IP_VS_RHT_WALK_BUCKETS_SAFE_RCU(); 3738 3281 struct net *net = sock_net(skb->sk); 3739 3282 struct netns_ipvs *ipvs = net_ipvs(net); 3283 + struct hlist_bl_head *head; 3284 + struct ip_vs_service *svc; 3285 + struct hlist_bl_node *e; 3286 + int start = cb->args[0]; 3287 + int idx = 0; 3740 3288 3289 + down_read(&ipvs->svc_resize_sem); 3741 3290 rcu_read_lock(); 3742 - for (i = 0; i < IP_VS_SVC_TAB_SIZE; i++) { 3743 - hlist_for_each_entry_rcu(svc, &ipvs->svc_table[i], s_list) { 3291 + ip_vs_rht_walk_buckets_safe_rcu(ipvs->svc_table, head) { 3292 + hlist_bl_for_each_entry_rcu(svc, e, head, s_list) { 3744 3293 if (++idx <= start) 3745 3294 continue; 3746 3295 if (ip_vs_genl_dump_service(skb, svc, cb) < 0) { ··· 3756 3297 3757 3298 nla_put_failure: 3758 3299 rcu_read_unlock(); 3300 + up_read(&ipvs->svc_resize_sem); 3759 3301 cb->args[0] = idx; 3760 3302 3761 3303 return skb->len; ··· 4766 4306 4767 4307 /* Initialize service_mutex, svc_table per netns */ 4768 4308 __mutex_init(&ipvs->service_mutex, "ipvs->service_mutex", &__ipvs_service_key); 4769 - for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) 4770 - INIT_HLIST_HEAD(&ipvs->svc_table[idx]); 4309 + init_rwsem(&ipvs->svc_resize_sem); 4310 + INIT_DELAYED_WORK(&ipvs->svc_resize_work, svc_resize_work_handler); 4311 + atomic_set(&ipvs->svc_table_changes, 0); 4312 + RCU_INIT_POINTER(ipvs->svc_table, NULL); 4771 4313 4772 4314 /* Initialize rs_table */ 4773 4315 for (idx = 0; idx < IP_VS_RTAB_SIZE; idx++) ··· 4788 4326 } 4789 4327 4790 4328 INIT_DELAYED_WORK(&ipvs->est_reload_work, est_reload_work_handler); 4329 + ipvs->sysctl_svc_lfactor = ip_vs_svc_default_load_factor(ipvs); 4791 4330 4792 4331 /* procfs stats */ 4793 4332 ipvs->tot_stats = kzalloc_obj(*ipvs->tot_stats);
+2 -2
net/netfilter/ipvs/ip_vs_pe_sip.c
··· 132 132 } 133 133 134 134 static u32 ip_vs_sip_hashkey_raw(const struct ip_vs_conn_param *p, 135 - u32 initval, bool inverse) 135 + struct ip_vs_rht *t, bool inverse) 136 136 { 137 - return jhash(p->pe_data, p->pe_data_len, initval); 137 + return jhash(p->pe_data, p->pe_data_len, (u32)t->hash_key.key[0]); 138 138 } 139 139 140 140 static int ip_vs_sip_show_pe_data(const struct ip_vs_conn *cp, char *buf)
+23
net/netfilter/ipvs/ip_vs_sync.c
··· 1755 1755 if (!ip_vs_use_count_inc()) 1756 1756 return -ENOPROTOOPT; 1757 1757 1758 + /* Backup server can be started without services just to sync conns, 1759 + * make sure conn_tab is created even if ipvs->enable is 0. 1760 + */ 1761 + if (state == IP_VS_STATE_BACKUP) { 1762 + mutex_lock(&ipvs->service_mutex); 1763 + if (!rcu_dereference_protected(ipvs->conn_tab, 1)) { 1764 + int lfactor = sysctl_conn_lfactor(ipvs); 1765 + int new_size = ip_vs_conn_desired_size(ipvs, NULL, 1766 + lfactor); 1767 + struct ip_vs_rht *tc_new; 1768 + 1769 + tc_new = ip_vs_conn_tab_alloc(ipvs, new_size, lfactor); 1770 + if (!tc_new) { 1771 + mutex_unlock(&ipvs->service_mutex); 1772 + result = -ENOMEM; 1773 + goto out_module; 1774 + } 1775 + rcu_assign_pointer(ipvs->conn_tab, tc_new); 1776 + } 1777 + mutex_unlock(&ipvs->service_mutex); 1778 + } 1779 + 1758 1780 /* Do not hold one mutex and then to block on another */ 1759 1781 for (;;) { 1760 1782 rtnl_lock(); ··· 1944 1922 mutex_unlock(&ipvs->sync_mutex); 1945 1923 rtnl_unlock(); 1946 1924 1925 + out_module: 1947 1926 /* decrease the module use count */ 1948 1927 ip_vs_use_count_dec(); 1949 1928 return result;
+12 -4
net/netfilter/nf_log_syslog.c
··· 165 165 static void nf_log_dump_sk_uid_gid(struct net *net, struct nf_log_buf *m, 166 166 struct sock *sk) 167 167 { 168 + const struct socket *sock; 169 + const struct file *file; 170 + 168 171 if (!sk || !sk_fullsock(sk) || !net_eq(net, sock_net(sk))) 169 172 return; 170 173 171 - read_lock_bh(&sk->sk_callback_lock); 172 - if (sk->sk_socket && sk->sk_socket->file) { 173 - const struct cred *cred = sk->sk_socket->file->f_cred; 174 + /* The sk pointer remains valid as long as the skb is. The sk_socket and 175 + * file pointer may become NULL if the socket is closed. Both structures 176 + * (including file->cred) are RCU freed which means they can be accessed 177 + * within a RCU read section. 178 + */ 179 + sock = READ_ONCE(sk->sk_socket); 180 + file = sock ? READ_ONCE(sock->file) : NULL; 181 + if (file) { 182 + const struct cred *cred = file->f_cred; 174 183 175 184 nf_log_buf_add(m, "UID=%u GID=%u ", 176 185 from_kuid_munged(&init_user_ns, cred->fsuid), 177 186 from_kgid_munged(&init_user_ns, cred->fsgid)); 178 187 } 179 - read_unlock_bh(&sk->sk_callback_lock); 180 188 } 181 189 182 190 static noinline_for_stack int
+13 -10
net/netfilter/nft_meta.c
··· 131 131 u32 *dest, 132 132 const struct nft_pktinfo *pkt) 133 133 { 134 - struct sock *sk = skb_to_full_sk(pkt->skb); 135 - struct socket *sock; 134 + const struct sock *sk = skb_to_full_sk(pkt->skb); 135 + const struct socket *sock; 136 + const struct file *file; 136 137 137 138 if (!sk || !sk_fullsock(sk) || !net_eq(nft_net(pkt), sock_net(sk))) 138 139 return false; 139 140 140 - read_lock_bh(&sk->sk_callback_lock); 141 - sock = sk->sk_socket; 142 - if (!sock || !sock->file) { 143 - read_unlock_bh(&sk->sk_callback_lock); 141 + /* The sk pointer remains valid as long as the skb is. The sk_socket and 142 + * file pointer may become NULL if the socket is closed. Both structures 143 + * (including file->cred) are RCU freed which means they can be accessed 144 + * within a RCU read section. 145 + */ 146 + sock = READ_ONCE(sk->sk_socket); 147 + file = sock ? READ_ONCE(sock->file) : NULL; 148 + if (!file) 144 149 return false; 145 - } 146 150 147 151 switch (key) { 148 152 case NFT_META_SKUID: 149 153 *dest = from_kuid_munged(sock_net(sk)->user_ns, 150 - sock->file->f_cred->fsuid); 154 + file->f_cred->fsuid); 151 155 break; 152 156 case NFT_META_SKGID: 153 157 *dest = from_kgid_munged(sock_net(sk)->user_ns, 154 - sock->file->f_cred->fsgid); 158 + file->f_cred->fsgid); 155 159 break; 156 160 default: 157 161 break; 158 162 } 159 163 160 - read_unlock_bh(&sk->sk_callback_lock); 161 164 return true; 162 165 } 163 166
+14 -14
net/netfilter/xt_owner.c
··· 63 63 owner_mt(const struct sk_buff *skb, struct xt_action_param *par) 64 64 { 65 65 const struct xt_owner_match_info *info = par->matchinfo; 66 - const struct file *filp; 67 66 struct sock *sk = skb_to_full_sk(skb); 68 67 struct net *net = xt_net(par); 68 + const struct socket *sock; 69 + const struct file *filp; 69 70 70 - if (!sk || !sk->sk_socket || !net_eq(net, sock_net(sk))) 71 + if (!sk || !READ_ONCE(sk->sk_socket) || !net_eq(net, sock_net(sk))) 71 72 return (info->match ^ info->invert) == 0; 72 73 else if (info->match & info->invert & XT_OWNER_SOCKET) 73 74 /* ··· 77 76 */ 78 77 return false; 79 78 80 - read_lock_bh(&sk->sk_callback_lock); 81 - filp = sk->sk_socket ? sk->sk_socket->file : NULL; 82 - if (filp == NULL) { 83 - read_unlock_bh(&sk->sk_callback_lock); 79 + /* The sk pointer remains valid as long as the skb is. The sk_socket and 80 + * file pointer may become NULL if the socket is closed. Both structures 81 + * (including file->cred) are RCU freed which means they can be accessed 82 + * within a RCU read section. 83 + */ 84 + sock = READ_ONCE(sk->sk_socket); 85 + filp = sock ? READ_ONCE(sock->file) : NULL; 86 + if (filp == NULL) 84 87 return ((info->match ^ info->invert) & 85 88 (XT_OWNER_UID | XT_OWNER_GID)) == 0; 86 - } 87 89 88 90 if (info->match & XT_OWNER_UID) { 89 91 kuid_t uid_min = make_kuid(net->user_ns, info->uid_min); 90 92 kuid_t uid_max = make_kuid(net->user_ns, info->uid_max); 93 + 91 94 if ((uid_gte(filp->f_cred->fsuid, uid_min) && 92 95 uid_lte(filp->f_cred->fsuid, uid_max)) ^ 93 - !(info->invert & XT_OWNER_UID)) { 94 - read_unlock_bh(&sk->sk_callback_lock); 96 + !(info->invert & XT_OWNER_UID)) 95 97 return false; 96 - } 97 98 } 98 99 99 100 if (info->match & XT_OWNER_GID) { ··· 120 117 } 121 118 } 122 119 123 - if (match ^ !(info->invert & XT_OWNER_GID)) { 124 - read_unlock_bh(&sk->sk_callback_lock); 120 + if (match ^ !(info->invert & XT_OWNER_GID)) 125 121 return false; 126 - } 127 122 } 128 123 129 - read_unlock_bh(&sk->sk_callback_lock); 130 124 return true; 131 125 } 132 126