Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

netfilter: nft_set_rbtree: revisit array resize logic

Chris Arges reports high memory consumption with thousands of
containers, this patch revisits the array allocation logic.

For anonymous sets, start by 16 slots (which takes 256 bytes on x86_64).
Expand it by x2 until threshold of 512 slots is reached, over that
threshold, expand it by x1.5.

For non-anonymous set, start by 1024 slots in the array (which takes 16
Kbytes initially on x86_64). Expand it by x1.5.

Use set->ndeact to subtract deactivated elements when calculating the
number of the slots in the array, otherwise the array size array gets
increased artifically. Add special case shrink logic to deal with flush
set too.

The shrink logic is skipped by anonymous sets.

Use check_add_overflow() to calculate the new array size.

Add a WARN_ON_ONCE check to make sure elements fit into the new array
size.

Reported-by: Chris Arges <carges@cloudflare.com>
Fixes: 7e43e0a1141d ("netfilter: nft_set_rbtree: translate rbtree to array for binary search")
Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>

+75 -17
+75 -17
net/netfilter/nft_set_rbtree.c
··· 572 572 return array; 573 573 } 574 574 575 - #define NFT_ARRAY_EXTRA_SIZE 10240 576 - 577 575 /* Similar to nft_rbtree_{u,k}size to hide details to userspace, but consider 578 576 * packed representation coming from userspace for anonymous sets too. 579 577 */ 580 578 static u32 nft_array_elems(const struct nft_set *set) 581 579 { 582 - u32 nelems = atomic_read(&set->nelems); 580 + u32 nelems = atomic_read(&set->nelems) - set->ndeact; 583 581 584 582 /* Adjacent intervals are represented with a single start element in 585 583 * anonymous sets, use the current element counter as is. ··· 593 595 return (nelems / 2) + 2; 594 596 } 595 597 596 - static int nft_array_may_resize(const struct nft_set *set) 598 + #define NFT_ARRAY_INITIAL_SIZE 1024 599 + #define NFT_ARRAY_INITIAL_ANON_SIZE 16 600 + #define NFT_ARRAY_INITIAL_ANON_THRESH (8192U / sizeof(struct nft_array_interval)) 601 + 602 + static int nft_array_may_resize(const struct nft_set *set, bool flush) 597 603 { 598 - u32 nelems = nft_array_elems(set), new_max_intervals; 604 + u32 initial_intervals, max_intervals, new_max_intervals, delta; 605 + u32 shrinked_max_intervals, nelems = nft_array_elems(set); 599 606 struct nft_rbtree *priv = nft_set_priv(set); 600 607 struct nft_array *array; 601 608 602 - if (!priv->array_next) { 603 - array = nft_array_alloc(nelems + NFT_ARRAY_EXTRA_SIZE); 609 + if (nft_set_is_anonymous(set)) 610 + initial_intervals = NFT_ARRAY_INITIAL_ANON_SIZE; 611 + else 612 + initial_intervals = NFT_ARRAY_INITIAL_SIZE; 613 + 614 + if (priv->array_next) { 615 + max_intervals = priv->array_next->max_intervals; 616 + new_max_intervals = priv->array_next->max_intervals; 617 + } else { 618 + if (priv->array) { 619 + max_intervals = priv->array->max_intervals; 620 + new_max_intervals = priv->array->max_intervals; 621 + } else { 622 + max_intervals = 0; 623 + new_max_intervals = initial_intervals; 624 + } 625 + } 626 + 627 + if (nft_set_is_anonymous(set)) 628 + goto maybe_grow; 629 + 630 + if (flush) { 631 + /* Set flush just started, nelems still report elements.*/ 632 + nelems = 0; 633 + new_max_intervals = NFT_ARRAY_INITIAL_SIZE; 634 + goto realloc_array; 635 + } 636 + 637 + if (check_add_overflow(new_max_intervals, new_max_intervals, 638 + &shrinked_max_intervals)) 639 + return -EOVERFLOW; 640 + 641 + shrinked_max_intervals = DIV_ROUND_UP(shrinked_max_intervals, 3); 642 + 643 + if (shrinked_max_intervals > NFT_ARRAY_INITIAL_SIZE && 644 + nelems < shrinked_max_intervals) { 645 + new_max_intervals = shrinked_max_intervals; 646 + goto realloc_array; 647 + } 648 + maybe_grow: 649 + if (nelems > new_max_intervals) { 650 + if (nft_set_is_anonymous(set) && 651 + new_max_intervals < NFT_ARRAY_INITIAL_ANON_THRESH) { 652 + new_max_intervals <<= 1; 653 + } else { 654 + delta = new_max_intervals >> 1; 655 + if (check_add_overflow(new_max_intervals, delta, 656 + &new_max_intervals)) 657 + return -EOVERFLOW; 658 + } 659 + } 660 + 661 + realloc_array: 662 + if (WARN_ON_ONCE(nelems > new_max_intervals)) 663 + return -ENOMEM; 664 + 665 + if (priv->array_next) { 666 + if (max_intervals == new_max_intervals) 667 + return 0; 668 + 669 + if (nft_array_intervals_alloc(priv->array_next, new_max_intervals) < 0) 670 + return -ENOMEM; 671 + } else { 672 + array = nft_array_alloc(new_max_intervals); 604 673 if (!array) 605 674 return -ENOMEM; 606 675 607 676 priv->array_next = array; 608 677 } 609 - 610 - if (nelems < priv->array_next->max_intervals) 611 - return 0; 612 - 613 - new_max_intervals = priv->array_next->max_intervals + NFT_ARRAY_EXTRA_SIZE; 614 - if (nft_array_intervals_alloc(priv->array_next, new_max_intervals) < 0) 615 - return -ENOMEM; 616 678 617 679 return 0; 618 680 } ··· 688 630 689 631 nft_rbtree_maybe_reset_start_cookie(priv, tstamp); 690 632 691 - if (nft_array_may_resize(set) < 0) 633 + if (nft_array_may_resize(set, false) < 0) 692 634 return -ENOMEM; 693 635 694 636 do { ··· 799 741 nft_rbtree_interval_null(set, this)) 800 742 priv->start_rbe_cookie = 0; 801 743 802 - if (nft_array_may_resize(set) < 0) 744 + if (nft_array_may_resize(set, false) < 0) 803 745 return NULL; 804 746 805 747 while (parent != NULL) { ··· 869 811 870 812 switch (iter->type) { 871 813 case NFT_ITER_UPDATE_CLONE: 872 - if (nft_array_may_resize(set) < 0) { 814 + if (nft_array_may_resize(set, true) < 0) { 873 815 iter->err = -ENOMEM; 874 816 break; 875 817 }