Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

ipv6: icmp: Add RFC 5837 support

Add the ability to append the incoming IP interface information to
ICMPv6 error messages in accordance with RFC 5837 and RFC 4884. This is
required for more meaningful traceroute results in unnumbered networks.

The feature is disabled by default and controlled via a new sysctl
("net.ipv6.icmp.errors_extension_mask") which accepts a bitmask of ICMP
extensions to append to ICMP error messages. Currently, only a single
value is supported, but the interface and the implementation should be
able to support more extensions, if needed.

Clone the skb and copy the relevant data portions before modifying the
skb as the caller of icmp6_send() still owns the skb after the function
returns. This should be fine since by default ICMP error messages are
rate limited to 1000 per second and no more than 1 per second per
specific host.

Trim or pad the packet to 128 bytes before appending the ICMP extension
structure in order to be compatible with legacy applications that assume
that the ICMP extension structure always starts at this offset (the
minimum length specified by RFC 4884).

Since commit 20e1954fe238 ("ipv6: RFC 4884 partial support for SIT/GRE
tunnels") it is possible for icmp6_send() to be called with an skb that
already contains ICMP extensions. This can happen when we receive an
ICMPv4 message with extensions from a tunnel and translate it to an
ICMPv6 message towards an IPv6 host in the overlay network. I could not
find an RFC that supports this behavior, but it makes sense to not
overwrite the original extensions that were appended to the packet.
Therefore, avoid appending extensions if the length field in the
provided ICMPv6 header is already filled.

Export netdev_copy_name() using EXPORT_IPV6_MOD_GPL() to make it
available to IPv6 when it is built as a module.

Reviewed-by: Petr Machata <petrm@nvidia.com>
Reviewed-by: David Ahern <dsahern@kernel.org>
Reviewed-by: Willem de Bruijn <willemb@google.com>
Signed-off-by: Ido Schimmel <idosch@nvidia.com>
Reviewed-by: Eric Dumazet <edumazet@google.com>
Link: https://patch.msgid.link/20251027082232.232571-3-idosch@nvidia.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>

authored by

Ido Schimmel and committed by
Jakub Kicinski
d12d04d2 f0e7036f

+232 -2
+17
Documentation/networking/ip-sysctl.rst
··· 3279 3279 3280 3280 Default: 0 (disabled) 3281 3281 3282 + errors_extension_mask - UNSIGNED INTEGER 3283 + Bitmask of ICMP extensions to append to ICMPv6 error messages 3284 + ("Destination Unreachable" and "Time Exceeded"). The original datagram 3285 + is trimmed / padded to 128 bytes in order to be compatible with 3286 + applications that do not comply with RFC 4884. 3287 + 3288 + Possible extensions are: 3289 + 3290 + ==== ============================================================== 3291 + 0x01 Incoming IP interface information according to RFC 5837. 3292 + Extension will include the index, IPv6 address (if present), 3293 + name and MTU of the IP interface that received the datagram 3294 + which elicited the ICMP error. 3295 + ==== ============================================================== 3296 + 3297 + Default: 0x00 (no extensions) 3298 + 3282 3299 xfrm6_gc_thresh - INTEGER 3283 3300 (Obsolete since linux-4.14) 3284 3301 The threshold at which we will start garbage collecting for IPv6
+1
include/net/netns/ipv6.h
··· 56 56 u8 skip_notify_on_dev_down; 57 57 u8 fib_notify_on_flag_change; 58 58 u8 icmpv6_error_anycast_as_unicast; 59 + u8 icmpv6_errors_extension_mask; 59 60 }; 60 61 61 62 struct netns_ipv6 {
+1
net/core/dev.c
··· 1163 1163 strscpy(name, dev->name, IFNAMSIZ); 1164 1164 } while (read_seqretry(&netdev_rename_lock, seq)); 1165 1165 } 1166 + EXPORT_IPV6_MOD_GPL(netdev_copy_name); 1166 1167 1167 1168 /** 1168 1169 * netdev_get_name - get a netdevice name, knowing its ifindex.
+1
net/ipv6/af_inet6.c
··· 960 960 net->ipv6.sysctl.icmpv6_echo_ignore_multicast = 0; 961 961 net->ipv6.sysctl.icmpv6_echo_ignore_anycast = 0; 962 962 net->ipv6.sysctl.icmpv6_error_anycast_as_unicast = 0; 963 + net->ipv6.sysctl.icmpv6_errors_extension_mask = 0; 963 964 964 965 /* By default, rate limit error messages. 965 966 * Except for pmtu discovery, it would break it.
+212 -2
net/ipv6/icmp.c
··· 444 444 return icmp6_dev(skb)->ifindex; 445 445 } 446 446 447 + struct icmp6_ext_iio_addr6_subobj { 448 + __be16 afi; 449 + __be16 reserved; 450 + struct in6_addr addr6; 451 + }; 452 + 453 + static unsigned int icmp6_ext_iio_len(void) 454 + { 455 + return sizeof(struct icmp_extobj_hdr) + 456 + /* ifIndex */ 457 + sizeof(__be32) + 458 + /* Interface Address Sub-Object */ 459 + sizeof(struct icmp6_ext_iio_addr6_subobj) + 460 + /* Interface Name Sub-Object. Length must be a multiple of 4 461 + * bytes. 462 + */ 463 + ALIGN(sizeof(struct icmp_ext_iio_name_subobj), 4) + 464 + /* MTU */ 465 + sizeof(__be32); 466 + } 467 + 468 + static unsigned int icmp6_ext_max_len(u8 ext_objs) 469 + { 470 + unsigned int ext_max_len; 471 + 472 + ext_max_len = sizeof(struct icmp_ext_hdr); 473 + 474 + if (ext_objs & BIT(ICMP_ERR_EXT_IIO_IIF)) 475 + ext_max_len += icmp6_ext_iio_len(); 476 + 477 + return ext_max_len; 478 + } 479 + 480 + static struct in6_addr *icmp6_ext_iio_addr6_find(const struct net_device *dev) 481 + { 482 + struct inet6_dev *in6_dev; 483 + struct inet6_ifaddr *ifa; 484 + 485 + in6_dev = __in6_dev_get(dev); 486 + if (!in6_dev) 487 + return NULL; 488 + 489 + /* It is unclear from RFC 5837 which IP address should be chosen, but 490 + * it makes sense to choose a global unicast address. 491 + */ 492 + list_for_each_entry_rcu(ifa, &in6_dev->addr_list, if_list) { 493 + if (ifa->flags & (IFA_F_TENTATIVE | IFA_F_DADFAILED)) 494 + continue; 495 + if (ipv6_addr_type(&ifa->addr) != IPV6_ADDR_UNICAST || 496 + ipv6_addr_src_scope(&ifa->addr) != IPV6_ADDR_SCOPE_GLOBAL) 497 + continue; 498 + return &ifa->addr; 499 + } 500 + 501 + return NULL; 502 + } 503 + 504 + static void icmp6_ext_iio_iif_append(struct net *net, struct sk_buff *skb, 505 + int iif) 506 + { 507 + struct icmp_ext_iio_name_subobj *name_subobj; 508 + struct icmp_extobj_hdr *objh; 509 + struct net_device *dev; 510 + struct in6_addr *addr6; 511 + __be32 data; 512 + 513 + if (!iif) 514 + return; 515 + 516 + /* Add the fields in the order specified by RFC 5837. */ 517 + objh = skb_put(skb, sizeof(*objh)); 518 + objh->class_num = ICMP_EXT_OBJ_CLASS_IIO; 519 + objh->class_type = ICMP_EXT_CTYPE_IIO_ROLE(ICMP_EXT_CTYPE_IIO_ROLE_IIF); 520 + 521 + data = htonl(iif); 522 + skb_put_data(skb, &data, sizeof(__be32)); 523 + objh->class_type |= ICMP_EXT_CTYPE_IIO_IFINDEX; 524 + 525 + rcu_read_lock(); 526 + 527 + dev = dev_get_by_index_rcu(net, iif); 528 + if (!dev) 529 + goto out; 530 + 531 + addr6 = icmp6_ext_iio_addr6_find(dev); 532 + if (addr6) { 533 + struct icmp6_ext_iio_addr6_subobj *addr6_subobj; 534 + 535 + addr6_subobj = skb_put_zero(skb, sizeof(*addr6_subobj)); 536 + addr6_subobj->afi = htons(ICMP_AFI_IP6); 537 + addr6_subobj->addr6 = *addr6; 538 + objh->class_type |= ICMP_EXT_CTYPE_IIO_IPADDR; 539 + } 540 + 541 + name_subobj = skb_put_zero(skb, ALIGN(sizeof(*name_subobj), 4)); 542 + name_subobj->len = ALIGN(sizeof(*name_subobj), 4); 543 + netdev_copy_name(dev, name_subobj->name); 544 + objh->class_type |= ICMP_EXT_CTYPE_IIO_NAME; 545 + 546 + data = htonl(READ_ONCE(dev->mtu)); 547 + skb_put_data(skb, &data, sizeof(__be32)); 548 + objh->class_type |= ICMP_EXT_CTYPE_IIO_MTU; 549 + 550 + out: 551 + rcu_read_unlock(); 552 + objh->length = htons(skb_tail_pointer(skb) - (unsigned char *)objh); 553 + } 554 + 555 + static void icmp6_ext_objs_append(struct net *net, struct sk_buff *skb, 556 + u8 ext_objs, int iif) 557 + { 558 + if (ext_objs & BIT(ICMP_ERR_EXT_IIO_IIF)) 559 + icmp6_ext_iio_iif_append(net, skb, iif); 560 + } 561 + 562 + static struct sk_buff * 563 + icmp6_ext_append(struct net *net, struct sk_buff *skb_in, 564 + struct icmp6hdr *icmp6h, unsigned int room, int iif) 565 + { 566 + unsigned int payload_len, ext_max_len, ext_len; 567 + struct icmp_ext_hdr *ext_hdr; 568 + struct sk_buff *skb; 569 + u8 ext_objs; 570 + int nhoff; 571 + 572 + switch (icmp6h->icmp6_type) { 573 + case ICMPV6_DEST_UNREACH: 574 + case ICMPV6_TIME_EXCEED: 575 + break; 576 + default: 577 + return NULL; 578 + } 579 + 580 + /* Do not overwrite existing extensions. This can happen when we 581 + * receive an ICMPv4 message with extensions from a tunnel and 582 + * translate it to an ICMPv6 message towards an IPv6 host in the 583 + * overlay network. 584 + */ 585 + if (icmp6h->icmp6_datagram_len) 586 + return NULL; 587 + 588 + ext_objs = READ_ONCE(net->ipv6.sysctl.icmpv6_errors_extension_mask); 589 + if (!ext_objs) 590 + return NULL; 591 + 592 + ext_max_len = icmp6_ext_max_len(ext_objs); 593 + if (ICMP_EXT_ORIG_DGRAM_MIN_LEN + ext_max_len > room) 594 + return NULL; 595 + 596 + skb = skb_clone(skb_in, GFP_ATOMIC); 597 + if (!skb) 598 + return NULL; 599 + 600 + nhoff = skb_network_offset(skb); 601 + payload_len = min(skb->len - nhoff, ICMP_EXT_ORIG_DGRAM_MIN_LEN); 602 + 603 + if (!pskb_network_may_pull(skb, payload_len)) 604 + goto free_skb; 605 + 606 + if (pskb_trim(skb, nhoff + ICMP_EXT_ORIG_DGRAM_MIN_LEN) || 607 + __skb_put_padto(skb, nhoff + ICMP_EXT_ORIG_DGRAM_MIN_LEN, false)) 608 + goto free_skb; 609 + 610 + if (pskb_expand_head(skb, 0, ext_max_len, GFP_ATOMIC)) 611 + goto free_skb; 612 + 613 + ext_hdr = skb_put_zero(skb, sizeof(*ext_hdr)); 614 + ext_hdr->version = ICMP_EXT_VERSION_2; 615 + 616 + icmp6_ext_objs_append(net, skb, ext_objs, iif); 617 + 618 + /* Do not send an empty extension structure. */ 619 + ext_len = skb_tail_pointer(skb) - (unsigned char *)ext_hdr; 620 + if (ext_len == sizeof(*ext_hdr)) 621 + goto free_skb; 622 + 623 + ext_hdr->checksum = ip_compute_csum(ext_hdr, ext_len); 624 + /* The length of the original datagram in 64-bit words (RFC 4884). */ 625 + icmp6h->icmp6_datagram_len = ICMP_EXT_ORIG_DGRAM_MIN_LEN / sizeof(u64); 626 + 627 + return skb; 628 + 629 + free_skb: 630 + consume_skb(skb); 631 + return NULL; 632 + } 633 + 447 634 /* 448 635 * Send an ICMP message in response to a packet in error 449 636 */ ··· 645 458 struct ipv6_pinfo *np; 646 459 const struct in6_addr *saddr = NULL; 647 460 bool apply_ratelimit = false; 461 + struct sk_buff *ext_skb; 648 462 struct dst_entry *dst; 463 + unsigned int room; 649 464 struct icmp6hdr tmp_hdr; 650 465 struct flowi6 fl6; 651 466 struct icmpv6_msg msg; ··· 801 612 msg.offset = skb_network_offset(skb); 802 613 msg.type = type; 803 614 804 - len = skb->len - msg.offset; 805 - len = min_t(unsigned int, len, IPV6_MIN_MTU - sizeof(struct ipv6hdr) - sizeof(struct icmp6hdr)); 615 + room = IPV6_MIN_MTU - sizeof(struct ipv6hdr) - sizeof(struct icmp6hdr); 616 + ext_skb = icmp6_ext_append(net, skb, &tmp_hdr, room, parm->iif); 617 + if (ext_skb) 618 + msg.skb = ext_skb; 619 + 620 + len = msg.skb->len - msg.offset; 621 + len = min_t(unsigned int, len, room); 806 622 if (len < 0) { 807 623 net_dbg_ratelimited("icmp: len problem [%pI6c > %pI6c]\n", 808 624 &hdr->saddr, &hdr->daddr); ··· 829 635 } 830 636 831 637 out_dst_release: 638 + if (ext_skb) 639 + consume_skb(ext_skb); 832 640 dst_release(dst); 833 641 out_unlock: 834 642 icmpv6_xmit_unlock(sk); ··· 1367 1171 EXPORT_SYMBOL(icmpv6_err_convert); 1368 1172 1369 1173 #ifdef CONFIG_SYSCTL 1174 + 1175 + static u32 icmpv6_errors_extension_mask_all = 1176 + GENMASK_U8(ICMP_ERR_EXT_COUNT - 1, 0); 1177 + 1370 1178 static struct ctl_table ipv6_icmp_table_template[] = { 1371 1179 { 1372 1180 .procname = "ratelimit", ··· 1416 1216 .extra1 = SYSCTL_ZERO, 1417 1217 .extra2 = SYSCTL_ONE, 1418 1218 }, 1219 + { 1220 + .procname = "errors_extension_mask", 1221 + .data = &init_net.ipv6.sysctl.icmpv6_errors_extension_mask, 1222 + .maxlen = sizeof(u8), 1223 + .mode = 0644, 1224 + .proc_handler = proc_dou8vec_minmax, 1225 + .extra1 = SYSCTL_ZERO, 1226 + .extra2 = &icmpv6_errors_extension_mask_all, 1227 + }, 1419 1228 }; 1420 1229 1421 1230 struct ctl_table * __net_init ipv6_icmp_sysctl_init(struct net *net) ··· 1442 1233 table[3].data = &net->ipv6.sysctl.icmpv6_echo_ignore_anycast; 1443 1234 table[4].data = &net->ipv6.sysctl.icmpv6_ratemask_ptr; 1444 1235 table[5].data = &net->ipv6.sysctl.icmpv6_error_anycast_as_unicast; 1236 + table[6].data = &net->ipv6.sysctl.icmpv6_errors_extension_mask; 1445 1237 } 1446 1238 return table; 1447 1239 }