Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

net: ipv4: Add ip_mr_output()

Multicast routing is today handled in the input path. Locally generated MC
packets don't hit the IPMR code today. Thus if a VXLAN remote address is
multicast, the driver needs to set an OIF during route lookup. Thus MC
routing configuration needs to be kept in sync with the VXLAN FDB and MDB.
Ideally, the VXLAN packets would be routed by the MC routing code instead.

To that end, this patch adds support to route locally generated multicast
packets. The newly-added routines do largely what ip_mr_input() and
ip_mr_forward() do: make an MR cache lookup to find where to send the
packets, and use ip_mc_output() to send each of them. When no cache entry
is found, the packet is punted to the daemon for resolution.

However, an installation that uses a VXLAN underlay netdevice for which it
also has matching MC routes, would get a different routing with this patch.
Previously, the MC packets would be delivered directly to the underlay
port, whereas now they would be MC-routed. In order to avoid this change in
behavior, introduce an IPCB flag. Only if the flag is set will
ip_mr_output() actually engage, otherwise it reverts to ip_mc_output().

This code is based on work by Roopa Prabhu and Nikolay Aleksandrov.

Signed-off-by: Roopa Prabhu <roopa@nvidia.com>
Signed-off-by: Nikolay Aleksandrov <razor@blackwall.org>
Signed-off-by: Benjamin Poirier <bpoirier@nvidia.com>
Signed-off-by: Petr Machata <petrm@nvidia.com>
Reviewed-by: Ido Schimmel <idosch@nvidia.com>
Link: https://patch.msgid.link/0aadbd49330471c0f758d54afb05eb3b6e3a6b65.1750113335.git.petrm@nvidia.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>

authored by

Petr Machata and committed by
Jakub Kicinski
35bec72a b2e653bc

+120 -1
+2
include/net/ip.h
··· 59 59 #define IPSKB_L3SLAVE BIT(7) 60 60 #define IPSKB_NOPOLICY BIT(8) 61 61 #define IPSKB_MULTIPATH BIT(9) 62 + #define IPSKB_MCROUTE BIT(10) 62 63 63 64 u16 frag_max_size; 64 65 }; ··· 168 167 int ip_local_deliver(struct sk_buff *skb); 169 168 void ip_protocol_deliver_rcu(struct net *net, struct sk_buff *skb, int proto); 170 169 int ip_mr_input(struct sk_buff *skb); 170 + int ip_mr_output(struct net *net, struct sock *sk, struct sk_buff *skb); 171 171 int ip_output(struct net *net, struct sock *sk, struct sk_buff *skb); 172 172 int ip_mc_output(struct net *net, struct sock *sk, struct sk_buff *skb); 173 173 int ip_do_fragment(struct net *net, struct sock *sk, struct sk_buff *skb,
+117
net/ipv4/ipmr.c
··· 1965 1965 kfree_skb(skb); 1966 1966 } 1967 1967 1968 + static void ipmr_queue_output_xmit(struct net *net, struct mr_table *mrt, 1969 + struct sk_buff *skb, int vifi) 1970 + { 1971 + if (ipmr_prepare_xmit(net, mrt, skb, vifi)) 1972 + goto out_free; 1973 + 1974 + ip_mc_output(net, NULL, skb); 1975 + return; 1976 + 1977 + out_free: 1978 + kfree_skb(skb); 1979 + } 1980 + 1968 1981 /* Called with mrt_lock or rcu_read_lock() */ 1969 1982 static int ipmr_find_vif(const struct mr_table *mrt, struct net_device *dev) 1970 1983 { ··· 2235 2222 return ip_local_deliver(skb); 2236 2223 kfree_skb(skb); 2237 2224 return 0; 2225 + } 2226 + 2227 + static void ip_mr_output_finish(struct net *net, struct mr_table *mrt, 2228 + struct net_device *dev, struct sk_buff *skb, 2229 + struct mfc_cache *c) 2230 + { 2231 + int psend = -1; 2232 + int ct; 2233 + 2234 + atomic_long_inc(&c->_c.mfc_un.res.pkt); 2235 + atomic_long_add(skb->len, &c->_c.mfc_un.res.bytes); 2236 + WRITE_ONCE(c->_c.mfc_un.res.lastuse, jiffies); 2237 + 2238 + /* Forward the frame */ 2239 + if (c->mfc_origin == htonl(INADDR_ANY) && 2240 + c->mfc_mcastgrp == htonl(INADDR_ANY)) { 2241 + if (ip_hdr(skb)->ttl > 2242 + c->_c.mfc_un.res.ttls[c->_c.mfc_parent]) { 2243 + /* It's an (*,*) entry and the packet is not coming from 2244 + * the upstream: forward the packet to the upstream 2245 + * only. 2246 + */ 2247 + psend = c->_c.mfc_parent; 2248 + goto last_xmit; 2249 + } 2250 + goto dont_xmit; 2251 + } 2252 + 2253 + for (ct = c->_c.mfc_un.res.maxvif - 1; 2254 + ct >= c->_c.mfc_un.res.minvif; ct--) { 2255 + if (ip_hdr(skb)->ttl > c->_c.mfc_un.res.ttls[ct]) { 2256 + if (psend != -1) { 2257 + struct sk_buff *skb2; 2258 + 2259 + skb2 = skb_clone(skb, GFP_ATOMIC); 2260 + if (skb2) 2261 + ipmr_queue_output_xmit(net, mrt, 2262 + skb2, psend); 2263 + } 2264 + psend = ct; 2265 + } 2266 + } 2267 + 2268 + last_xmit: 2269 + if (psend != -1) { 2270 + ipmr_queue_output_xmit(net, mrt, skb, psend); 2271 + return; 2272 + } 2273 + 2274 + dont_xmit: 2275 + kfree_skb(skb); 2276 + } 2277 + 2278 + /* Multicast packets for forwarding arrive here 2279 + * Called with rcu_read_lock(); 2280 + */ 2281 + int ip_mr_output(struct net *net, struct sock *sk, struct sk_buff *skb) 2282 + { 2283 + struct rtable *rt = skb_rtable(skb); 2284 + struct mfc_cache *cache; 2285 + struct net_device *dev; 2286 + struct mr_table *mrt; 2287 + int vif; 2288 + 2289 + WARN_ON_ONCE(!rcu_read_lock_held()); 2290 + dev = rt->dst.dev; 2291 + 2292 + if (IPCB(skb)->flags & IPSKB_FORWARDED) 2293 + goto mc_output; 2294 + if (!(IPCB(skb)->flags & IPSKB_MCROUTE)) 2295 + goto mc_output; 2296 + 2297 + skb->dev = dev; 2298 + 2299 + mrt = ipmr_rt_fib_lookup(net, skb); 2300 + if (IS_ERR(mrt)) 2301 + goto mc_output; 2302 + 2303 + /* already under rcu_read_lock() */ 2304 + cache = ipmr_cache_find(mrt, ip_hdr(skb)->saddr, ip_hdr(skb)->daddr); 2305 + if (!cache) { 2306 + vif = ipmr_find_vif(mrt, dev); 2307 + if (vif >= 0) 2308 + cache = ipmr_cache_find_any(mrt, ip_hdr(skb)->daddr, 2309 + vif); 2310 + } 2311 + 2312 + /* No usable cache entry */ 2313 + if (!cache) { 2314 + vif = ipmr_find_vif(mrt, dev); 2315 + if (vif >= 0) 2316 + return ipmr_cache_unresolved(mrt, vif, skb, dev); 2317 + goto mc_output; 2318 + } 2319 + 2320 + vif = cache->_c.mfc_parent; 2321 + if (rcu_access_pointer(mrt->vif_table[vif].dev) != dev) 2322 + goto mc_output; 2323 + 2324 + ip_mr_output_finish(net, mrt, dev, skb, cache); 2325 + return 0; 2326 + 2327 + mc_output: 2328 + return ip_mc_output(net, sk, skb); 2238 2329 } 2239 2330 2240 2331 #ifdef CONFIG_IP_PIMSM_V1
+1 -1
net/ipv4/route.c
··· 2660 2660 if (IN_DEV_MFORWARD(in_dev) && 2661 2661 !ipv4_is_local_multicast(fl4->daddr)) { 2662 2662 rth->dst.input = ip_mr_input; 2663 - rth->dst.output = ip_mc_output; 2663 + rth->dst.output = ip_mr_output; 2664 2664 } 2665 2665 } 2666 2666 #endif