diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 9b710abac4253..d41a4996213f3 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -2897,8 +2897,34 @@ union bpf_attr { * * * **BPF_F_ADJ_ROOM_DECAP_L3_IPV4**, * **BPF_F_ADJ_ROOM_DECAP_L3_IPV6**: - * Indicate the new IP header version after decapsulating the outer - * IP header. Used when the inner and outer IP versions are different. + * Indicate the new IP header version after decapsulating the + * outer IP header. Used when the inner and outer IP versions + * are different. These flags only trigger a protocol change + * without clearing any tunnel-specific GSO flags. + * + * * **BPF_F_ADJ_ROOM_DECAP_L4_GRE**: + * Clear GRE tunnel GSO flags (SKB_GSO_GRE and SKB_GSO_GRE_CSUM) + * when decapsulating a GRE tunnel. + * + * * **BPF_F_ADJ_ROOM_DECAP_L4_UDP**: + * Clear UDP tunnel GSO flags (SKB_GSO_UDP_TUNNEL and + * SKB_GSO_UDP_TUNNEL_CSUM) when decapsulating a UDP tunnel. + * + * * **BPF_F_ADJ_ROOM_DECAP_IPXIP4**: + * Clear IPIP/SIT tunnel GSO flag (SKB_GSO_IPXIP4) when decapsulating + * a tunnel with an outer IPv4 header (IPv4-in-IPv4 or IPv6-in-IPv4). + * + * * **BPF_F_ADJ_ROOM_DECAP_IPXIP6**: + * Clear IPv6 encapsulation tunnel GSO flag (SKB_GSO_IPXIP6) when + * decapsulating a tunnel with an outer IPv6 header (IPv6-in-IPv6 + * or IPv4-in-IPv6). + * + * When using the decapsulation flags above, the skb->encapsulation + * flag is automatically cleared if all tunnel-specific GSO flags + * (SKB_GSO_UDP_TUNNEL, SKB_GSO_UDP_TUNNEL_CSUM, SKB_GSO_GRE, + * SKB_GSO_GRE_CSUM, SKB_GSO_IPXIP4, SKB_GSO_IPXIP6) have been + * removed from the packet. This handles cases where all tunnel + * layers have been decapsulated. * * A call to this helper is susceptible to change the underlying * packet buffer. Therefore, at load time, all checks on pointers @@ -6087,7 +6113,7 @@ enum { }; /* BPF_FUNC_skb_adjust_room flags. */ -enum { +enum bpf_adj_room_flags { BPF_F_ADJ_ROOM_FIXED_GSO = (1ULL << 0), BPF_F_ADJ_ROOM_ENCAP_L3_IPV4 = (1ULL << 1), BPF_F_ADJ_ROOM_ENCAP_L3_IPV6 = (1ULL << 2), @@ -6097,6 +6123,10 @@ enum { BPF_F_ADJ_ROOM_ENCAP_L2_ETH = (1ULL << 6), BPF_F_ADJ_ROOM_DECAP_L3_IPV4 = (1ULL << 7), BPF_F_ADJ_ROOM_DECAP_L3_IPV6 = (1ULL << 8), + BPF_F_ADJ_ROOM_DECAP_L4_GRE = (1ULL << 9), + BPF_F_ADJ_ROOM_DECAP_L4_UDP = (1ULL << 10), + BPF_F_ADJ_ROOM_DECAP_IPXIP4 = (1ULL << 11), + BPF_F_ADJ_ROOM_DECAP_IPXIP6 = (1ULL << 12), }; enum { diff --git a/net/core/filter.c b/net/core/filter.c index 2d6f978ba9c96..7bc9176402df7 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -56,6 +56,7 @@ #include #include #include +#include #include #include #include @@ -3420,14 +3421,27 @@ static u32 bpf_skb_net_base_len(const struct sk_buff *skb) #define BPF_F_ADJ_ROOM_DECAP_L3_MASK (BPF_F_ADJ_ROOM_DECAP_L3_IPV4 | \ BPF_F_ADJ_ROOM_DECAP_L3_IPV6) -#define BPF_F_ADJ_ROOM_MASK (BPF_F_ADJ_ROOM_FIXED_GSO | \ - BPF_F_ADJ_ROOM_ENCAP_L3_MASK | \ +#define BPF_F_ADJ_ROOM_DECAP_L4_MASK (BPF_F_ADJ_ROOM_DECAP_L4_UDP | \ + BPF_F_ADJ_ROOM_DECAP_L4_GRE) + +#define BPF_F_ADJ_ROOM_DECAP_IPXIP_MASK (BPF_F_ADJ_ROOM_DECAP_IPXIP4 | \ + BPF_F_ADJ_ROOM_DECAP_IPXIP6) + +#define BPF_F_ADJ_ROOM_ENCAP_MASK (BPF_F_ADJ_ROOM_ENCAP_L3_MASK | \ BPF_F_ADJ_ROOM_ENCAP_L4_GRE | \ BPF_F_ADJ_ROOM_ENCAP_L4_UDP | \ BPF_F_ADJ_ROOM_ENCAP_L2_ETH | \ BPF_F_ADJ_ROOM_ENCAP_L2( \ - BPF_ADJ_ROOM_ENCAP_L2_MASK) | \ - BPF_F_ADJ_ROOM_DECAP_L3_MASK) + BPF_ADJ_ROOM_ENCAP_L2_MASK)) + +#define BPF_F_ADJ_ROOM_DECAP_MASK (BPF_F_ADJ_ROOM_DECAP_L3_MASK | \ + BPF_F_ADJ_ROOM_DECAP_L4_MASK | \ + BPF_F_ADJ_ROOM_DECAP_IPXIP_MASK) + +#define BPF_F_ADJ_ROOM_MASK (BPF_F_ADJ_ROOM_FIXED_GSO | \ + BPF_F_ADJ_ROOM_ENCAP_MASK | \ + BPF_F_ADJ_ROOM_DECAP_MASK | \ + BPF_F_ADJ_ROOM_NO_CSUM_RESET) static int bpf_skb_net_grow(struct sk_buff *skb, u32 off, u32 len_diff, u64 flags) @@ -3547,8 +3561,8 @@ static int bpf_skb_net_shrink(struct sk_buff *skb, u32 off, u32 len_diff, { int ret; - if (unlikely(flags & ~(BPF_F_ADJ_ROOM_FIXED_GSO | - BPF_F_ADJ_ROOM_DECAP_L3_MASK | + if (unlikely(flags & ~(BPF_F_ADJ_ROOM_DECAP_MASK | + BPF_F_ADJ_ROOM_FIXED_GSO | BPF_F_ADJ_ROOM_NO_CSUM_RESET))) return -EINVAL; @@ -3582,9 +3596,48 @@ static int bpf_skb_net_shrink(struct sk_buff *skb, u32 off, u32 len_diff, if (!(flags & BPF_F_ADJ_ROOM_FIXED_GSO)) skb_increase_gso_size(shinfo, len_diff); + /* Selective GSO flag clearing based on decap type. + * Only clear the flags for the tunnel layer being removed. + */ + if ((flags & BPF_F_ADJ_ROOM_DECAP_L4_UDP) && + (shinfo->gso_type & (SKB_GSO_UDP_TUNNEL | + SKB_GSO_UDP_TUNNEL_CSUM))) + shinfo->gso_type &= ~(SKB_GSO_UDP_TUNNEL | + SKB_GSO_UDP_TUNNEL_CSUM); + if ((flags & BPF_F_ADJ_ROOM_DECAP_L4_GRE) && + (shinfo->gso_type & (SKB_GSO_GRE | SKB_GSO_GRE_CSUM))) + shinfo->gso_type &= ~(SKB_GSO_GRE | + SKB_GSO_GRE_CSUM); + if ((flags & BPF_F_ADJ_ROOM_DECAP_IPXIP4) && + (shinfo->gso_type & SKB_GSO_IPXIP4)) + shinfo->gso_type &= ~SKB_GSO_IPXIP4; + if ((flags & BPF_F_ADJ_ROOM_DECAP_IPXIP6) && + (shinfo->gso_type & SKB_GSO_IPXIP6)) + shinfo->gso_type &= ~SKB_GSO_IPXIP6; + + /* Clear encapsulation flag only when no tunnel GSO flags remain */ + if (flags & (BPF_F_ADJ_ROOM_DECAP_L4_MASK | + BPF_F_ADJ_ROOM_DECAP_IPXIP_MASK)) { + if (!(shinfo->gso_type & (SKB_GSO_UDP_TUNNEL | + SKB_GSO_UDP_TUNNEL_CSUM | + SKB_GSO_GRE | + SKB_GSO_GRE_CSUM | + SKB_GSO_IPXIP4 | + SKB_GSO_IPXIP6 | + SKB_GSO_ESP))) + if (skb->encapsulation) + skb->encapsulation = 0; + } + /* Header must be checked, and gso_segs recomputed. */ shinfo->gso_type |= SKB_GSO_DODGY; shinfo->gso_segs = 0; + } else { + /* For non-GSO packets, clear encapsulation if decap flags are set */ + if ((flags & (BPF_F_ADJ_ROOM_DECAP_L4_MASK | + BPF_F_ADJ_ROOM_DECAP_IPXIP_MASK)) && + skb->encapsulation) + skb->encapsulation = 0; } return 0; @@ -3644,8 +3697,7 @@ BPF_CALL_4(bpf_skb_adjust_room, struct sk_buff *, skb, s32, len_diff, u32 off; int ret; - if (unlikely(flags & ~(BPF_F_ADJ_ROOM_MASK | - BPF_F_ADJ_ROOM_NO_CSUM_RESET))) + if (unlikely(flags & ~BPF_F_ADJ_ROOM_MASK)) return -EINVAL; if (unlikely(len_diff_abs > 0xfffU)) return -EFAULT; @@ -3664,20 +3716,53 @@ BPF_CALL_4(bpf_skb_adjust_room, struct sk_buff *, skb, s32, len_diff, return -ENOTSUPP; } - if (flags & BPF_F_ADJ_ROOM_DECAP_L3_MASK) { + if (flags & BPF_F_ADJ_ROOM_DECAP_MASK) { + u32 len_decap_min = 0; + if (!shrink) return -EINVAL; - switch (flags & BPF_F_ADJ_ROOM_DECAP_L3_MASK) { - case BPF_F_ADJ_ROOM_DECAP_L3_IPV4: + /* Reject mutually exclusive decap flag pairs. */ + if ((flags & BPF_F_ADJ_ROOM_DECAP_L3_MASK) == + BPF_F_ADJ_ROOM_DECAP_L3_MASK) + return -EINVAL; + + if ((flags & BPF_F_ADJ_ROOM_DECAP_L4_MASK) == + BPF_F_ADJ_ROOM_DECAP_L4_MASK) + return -EINVAL; + + if ((flags & BPF_F_ADJ_ROOM_DECAP_IPXIP_MASK) == + BPF_F_ADJ_ROOM_DECAP_IPXIP_MASK) + return -EINVAL; + + /* Reject mutually exclusive decap tunnel type flags. */ + if ((flags & BPF_F_ADJ_ROOM_DECAP_L4_MASK) && + (flags & BPF_F_ADJ_ROOM_DECAP_IPXIP_MASK)) + return -EINVAL; + + if (flags & BPF_F_ADJ_ROOM_DECAP_L4_MASK) + len_decap_min += bpf_skb_net_base_len(skb); + + if (flags & BPF_F_ADJ_ROOM_DECAP_L4_UDP) + len_decap_min += sizeof(struct udphdr); + + if (flags & BPF_F_ADJ_ROOM_DECAP_L4_GRE) + len_decap_min += sizeof(struct gre_base_hdr); + + if (flags & BPF_F_ADJ_ROOM_DECAP_IPXIP4) + len_decap_min += sizeof(struct iphdr); + + if (flags & BPF_F_ADJ_ROOM_DECAP_IPXIP6) + len_decap_min += sizeof(struct ipv6hdr); + + if (len_diff_abs < len_decap_min) + return -EINVAL; + + if (flags & BPF_F_ADJ_ROOM_DECAP_L3_IPV4) len_min = sizeof(struct iphdr); - break; - case BPF_F_ADJ_ROOM_DECAP_L3_IPV6: + + if (flags & BPF_F_ADJ_ROOM_DECAP_L3_IPV6) len_min = sizeof(struct ipv6hdr); - break; - default: - return -EINVAL; - } } len_cur = skb->len - skb_network_offset(skb); diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 021c0676ff204..fcc22bbcd6e2b 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -2897,8 +2897,34 @@ union bpf_attr { * * * **BPF_F_ADJ_ROOM_DECAP_L3_IPV4**, * **BPF_F_ADJ_ROOM_DECAP_L3_IPV6**: - * Indicate the new IP header version after decapsulating the outer - * IP header. Used when the inner and outer IP versions are different. + * Indicate the new IP header version after decapsulating the + * outer IP header. Used when the inner and outer IP versions + * are different. These flags only trigger a protocol change + * without clearing any tunnel-specific GSO flags. + * + * * **BPF_F_ADJ_ROOM_DECAP_L4_GRE**: + * Clear GRE tunnel GSO flags (SKB_GSO_GRE and SKB_GSO_GRE_CSUM) + * when decapsulating a GRE tunnel. + * + * * **BPF_F_ADJ_ROOM_DECAP_L4_UDP**: + * Clear UDP tunnel GSO flags (SKB_GSO_UDP_TUNNEL and + * SKB_GSO_UDP_TUNNEL_CSUM) when decapsulating a UDP tunnel. + * + * * **BPF_F_ADJ_ROOM_DECAP_IPXIP4**: + * Clear IPIP/SIT tunnel GSO flag (SKB_GSO_IPXIP4) when decapsulating + * a tunnel with an outer IPv4 header (IPv4-in-IPv4 or IPv6-in-IPv4). + * + * * **BPF_F_ADJ_ROOM_DECAP_IPXIP6**: + * Clear IPv6 encapsulation tunnel GSO flag (SKB_GSO_IPXIP6) when + * decapsulating a tunnel with an outer IPv6 header (IPv6-in-IPv6 + * or IPv4-in-IPv6). + * + * When using the decapsulation flags above, the skb->encapsulation + * flag is automatically cleared if all tunnel-specific GSO flags + * (SKB_GSO_UDP_TUNNEL, SKB_GSO_UDP_TUNNEL_CSUM, SKB_GSO_GRE, + * SKB_GSO_GRE_CSUM, SKB_GSO_IPXIP4, SKB_GSO_IPXIP6) have been + * removed from the packet. This handles cases where all tunnel + * layers have been decapsulated. * * A call to this helper is susceptible to change the underlying * packet buffer. Therefore, at load time, all checks on pointers @@ -6087,7 +6113,7 @@ enum { }; /* BPF_FUNC_skb_adjust_room flags. */ -enum { +enum bpf_adj_room_flags { BPF_F_ADJ_ROOM_FIXED_GSO = (1ULL << 0), BPF_F_ADJ_ROOM_ENCAP_L3_IPV4 = (1ULL << 1), BPF_F_ADJ_ROOM_ENCAP_L3_IPV6 = (1ULL << 2), @@ -6097,6 +6123,10 @@ enum { BPF_F_ADJ_ROOM_ENCAP_L2_ETH = (1ULL << 6), BPF_F_ADJ_ROOM_DECAP_L3_IPV4 = (1ULL << 7), BPF_F_ADJ_ROOM_DECAP_L3_IPV6 = (1ULL << 8), + BPF_F_ADJ_ROOM_DECAP_L4_GRE = (1ULL << 9), + BPF_F_ADJ_ROOM_DECAP_L4_UDP = (1ULL << 10), + BPF_F_ADJ_ROOM_DECAP_IPXIP4 = (1ULL << 11), + BPF_F_ADJ_ROOM_DECAP_IPXIP6 = (1ULL << 12), }; enum { diff --git a/tools/testing/selftests/bpf/progs/test_tc_tunnel.c b/tools/testing/selftests/bpf/progs/test_tc_tunnel.c index 404124a938927..a8adb1311091a 100644 --- a/tools/testing/selftests/bpf/progs/test_tc_tunnel.c +++ b/tools/testing/selftests/bpf/progs/test_tc_tunnel.c @@ -597,7 +597,8 @@ int __encap_ip6vxlan_eth(struct __sk_buff *skb) return TC_ACT_OK; } -static int decap_internal(struct __sk_buff *skb, int off, int len, char proto) +static int decap_internal(struct __sk_buff *skb, int off, int len, char proto, + __u64 ipxip_flag) { __u64 flags = BPF_F_ADJ_ROOM_FIXED_GSO; struct ipv6_opt_hdr ip6_opt_hdr; @@ -607,10 +608,12 @@ static int decap_internal(struct __sk_buff *skb, int off, int len, char proto) switch (proto) { case IPPROTO_IPIP: - flags |= BPF_F_ADJ_ROOM_DECAP_L3_IPV4; + flags |= BPF_F_ADJ_ROOM_DECAP_L3_IPV4 | + ipxip_flag; break; case IPPROTO_IPV6: - flags |= BPF_F_ADJ_ROOM_DECAP_L3_IPV6; + flags |= BPF_F_ADJ_ROOM_DECAP_L3_IPV6 | + ipxip_flag; break; case NEXTHDR_DEST: if (bpf_skb_load_bytes(skb, off + len, &ip6_opt_hdr, @@ -618,10 +621,12 @@ static int decap_internal(struct __sk_buff *skb, int off, int len, char proto) return TC_ACT_OK; switch (ip6_opt_hdr.nexthdr) { case IPPROTO_IPIP: - flags |= BPF_F_ADJ_ROOM_DECAP_L3_IPV4; + flags |= BPF_F_ADJ_ROOM_DECAP_L3_IPV4 | + ipxip_flag; break; case IPPROTO_IPV6: - flags |= BPF_F_ADJ_ROOM_DECAP_L3_IPV6; + flags |= BPF_F_ADJ_ROOM_DECAP_L3_IPV6 | + ipxip_flag; break; default: return TC_ACT_OK; @@ -629,6 +634,7 @@ static int decap_internal(struct __sk_buff *skb, int off, int len, char proto) break; case IPPROTO_GRE: olen += sizeof(struct gre_hdr); + flags |= BPF_F_ADJ_ROOM_DECAP_L4_GRE; if (bpf_skb_load_bytes(skb, off + len, &greh, sizeof(greh)) < 0) return TC_ACT_OK; switch (bpf_ntohs(greh.protocol)) { @@ -642,6 +648,7 @@ static int decap_internal(struct __sk_buff *skb, int off, int len, char proto) break; case IPPROTO_UDP: olen += sizeof(struct udphdr); + flags |= BPF_F_ADJ_ROOM_DECAP_L4_UDP; if (bpf_skb_load_bytes(skb, off + len, &udph, sizeof(udph)) < 0) return TC_ACT_OK; switch (bpf_ntohs(udph.dest)) { @@ -678,7 +685,8 @@ static int decap_ipv4(struct __sk_buff *skb) return TC_ACT_OK; return decap_internal(skb, ETH_HLEN, sizeof(iph_outer), - iph_outer.protocol); + iph_outer.protocol, + BPF_F_ADJ_ROOM_DECAP_IPXIP4); } static int decap_ipv6(struct __sk_buff *skb) @@ -690,7 +698,8 @@ static int decap_ipv6(struct __sk_buff *skb) return TC_ACT_OK; return decap_internal(skb, ETH_HLEN, sizeof(iph_outer), - iph_outer.nexthdr); + iph_outer.nexthdr, + BPF_F_ADJ_ROOM_DECAP_IPXIP6); } SEC("decap")