From 769d2a48d67d828c0befc854e9e0b282500bae5b Mon Sep 17 00:00:00 2001 From: Gray Liang Date: Sat, 2 Mar 2024 12:41:57 +0800 Subject: [PATCH 1/3] bpf: Parse skb->data only once Previously we parsed skb->data for twice: wan_egress/lan_ingress and dae0peer_ingress. This is because the limit of bpf_sk_assign: we have to call it within the netns where the socket is. This patch manages to parse skb->data only once at wan_egress/lan_ingress, where we leave a value in skb->cb[1] to tell dae0peer_ingress: 1. if skb->cb[1] == TCP, then it's a new TCP conn, assign skb to TCP listener; 2. if skb->cb[1] == UDP, then it's a UDP, assign skb to UDP listener; 3. else it's an establised TCP conn, stack can take care of socket lookup; --- control/kern/tproxy.c | 94 +++++++++--------------------------------- control/netns_utils.go | 8 ++++ 2 files changed, 28 insertions(+), 74 deletions(-) diff --git a/control/kern/tproxy.c b/control/kern/tproxy.c index 488bc27644..2b7861784e 100644 --- a/control/kern/tproxy.c +++ b/control/kern/tproxy.c @@ -835,24 +835,6 @@ static __always_inline __u32 get_link_h_len(__u32 ifindex, return 0; } -static __always_inline int -lookup_and_assign_tcp_established(struct __sk_buff *skb, struct bpf_sock_tuple *tuple, __u32 len) -{ - int ret = -1; - struct bpf_sock *sk = bpf_skc_lookup_tcp(skb, tuple, len, BPF_F_CURRENT_NETNS, 0); - if (!sk) - return -1; - - if (sk->state == BPF_TCP_LISTEN || sk->state == BPF_TCP_TIME_WAIT) { - goto release; - } - - ret = bpf_sk_assign(skb, sk, 0); -release: - bpf_sk_release(sk); - return ret; -} - static __always_inline int assign_listener(struct __sk_buff *skb, __u8 l4proto) { @@ -873,7 +855,8 @@ assign_listener(struct __sk_buff *skb, __u8 l4proto) static __always_inline int redirect_to_control_plane(struct __sk_buff *skb, __u32 link_h_len, struct tuples *tuples, __u8 l4proto, - struct ethhdr *ethh, __u8 from_wan) { + struct ethhdr *ethh, __u8 from_wan, + struct tcphdr *tcph) { /* Redirect from L3 dev to L2 dev, e.g. wg0 -> veth */ if (!link_h_len) { @@ -903,6 +886,10 @@ redirect_to_control_plane(struct __sk_buff *skb, __u32 link_h_len, bpf_map_update_elem(&redirect_track, &redirect_tuple, &redirect_entry, BPF_ANY); skb->cb[0] = TPROXY_MARK; + skb->cb[1] = 0; + if ((l4proto == IPPROTO_TCP && tcph->syn) || l4proto == IPPROTO_UDP) { + skb->cb[1] = l4proto; + } return bpf_redirect(PARAM.dae0_ifindex, 0); } @@ -1071,7 +1058,7 @@ int tproxy_lan_ingress(struct __sk_buff *skb) { // Assign to control plane. control_plane: - return redirect_to_control_plane(skb, link_h_len, &tuples, l4proto, ðh, 0); + return redirect_to_control_plane(skb, link_h_len, &tuples, l4proto, ðh, 0, &tcph); direct: return TC_ACT_OK; @@ -1369,72 +1356,31 @@ int tproxy_wan_egress(struct __sk_buff *skb) { } - return redirect_to_control_plane(skb, link_h_len, &tuples, l4proto, ðh, 1); + return redirect_to_control_plane(skb, link_h_len, &tuples, l4proto, ðh, 1, &tcph); } SEC("tc/dae0peer_ingress") int tproxy_dae0peer_ingress(struct __sk_buff *skb) { - struct ethhdr ethh; - struct iphdr iph; - struct ipv6hdr ipv6h; - struct icmp6hdr icmp6h; - struct tcphdr tcph; - struct udphdr udph; - __u8 ihl; - __u8 l4proto; - __u32 link_h_len = 14; - + /* Only packets redirected from wan_egress or lan_ingress have this cb mark. */ if (skb->cb[0] != TPROXY_MARK) { return TC_ACT_SHOT; } - int ret = parse_transport(skb, link_h_len, ðh, &iph, &ipv6h, &icmp6h, - &tcph, &udph, &ihl, &l4proto); - if (ret) { - return TC_ACT_OK; - } - if (l4proto == IPPROTO_ICMPV6) { - return TC_ACT_OK; - } - - struct tuples tuples; - get_tuples(skb, &tuples, &iph, &ipv6h, &tcph, &udph, l4proto); - + /* ip rule add fwmark 0x8000000/0x8000000 table 2023 + * ip route add local default dev lo table 2023 + */ skb->mark = TPROXY_MARK; bpf_skb_change_type(skb, PACKET_HOST); - /* First look for established socket. - * This is done for TCP only, otherwise bpf_sk_lookup_udp would find - * previously created transparent socket for UDP, which is not what we want. - * */ - if (l4proto == IPPROTO_TCP) { - __u32 tuple_size; - struct bpf_sock_tuple tuple = {}; - - if (skb->protocol == bpf_htons(ETH_P_IP)) { - tuple.ipv4.saddr = tuples.five.sip.u6_addr32[3]; - tuple.ipv4.daddr = tuples.five.dip.u6_addr32[3]; - tuple.ipv4.sport = tuples.five.sport; - tuple.ipv4.dport = tuples.five.dport; - tuple_size = sizeof(tuple.ipv4); - } else { - __builtin_memcpy(tuple.ipv6.saddr, &tuples.five.sip, IPV6_BYTE_LENGTH); - __builtin_memcpy(tuple.ipv6.daddr, &tuples.five.dip, IPV6_BYTE_LENGTH); - tuple.ipv6.sport = tuples.five.sport; - tuple.ipv6.dport = tuples.five.dport; - tuple_size = sizeof(tuple.ipv6); - } - if (lookup_and_assign_tcp_established(skb, &tuple, tuple_size) == 0) { - return TC_ACT_OK; - } + /* l4proto is stored in skb->cb[1] only for UDP and new TCP. As for + * established TCP, kernel can take care of socket lookup, so just + * return them to stack without calling bpf_sk_assign. + */ + __u8 l4proto = skb->cb[1]; + if (l4proto != 0) { + assign_listener(skb, l4proto); } - - /* Then look for tproxy listening socket */ - if (assign_listener(skb, l4proto) == 0) { - return TC_ACT_OK; - } - - return TC_ACT_SHOT; + return TC_ACT_OK; } SEC("tc/dae0_ingress") diff --git a/control/netns_utils.go b/control/netns_utils.go index a32c983af2..5fbaf64d96 100644 --- a/control/netns_utils.go +++ b/control/netns_utils.go @@ -290,6 +290,14 @@ func (ns *DaeNetns) setupSysctl() (err error) { } // sysctl net.ipv6.conf.all.forwarding=1 SetForwarding("all", "1") + + // *_early_demux is not mandatory, but it's recommended to enable it for better performance + if err = netns.Set(ns.daeNs); err != nil { + return fmt.Errorf("failed to switch to daens: %v", err) + } + defer netns.Set(ns.hostNs) + sysctl.Set("net.ipv4.tcp_early_demux", "1", false) + sysctl.Set("net.ipv4.ip_early_demux", "1", false) return } From 5badabfc8a21d5f2accc49329e8e8da58d415049 Mon Sep 17 00:00:00 2001 From: Gray Liang Date: Thu, 7 Mar 2024 01:28:24 +0800 Subject: [PATCH 2/3] bpf: use bpf_redirect_peer for lan_ingress!!! --- control/kern/tproxy.c | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/control/kern/tproxy.c b/control/kern/tproxy.c index 2b7861784e..8af2e01ea5 100644 --- a/control/kern/tproxy.c +++ b/control/kern/tproxy.c @@ -852,8 +852,8 @@ assign_listener(struct __sk_buff *skb, __u8 l4proto) return ret; } -static __always_inline int -redirect_to_control_plane(struct __sk_buff *skb, __u32 link_h_len, +static __always_inline void +prep_redirect_to_control_plane(struct __sk_buff *skb, __u32 link_h_len, struct tuples *tuples, __u8 l4proto, struct ethhdr *ethh, __u8 from_wan, struct tcphdr *tcph) { @@ -890,7 +890,7 @@ redirect_to_control_plane(struct __sk_buff *skb, __u32 link_h_len, if ((l4proto == IPPROTO_TCP && tcph->syn) || l4proto == IPPROTO_UDP) { skb->cb[1] = l4proto; } - return bpf_redirect(PARAM.dae0_ifindex, 0); + return; } SEC("tc/ingress") @@ -1058,7 +1058,8 @@ int tproxy_lan_ingress(struct __sk_buff *skb) { // Assign to control plane. control_plane: - return redirect_to_control_plane(skb, link_h_len, &tuples, l4proto, ðh, 0, &tcph); + prep_redirect_to_control_plane(skb, link_h_len, &tuples, l4proto, ðh, 0, &tcph); + return bpf_redirect_peer(PARAM.dae0_ifindex, 0); direct: return TC_ACT_OK; @@ -1356,7 +1357,8 @@ int tproxy_wan_egress(struct __sk_buff *skb) { } - return redirect_to_control_plane(skb, link_h_len, &tuples, l4proto, ðh, 1, &tcph); + prep_redirect_to_control_plane(skb, link_h_len, &tuples, l4proto, ðh, 1, &tcph); + return bpf_redirect(PARAM.dae0_ifindex, 0); } SEC("tc/dae0peer_ingress") From 8cc3e8a86549d9bd1b53913bf01a15266bcc8006 Mon Sep 17 00:00:00 2001 From: Gray Liang Date: Thu, 7 Mar 2024 01:44:50 +0800 Subject: [PATCH 3/3] ci: update lvh-images Because apt.k8s.io no longer exists: https://kubernetes.io/blog/2023/08/31/legacy-package-repository-deprecation/ --- .github/workflows/kernel-test.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/kernel-test.yml b/.github/workflows/kernel-test.yml index 8cb171416c..d5194ad420 100644 --- a/.github/workflows/kernel-test.yml +++ b/.github/workflows/kernel-test.yml @@ -43,7 +43,7 @@ jobs: strategy: fail-fast: false matrix: - kernel: [ '5.10-20240201.165956', '5.15-20240201.165956', '6.1-20240201.165956', 'bpf-next-20240204.012837' ] + kernel: [ '5.10-20240305.092417', '5.15-20240305.092417', '6.1-20240305.092417', '6.6-20240305.092417' ] timeout-minutes: 10 steps: - uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11