diff --git a/Documentation/networking/tproxy.rst b/Documentation/networking/tproxy.rst index 00dc3a1a66b48..7f7c1ff6f159e 100644 --- a/Documentation/networking/tproxy.rst +++ b/Documentation/networking/tproxy.rst @@ -17,7 +17,7 @@ The idea is that you identify packets with destination address matching a local socket on your box, set the packet mark to a certain value:: # iptables -t mangle -N DIVERT - # iptables -t mangle -A PREROUTING -p tcp -m socket -j DIVERT + # iptables -t mangle -A PREROUTING -p tcp -m socket --transparent -j DIVERT # iptables -t mangle -A DIVERT -j MARK --set-mark 1 # iptables -t mangle -A DIVERT -j ACCEPT diff --git a/include/linux/netfilter.h b/include/linux/netfilter.h index 2683b2b77612a..2b8aac2c70ada 100644 --- a/include/linux/netfilter.h +++ b/include/linux/netfilter.h @@ -376,15 +376,11 @@ int nf_route(struct net *net, struct dst_entry **dst, struct flowi *fl, struct nf_conn; enum nf_nat_manip_type; struct nlattr; -enum ip_conntrack_dir; struct nf_nat_hook { int (*parse_nat_setup)(struct nf_conn *ct, enum nf_nat_manip_type manip, const struct nlattr *attr); void (*decode_session)(struct sk_buff *skb, struct flowi *fl); - unsigned int (*manip_pkt)(struct sk_buff *skb, struct nf_conn *ct, - enum nf_nat_manip_type mtype, - enum ip_conntrack_dir dir); void (*remove_nat_bysrc)(struct nf_conn *ct); }; diff --git a/net/ipv4/netfilter/nf_reject_ipv4.c b/net/ipv4/netfilter/nf_reject_ipv4.c index 04504b2b51df5..87fd945a0d27a 100644 --- a/net/ipv4/netfilter/nf_reject_ipv4.c +++ b/net/ipv4/netfilter/nf_reject_ipv4.c @@ -239,9 +239,8 @@ static int nf_reject_fill_skb_dst(struct sk_buff *skb_in) void nf_send_reset(struct net *net, struct sock *sk, struct sk_buff *oldskb, int hook) { - struct sk_buff *nskb; - struct iphdr *niph; const struct tcphdr *oth; + struct sk_buff *nskb; struct tcphdr _oth; oth = nf_reject_ip_tcphdr_get(oldskb, &_oth, hook); @@ -266,14 +265,12 @@ void nf_send_reset(struct net *net, struct sock *sk, struct sk_buff *oldskb, nskb->mark = IP4_REPLY_MARK(net, oldskb->mark); skb_reserve(nskb, LL_MAX_HEADER); - niph = nf_reject_iphdr_put(nskb, oldskb, IPPROTO_TCP, - ip4_dst_hoplimit(skb_dst(nskb))); + nf_reject_iphdr_put(nskb, oldskb, IPPROTO_TCP, + ip4_dst_hoplimit(skb_dst(nskb))); nf_reject_ip_tcphdr_put(nskb, oldskb, oth); if (ip_route_me_harder(net, sk, nskb, RTN_UNSPEC)) goto free_nskb; - niph = ip_hdr(nskb); - /* "Never happens" */ if (nskb->len > dst_mtu(skb_dst(nskb))) goto free_nskb; @@ -290,6 +287,7 @@ void nf_send_reset(struct net *net, struct sock *sk, struct sk_buff *oldskb, */ if (nf_bridge_info_exists(oldskb)) { struct ethhdr *oeth = eth_hdr(oldskb); + struct iphdr *niph = ip_hdr(nskb); struct net_device *br_indev; br_indev = nf_bridge_get_physindev(oldskb, net); diff --git a/net/ipv6/netfilter/nf_reject_ipv6.c b/net/ipv6/netfilter/nf_reject_ipv6.c index b9457473c176d..7db0437140bf2 100644 --- a/net/ipv6/netfilter/nf_reject_ipv6.c +++ b/net/ipv6/netfilter/nf_reject_ipv6.c @@ -273,7 +273,6 @@ void nf_send_reset6(struct net *net, struct sock *sk, struct sk_buff *oldskb, const struct tcphdr *otcph; unsigned int otcplen, hh_len; const struct ipv6hdr *oip6h = ipv6_hdr(oldskb); - struct ipv6hdr *ip6h; struct dst_entry *dst = NULL; struct flowi6 fl6; @@ -329,8 +328,7 @@ void nf_send_reset6(struct net *net, struct sock *sk, struct sk_buff *oldskb, nskb->mark = fl6.flowi6_mark; skb_reserve(nskb, hh_len + dst->header_len); - ip6h = nf_reject_ip6hdr_put(nskb, oldskb, IPPROTO_TCP, - ip6_dst_hoplimit(dst)); + nf_reject_ip6hdr_put(nskb, oldskb, IPPROTO_TCP, ip6_dst_hoplimit(dst)); nf_reject_ip6_tcphdr_put(nskb, oldskb, otcph, otcplen); nf_ct_attach(nskb, oldskb); @@ -345,6 +343,7 @@ void nf_send_reset6(struct net *net, struct sock *sk, struct sk_buff *oldskb, */ if (nf_bridge_info_exists(oldskb)) { struct ethhdr *oeth = eth_hdr(oldskb); + struct ipv6hdr *ip6h = ipv6_hdr(nskb); struct net_device *br_indev; br_indev = nf_bridge_get_physindev(oldskb, net); diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c index d3cb53b008f5a..9db3e2b0b1c34 100644 --- a/net/netfilter/nf_conntrack_core.c +++ b/net/netfilter/nf_conntrack_core.c @@ -988,6 +988,56 @@ static void __nf_conntrack_insert_prepare(struct nf_conn *ct) tstamp->start = ktime_get_real_ns(); } +/** + * nf_ct_match_reverse - check if ct1 and ct2 refer to identical flow + * @ct1: conntrack in hash table to check against + * @ct2: merge candidate + * + * returns true if ct1 and ct2 happen to refer to the same flow, but + * in opposing directions, i.e. + * ct1: a:b -> c:d + * ct2: c:d -> a:b + * for both directions. If so, @ct2 should not have been created + * as the skb should have been picked up as ESTABLISHED flow. + * But ct1 was not yet committed to hash table before skb that created + * ct2 had arrived. + * + * Note we don't compare netns because ct entries in different net + * namespace cannot clash to begin with. + * + * @return: true if ct1 and ct2 are identical when swapping origin/reply. + */ +static bool +nf_ct_match_reverse(const struct nf_conn *ct1, const struct nf_conn *ct2) +{ + u16 id1, id2; + + if (!nf_ct_tuple_equal(&ct1->tuplehash[IP_CT_DIR_ORIGINAL].tuple, + &ct2->tuplehash[IP_CT_DIR_REPLY].tuple)) + return false; + + if (!nf_ct_tuple_equal(&ct1->tuplehash[IP_CT_DIR_REPLY].tuple, + &ct2->tuplehash[IP_CT_DIR_ORIGINAL].tuple)) + return false; + + id1 = nf_ct_zone_id(nf_ct_zone(ct1), IP_CT_DIR_ORIGINAL); + id2 = nf_ct_zone_id(nf_ct_zone(ct2), IP_CT_DIR_REPLY); + if (id1 != id2) + return false; + + id1 = nf_ct_zone_id(nf_ct_zone(ct1), IP_CT_DIR_REPLY); + id2 = nf_ct_zone_id(nf_ct_zone(ct2), IP_CT_DIR_ORIGINAL); + + return id1 == id2; +} + +static int nf_ct_can_merge(const struct nf_conn *ct, + const struct nf_conn *loser_ct) +{ + return nf_ct_match(ct, loser_ct) || + nf_ct_match_reverse(ct, loser_ct); +} + /* caller must hold locks to prevent concurrent changes */ static int __nf_ct_resolve_clash(struct sk_buff *skb, struct nf_conntrack_tuple_hash *h) @@ -999,11 +1049,7 @@ static int __nf_ct_resolve_clash(struct sk_buff *skb, loser_ct = nf_ct_get(skb, &ctinfo); - if (nf_ct_is_dying(ct)) - return NF_DROP; - - if (((ct->status & IPS_NAT_DONE_MASK) == 0) || - nf_ct_match(ct, loser_ct)) { + if (nf_ct_can_merge(ct, loser_ct)) { struct net *net = nf_ct_net(ct); nf_conntrack_get(&ct->ct_general); @@ -2151,80 +2197,6 @@ static void nf_conntrack_attach(struct sk_buff *nskb, const struct sk_buff *skb) nf_conntrack_get(skb_nfct(nskb)); } -static int __nf_conntrack_update(struct net *net, struct sk_buff *skb, - struct nf_conn *ct, - enum ip_conntrack_info ctinfo) -{ - const struct nf_nat_hook *nat_hook; - struct nf_conntrack_tuple_hash *h; - struct nf_conntrack_tuple tuple; - unsigned int status; - int dataoff; - u16 l3num; - u8 l4num; - - l3num = nf_ct_l3num(ct); - - dataoff = get_l4proto(skb, skb_network_offset(skb), l3num, &l4num); - if (dataoff <= 0) - return NF_DROP; - - if (!nf_ct_get_tuple(skb, skb_network_offset(skb), dataoff, l3num, - l4num, net, &tuple)) - return NF_DROP; - - if (ct->status & IPS_SRC_NAT) { - memcpy(tuple.src.u3.all, - ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.u3.all, - sizeof(tuple.src.u3.all)); - tuple.src.u.all = - ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.u.all; - } - - if (ct->status & IPS_DST_NAT) { - memcpy(tuple.dst.u3.all, - ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.u3.all, - sizeof(tuple.dst.u3.all)); - tuple.dst.u.all = - ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.u.all; - } - - h = nf_conntrack_find_get(net, nf_ct_zone(ct), &tuple); - if (!h) - return NF_ACCEPT; - - /* Store status bits of the conntrack that is clashing to re-do NAT - * mangling according to what it has been done already to this packet. - */ - status = ct->status; - - nf_ct_put(ct); - ct = nf_ct_tuplehash_to_ctrack(h); - nf_ct_set(skb, ct, ctinfo); - - nat_hook = rcu_dereference(nf_nat_hook); - if (!nat_hook) - return NF_ACCEPT; - - if (status & IPS_SRC_NAT) { - unsigned int verdict = nat_hook->manip_pkt(skb, ct, - NF_NAT_MANIP_SRC, - IP_CT_DIR_ORIGINAL); - if (verdict != NF_ACCEPT) - return verdict; - } - - if (status & IPS_DST_NAT) { - unsigned int verdict = nat_hook->manip_pkt(skb, ct, - NF_NAT_MANIP_DST, - IP_CT_DIR_ORIGINAL); - if (verdict != NF_ACCEPT) - return verdict; - } - - return NF_ACCEPT; -} - /* This packet is coming from userspace via nf_queue, complete the packet * processing after the helper invocation in nf_confirm(). */ @@ -2288,17 +2260,6 @@ static int nf_conntrack_update(struct net *net, struct sk_buff *skb) if (!ct) return NF_ACCEPT; - if (!nf_ct_is_confirmed(ct)) { - int ret = __nf_conntrack_update(net, skb, ct, ctinfo); - - if (ret != NF_ACCEPT) - return ret; - - ct = nf_ct_get(skb, &ctinfo); - if (!ct) - return NF_ACCEPT; - } - return nf_confirm_cthelper(skb, ct, ctinfo); } diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c index 123e2e933e9b2..6a1239433830f 100644 --- a/net/netfilter/nf_conntrack_netlink.c +++ b/net/netfilter/nf_conntrack_netlink.c @@ -382,7 +382,7 @@ static int ctnetlink_dump_secctx(struct sk_buff *skb, const struct nf_conn *ct) #define ctnetlink_dump_secctx(a, b) (0) #endif -#ifdef CONFIG_NF_CONNTRACK_LABELS +#ifdef CONFIG_NF_CONNTRACK_EVENTS static inline int ctnetlink_label_size(const struct nf_conn *ct) { struct nf_conn_labels *labels = nf_ct_labels_find(ct); @@ -391,6 +391,7 @@ static inline int ctnetlink_label_size(const struct nf_conn *ct) return 0; return nla_total_size(sizeof(labels->bits)); } +#endif static int ctnetlink_dump_labels(struct sk_buff *skb, const struct nf_conn *ct) @@ -411,10 +412,6 @@ ctnetlink_dump_labels(struct sk_buff *skb, const struct nf_conn *ct) return 0; } -#else -#define ctnetlink_dump_labels(a, b) (0) -#define ctnetlink_label_size(a) (0) -#endif #define master_tuple(ct) &(ct->master->tuplehash[IP_CT_DIR_ORIGINAL].tuple) @@ -652,7 +649,6 @@ static size_t ctnetlink_proto_size(const struct nf_conn *ct) return len + len4; } -#endif static inline size_t ctnetlink_acct_size(const struct nf_conn *ct) { @@ -690,6 +686,7 @@ static inline size_t ctnetlink_timestamp_size(const struct nf_conn *ct) return 0; #endif } +#endif #ifdef CONFIG_NF_CONNTRACK_EVENTS static size_t ctnetlink_nlmsg_size(const struct nf_conn *ct) diff --git a/net/netfilter/nf_nat_core.c b/net/netfilter/nf_nat_core.c index 6d8da6dddf998..4085c436e3062 100644 --- a/net/netfilter/nf_nat_core.c +++ b/net/netfilter/nf_nat_core.c @@ -183,7 +183,35 @@ hash_by_src(const struct net *net, return reciprocal_scale(hash, nf_nat_htable_size); } -/* Is this tuple already taken? (not by us) */ +/** + * nf_nat_used_tuple - check if proposed nat tuple clashes with existing entry + * @tuple: proposed NAT binding + * @ignored_conntrack: our (unconfirmed) conntrack entry + * + * A conntrack entry can be inserted to the connection tracking table + * if there is no existing entry with an identical tuple in either direction. + * + * Example: + * INITIATOR -> NAT/PAT -> RESPONDER + * + * INITIATOR passes through NAT/PAT ("us") and SNAT is done (saddr rewrite). + * Then, later, NAT/PAT itself also connects to RESPONDER. + * + * This will not work if the SNAT done earlier has same IP:PORT source pair. + * + * Conntrack table has: + * ORIGINAL: $IP_INITIATOR:$SPORT -> $IP_RESPONDER:$DPORT + * REPLY: $IP_RESPONDER:$DPORT -> $IP_NAT:$SPORT + * + * and new locally originating connection wants: + * ORIGINAL: $IP_NAT:$SPORT -> $IP_RESPONDER:$DPORT + * REPLY: $IP_RESPONDER:$DPORT -> $IP_NAT:$SPORT + * + * ... which would mean incoming packets cannot be distinguished between + * the existing and the newly added entry (identical IP_CT_DIR_REPLY tuple). + * + * @return: true if the proposed NAT mapping collides with an existing entry. + */ static int nf_nat_used_tuple(const struct nf_conntrack_tuple *tuple, const struct nf_conn *ignored_conntrack) @@ -200,6 +228,94 @@ nf_nat_used_tuple(const struct nf_conntrack_tuple *tuple, return nf_conntrack_tuple_taken(&reply, ignored_conntrack); } +static bool nf_nat_allow_clash(const struct nf_conn *ct) +{ + return nf_ct_l4proto_find(nf_ct_protonum(ct))->allow_clash; +} + +/** + * nf_nat_used_tuple_new - check if to-be-inserted conntrack collides with existing entry + * @tuple: proposed NAT binding + * @ignored_ct: our (unconfirmed) conntrack entry + * + * Same as nf_nat_used_tuple, but also check for rare clash in reverse + * direction. Should be called only when @tuple has not been altered, i.e. + * @ignored_conntrack will not be subject to NAT. + * + * @return: true if the proposed NAT mapping collides with existing entry. + */ +static noinline bool +nf_nat_used_tuple_new(const struct nf_conntrack_tuple *tuple, + const struct nf_conn *ignored_ct) +{ + static const unsigned long uses_nat = IPS_NAT_MASK | IPS_SEQ_ADJUST_BIT; + const struct nf_conntrack_tuple_hash *thash; + const struct nf_conntrack_zone *zone; + struct nf_conn *ct; + bool taken = true; + struct net *net; + + if (!nf_nat_used_tuple(tuple, ignored_ct)) + return false; + + if (!nf_nat_allow_clash(ignored_ct)) + return true; + + /* Initial choice clashes with existing conntrack. + * Check for (rare) reverse collision. + * + * This can happen when new packets are received in both directions + * at the exact same time on different CPUs. + * + * Without SMP, first packet creates new conntrack entry and second + * packet is resolved as established reply packet. + * + * With parallel processing, both packets could be picked up as + * new and both get their own ct entry allocated. + * + * If ignored_conntrack and colliding ct are not subject to NAT then + * pretend the tuple is available and let later clash resolution + * handle this at insertion time. + * + * Without it, the 'reply' packet has its source port rewritten + * by nat engine. + */ + if (READ_ONCE(ignored_ct->status) & uses_nat) + return true; + + net = nf_ct_net(ignored_ct); + zone = nf_ct_zone(ignored_ct); + + thash = nf_conntrack_find_get(net, zone, tuple); + if (unlikely(!thash)) /* clashing entry went away */ + return false; + + ct = nf_ct_tuplehash_to_ctrack(thash); + + /* NB: IP_CT_DIR_ORIGINAL should be impossible because + * nf_nat_used_tuple() handles origin collisions. + * + * Handle remote chance other CPU confirmed its ct right after. + */ + if (thash->tuple.dst.dir != IP_CT_DIR_REPLY) + goto out; + + /* clashing connection subject to NAT? Retry with new tuple. */ + if (READ_ONCE(ct->status) & uses_nat) + goto out; + + if (nf_ct_tuple_equal(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple, + &ignored_ct->tuplehash[IP_CT_DIR_REPLY].tuple) && + nf_ct_tuple_equal(&ct->tuplehash[IP_CT_DIR_REPLY].tuple, + &ignored_ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple)) { + taken = false; + goto out; + } +out: + nf_ct_put(ct); + return taken; +} + static bool nf_nat_may_kill(struct nf_conn *ct, unsigned long flags) { static const unsigned long flags_refuse = IPS_FIXED_TIMEOUT | @@ -611,7 +727,7 @@ get_unique_tuple(struct nf_conntrack_tuple *tuple, !(range->flags & NF_NAT_RANGE_PROTO_RANDOM_ALL)) { /* try the original tuple first */ if (nf_in_range(orig_tuple, range)) { - if (!nf_nat_used_tuple(orig_tuple, ct)) { + if (!nf_nat_used_tuple_new(orig_tuple, ct)) { *tuple = *orig_tuple; return; } @@ -1208,7 +1324,6 @@ static const struct nf_nat_hook nat_hook = { #ifdef CONFIG_XFRM .decode_session = __nf_nat_decode_session, #endif - .manip_pkt = nf_nat_manip_pkt, .remove_nat_bysrc = nf_nat_cleanup_conntrack, }; diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 57259b5f3ef58..a24fe62650a75 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -1849,7 +1849,7 @@ static int nft_dump_basechain_hook(struct sk_buff *skb, int family, if (!hook_list) hook_list = &basechain->hook_list; - list_for_each_entry(hook, hook_list, list) { + list_for_each_entry_rcu(hook, hook_list, list) { if (!first) first = hook; @@ -6684,7 +6684,7 @@ static int nft_setelem_catchall_insert(const struct net *net, } } - catchall = kmalloc(sizeof(*catchall), GFP_KERNEL); + catchall = kmalloc(sizeof(*catchall), GFP_KERNEL_ACCOUNT); if (!catchall) return -ENOMEM; @@ -9207,7 +9207,7 @@ static void nf_tables_flowtable_destroy(struct nft_flowtable *flowtable) flowtable->data.type->setup(&flowtable->data, hook->ops.dev, FLOW_BLOCK_UNBIND); list_del_rcu(&hook->list); - kfree(hook); + kfree_rcu(hook, rcu); } kfree(flowtable->name); module_put(flowtable->data.type->owner); diff --git a/net/netfilter/nft_compat.c b/net/netfilter/nft_compat.c index 52cdfee17f73f..7ca4f0d21fe2a 100644 --- a/net/netfilter/nft_compat.c +++ b/net/netfilter/nft_compat.c @@ -535,7 +535,7 @@ nft_match_large_init(const struct nft_ctx *ctx, const struct nft_expr *expr, struct xt_match *m = expr->ops->data; int ret; - priv->info = kmalloc(XT_ALIGN(m->matchsize), GFP_KERNEL); + priv->info = kmalloc(XT_ALIGN(m->matchsize), GFP_KERNEL_ACCOUNT); if (!priv->info) return -ENOMEM; @@ -808,7 +808,7 @@ nft_match_select_ops(const struct nft_ctx *ctx, goto err; } - ops = kzalloc(sizeof(struct nft_expr_ops), GFP_KERNEL); + ops = kzalloc(sizeof(struct nft_expr_ops), GFP_KERNEL_ACCOUNT); if (!ops) { err = -ENOMEM; goto err; @@ -898,7 +898,7 @@ nft_target_select_ops(const struct nft_ctx *ctx, goto err; } - ops = kzalloc(sizeof(struct nft_expr_ops), GFP_KERNEL); + ops = kzalloc(sizeof(struct nft_expr_ops), GFP_KERNEL_ACCOUNT); if (!ops) { err = -ENOMEM; goto err; diff --git a/net/netfilter/nft_log.c b/net/netfilter/nft_log.c index 5defe6e4fd982..e355881379957 100644 --- a/net/netfilter/nft_log.c +++ b/net/netfilter/nft_log.c @@ -163,7 +163,7 @@ static int nft_log_init(const struct nft_ctx *ctx, nla = tb[NFTA_LOG_PREFIX]; if (nla != NULL) { - priv->prefix = kmalloc(nla_len(nla) + 1, GFP_KERNEL); + priv->prefix = kmalloc(nla_len(nla) + 1, GFP_KERNEL_ACCOUNT); if (priv->prefix == NULL) return -ENOMEM; nla_strscpy(priv->prefix, nla, nla_len(nla) + 1); diff --git a/net/netfilter/nft_meta.c b/net/netfilter/nft_meta.c index 8c8eb14d647b0..05cd1e6e6a2f6 100644 --- a/net/netfilter/nft_meta.c +++ b/net/netfilter/nft_meta.c @@ -952,7 +952,7 @@ static int nft_secmark_obj_init(const struct nft_ctx *ctx, if (tb[NFTA_SECMARK_CTX] == NULL) return -EINVAL; - priv->ctx = nla_strdup(tb[NFTA_SECMARK_CTX], GFP_KERNEL); + priv->ctx = nla_strdup(tb[NFTA_SECMARK_CTX], GFP_KERNEL_ACCOUNT); if (!priv->ctx) return -ENOMEM; diff --git a/net/netfilter/nft_numgen.c b/net/netfilter/nft_numgen.c index 7d29db7c2ac0f..bd058babfc820 100644 --- a/net/netfilter/nft_numgen.c +++ b/net/netfilter/nft_numgen.c @@ -66,7 +66,7 @@ static int nft_ng_inc_init(const struct nft_ctx *ctx, if (priv->offset + priv->modulus - 1 < priv->offset) return -EOVERFLOW; - priv->counter = kmalloc(sizeof(*priv->counter), GFP_KERNEL); + priv->counter = kmalloc(sizeof(*priv->counter), GFP_KERNEL_ACCOUNT); if (!priv->counter) return -ENOMEM; diff --git a/net/netfilter/nft_set_pipapo.c b/net/netfilter/nft_set_pipapo.c index eb4c4a4ac7ace..7be342b495f5f 100644 --- a/net/netfilter/nft_set_pipapo.c +++ b/net/netfilter/nft_set_pipapo.c @@ -663,7 +663,7 @@ static int pipapo_realloc_mt(struct nft_pipapo_field *f, check_add_overflow(rules, extra, &rules_alloc)) return -EOVERFLOW; - new_mt = kvmalloc_array(rules_alloc, sizeof(*new_mt), GFP_KERNEL); + new_mt = kvmalloc_array(rules_alloc, sizeof(*new_mt), GFP_KERNEL_ACCOUNT); if (!new_mt) return -ENOMEM; @@ -936,7 +936,7 @@ static void pipapo_lt_bits_adjust(struct nft_pipapo_field *f) return; } - new_lt = kvzalloc(lt_size + NFT_PIPAPO_ALIGN_HEADROOM, GFP_KERNEL); + new_lt = kvzalloc(lt_size + NFT_PIPAPO_ALIGN_HEADROOM, GFP_KERNEL_ACCOUNT); if (!new_lt) return; @@ -1212,7 +1212,7 @@ static int pipapo_realloc_scratch(struct nft_pipapo_match *clone, scratch = kzalloc_node(struct_size(scratch, map, bsize_max * 2) + NFT_PIPAPO_ALIGN_HEADROOM, - GFP_KERNEL, cpu_to_node(i)); + GFP_KERNEL_ACCOUNT, cpu_to_node(i)); if (!scratch) { /* On failure, there's no need to undo previous * allocations: this means that some scratch maps have @@ -1427,7 +1427,7 @@ static struct nft_pipapo_match *pipapo_clone(struct nft_pipapo_match *old) struct nft_pipapo_match *new; int i; - new = kmalloc(struct_size(new, f, old->field_count), GFP_KERNEL); + new = kmalloc(struct_size(new, f, old->field_count), GFP_KERNEL_ACCOUNT); if (!new) return NULL; @@ -1457,7 +1457,7 @@ static struct nft_pipapo_match *pipapo_clone(struct nft_pipapo_match *old) new_lt = kvzalloc(src->groups * NFT_PIPAPO_BUCKETS(src->bb) * src->bsize * sizeof(*dst->lt) + NFT_PIPAPO_ALIGN_HEADROOM, - GFP_KERNEL); + GFP_KERNEL_ACCOUNT); if (!new_lt) goto out_lt; @@ -1470,7 +1470,8 @@ static struct nft_pipapo_match *pipapo_clone(struct nft_pipapo_match *old) if (src->rules > 0) { dst->mt = kvmalloc_array(src->rules_alloc, - sizeof(*src->mt), GFP_KERNEL); + sizeof(*src->mt), + GFP_KERNEL_ACCOUNT); if (!dst->mt) goto out_mt; diff --git a/net/netfilter/nft_tunnel.c b/net/netfilter/nft_tunnel.c index 60a76e6e348e7..5c6ed68cc6e05 100644 --- a/net/netfilter/nft_tunnel.c +++ b/net/netfilter/nft_tunnel.c @@ -509,13 +509,14 @@ static int nft_tunnel_obj_init(const struct nft_ctx *ctx, return err; } - md = metadata_dst_alloc(priv->opts.len, METADATA_IP_TUNNEL, GFP_KERNEL); + md = metadata_dst_alloc(priv->opts.len, METADATA_IP_TUNNEL, + GFP_KERNEL_ACCOUNT); if (!md) return -ENOMEM; memcpy(&md->u.tun_info, &info, sizeof(info)); #ifdef CONFIG_DST_CACHE - err = dst_cache_init(&md->u.tun_info.dst_cache, GFP_KERNEL); + err = dst_cache_init(&md->u.tun_info.dst_cache, GFP_KERNEL_ACCOUNT); if (err < 0) { metadata_dst_free(md); return err; diff --git a/tools/testing/selftests/net/netfilter/Makefile b/tools/testing/selftests/net/netfilter/Makefile index d13fb5ea3e894..e6c9e777feadc 100644 --- a/tools/testing/selftests/net/netfilter/Makefile +++ b/tools/testing/selftests/net/netfilter/Makefile @@ -13,6 +13,7 @@ TEST_PROGS += conntrack_ipip_mtu.sh TEST_PROGS += conntrack_tcp_unreplied.sh TEST_PROGS += conntrack_sctp_collision.sh TEST_PROGS += conntrack_vrf.sh +TEST_PROGS += conntrack_reverse_clash.sh TEST_PROGS += ipvs.sh TEST_PROGS += nf_conntrack_packetdrill.sh TEST_PROGS += nf_nat_edemux.sh @@ -26,6 +27,8 @@ TEST_PROGS += nft_nat.sh TEST_PROGS += nft_nat_zones.sh TEST_PROGS += nft_queue.sh TEST_PROGS += nft_synproxy.sh +TEST_PROGS += nft_tproxy_tcp.sh +TEST_PROGS += nft_tproxy_udp.sh TEST_PROGS += nft_zones_many.sh TEST_PROGS += rpath.sh TEST_PROGS += xt_string.sh @@ -36,6 +39,7 @@ TEST_GEN_PROGS = conntrack_dump_flush TEST_GEN_FILES = audit_logread TEST_GEN_FILES += connect_close nf_queue +TEST_GEN_FILES += conntrack_reverse_clash TEST_GEN_FILES += sctp_collision include ../../lib.mk diff --git a/tools/testing/selftests/net/netfilter/config b/tools/testing/selftests/net/netfilter/config index b2dd4db452150..c5fe7b34eaf19 100644 --- a/tools/testing/selftests/net/netfilter/config +++ b/tools/testing/selftests/net/netfilter/config @@ -81,6 +81,7 @@ CONFIG_NFT_QUEUE=m CONFIG_NFT_QUOTA=m CONFIG_NFT_REDIR=m CONFIG_NFT_SYNPROXY=m +CONFIG_NFT_TPROXY=m CONFIG_VETH=m CONFIG_VLAN_8021Q=m CONFIG_XFRM_USER=m diff --git a/tools/testing/selftests/net/netfilter/conntrack_reverse_clash.c b/tools/testing/selftests/net/netfilter/conntrack_reverse_clash.c new file mode 100644 index 0000000000000..507930cee8cb6 --- /dev/null +++ b/tools/testing/selftests/net/netfilter/conntrack_reverse_clash.c @@ -0,0 +1,125 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Needs something like: + * + * iptables -t nat -A POSTROUTING -o nomatch -j MASQUERADE + * + * so NAT engine attaches a NAT null-binding to each connection. + * + * With unmodified kernels, child or parent will exit with + * "Port number changed" error, even though no port translation + * was requested. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define LEN 512 +#define PORT 56789 +#define TEST_TIME 5 + +static void die(const char *e) +{ + perror(e); + exit(111); +} + +static void die_port(uint16_t got, uint16_t want) +{ + fprintf(stderr, "Port number changed, wanted %d got %d\n", want, ntohs(got)); + exit(1); +} + +static int udp_socket(void) +{ + static const struct timeval tv = { + .tv_sec = 1, + }; + int fd = socket(AF_INET, SOCK_DGRAM, IPPROTO_UDP); + + if (fd < 0) + die("socket"); + + setsockopt(fd, SOL_SOCKET, SO_RCVTIMEO, &tv, sizeof(tv)); + return fd; +} + +int main(int argc, char *argv[]) +{ + struct sockaddr_in sa1 = { + .sin_family = AF_INET, + }; + struct sockaddr_in sa2 = { + .sin_family = AF_INET, + }; + int s1, s2, status; + time_t end, now; + socklen_t plen; + char buf[LEN]; + bool child; + + sa1.sin_port = htons(PORT); + sa2.sin_port = htons(PORT + 1); + + s1 = udp_socket(); + s2 = udp_socket(); + + inet_pton(AF_INET, "127.0.0.11", &sa1.sin_addr); + inet_pton(AF_INET, "127.0.0.12", &sa2.sin_addr); + + if (bind(s1, (struct sockaddr *)&sa1, sizeof(sa1)) < 0) + die("bind 1"); + if (bind(s2, (struct sockaddr *)&sa2, sizeof(sa2)) < 0) + die("bind 2"); + + child = fork() == 0; + + now = time(NULL); + end = now + TEST_TIME; + + while (now < end) { + struct sockaddr_in peer; + socklen_t plen = sizeof(peer); + + now = time(NULL); + + if (child) { + if (sendto(s1, buf, LEN, 0, (struct sockaddr *)&sa2, sizeof(sa2)) != LEN) + continue; + + if (recvfrom(s2, buf, LEN, 0, (struct sockaddr *)&peer, &plen) < 0) + die("child recvfrom"); + + if (peer.sin_port != htons(PORT)) + die_port(peer.sin_port, PORT); + } else { + if (sendto(s2, buf, LEN, 0, (struct sockaddr *)&sa1, sizeof(sa1)) != LEN) + continue; + + if (recvfrom(s1, buf, LEN, 0, (struct sockaddr *)&peer, &plen) < 0) + die("parent recvfrom"); + + if (peer.sin_port != htons((PORT + 1))) + die_port(peer.sin_port, PORT + 1); + } + } + + if (child) + return 0; + + wait(&status); + + if (WIFEXITED(status)) + return WEXITSTATUS(status); + + return 1; +} diff --git a/tools/testing/selftests/net/netfilter/conntrack_reverse_clash.sh b/tools/testing/selftests/net/netfilter/conntrack_reverse_clash.sh new file mode 100755 index 0000000000000..a24c896347a88 --- /dev/null +++ b/tools/testing/selftests/net/netfilter/conntrack_reverse_clash.sh @@ -0,0 +1,51 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 + +source lib.sh + +cleanup() +{ + cleanup_all_ns +} + +checktool "nft --version" "run test without nft" +checktool "conntrack --version" "run test without conntrack" + +trap cleanup EXIT + +setup_ns ns0 + +# make loopback connections get nat null bindings assigned +ip netns exec "$ns0" nft -f - </dev/null + now=$(date +%s) + done +} + +do_flush & + +if ip netns exec "$ns0" ./conntrack_reverse_clash; then + echo "PASS: No SNAT performed for null bindings" +else + echo "ERROR: SNAT performed without any matching snat rule" + exit 1 +fi + +exit 0 diff --git a/tools/testing/selftests/net/netfilter/ipvs.sh b/tools/testing/selftests/net/netfilter/ipvs.sh index 4ceee9fb39495..d3edb16cd4b3f 100755 --- a/tools/testing/selftests/net/netfilter/ipvs.sh +++ b/tools/testing/selftests/net/netfilter/ipvs.sh @@ -97,7 +97,7 @@ cleanup() { } server_listen() { - ip netns exec "$ns2" socat -u -4 TCP-LISTEN:8080,reuseaddr STDOUT > "${outfile}" & + ip netns exec "$ns2" timeout 5 socat -u -4 TCP-LISTEN:8080,reuseaddr STDOUT > "${outfile}" & server_pid=$! sleep 0.2 } diff --git a/tools/testing/selftests/net/netfilter/nft_queue.sh b/tools/testing/selftests/net/netfilter/nft_queue.sh index d66e3c4dfec62..a9d109fcc15c2 100755 --- a/tools/testing/selftests/net/netfilter/nft_queue.sh +++ b/tools/testing/selftests/net/netfilter/nft_queue.sh @@ -31,7 +31,7 @@ modprobe -q sctp trap cleanup EXIT -setup_ns ns1 ns2 nsrouter +setup_ns ns1 ns2 ns3 nsrouter TMPFILE0=$(mktemp) TMPFILE1=$(mktemp) @@ -48,6 +48,7 @@ if ! ip link add veth0 netns "$nsrouter" type veth peer name eth0 netns "$ns1" > exit $ksft_skip fi ip link add veth1 netns "$nsrouter" type veth peer name eth0 netns "$ns2" +ip link add veth2 netns "$nsrouter" type veth peer name eth0 netns "$ns3" ip -net "$nsrouter" link set veth0 up ip -net "$nsrouter" addr add 10.0.1.1/24 dev veth0 @@ -57,8 +58,13 @@ ip -net "$nsrouter" link set veth1 up ip -net "$nsrouter" addr add 10.0.2.1/24 dev veth1 ip -net "$nsrouter" addr add dead:2::1/64 dev veth1 nodad +ip -net "$nsrouter" link set veth2 up +ip -net "$nsrouter" addr add 10.0.3.1/24 dev veth2 +ip -net "$nsrouter" addr add dead:3::1/64 dev veth2 nodad + ip -net "$ns1" link set eth0 up ip -net "$ns2" link set eth0 up +ip -net "$ns3" link set eth0 up ip -net "$ns1" addr add 10.0.1.99/24 dev eth0 ip -net "$ns1" addr add dead:1::99/64 dev eth0 nodad @@ -70,6 +76,11 @@ ip -net "$ns2" addr add dead:2::99/64 dev eth0 nodad ip -net "$ns2" route add default via 10.0.2.1 ip -net "$ns2" route add default via dead:2::1 +ip -net "$ns3" addr add 10.0.3.99/24 dev eth0 +ip -net "$ns3" addr add dead:3::99/64 dev eth0 nodad +ip -net "$ns3" route add default via 10.0.3.1 +ip -net "$ns3" route add default via dead:3::1 + load_ruleset() { local name=$1 local prio=$2 @@ -473,6 +484,83 @@ EOF check_output_files "$TMPINPUT" "$TMPFILE1" "sctp output" } +udp_listener_ready() +{ + ss -S -N "$1" -uln -o "sport = :12345" | grep -q 12345 +} + +output_files_written() +{ + test -s "$1" && test -s "$2" +} + +test_udp_ct_race() +{ + ip netns exec "$nsrouter" nft -f /dev/stdin < "$TMPFILE1" + :> "$TMPFILE2" + + timeout 10 ip netns exec "$ns2" socat UDP-LISTEN:12345,fork OPEN:"$TMPFILE1",trunc & + local rpid1=$! + + timeout 10 ip netns exec "$ns3" socat UDP-LISTEN:12345,fork OPEN:"$TMPFILE2",trunc & + local rpid2=$! + + ip netns exec "$nsrouter" ./nf_queue -q 12 -d 1000 & + local nfqpid=$! + + busywait "$BUSYWAIT_TIMEOUT" udp_listener_ready "$ns2" + busywait "$BUSYWAIT_TIMEOUT" udp_listener_ready "$ns3" + busywait "$BUSYWAIT_TIMEOUT" nf_queue_wait "$nsrouter" 12 + + # Send two packets, one should end up in ns1, other in ns2. + # This is because nfqueue will delay packet for long enough so that + # second packet will not find existing conntrack entry. + echo "Packet 1" | ip netns exec "$ns1" socat STDIN UDP-DATAGRAM:10.6.6.6:12345,bind=0.0.0.0:55221 + echo "Packet 2" | ip netns exec "$ns1" socat STDIN UDP-DATAGRAM:10.6.6.6:12345,bind=0.0.0.0:55221 + + busywait 10000 output_files_written "$TMPFILE1" "$TMPFILE2" + + kill "$nfqpid" + + if ! ip netns exec "$nsrouter" bash -c 'conntrack -L -p udp --dport 12345 2>/dev/null | wc -l | grep -q "^1"'; then + echo "FAIL: Expected One udp conntrack entry" + ip netns exec "$nsrouter" conntrack -L -p udp --dport 12345 + ret=1 + fi + + if ! ip netns exec "$nsrouter" nft delete table inet udpq; then + echo "FAIL: Could not delete udpq table" + ret=1 + return + fi + + NUMLINES1=$(wc -l < "$TMPFILE1") + NUMLINES2=$(wc -l < "$TMPFILE2") + + if [ "$NUMLINES1" -ne 1 ] || [ "$NUMLINES2" -ne 1 ]; then + ret=1 + echo "FAIL: uneven udp packet distribution: $NUMLINES1 $NUMLINES2" + echo -n "$TMPFILE1: ";cat "$TMPFILE1" + echo -n "$TMPFILE2: ";cat "$TMPFILE2" + return + fi + + echo "PASS: both udp receivers got one packet each" +} + test_queue_removal() { read tainted_then < /proc/sys/kernel/tainted @@ -512,6 +600,7 @@ EOF ip netns exec "$nsrouter" sysctl net.ipv6.conf.all.forwarding=1 > /dev/null ip netns exec "$nsrouter" sysctl net.ipv4.conf.veth0.forwarding=1 > /dev/null ip netns exec "$nsrouter" sysctl net.ipv4.conf.veth1.forwarding=1 > /dev/null +ip netns exec "$nsrouter" sysctl net.ipv4.conf.veth2.forwarding=1 > /dev/null load_ruleset "filter" 0 @@ -549,6 +638,7 @@ test_tcp_localhost_connectclose test_tcp_localhost_requeue test_sctp_forward test_sctp_output +test_udp_ct_race # should be last, adds vrf device in ns1 and changes routes test_icmp_vrf diff --git a/tools/testing/selftests/net/netfilter/nft_tproxy_tcp.sh b/tools/testing/selftests/net/netfilter/nft_tproxy_tcp.sh new file mode 100755 index 0000000000000..e208fb03eeb76 --- /dev/null +++ b/tools/testing/selftests/net/netfilter/nft_tproxy_tcp.sh @@ -0,0 +1,358 @@ +#!/bin/bash +# +# This tests tproxy on the following scenario: +# +# +------------+ +# +-------+ | nsrouter | +-------+ +# |ns1 |.99 .1| |.1 .99| ns2| +# | eth0|---------------|veth0 veth1|------------------|eth0 | +# | | 10.0.1.0/24 | | 10.0.2.0/24 | | +# +-------+ dead:1::/64 | veth2 | dead:2::/64 +-------+ +# +------------+ +# |.1 +# | +# | +# | +-------+ +# | .99| ns3| +# +------------------------|eth0 | +# 10.0.3.0/24 | | +# dead:3::/64 +-------+ +# +# The tproxy implementation acts as an echo server so the client +# must receive the same message it sent if it has been proxied. +# If is not proxied the servers return PONG_NS# with the number +# of the namespace the server is running. +# +# shellcheck disable=SC2162,SC2317 + +source lib.sh +ret=0 +timeout=5 + +cleanup() +{ + ip netns pids "$ns1" | xargs kill 2>/dev/null + ip netns pids "$ns2" | xargs kill 2>/dev/null + ip netns pids "$ns3" | xargs kill 2>/dev/null + ip netns pids "$nsrouter" | xargs kill 2>/dev/null + + cleanup_all_ns +} + +checktool "nft --version" "test without nft tool" +checktool "socat -h" "run test without socat" + +trap cleanup EXIT +setup_ns ns1 ns2 ns3 nsrouter + +if ! ip link add veth0 netns "$nsrouter" type veth peer name eth0 netns "$ns1" > /dev/null 2>&1; then + echo "SKIP: No virtual ethernet pair device support in kernel" + exit $ksft_skip +fi +ip link add veth1 netns "$nsrouter" type veth peer name eth0 netns "$ns2" +ip link add veth2 netns "$nsrouter" type veth peer name eth0 netns "$ns3" + +ip -net "$nsrouter" link set veth0 up +ip -net "$nsrouter" addr add 10.0.1.1/24 dev veth0 +ip -net "$nsrouter" addr add dead:1::1/64 dev veth0 nodad + +ip -net "$nsrouter" link set veth1 up +ip -net "$nsrouter" addr add 10.0.2.1/24 dev veth1 +ip -net "$nsrouter" addr add dead:2::1/64 dev veth1 nodad + +ip -net "$nsrouter" link set veth2 up +ip -net "$nsrouter" addr add 10.0.3.1/24 dev veth2 +ip -net "$nsrouter" addr add dead:3::1/64 dev veth2 nodad + +ip -net "$ns1" link set eth0 up +ip -net "$ns2" link set eth0 up +ip -net "$ns3" link set eth0 up + +ip -net "$ns1" addr add 10.0.1.99/24 dev eth0 +ip -net "$ns1" addr add dead:1::99/64 dev eth0 nodad +ip -net "$ns1" route add default via 10.0.1.1 +ip -net "$ns1" route add default via dead:1::1 + +ip -net "$ns2" addr add 10.0.2.99/24 dev eth0 +ip -net "$ns2" addr add dead:2::99/64 dev eth0 nodad +ip -net "$ns2" route add default via 10.0.2.1 +ip -net "$ns2" route add default via dead:2::1 + +ip -net "$ns3" addr add 10.0.3.99/24 dev eth0 +ip -net "$ns3" addr add dead:3::99/64 dev eth0 nodad +ip -net "$ns3" route add default via 10.0.3.1 +ip -net "$ns3" route add default via dead:3::1 + +ip netns exec "$nsrouter" sysctl net.ipv6.conf.all.forwarding=1 > /dev/null +ip netns exec "$nsrouter" sysctl net.ipv4.conf.veth0.forwarding=1 > /dev/null +ip netns exec "$nsrouter" sysctl net.ipv4.conf.veth1.forwarding=1 > /dev/null +ip netns exec "$nsrouter" sysctl net.ipv4.conf.veth2.forwarding=1 > /dev/null + +test_ping() { + if ! ip netns exec "$ns1" ping -c 1 -q 10.0.2.99 > /dev/null; then + return 1 + fi + + if ! ip netns exec "$ns1" ping -c 1 -q dead:2::99 > /dev/null; then + return 2 + fi + + if ! ip netns exec "$ns1" ping -c 1 -q 10.0.3.99 > /dev/null; then + return 1 + fi + + if ! ip netns exec "$ns1" ping -c 1 -q dead:3::99 > /dev/null; then + return 2 + fi + + return 0 +} + +test_ping_router() { + if ! ip netns exec "$ns1" ping -c 1 -q 10.0.2.1 > /dev/null; then + return 3 + fi + + if ! ip netns exec "$ns1" ping -c 1 -q dead:2::1 > /dev/null; then + return 4 + fi + + return 0 +} + + +listener_ready() +{ + local ns="$1" + local port="$2" + local proto="$3" + ss -N "$ns" -ln "$proto" -o "sport = :$port" | grep -q "$port" +} + +test_tproxy() +{ + local traffic_origin="$1" + local ip_proto="$2" + local expect_ns1_ns2="$3" + local expect_ns1_ns3="$4" + local expect_nsrouter_ns2="$5" + local expect_nsrouter_ns3="$6" + + # derived variables + local testname="test_${ip_proto}_tcp_${traffic_origin}" + local socat_ipproto + local ns1_ip + local ns2_ip + local ns3_ip + local ns2_target + local ns3_target + local nftables_subject + local ip_command + + # socat 1.8.0 has a bug that requires to specify the IP family to bind (fixed in 1.8.0.1) + case $ip_proto in + "ip") + socat_ipproto="-4" + ns1_ip=10.0.1.99 + ns2_ip=10.0.2.99 + ns3_ip=10.0.3.99 + ns2_target="tcp:$ns2_ip:8080" + ns3_target="tcp:$ns3_ip:8080" + nftables_subject="ip daddr $ns2_ip tcp dport 8080" + ip_command="ip" + ;; + "ip6") + socat_ipproto="-6" + ns1_ip=dead:1::99 + ns2_ip=dead:2::99 + ns3_ip=dead:3::99 + ns2_target="tcp:[$ns2_ip]:8080" + ns3_target="tcp:[$ns3_ip]:8080" + nftables_subject="ip6 daddr $ns2_ip tcp dport 8080" + ip_command="ip -6" + ;; + *) + echo "FAIL: unsupported protocol" + exit 255 + ;; + esac + + case $traffic_origin in + # to capture the local originated traffic we need to mark the outgoing + # traffic so the policy based routing rule redirects it and can be processed + # in the prerouting chain. + "local") + nftables_rules=" +flush ruleset +table inet filter { + chain divert { + type filter hook prerouting priority 0; policy accept; + $nftables_subject tproxy $ip_proto to :12345 meta mark set 1 accept + } + chain output { + type route hook output priority 0; policy accept; + $nftables_subject meta mark set 1 accept + } +}" + ;; + "forward") + nftables_rules=" +flush ruleset +table inet filter { + chain divert { + type filter hook prerouting priority 0; policy accept; + $nftables_subject tproxy $ip_proto to :12345 meta mark set 1 accept + } +}" + ;; + *) + echo "FAIL: unsupported parameter for traffic origin" + exit 255 + ;; + esac + + # shellcheck disable=SC2046 # Intended splitting of ip_command + ip netns exec "$nsrouter" $ip_command rule add fwmark 1 table 100 + ip netns exec "$nsrouter" $ip_command route add local "${ns2_ip}" dev lo table 100 + echo "$nftables_rules" | ip netns exec "$nsrouter" nft -f /dev/stdin + + timeout "$timeout" ip netns exec "$nsrouter" socat "$socat_ipproto" tcp-listen:12345,fork,ip-transparent SYSTEM:"cat" 2>/dev/null & + local tproxy_pid=$! + + timeout "$timeout" ip netns exec "$ns2" socat "$socat_ipproto" tcp-listen:8080,fork SYSTEM:"echo PONG_NS2" 2>/dev/null & + local server2_pid=$! + + timeout "$timeout" ip netns exec "$ns3" socat "$socat_ipproto" tcp-listen:8080,fork SYSTEM:"echo PONG_NS3" 2>/dev/null & + local server3_pid=$! + + busywait "$BUSYWAIT_TIMEOUT" listener_ready "$nsrouter" 12345 "-t" + busywait "$BUSYWAIT_TIMEOUT" listener_ready "$ns2" 8080 "-t" + busywait "$BUSYWAIT_TIMEOUT" listener_ready "$ns3" 8080 "-t" + + local result + # request from ns1 to ns2 (forwarded traffic) + result=$(echo I_M_PROXIED | ip netns exec "$ns1" socat -t 2 -T 2 STDIO "$ns2_target") + if [ "$result" == "$expect_ns1_ns2" ] ;then + echo "PASS: tproxy test $testname: ns1 got reply \"$result\" connecting to ns2" + else + echo "ERROR: tproxy test $testname: ns1 got reply \"$result\" connecting to ns2, not \"${expect_ns1_ns2}\" as intended" + ret=1 + fi + + # request from ns1 to ns3(forwarded traffic) + result=$(echo I_M_PROXIED | ip netns exec "$ns1" socat -t 2 -T 2 STDIO "$ns3_target") + if [ "$result" = "$expect_ns1_ns3" ] ;then + echo "PASS: tproxy test $testname: ns1 got reply \"$result\" connecting to ns3" + else + echo "ERROR: tproxy test $testname: ns1 got reply \"$result\" connecting to ns3, not \"$expect_ns1_ns3\" as intended" + ret=1 + fi + + # request from nsrouter to ns2 (localy originated traffic) + result=$(echo I_M_PROXIED | ip netns exec "$nsrouter" socat -t 2 -T 2 STDIO "$ns2_target") + if [ "$result" == "$expect_nsrouter_ns2" ] ;then + echo "PASS: tproxy test $testname: nsrouter got reply \"$result\" connecting to ns2" + else + echo "ERROR: tproxy test $testname: nsrouter got reply \"$result\" connecting to ns2, not \"$expect_nsrouter_ns2\" as intended" + ret=1 + fi + + # request from nsrouter to ns3 (localy originated traffic) + result=$(echo I_M_PROXIED | ip netns exec "$nsrouter" socat -t 2 -T 2 STDIO "$ns3_target") + if [ "$result" = "$expect_nsrouter_ns3" ] ;then + echo "PASS: tproxy test $testname: nsrouter got reply \"$result\" connecting to ns3" + else + echo "ERROR: tproxy test $testname: nsrouter got reply \"$result\" connecting to ns3, not \"$expect_nsrouter_ns3\" as intended" + ret=1 + fi + + # cleanup + kill "$tproxy_pid" "$server2_pid" "$server3_pid" 2>/dev/null + # shellcheck disable=SC2046 # Intended splitting of ip_command + ip netns exec "$nsrouter" $ip_command rule del fwmark 1 table 100 + ip netns exec "$nsrouter" $ip_command route flush table 100 +} + + +test_ipv4_tcp_forward() +{ + local traffic_origin="forward" + local ip_proto="ip" + local expect_ns1_ns2="I_M_PROXIED" + local expect_ns1_ns3="PONG_NS3" + local expect_nsrouter_ns2="PONG_NS2" + local expect_nsrouter_ns3="PONG_NS3" + + test_tproxy "$traffic_origin" \ + "$ip_proto" \ + "$expect_ns1_ns2" \ + "$expect_ns1_ns3" \ + "$expect_nsrouter_ns2" \ + "$expect_nsrouter_ns3" +} + +test_ipv4_tcp_local() +{ + local traffic_origin="local" + local ip_proto="ip" + local expect_ns1_ns2="I_M_PROXIED" + local expect_ns1_ns3="PONG_NS3" + local expect_nsrouter_ns2="I_M_PROXIED" + local expect_nsrouter_ns3="PONG_NS3" + + test_tproxy "$traffic_origin" \ + "$ip_proto" \ + "$expect_ns1_ns2" \ + "$expect_ns1_ns3" \ + "$expect_nsrouter_ns2" \ + "$expect_nsrouter_ns3" +} + +test_ipv6_tcp_forward() +{ + local traffic_origin="forward" + local ip_proto="ip6" + local expect_ns1_ns2="I_M_PROXIED" + local expect_ns1_ns3="PONG_NS3" + local expect_nsrouter_ns2="PONG_NS2" + local expect_nsrouter_ns3="PONG_NS3" + + test_tproxy "$traffic_origin" \ + "$ip_proto" \ + "$expect_ns1_ns2" \ + "$expect_ns1_ns3" \ + "$expect_nsrouter_ns2" \ + "$expect_nsrouter_ns3" +} + +test_ipv6_tcp_local() +{ + local traffic_origin="local" + local ip_proto="ip6" + local expect_ns1_ns2="I_M_PROXIED" + local expect_ns1_ns3="PONG_NS3" + local expect_nsrouter_ns2="I_M_PROXIED" + local expect_nsrouter_ns3="PONG_NS3" + + test_tproxy "$traffic_origin" \ + "$ip_proto" \ + "$expect_ns1_ns2" \ + "$expect_ns1_ns3" \ + "$expect_nsrouter_ns2" \ + "$expect_nsrouter_ns3" +} + +if test_ping; then + # queue bypass works (rules were skipped, no listener) + echo "PASS: ${ns1} can reach ${ns2}" +else + echo "FAIL: ${ns1} cannot reach ${ns2}: $ret" 1>&2 + exit $ret +fi + +test_ipv4_tcp_forward +test_ipv4_tcp_local +test_ipv6_tcp_forward +test_ipv6_tcp_local + +exit $ret diff --git a/tools/testing/selftests/net/netfilter/nft_tproxy_udp.sh b/tools/testing/selftests/net/netfilter/nft_tproxy_udp.sh new file mode 100755 index 0000000000000..d16de13fe5a75 --- /dev/null +++ b/tools/testing/selftests/net/netfilter/nft_tproxy_udp.sh @@ -0,0 +1,262 @@ +#!/bin/bash +# +# This tests tproxy on the following scenario: +# +# +------------+ +# +-------+ | nsrouter | +-------+ +# |ns1 |.99 .1| |.1 .99| ns2| +# | eth0|---------------|veth0 veth1|------------------|eth0 | +# | | 10.0.1.0/24 | | 10.0.2.0/24 | | +# +-------+ dead:1::/64 | veth2 | dead:2::/64 +-------+ +# +------------+ +# |.1 +# | +# | +# | +-------+ +# | .99| ns3| +# +------------------------|eth0 | +# 10.0.3.0/24 | | +# dead:3::/64 +-------+ +# +# The tproxy implementation acts as an echo server so the client +# must receive the same message it sent if it has been proxied. +# If is not proxied the servers return PONG_NS# with the number +# of the namespace the server is running. +# shellcheck disable=SC2162,SC2317 + +source lib.sh +ret=0 +# UDP is slow +timeout=15 + +cleanup() +{ + ip netns pids "$ns1" | xargs kill 2>/dev/null + ip netns pids "$ns2" | xargs kill 2>/dev/null + ip netns pids "$ns3" | xargs kill 2>/dev/null + ip netns pids "$nsrouter" | xargs kill 2>/dev/null + + cleanup_all_ns +} + +checktool "nft --version" "test without nft tool" +checktool "socat -h" "run test without socat" + +trap cleanup EXIT +setup_ns ns1 ns2 ns3 nsrouter + +if ! ip link add veth0 netns "$nsrouter" type veth peer name eth0 netns "$ns1" > /dev/null 2>&1; then + echo "SKIP: No virtual ethernet pair device support in kernel" + exit $ksft_skip +fi +ip link add veth1 netns "$nsrouter" type veth peer name eth0 netns "$ns2" +ip link add veth2 netns "$nsrouter" type veth peer name eth0 netns "$ns3" + +ip -net "$nsrouter" link set veth0 up +ip -net "$nsrouter" addr add 10.0.1.1/24 dev veth0 +ip -net "$nsrouter" addr add dead:1::1/64 dev veth0 nodad + +ip -net "$nsrouter" link set veth1 up +ip -net "$nsrouter" addr add 10.0.2.1/24 dev veth1 +ip -net "$nsrouter" addr add dead:2::1/64 dev veth1 nodad + +ip -net "$nsrouter" link set veth2 up +ip -net "$nsrouter" addr add 10.0.3.1/24 dev veth2 +ip -net "$nsrouter" addr add dead:3::1/64 dev veth2 nodad + +ip -net "$ns1" link set eth0 up +ip -net "$ns2" link set eth0 up +ip -net "$ns3" link set eth0 up + +ip -net "$ns1" addr add 10.0.1.99/24 dev eth0 +ip -net "$ns1" addr add dead:1::99/64 dev eth0 nodad +ip -net "$ns1" route add default via 10.0.1.1 +ip -net "$ns1" route add default via dead:1::1 + +ip -net "$ns2" addr add 10.0.2.99/24 dev eth0 +ip -net "$ns2" addr add dead:2::99/64 dev eth0 nodad +ip -net "$ns2" route add default via 10.0.2.1 +ip -net "$ns2" route add default via dead:2::1 + +ip -net "$ns3" addr add 10.0.3.99/24 dev eth0 +ip -net "$ns3" addr add dead:3::99/64 dev eth0 nodad +ip -net "$ns3" route add default via 10.0.3.1 +ip -net "$ns3" route add default via dead:3::1 + +ip netns exec "$nsrouter" sysctl net.ipv6.conf.all.forwarding=1 > /dev/null +ip netns exec "$nsrouter" sysctl net.ipv4.conf.veth0.forwarding=1 > /dev/null +ip netns exec "$nsrouter" sysctl net.ipv4.conf.veth1.forwarding=1 > /dev/null +ip netns exec "$nsrouter" sysctl net.ipv4.conf.veth2.forwarding=1 > /dev/null + +test_ping() { + if ! ip netns exec "$ns1" ping -c 1 -q 10.0.2.99 > /dev/null; then + return 1 + fi + + if ! ip netns exec "$ns1" ping -c 1 -q dead:2::99 > /dev/null; then + return 2 + fi + + if ! ip netns exec "$ns1" ping -c 1 -q 10.0.3.99 > /dev/null; then + return 1 + fi + + if ! ip netns exec "$ns1" ping -c 1 -q dead:3::99 > /dev/null; then + return 2 + fi + + return 0 +} + +test_ping_router() { + if ! ip netns exec "$ns1" ping -c 1 -q 10.0.2.1 > /dev/null; then + return 3 + fi + + if ! ip netns exec "$ns1" ping -c 1 -q dead:2::1 > /dev/null; then + return 4 + fi + + return 0 +} + + +listener_ready() +{ + local ns="$1" + local port="$2" + local proto="$3" + ss -N "$ns" -ln "$proto" -o "sport = :$port" | grep -q "$port" +} + +test_tproxy_udp_forward() +{ + local ip_proto="$1" + + local expect_ns1_ns2="I_M_PROXIED" + local expect_ns1_ns3="PONG_NS3" + local expect_nsrouter_ns2="PONG_NS2" + local expect_nsrouter_ns3="PONG_NS3" + + # derived variables + local testname="test_${ip_proto}_udp_forward" + local socat_ipproto + local ns1_ip + local ns2_ip + local ns3_ip + local ns1_ip_port + local ns2_ip_port + local ns3_ip_port + local ip_command + + # socat 1.8.0 has a bug that requires to specify the IP family to bind (fixed in 1.8.0.1) + case $ip_proto in + "ip") + socat_ipproto="-4" + ns1_ip=10.0.1.99 + ns2_ip=10.0.2.99 + ns3_ip=10.0.3.99 + ns1_ip_port="$ns1_ip:18888" + ns2_ip_port="$ns2_ip:8080" + ns3_ip_port="$ns3_ip:8080" + ip_command="ip" + ;; + "ip6") + socat_ipproto="-6" + ns1_ip=dead:1::99 + ns2_ip=dead:2::99 + ns3_ip=dead:3::99 + ns1_ip_port="[$ns1_ip]:18888" + ns2_ip_port="[$ns2_ip]:8080" + ns3_ip_port="[$ns3_ip]:8080" + ip_command="ip -6" + ;; + *) + echo "FAIL: unsupported protocol" + exit 255 + ;; + esac + + # shellcheck disable=SC2046 # Intended splitting of ip_command + ip netns exec "$nsrouter" $ip_command rule add fwmark 1 table 100 + ip netns exec "$nsrouter" $ip_command route add local "$ns2_ip" dev lo table 100 + ip netns exec "$nsrouter" nft -f /dev/stdin </dev/null & + local tproxy_pid=$! + + timeout "$timeout" ip netns exec "$ns2" socat "$socat_ipproto" udp-listen:8080,fork SYSTEM:"echo PONG_NS2" 2>/dev/null & + local server2_pid=$! + + timeout "$timeout" ip netns exec "$ns3" socat "$socat_ipproto" udp-listen:8080,fork SYSTEM:"echo PONG_NS3" 2>/dev/null & + local server3_pid=$! + + busywait "$BUSYWAIT_TIMEOUT" listener_ready "$nsrouter" 12345 "-u" + busywait "$BUSYWAIT_TIMEOUT" listener_ready "$ns2" 8080 "-u" + busywait "$BUSYWAIT_TIMEOUT" listener_ready "$ns3" 8080 "-u" + + local result + # request from ns1 to ns2 (forwarded traffic) + result=$(echo I_M_PROXIED | ip netns exec "$ns1" socat -t 2 -T 2 STDIO udp:"$ns2_ip_port",sourceport=18888) + if [ "$result" == "$expect_ns1_ns2" ] ;then + echo "PASS: tproxy test $testname: ns1 got reply \"$result\" connecting to ns2" + else + echo "ERROR: tproxy test $testname: ns1 got reply \"$result\" connecting to ns2, not \"${expect_ns1_ns2}\" as intended" + ret=1 + fi + + # request from ns1 to ns3 (forwarded traffic) + result=$(echo I_M_PROXIED | ip netns exec "$ns1" socat -t 2 -T 2 STDIO udp:"$ns3_ip_port") + if [ "$result" = "$expect_ns1_ns3" ] ;then + echo "PASS: tproxy test $testname: ns1 got reply \"$result\" connecting to ns3" + else + echo "ERROR: tproxy test $testname: ns1 got reply \"$result\" connecting to ns3, not \"$expect_ns1_ns3\" as intended" + ret=1 + fi + + # request from nsrouter to ns2 (localy originated traffic) + result=$(echo I_M_PROXIED | ip netns exec "$nsrouter" socat -t 2 -T 2 STDIO udp:"$ns2_ip_port") + if [ "$result" == "$expect_nsrouter_ns2" ] ;then + echo "PASS: tproxy test $testname: nsrouter got reply \"$result\" connecting to ns2" + else + echo "ERROR: tproxy test $testname: nsrouter got reply \"$result\" connecting to ns2, not \"$expect_nsrouter_ns2\" as intended" + ret=1 + fi + + # request from nsrouter to ns3 (localy originated traffic) + result=$(echo I_M_PROXIED | ip netns exec "$nsrouter" socat -t 2 -T 2 STDIO udp:"$ns3_ip_port") + if [ "$result" = "$expect_nsrouter_ns3" ] ;then + echo "PASS: tproxy test $testname: nsrouter got reply \"$result\" connecting to ns3" + else + echo "ERROR: tproxy test $testname: nsrouter got reply \"$result\" connecting to ns3, not \"$expect_nsrouter_ns3\" as intended" + ret=1 + fi + + # cleanup + kill "$tproxy_pid" "$server2_pid" "$server3_pid" 2>/dev/null + # shellcheck disable=SC2046 # Intended splitting of ip_command + ip netns exec "$nsrouter" $ip_command rule del fwmark 1 table 100 + ip netns exec "$nsrouter" $ip_command route flush table 100 +} + + +if test_ping; then + # queue bypass works (rules were skipped, no listener) + echo "PASS: ${ns1} can reach ${ns2}" +else + echo "FAIL: ${ns1} cannot reach ${ns2}: $ret" 1>&2 + exit $ret +fi + +test_tproxy_udp_forward "ip" +test_tproxy_udp_forward "ip6" + +exit $ret