Skip to content

Commit

Permalink
fix: use masks and different firewall mark for KubeSpan
Browse files Browse the repository at this point in the history
Fixes #4836

Firewall mark is `uint32` attached to the packet in the Linux kernel
(it's not transmitted on the wire). This is a shared value for all
networking software, so multiple components might attempt to set and
match on the firewall mark.

Cilium and Calico CNIs are using firewall marks internally, but they
touch only some bits of the firewall mark.

The way KubeSpan was implemented before this PR, it was doing direct
match on the firewall mark, and setting the whole `uint32`, so it comes
into conflict with any other networking component using firewall marks.

The other problem was that firewall mark 0x51820 (0x51821) was too
"wide" touching random bits of the 32-bit value for no good reason.

So this change contains two fixes:

* make firewall mark exactly a single bit (we use bits `0x20` and `0x40`
  now)
* match and mark packets with the mask (don't touch bits outside of the
  mask when setting the mark and ignore bits outside of the mask when
  matching on the mark).

This was tested successfully with both Cilium CNI (default config +
`ipam.mode=kubernetes`) and Calico CNI (default config).

One thing to note is that for KubeSpan and Talos it's important to make
sure that `podSubnets` in the machine config match CNI setting for
`podCIDRs`.

Signed-off-by: Andrey Smirnov <[email protected]>
  • Loading branch information
smira committed Jul 20, 2022
1 parent 80444a4 commit 644e803
Show file tree
Hide file tree
Showing 8 changed files with 189 additions and 55 deletions.
8 changes: 4 additions & 4 deletions internal/app/machined/pkg/controllers/kubespan/manager.go
Original file line number Diff line number Diff line change
Expand Up @@ -56,10 +56,10 @@ type WireguardClient interface {
}

// RulesManagerFactory allows mocking RulesManager.
type RulesManagerFactory func(targetTable, internalMark int) RulesManager
type RulesManagerFactory func(targetTable, internalMark, markMask int) RulesManager

// NfTablesManagerFactory allows mocking NfTablesManager.
type NfTablesManagerFactory func(externalMark, internalMark uint32) NfTablesManager
type NfTablesManagerFactory func(externalMark, internalMark, markMask uint32) NfTablesManager

// Inputs implements controller.Controller interface.
func (ctrl *ManagerController) Inputs() []controller.Input {
Expand Down Expand Up @@ -221,15 +221,15 @@ func (ctrl *ManagerController) Run(ctx context.Context, r controller.Runtime, lo
}

if rulesMgr == nil {
rulesMgr = ctrl.RulesManagerFactory(constants.KubeSpanDefaultRoutingTable, constants.KubeSpanDefaultForceFirewallMark)
rulesMgr = ctrl.RulesManagerFactory(constants.KubeSpanDefaultRoutingTable, constants.KubeSpanDefaultForceFirewallMark, constants.KubeSpanDefaultFirewallMask)

if err = rulesMgr.Install(); err != nil {
return fmt.Errorf("failed setting up routing rules: %w", err)
}
}

if nfTablesMgr == nil {
nfTablesMgr = ctrl.NfTablesManagerFactory(constants.KubeSpanDefaultFirewallMark, constants.KubeSpanDefaultForceFirewallMark)
nfTablesMgr = ctrl.NfTablesManagerFactory(constants.KubeSpanDefaultFirewallMark, constants.KubeSpanDefaultForceFirewallMark, constants.KubeSpanDefaultFirewallMask)
}

cfgSpec := cfg.(*kubespan.Config).TypedSpec()
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -122,10 +122,10 @@ func (suite *ManagerSuite) TestReconcile() {
WireguardClientFactory: func() (kubespanctrl.WireguardClient, error) {
return mockWireguard, nil
},
RulesManagerFactory: func(_, _ int) kubespanctrl.RulesManager {
RulesManagerFactory: func(_, _, _ int) kubespanctrl.RulesManager {
return mockRulesManager{}
},
NfTablesManagerFactory: func(_, _ uint32) kubespanctrl.NfTablesManager {
NfTablesManagerFactory: func(_, _, _ uint32) kubespanctrl.NfTablesManager {
return mockNfTables
},
PeerReconcileInterval: time.Second,
Expand Down
101 changes: 59 additions & 42 deletions internal/app/machined/pkg/controllers/kubespan/nftables.go
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ type NfTablesManager interface {
}

// NewNfTablesManager initializes NfTablesManager.
func NewNfTablesManager(externalMark, internalMark uint32) NfTablesManager {
func NewNfTablesManager(externalMark, internalMark, markMask uint32) NfTablesManager {
nfTable := &nftables.Table{
Family: nftables.TableFamilyINet,
Name: "talos_kubespan",
Expand All @@ -29,6 +29,7 @@ func NewNfTablesManager(externalMark, internalMark uint32) NfTablesManager {
return &nfTablesManager{
ExternalMark: externalMark,
InternalMark: internalMark,
MarkMask: markMask,

nfTable: nfTable,
targetSet4: &nftables.Set{
Expand All @@ -50,6 +51,7 @@ func NewNfTablesManager(externalMark, internalMark uint32) NfTablesManager {
type nfTablesManager struct {
InternalMark uint32
ExternalMark uint32
MarkMask uint32

currentSet *netaddr.IPSet

Expand Down Expand Up @@ -173,70 +175,72 @@ func (m *nfTablesManager) setNFTable(ips *netaddr.IPSet) error {
return fmt.Errorf("failed to add IPv6 set: %w", err)
}

// meta mark & 0x00000060 == 0x00000020 accept
ruleExpr := []expr.Any{
// Load the firewall mark into register 1
&expr.Meta{
Key: expr.MetaKeyMARK,
Register: 1,
},
// Mask the mark with the configured mask:
// R1 = R1 & mask
&expr.Bitwise{
SourceRegister: 1,
DestRegister: 1,
Len: 4,
Xor: binaryutil.NativeEndian.PutUint32(0),
Mask: binaryutil.NativeEndian.PutUint32(m.MarkMask),
},
// Compare the masked firewall mark with expected value
&expr.Cmp{
Op: expr.CmpOpEq,
Register: 1,
Data: binaryutil.NativeEndian.PutUint32(m.ExternalMark),
},
// Accept the packet to stop the ruleset processing
&expr.Verdict{
Kind: expr.VerdictAccept,
},
}

// match fwmark of Wireguard interface (not kubespan mark)
// accept and return without modifying the table or mark
c.AddRule(&nftables.Rule{
Table: m.nfTable,
Chain: preChain,
Exprs: []expr.Any{
&expr.Meta{
Key: expr.MetaKeyMARK,
Register: 1,
},
&expr.Cmp{
Op: expr.CmpOpEq,
Register: 1,
Data: binaryutil.NativeEndian.PutUint32(m.ExternalMark),
},
&expr.Verdict{
Kind: expr.VerdictAccept,
},
},
Exprs: ruleExpr,
})

// match fwmark of Wireguard interface (not kubespan mark)
// accept and return without modifying the table or mark
c.AddRule(&nftables.Rule{
Table: m.nfTable,
Chain: outChain,
Exprs: []expr.Any{
&expr.Meta{
Key: expr.MetaKeyMARK,
Register: 1,
},
&expr.Cmp{
Op: expr.CmpOpEq,
Register: 1,
Data: binaryutil.NativeEndian.PutUint32(m.ExternalMark),
},
&expr.Verdict{
Kind: expr.VerdictAccept,
},
},
Exprs: ruleExpr,
})

c.AddRule(&nftables.Rule{
Table: m.nfTable,
Chain: preChain,
Exprs: matchIPv4Set(m.targetSet4, m.InternalMark),
Exprs: matchIPv4Set(m.targetSet4, m.InternalMark, m.MarkMask),
})

c.AddRule(&nftables.Rule{
Table: m.nfTable,
Chain: preChain,
Exprs: matchIPv6Set(m.targetSet6, m.InternalMark),
Exprs: matchIPv6Set(m.targetSet6, m.InternalMark, m.MarkMask),
})

c.AddRule(&nftables.Rule{
Table: m.nfTable,
Chain: outChain,
Exprs: matchIPv4Set(m.targetSet4, m.InternalMark),
Exprs: matchIPv4Set(m.targetSet4, m.InternalMark, m.MarkMask),
})

c.AddRule(&nftables.Rule{
Table: m.nfTable,
Chain: outChain,
Exprs: matchIPv6Set(m.targetSet6, m.InternalMark),
Exprs: matchIPv6Set(m.targetSet6, m.InternalMark, m.MarkMask),
})

if err := c.Flush(); err != nil {
Expand All @@ -246,15 +250,15 @@ func (m *nfTablesManager) setNFTable(ips *netaddr.IPSet) error {
return nil
}

func matchIPv4Set(set *nftables.Set, mark uint32) []expr.Any {
return matchIPSet(set, mark, nftables.TableFamilyIPv4)
func matchIPv4Set(set *nftables.Set, mark, mask uint32) []expr.Any {
return matchIPSet(set, mark, mask, nftables.TableFamilyIPv4)
}

func matchIPv6Set(set *nftables.Set, mark uint32) []expr.Any {
return matchIPSet(set, mark, nftables.TableFamilyIPv6)
func matchIPv6Set(set *nftables.Set, mark, mask uint32) []expr.Any {
return matchIPSet(set, mark, mask, nftables.TableFamilyIPv6)
}

func matchIPSet(set *nftables.Set, mark uint32, family nftables.TableFamily) []expr.Any {
func matchIPSet(set *nftables.Set, mark, mask uint32, family nftables.TableFamily) []expr.Any {
var (
offset uint32 = 16
length uint32 = 4
Expand All @@ -265,6 +269,7 @@ func matchIPSet(set *nftables.Set, mark uint32, family nftables.TableFamily) []e
length = 16
}

// ip daddr @kubespan_targets_ipv4 meta mark set meta mark & 0xffffffdf | 0x00000040 accept
return []expr.Any{
// Store protocol type to register 1
&expr.Meta{
Expand All @@ -290,17 +295,29 @@ func matchIPSet(set *nftables.Set, mark uint32, family nftables.TableFamily) []e
SetName: set.Name,
SetID: set.ID,
},
// Store Firewall Force mark to register 1
&expr.Immediate{
// Load the current packet mark into register 1
&expr.Meta{
Key: expr.MetaKeyMARK,
Register: 1,
Data: binaryutil.NativeEndian.PutUint32(mark),
},
// Set firewall mark
// This bitwise is equivalent to: R1 = R1 | (R1 & mask | mark)
//
// The NFTables backend bitwise operation is R3 = R2 & MASK ^ XOR,
// so we need to do a bit of a trick to do what we need: R1 = R1 & ^mask ^ mark
&expr.Bitwise{
SourceRegister: 1,
DestRegister: 1,
Len: 4,
Xor: binaryutil.NativeEndian.PutUint32(mark),
Mask: binaryutil.NativeEndian.PutUint32(^mask),
},
// Set firewall mark to the value computed in register 1
&expr.Meta{
Key: expr.MetaKeyMARK,
SourceRegister: true,
Register: 1,
},
// Accept the packet to stop the ruleset processing
&expr.Verdict{
Kind: expr.VerdictAccept,
},
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ import (

func TestNfTables(t *testing.T) {
// use a different mark to avoid conflicts with running kubespan
mgr := kubespan.NewNfTablesManager(constants.KubeSpanDefaultFirewallMark+10, constants.KubeSpanDefaultForceFirewallMark+10)
mgr := kubespan.NewNfTablesManager(constants.KubeSpanDefaultFirewallMark+10, constants.KubeSpanDefaultForceFirewallMark<<1, constants.KubeSpanDefaultFirewallMask<<1)

// cleanup should be fine if nothing is installed
assert.NoError(t, mgr.Cleanup())
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -23,16 +23,18 @@ type RulesManager interface {
}

// NewRulesManager initializes new RulesManager.
func NewRulesManager(targetTable, internalMark int) RulesManager {
func NewRulesManager(targetTable, internalMark, markMask int) RulesManager {
return &rulesManager{
TargetTable: targetTable,
InternalMark: internalMark,
MarkMask: markMask,
}
}

type rulesManager struct {
TargetTable int
InternalMark int
MarkMask int
}

// Install routing rules.
Expand All @@ -49,7 +51,7 @@ func (m *rulesManager) Install() error {
Family: unix.AF_INET,
Table: m.TargetTable,
Mark: m.InternalMark,
Mask: -1,
Mask: m.MarkMask,
Goto: -1,
Flow: -1,
SuppressIfgroup: -1,
Expand All @@ -65,7 +67,7 @@ func (m *rulesManager) Install() error {
Family: unix.AF_INET6,
Table: m.TargetTable,
Mark: m.InternalMark,
Mask: -1,
Mask: m.MarkMask,
Goto: -1,
Flow: -1,
SuppressIfgroup: -1,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ import (

func TestRoutingRules(t *testing.T) {
// use a different table/mark to avoid conflicts with running kubespan
mgr := kubespan.NewRulesManager(constants.KubeSpanDefaultRoutingTable+10, constants.KubeSpanDefaultForceFirewallMark+10)
mgr := kubespan.NewRulesManager(constants.KubeSpanDefaultRoutingTable+10, constants.KubeSpanDefaultForceFirewallMark<<1, constants.KubeSpanDefaultFirewallMask<<1)

// cleanup should be fine if nothing is installed
assert.NoError(t, mgr.Cleanup())
Expand Down
9 changes: 7 additions & 2 deletions pkg/machinery/constants/constants.go
Original file line number Diff line number Diff line change
Expand Up @@ -619,12 +619,17 @@ const (
// KubeSpanDefaultFirewallMark is the default firewall mark to use for Wireguard encrypted egress packets.
//
// Normal Wireguard configurations will NOT use this firewall mark.
KubeSpanDefaultFirewallMark = 0x51820
KubeSpanDefaultFirewallMark = 0x20

// KubeSpanDefaultForceFirewallMark is the default firewall mark to use for packets destined to IPs serviced by KubeSpan.
//
// It is used to signal that matching packets should be forced into the Wireguard interface.
KubeSpanDefaultForceFirewallMark = 0x51821
KubeSpanDefaultForceFirewallMark = 0x40

// KubeSpanDefaultFirewallMask is the mask applied to the packet mark when matching and setting the mark.
//
// This mask signals the bits of the firewall mark used by KubeSpan.
KubeSpanDefaultFirewallMask = KubeSpanDefaultFirewallMark | KubeSpanDefaultForceFirewallMark

// KubeSpanDefaultPeerKeepalive is the interval at which Wireguard Peer Keepalives should be sent.
KubeSpanDefaultPeerKeepalive = 25 * time.Second
Expand Down
Loading

0 comments on commit 644e803

Please sign in to comment.