Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Windows] Fix Pod cannot access endpoints with external IP through ClusterIP Service #1824

Merged
merged 3 commits into from
Feb 9, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
105 changes: 92 additions & 13 deletions pkg/agent/openflow/pipeline.go
Original file line number Diff line number Diff line change
Expand Up @@ -222,6 +222,19 @@ const (

CtZone = 0xfff0
CtZoneV6 = 0xffe6
// CtZoneSNAT is only used on Windows and only when AntreaProxy is enabled.
// When a Pod access a ClusterIP Service, and the IP of the selected endpoint
// is not in "cluster-cidr". The request packets need to be SNAT'd(set src IP to local Node IP)
// after have been DNAT'd(set dst IP to endpoint IP).
// For example, the endpoint Pod may run in hostNetwork mode and the IP of the endpoint
// will is the current Node IP.
// We need to use a different ct_zone to track the SNAT'd connection because OVS
// does not support doing both DNAT and SNAT in the same ct_zone.
//
// An example of the connection is a Pod accesses kubernetes API service:
// Pod --> DNAT(CtZone) --> SNAT(CtZoneSNAT) --> Endpoint(API server NodeIP)
// Pod <-- unDNAT(CtZone) <-- unSNAT(CtZoneSNAT) <-- Endpoint(API server NodeIP)
CtZoneSNAT = 0xffdc

portFoundMark = 0b1
snatRequiredMark = 0b1
Expand Down Expand Up @@ -692,7 +705,7 @@ func (c *client) ctRewriteDstMACFlows(gatewayMAC net.HardwareAddr, category cook
// service LB tables and enter egressRuleTable directly.
func (c *client) serviceLBBypassFlows(ipProtocol binding.Protocol) []binding.Flow {
connectionTrackStateTable := c.pipeline[conntrackStateTable]
return []binding.Flow{
flows := []binding.Flow{
// Tracked connections with the ServiceCTMark (load-balanced by AntreaProxy) receive
// the macRewriteMark and are sent to egressRuleTable.
connectionTrackStateTable.BuildFlow(priorityNormal).MatchProtocol(ipProtocol).
Expand All @@ -711,6 +724,26 @@ func (c *client) serviceLBBypassFlows(ipProtocol binding.Protocol) []binding.Flo
Cookie(c.cookieAllocator.Request(cookie.Service).Raw()).
Done(),
}

if runtime.IsWindowsPlatform() && ipProtocol == binding.ProtocolIP {
// Handle the reply packets of the connection which are applied both DNAT and SNAT.
// The packets have following characteristics:
// - Received from uplink
// - ct_state is "-new+trk"
// - ct_mark is set to 0x21(ServiceCTMark)
// This flow resubmits the packets to the following table to avoid being forwarded
// to the bridge port by default.
flows = append(flows, c.pipeline[conntrackStateTable].BuildFlow(priorityHigh).
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

How is it different from the first flow created in L711? seems duplicate

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Here we add a new match field markTrafficFromUplink . Or the traffic recieved from uplink will hit L1563 and be forwarded to br-int directly.

		// Output the non-SNAT packet to the bridge interface directly if it is received from the uplink interface.
		c.pipeline[conntrackStateTable].BuildFlow(priorityNormal).
			MatchProtocol(binding.ProtocolIP).
			MatchRegRange(int(marksReg), markTrafficFromUplink, binding.Range{0, 15}).
			Action().Output(int(bridgeOFPort)).
			Cookie(c.cookieAllocator.Request(category).Raw()).
			Done(),

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

For simplicity, could we make L1563 low priority, L1552 flow can be normal priority too?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think it's reasonable by analyzing the flows. Will have a try.

MatchProtocol(ipProtocol).
MatchCTStateNew(false).MatchCTStateTrk(true).
MatchCTMark(ServiceCTMark, nil).
MatchRegRange(int(marksReg), markTrafficFromUplink, binding.Range{0, 15}).
Action().LoadRegRange(int(marksReg), macRewriteMark, macRewriteMarkRange).
Action().GotoTable(EgressRuleTable).
Cookie(c.cookieAllocator.Request(cookie.Service).Raw()).
Done())
}
return flows
}

// l2ForwardCalcFlow generates the flow that matches dst MAC and loads ofPort to reg.
Expand Down Expand Up @@ -1516,16 +1549,6 @@ func (c *client) uplinkSNATFlows(category cookie.Category) []binding.Flow {
}
bridgeOFPort := uint32(config.BridgeOFPort)
flows := []binding.Flow{
// Forward the IP packets from the uplink interface to
// conntrackTable. This is for unSNAT the traffic from the local
// Pod subnet to the external network. Non-SNAT packets will be
// output to the bridge port in conntrackStateTable.
c.pipeline[uplinkTable].BuildFlow(priorityNormal).
MatchProtocol(binding.ProtocolIP).
MatchRegRange(int(marksReg), markTrafficFromUplink, binding.Range{0, 15}).
Action().GotoTable(conntrackTable).
Cookie(c.cookieAllocator.Request(category).Raw()).
Done(),
// Mark the packet to indicate its destination MAC should be rewritten to the real MAC in the L3Forwarding
// table, if the packet is a reply to a Pod from an external address.
c.pipeline[conntrackStateTable].BuildFlow(priorityHigh).
Expand All @@ -1545,6 +1568,29 @@ func (c *client) uplinkSNATFlows(category cookie.Category) []binding.Flow {
Cookie(c.cookieAllocator.Request(category).Raw()).
Done(),
}
// Forward the IP packets from the uplink interface to
// conntrackTable. This is for unSNAT the traffic from the local
// Pod subnet to the external network. Non-SNAT packets will be
// output to the bridge port in conntrackStateTable.
if c.enableProxy {
// For the connection which is both applied DNAT and SNAT, the reply packtets
// are received from uplink and need to enter CTZoneSNAT first to do unSNAT.
// Pod --> DNAT(CtZone) --> SNAT(CtZoneSNAT) --> ExternalServer
// Pod <-- unDNAT(CtZone) <-- unSNAT(CtZoneSNAT) <-- ExternalServer
flows = append(flows, c.pipeline[uplinkTable].BuildFlow(priorityNormal).
MatchProtocol(binding.ProtocolIP).
MatchRegRange(int(marksReg), markTrafficFromUplink, binding.Range{0, 15}).
Action().CT(false, conntrackTable, CtZoneSNAT).NAT().CTDone().
Cookie(c.cookieAllocator.Request(category).Raw()).
Done())
} else {
flows = append(flows, c.pipeline[uplinkTable].BuildFlow(priorityNormal).
MatchProtocol(binding.ProtocolIP).
MatchRegRange(int(marksReg), markTrafficFromUplink, binding.Range{0, 15}).
Action().GotoTable(conntrackTable).
Cookie(c.cookieAllocator.Request(category).Raw()).
Done())
}
return flows
}

Expand Down Expand Up @@ -1600,7 +1646,6 @@ func (c *client) snatFlows(nodeIP net.IP, localSubnet net.IPNet, category cookie
Action().GotoTable(nextTable).
Cookie(c.cookieAllocator.Request(category).Raw()).
Done(),

// Force IP packet into the conntrack zone with SNAT. If the connection is SNATed, the reply packet should use
// Pod IP as the destination, and then is forwarded to conntrackStateTable.
c.pipeline[conntrackTable].BuildFlow(priorityNormal).MatchProtocol(binding.ProtocolIP).
Expand All @@ -1612,14 +1657,48 @@ func (c *client) snatFlows(nodeIP net.IP, localSubnet net.IPNet, category cookie
// source IP in NAT action, 4) ct_mark is set to 0x40 in the conn_track context.
c.pipeline[conntrackCommitTable].BuildFlow(priorityNormal).
MatchProtocol(binding.ProtocolIP).
MatchCTStateNew(true).MatchCTStateTrk(true).
MatchCTStateNew(true).MatchCTStateTrk(true).MatchCTStateDNAT(false).
MatchRegRange(int(marksReg), snatRequiredMark, snatMarkRange).
Action().CT(true, L2ForwardingOutTable, CtZone).
SNAT(snatIPRange, nil).
LoadToMark(snatCTMark).CTDone().
Cookie(c.cookieAllocator.Request(category).Raw()).
Done(),
}
// The following flows are for both apply DNAT + SNAT for packets.
// If AntreaProxy is disabled, no DNAT happens in OVS pipeline.
if c.enableProxy {
// If the SNAT is needed after DNAT, mark the snatRequiredMark even the connection is not new.
// Because this kind of packets need to enter CtZoneSNAT to make sure the SNAT can be applied
// before leaving the pipeline.
flows = append(flows, l3FwdTable.BuildFlow(priorityLow).
MatchProtocol(binding.ProtocolIP).
MatchCTStateNew(false).MatchCTStateTrk(true).MatchCTStateDNAT(true).
ruicao93 marked this conversation as resolved.
Show resolved Hide resolved
MatchRegRange(int(marksReg), markTrafficFromLocal, binding.Range{0, 15}).
Action().LoadRegRange(int(marksReg), snatRequiredMark, snatMarkRange).
Action().GotoTable(nextTable).
Cookie(c.cookieAllocator.Request(category).Raw()).
Done())
// If SNAT is needed after DNAT:
// - For new connection: commit to CtZoneSNAT
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do we consider doing all SNAT in this zone later? It seems currently SNAT is performed in CtZone when it's not DNATed and in CtZoneSNAT otherwise, which seems a little difficult complex. Or you plan to unify them when moving to two bridges?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think unify them when moving to two bridges. This PR only handle the DNAT + SNAT case to aovid introducing big change.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I would agree we should handle all SNAT in a single way. Need to understand the two bridge proposal better.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Once you have something, could you share? I hope to understand how we are going to organize flows with two bridges, as I am designing flows for SNAT policy, which might be impacted by the two-bridge change.

Copy link
Contributor Author

@ruicao93 ruicao93 Feb 9, 2021

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sure @jianjuns. Actually the two bridges is just a draft idea for NodePort Service support on Windows and need to be investigated.

Agree with you and Quan, handle all SNAT in a single way would be better.

But consider the v0.13.0 is near to release, do you think if we could merge current change first and make further step(all SNAT in single ct_zone or use other ways) after v0.13.0?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We can unify SNAT flows in the next release.

// - For existing connection: enter CtZoneSNAT to apply SNAT
flows = append(flows, c.pipeline[conntrackCommitTable].BuildFlow(priorityNormal).
MatchProtocol(binding.ProtocolIP).
MatchCTStateNew(true).MatchCTStateTrk(true).MatchCTStateDNAT(true).
MatchRegRange(int(marksReg), snatRequiredMark, snatMarkRange).
Action().CT(true, L2ForwardingOutTable, CtZoneSNAT).
SNAT(snatIPRange, nil).
LoadToMark(snatCTMark).CTDone().
Cookie(c.cookieAllocator.Request(category).Raw()).
Done())
flows = append(flows, c.pipeline[conntrackCommitTable].BuildFlow(priorityNormal).
MatchProtocol(binding.ProtocolIP).
MatchCTStateNew(false).MatchCTStateTrk(true).MatchCTStateDNAT(true).
MatchRegRange(int(marksReg), snatRequiredMark, snatMarkRange).
Action().CT(false, L2ForwardingOutTable, CtZoneSNAT).NAT().CTDone().
Cookie(c.cookieAllocator.Request(category).Raw()).
Done())
}
return flows
}

Expand Down
2 changes: 2 additions & 0 deletions pkg/ovs/openflow/interfaces.go
Original file line number Diff line number Diff line change
Expand Up @@ -225,6 +225,8 @@ type FlowBuilder interface {
MatchCTStateEst(isSet bool) FlowBuilder
MatchCTStateTrk(isSet bool) FlowBuilder
MatchCTStateInv(isSet bool) FlowBuilder
MatchCTStateDNAT(isSet bool) FlowBuilder
MatchCTStateSNAT(isSet bool) FlowBuilder
MatchCTMark(value uint32, mask *uint32) FlowBuilder
MatchCTLabelRange(high, low uint64, bitRange Range) FlowBuilder
MatchConjID(value uint32) FlowBuilder
Expand Down
28 changes: 28 additions & 0 deletions pkg/ovs/openflow/ofctrl_builder.go
Original file line number Diff line number Diff line change
Expand Up @@ -189,6 +189,34 @@ func (b *ofFlowBuilder) MatchCTStateInv(set bool) FlowBuilder {
return b
}

func (b *ofFlowBuilder) MatchCTStateDNAT(set bool) FlowBuilder {
if b.ctStates == nil {
b.ctStates = openflow13.NewCTStates()
}
if set {
b.ctStates.SetDNAT()
b.addCTStateString("+dnat")
} else {
b.ctStates.UnsetDNAT()
b.addCTStateString("-dnat")
}
return b
}

func (b *ofFlowBuilder) MatchCTStateSNAT(set bool) FlowBuilder {
if b.ctStates == nil {
b.ctStates = openflow13.NewCTStates()
}
if set {
b.ctStates.SetSNAT()
b.addCTStateString("+snat")
} else {
b.ctStates.UnsetSNAT()
b.addCTStateString("-snat")
}
return b
}

// MatchCTMark adds match condition for matching ct_mark. If mask is nil, the mask should be not set in the OpenFlow
// message which is sent to OVS, and OVS should match the value exactly.
func (b *ofFlowBuilder) MatchCTMark(value uint32, mask *uint32) FlowBuilder {
Expand Down
30 changes: 29 additions & 1 deletion pkg/ovs/openflow/testing/mock_openflow.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

12 changes: 10 additions & 2 deletions test/integration/agent/openflow_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -1272,7 +1272,7 @@ func prepareExternalFlows(nodeIP net.IP, localSubnet *net.IPNet, vMAC net.Hardwa
[]*ofTestUtils.ExpectFlow{
{
MatchStr: fmt.Sprintf("priority=200,ip,reg0=0x4/0xffff"),
ActStr: "goto_table:30",
ActStr: "ct(table=30,zone=65500,nat)",
},
},
},
Expand Down Expand Up @@ -1321,9 +1321,17 @@ func prepareExternalFlows(nodeIP net.IP, localSubnet *net.IPNet, vMAC net.Hardwa
uint8(105),
[]*ofTestUtils.ExpectFlow{
{
MatchStr: "priority=200,ct_state=+new+trk,ip,reg0=0x20000/0x20000",
MatchStr: "priority=200,ct_state=+new+trk-dnat,ip,reg0=0x20000/0x20000",
ActStr: fmt.Sprintf("ct(commit,table=110,zone=65520,nat(src=%s),exec(load:0x40->NXM_NX_CT_MARK[]))", nodeIP.String()),
},
{
MatchStr: "priority=200,ct_state=+new+trk+dnat,ip,reg0=0x20000/0x20000",
ActStr: fmt.Sprintf("ct(commit,table=110,zone=65500,nat(src=%s),exec(load:0x40->NXM_NX_CT_MARK[]))", nodeIP.String()),
},
{
MatchStr: "priority=200,ct_state=-new+trk+dnat,ip,reg0=0x20000/0x20000",
ActStr: "ct(table=110,zone=65500,nat)",
},
},
},
}
Expand Down