Skip to content

Commit

Permalink
fix: workaround a race in CNI setup (talosctl cluster create)
Browse files Browse the repository at this point in the history
When provisioning VMs, each launch process sets up CNI network, and from
time to time CNI setup fails with something like:

```
error provisioning CNI network: plugin type="firewall" failed (add): running [/sbin/iptables -t filter -N CNI-ADMIN --wait]: exit status 4: iptables v1.8.10 (nf_tables)
```

This a race condition in the CNI plugins, and it looks like there is no
fix for it (see e.g. hashicorp/nomad#8838).

As a workaround, take a mutex around CNI operation to serialize them.
CNI setup happens in different processes, so use a file-based mutex.

Signed-off-by: Andrey Smirnov <[email protected]>
(cherry picked from commit b2ad5dc)
  • Loading branch information
smira committed Mar 6, 2024
1 parent 38b5aed commit e4f7126
Show file tree
Hide file tree
Showing 3 changed files with 56 additions and 3 deletions.
1 change: 1 addition & 0 deletions go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@ require (
github.com/Azure/azure-sdk-for-go/sdk/security/keyvault/azcertificates v1.0.0
github.com/Azure/azure-sdk-for-go/sdk/security/keyvault/azkeys v1.0.1
github.com/BurntSushi/toml v1.3.2
github.com/alexflint/go-filemutex v1.2.0
github.com/aws/aws-sdk-go-v2/config v1.25.6
github.com/aws/aws-sdk-go-v2/feature/ec2/imds v1.14.5
github.com/aws/aws-sdk-go-v2/service/kms v1.26.5
Expand Down
2 changes: 2 additions & 0 deletions go.sum
Original file line number Diff line number Diff line change
Expand Up @@ -81,6 +81,8 @@ github.com/ProtonMail/gopenpgp/v2 v2.7.4 h1:Vz/8+HViFFnf2A6XX8JOvZMrA6F5puwNvvF2
github.com/ProtonMail/gopenpgp/v2 v2.7.4/go.mod h1:IhkNEDaxec6NyzSI0PlxapinnwPVIESk8/76da3Ct3g=
github.com/adrg/xdg v0.4.0 h1:RzRqFcjH4nE5C6oTAxhBtoE2IRyjBSa62SCbyPidvls=
github.com/adrg/xdg v0.4.0/go.mod h1:N6ag73EX4wyxeaoeHctc1mas01KZgsj5tYiAIwqJE/E=
github.com/alexflint/go-filemutex v1.2.0 h1:1v0TJPDtlhgpW4nJ+GvxCLSlUDC3+gW0CQQvlmfDR/s=
github.com/alexflint/go-filemutex v1.2.0/go.mod h1:mYyQSWvw9Tx2/H2n9qXPb52tTYfE0pZAWcBq5mK025c=
github.com/armon/circbuf v0.0.0-20190214190532-5111143e8da2 h1:7Ip0wMmLHLRJdrloDxZfhMm0xrLXZS8+COSu2bXmEQs=
github.com/armon/circbuf v0.0.0-20190214190532-5111143e8da2/go.mod h1:3U/XgcO3hCbHZ8TKRvWD2dDTCfh9M9ya+I9JpbB7O8o=
github.com/armon/go-proxyproto v0.0.0-20210323213023-7e956b284f0a/go.mod h1:QmP9hvJ91BbJmGVGSbutW19IC0Q9phDCLGaomwTJbgU=
Expand Down
56 changes: 53 additions & 3 deletions pkg/provision/providers/qemu/launch.go
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,9 @@ import (
"strconv"
"strings"

"github.com/alexflint/go-filemutex"
"github.com/containernetworking/cni/libcni"
"github.com/containernetworking/cni/pkg/types"
types100 "github.com/containernetworking/cni/pkg/types/100"
"github.com/containernetworking/plugins/pkg/ns"
"github.com/containernetworking/plugins/pkg/testutils"
Expand Down Expand Up @@ -94,6 +96,39 @@ type tpm2Config struct {
StateDir string
}

// withCNIOperationLocked ensures that CNI operations don't run concurrently.
//
// There are race conditions in the CNI plugins that can cause a failure if called concurrently.
func withCNIOperationLocked[T any](config *LaunchConfig, f func() (T, error)) (T, error) {
var zeroT T

lock, err := filemutex.New(filepath.Join(config.StatePath, "cni.lock"))
if err != nil {
return zeroT, fmt.Errorf("failed to create CNI lock: %w", err)
}

if err = lock.Lock(); err != nil {
return zeroT, fmt.Errorf("failed to acquire CNI lock: %w", err)
}

defer func() {
if err := lock.Close(); err != nil {
log.Printf("failed to release CNI lock: %s", err)
}
}()

return f()
}

// withCNIOperationLockedNoResult ensures that CNI operations don't run concurrently.
func withCNIOperationLockedNoResult(config *LaunchConfig, f func() error) error {
_, err := withCNIOperationLocked(config, func() (struct{}, error) {
return struct{}{}, f()
})

return err
}

// withCNI creates network namespace, launches CNI and passes control to the next function
// filling config with netNS and interface details.
//
Expand Down Expand Up @@ -134,18 +169,33 @@ func withCNI(ctx context.Context, config *LaunchConfig, f func(config *LaunchCon
}

// attempt to clean up network in case it was deployed previously
err = cniConfig.DelNetworkList(ctx, config.NetworkConfig, &runtimeConf)
err = withCNIOperationLockedNoResult(
config,
func() error {
return cniConfig.DelNetworkList(ctx, config.NetworkConfig, &runtimeConf)
},
)
if err != nil {
return fmt.Errorf("error deleting CNI network: %w", err)
}

res, err := cniConfig.AddNetworkList(ctx, config.NetworkConfig, &runtimeConf)
res, err := withCNIOperationLocked(
config,
func() (types.Result, error) {
return cniConfig.AddNetworkList(ctx, config.NetworkConfig, &runtimeConf)
},
)
if err != nil {
return fmt.Errorf("error provisioning CNI network: %w", err)
}

defer func() {
if e := cniConfig.DelNetworkList(ctx, config.NetworkConfig, &runtimeConf); e != nil {
if e := withCNIOperationLockedNoResult(
config,
func() error {
return cniConfig.DelNetworkList(ctx, config.NetworkConfig, &runtimeConf)
},
); e != nil {
log.Printf("error cleaning up CNI: %s", e)
}
}()
Expand Down

0 comments on commit e4f7126

Please sign in to comment.