Skip to content

Commit

Permalink
feat: support rotation of Talos API CA
Browse files Browse the repository at this point in the history
This allows to roll all nodes to use a new CA, to refresh it, or e.g.
when the `talosconfig` was exposed accidentally.

Signed-off-by: Andrey Smirnov <[email protected]>
  • Loading branch information
smira committed Mar 22, 2024
1 parent 92808e3 commit 8eacc4b
Show file tree
Hide file tree
Showing 59 changed files with 1,908 additions and 518 deletions.
Binary file modified api/api.descriptors
Binary file not shown.
4 changes: 4 additions & 0 deletions api/common/common.proto
Original file line number Diff line number Diff line change
Expand Up @@ -107,6 +107,10 @@ message PEMEncodedKey {
bytes key = 1;
}

message PEMEncodedCertificate {
bytes crt = 1;
}

message NetIP {
bytes ip = 1;
}
Expand Down
7 changes: 4 additions & 3 deletions api/resource/definitions/secrets/secrets.proto
Original file line number Diff line number Diff line change
Expand Up @@ -8,9 +8,9 @@ import "common/common.proto";

// APICertsSpec describes etcd certs secrets.
message APICertsSpec {
common.PEMEncodedCertificateAndKey ca = 1;
common.PEMEncodedCertificateAndKey client = 2;
common.PEMEncodedCertificateAndKey server = 3;
repeated common.PEMEncodedCertificate accepted_c_as = 4;
}

// CertSANSpec describes fields of the cert SANs.
Expand Down Expand Up @@ -86,15 +86,16 @@ message MaintenanceServiceCertsSpec {

// OSRootSpec describes operating system CA.
message OSRootSpec {
common.PEMEncodedCertificateAndKey ca = 1;
common.PEMEncodedCertificateAndKey issuing_ca = 1;
repeated common.NetIP cert_sani_ps = 2;
repeated string cert_sandns_names = 3;
string token = 4;
repeated common.PEMEncodedCertificate accepted_c_as = 5;
}

// TrustdCertsSpec describes etcd certs secrets.
message TrustdCertsSpec {
common.PEMEncodedCertificateAndKey ca = 1;
common.PEMEncodedCertificateAndKey server = 2;
repeated common.PEMEncodedCertificate accepted_c_as = 3;
}

6 changes: 2 additions & 4 deletions cmd/talosctl/cmd/talos/health.go
Original file line number Diff line number Diff line change
Expand Up @@ -125,7 +125,7 @@ func healthOnClient(ctx context.Context, c *client.Client) error {
}
defer clientProvider.Close() //nolint:errcheck

clusterInfo, err := buildClusterInfo()
clusterInfo, err := buildClusterInfo(healthCmdFlags.clusterState)
if err != nil {
return err
}
Expand Down Expand Up @@ -225,9 +225,7 @@ func init() {
healthCmd.Flags().BoolVar(&healthCmdFlags.runE2E, "run-e2e", false, "run Kubernetes e2e test")
}

func buildClusterInfo() (cluster.Info, error) {
clusterState := healthCmdFlags.clusterState

func buildClusterInfo(clusterState clusterNodes) (cluster.Info, error) {
// if nodes are set explicitly via command line args, use them
if len(clusterState.ControlPlaneNodes) > 0 || len(clusterState.WorkerNodes) > 0 {
return &clusterState, nil
Expand Down
122 changes: 122 additions & 0 deletions cmd/talosctl/cmd/talos/rotate-ca.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,122 @@
// This Source Code Form is subject to the terms of the Mozilla Public
// License, v. 2.0. If a copy of the MPL was not distributed with this
// file, You can obtain one at http://mozilla.org/MPL/2.0/.

package talos

import (
"context"
"fmt"
"time"

"github.com/spf13/cobra"

"github.com/siderolabs/talos/pkg/machinery/client"
clientconfig "github.com/siderolabs/talos/pkg/machinery/client/config"
"github.com/siderolabs/talos/pkg/machinery/config"
"github.com/siderolabs/talos/pkg/machinery/config/encoder"
"github.com/siderolabs/talos/pkg/machinery/config/generate/secrets"
"github.com/siderolabs/talos/pkg/rotate/pki/talos"
)

var rotateCACmdFlags struct {
clusterState clusterNodes
forceEndpoint string
output string
withExamples bool
withDocs bool
dryRun bool
}

// rotateCACmd represents the rotate-ca command.
var rotateCACmd = &cobra.Command{
Use: "rotate-ca",
Short: "Rotate cluster CAs (Talos and Kubernetes APIs).",
Long: `The command starts by generating new CAs, and gracefully applying it to the cluster.`,
Args: cobra.NoArgs,
RunE: func(cmd *cobra.Command, args []string) error {
err := rotateCACmdFlags.clusterState.InitNodeInfos()
if err != nil {
return err
}

return WithClient(rotateCA)
},
}

func rotateCA(ctx context.Context, oldClient *client.Client) error {
commentsFlags := encoder.CommentsDisabled
if upgradeK8sCmdFlags.withDocs {
commentsFlags |= encoder.CommentsDocs
}

if upgradeK8sCmdFlags.withExamples {
commentsFlags |= encoder.CommentsExamples
}

encoderOpt := encoder.WithComments(commentsFlags)

clusterInfo, err := buildClusterInfo(rotateCACmdFlags.clusterState)
if err != nil {
return err
}

oldTalosconfig, err := clientconfig.Open(GlobalArgs.Talosconfig)
if err != nil {
return fmt.Errorf("failed to open config file %q: %w", GlobalArgs.Talosconfig, err)
}

configContext := oldTalosconfig.Context

if GlobalArgs.CmdContext != "" {
configContext = GlobalArgs.CmdContext
}

newBundle, err := secrets.NewBundle(secrets.NewFixedClock(time.Now()), config.TalosVersionCurrent)
if err != nil {
return fmt.Errorf("error generating new Talos CA: %w", err)
}

options := talos.Options{
DryRun: rotateCACmdFlags.dryRun,

CurrentClient: oldClient,
ClusterInfo: clusterInfo,

ContextName: configContext,
Endpoints: oldClient.GetEndpoints(),

NewTalosCA: newBundle.Certs.OS,

EncoderOption: encoderOpt,

Printf: func(format string, args ...any) { fmt.Printf(format, args...) },
}

newTalosconfig, err := talos.Rotate(ctx, options)
if err != nil {
return err
}

if rotateCACmdFlags.dryRun {
fmt.Println("> Dry-run mode enabled, no changes were made to the cluster, re-run with `--dry-run=false` to apply the changes.")

return nil
}

fmt.Printf("> Writing new talosconfig to %q\n", rotateCACmdFlags.output)

return newTalosconfig.Save(rotateCACmdFlags.output)
}

func init() {
addCommand(rotateCACmd)
rotateCACmd.Flags().StringVar(&rotateCACmdFlags.clusterState.InitNode, "init-node", "", "specify IPs of init node")
rotateCACmd.Flags().StringSliceVar(&rotateCACmdFlags.clusterState.ControlPlaneNodes, "control-plane-nodes", nil, "specify IPs of control plane nodes")
rotateCACmd.Flags().StringSliceVar(&rotateCACmdFlags.clusterState.WorkerNodes, "worker-nodes", nil, "specify IPs of worker nodes")
rotateCACmd.Flags().StringVar(&rotateCACmdFlags.forceEndpoint, "k8s-endpoint", "", "use endpoint instead of kubeconfig default")
rotateCACmd.Flags().BoolVarP(&rotateCACmdFlags.withExamples, "with-examples", "", true, "patch all machine configs with the commented examples")
rotateCACmd.Flags().BoolVarP(&rotateCACmdFlags.withDocs, "with-docs", "", true, "patch all machine configs adding the documentation for each field")
rotateCACmd.Flags().StringVarP(&rotateCACmdFlags.output, "output", "o", "talosconfig", "path to the output new `talosconfig`")
rotateCACmd.Flags().BoolVarP(&rotateCACmdFlags.dryRun, "dry-run", "", true, "dry-run mode (no changes to the cluster)")
}
2 changes: 1 addition & 1 deletion go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -111,7 +111,7 @@ require (
github.com/ryanuber/go-glob v1.0.0
github.com/safchain/ethtool v0.3.0
github.com/scaleway/scaleway-sdk-go v1.0.0-beta.25
github.com/siderolabs/crypto v0.4.2
github.com/siderolabs/crypto v0.4.4
github.com/siderolabs/discovery-api v0.1.4
github.com/siderolabs/discovery-client v0.1.8
github.com/siderolabs/gen v0.4.8
Expand Down
4 changes: 2 additions & 2 deletions go.sum
Original file line number Diff line number Diff line change
Expand Up @@ -652,8 +652,8 @@ github.com/sethgrid/pester v1.2.0 h1:adC9RS29rRUef3rIKWPOuP1Jm3/MmB6ke+OhE5giENI
github.com/sethgrid/pester v1.2.0/go.mod h1:hEUINb4RqvDxtoCaU0BNT/HV4ig5kfgOasrf1xcvr0A=
github.com/siderolabs/coredns v1.11.52 h1:L0jzzyRvmhiA3mf21yjXh8OjTEz95sVknOHfgq1I9Dk=
github.com/siderolabs/coredns v1.11.52/go.mod h1:dePXyKhQsTe3Ks228EAaiBWxV37jyquDDVHc8zwiWSY=
github.com/siderolabs/crypto v0.4.2 h1:ahAwmm1+0xd3QfGiZ0jYpWxtCVngsy+PK0cgR9frOA8=
github.com/siderolabs/crypto v0.4.2/go.mod h1:rnjC/Z6m/mI2vMv98glgU6oU8lXNi8YceiTxALohfzY=
github.com/siderolabs/crypto v0.4.4 h1:Q6EDBMR2Ub2oAZW5Xl8lrKB27bM3Sn8Gkfw3rngco5U=
github.com/siderolabs/crypto v0.4.4/go.mod h1:hsR3tJ3aaeuhCChsLF4dBd9vlJVPvmhg4vvx2ez4aD4=
github.com/siderolabs/discovery-api v0.1.4 h1:2fMEFSMiWaD1zDiBDY5md8VxItvL1rDQRSOfeXNjYKc=
github.com/siderolabs/discovery-api v0.1.4/go.mod h1:kaBy+G42v2xd/uAF/NIe383sjNTBE2AhxPTyi9SZI0s=
github.com/siderolabs/discovery-client v0.1.8 h1:8WhJiNyVmjZ0F+tSfeaDyQ04n02lRK2dh/CvWp+zlnY=
Expand Down
6 changes: 6 additions & 0 deletions hack/release.toml
Original file line number Diff line number Diff line change
Expand Up @@ -155,6 +155,12 @@ machine:
servers:
- /dev/ptp0
```
"""

[notes.ca-rotation]
title = "CA Rotation"
description = """\
Talos Linux now supports rotating the root CA certificate and key for Talos API and Kubernetes API.
"""

[make_deps]
Expand Down
2 changes: 2 additions & 0 deletions hack/structprotogen/proto/proto.go
Original file line number Diff line number Diff line change
Expand Up @@ -390,6 +390,8 @@ func formatTypeName(fieldTypePkg string, fieldType string, declPkg string) (stri
return commoProto, "common.PEMEncodedCertificateAndKey"
case typeData{"github.com/siderolabs/crypto/x509", "PEMEncodedKey"}:
return commoProto, "common.PEMEncodedKey"
case typeData{"github.com/siderolabs/crypto/x509", "PEMEncodedCertificate"}:
return commoProto, "common.PEMEncodedCertificate"
default:
return "", ""
}
Expand Down
10 changes: 7 additions & 3 deletions internal/app/apid/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -100,11 +100,15 @@ func apidMain() error {
return fmt.Errorf("failed to create client TLS config: %w", err)
}

var remoteFactory director.RemoteBackendFactory
var (
remoteFactory director.RemoteBackendFactory
onPKIUpdate func()
)

if clientTLSConfig != nil {
backendFactory := apidbackend.NewAPIDFactory(clientTLSConfig)
backendFactory := apidbackend.NewAPIDFactory(tlsConfig)
remoteFactory = backendFactory.Get
onPKIUpdate = backendFactory.Flush
}

localAddressProvider, err := director.NewLocalAddressProvider(resources)
Expand Down Expand Up @@ -226,7 +230,7 @@ func apidMain() error {
})

errGroup.Go(func() error {
return tlsConfig.Watch(ctx)
return tlsConfig.Watch(ctx, onPKIUpdate)
})

errGroup.Go(func() error {
Expand Down
18 changes: 12 additions & 6 deletions internal/app/apid/pkg/backend/apid.go
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ package backend

import (
"context"
"crypto/tls"
"fmt"
"sync"
"time"
Expand All @@ -32,22 +33,23 @@ var _ proxy.Backend = (*APID)(nil)
// Backend authenticates itself using given grpc credentials.
type APID struct {
target string
creds credentials.TransportCredentials

tlsConfigProvider func() (*tls.Config, error)

mu sync.Mutex
conn *grpc.ClientConn
}

// NewAPID creates new instance of APID backend.
func NewAPID(target string, creds credentials.TransportCredentials) (*APID, error) {
func NewAPID(target string, tlsConfigProvider func() (*tls.Config, error)) (*APID, error) {
// perform very basic validation on target, trying to weed out empty addresses or addresses with the port appended
if target == "" || net.AddressContainsPort(target) {
return nil, fmt.Errorf("invalid target %q", target)
}

return &APID{
target: target,
creds: creds,
target: target,
tlsConfigProvider: tlsConfigProvider,
}, nil
}

Expand Down Expand Up @@ -81,20 +83,24 @@ func (a *APID) GetConnection(ctx context.Context, fullMethodName string) (contex
return outCtx, a.conn, nil
}

tlsConfig, err := a.tlsConfigProvider()
if err != nil {
return outCtx, nil, err
}

// override max delay to avoid excessive backoff when the another node is unavailable (e.g. rebooted),
// and apid used as an endpoint considers another node to be down for longer than expected.
//
// default max delay is 2 minutes, which is too long for our use case.
backoffConfig := backoff.DefaultConfig
backoffConfig.MaxDelay = 15 * time.Second

var err error
a.conn, err = grpc.DialContext(
ctx,
fmt.Sprintf("%s:%d", net.FormatAddress(a.target), constants.ApidPort),
grpc.WithInitialWindowSize(65535*32),
grpc.WithInitialConnWindowSize(65535*16),
grpc.WithTransportCredentials(a.creds),
grpc.WithTransportCredentials(credentials.NewTLS(tlsConfig)),
grpc.WithConnectParams(grpc.ConnectParams{
Backoff: backoffConfig,
// not published as a constant in gRPC library
Expand Down
33 changes: 24 additions & 9 deletions internal/app/apid/pkg/backend/apid_factory.go
Original file line number Diff line number Diff line change
Expand Up @@ -6,26 +6,30 @@ package backend

import (
"crypto/tls"
"sync"

"github.com/siderolabs/gen/containers"
"github.com/siderolabs/grpc-proxy/proxy"
"google.golang.org/grpc/credentials"
)

// APIDFactory caches connection to apid instances by target.
//
// TODO: need to clean up idle connections from time to time.
type APIDFactory struct {
cache sync.Map
creds credentials.TransportCredentials
cache containers.SyncMap[string, *APID]
provider TLSConfigProvider
}

// TLSConfigProvider provides tls.Config for client connections.
type TLSConfigProvider interface {
ClientConfig() (*tls.Config, error)
}

// NewAPIDFactory creates new APIDFactory with given tls.Config.
//
// Client TLS config is used to connect to other apid instances.
func NewAPIDFactory(config *tls.Config) *APIDFactory {
func NewAPIDFactory(provider TLSConfigProvider) *APIDFactory {
return &APIDFactory{
creds: credentials.NewTLS(config),
provider: provider,
}
}

Expand All @@ -35,10 +39,10 @@ func NewAPIDFactory(config *tls.Config) *APIDFactory {
func (factory *APIDFactory) Get(target string) (proxy.Backend, error) {
b, ok := factory.cache.Load(target)
if ok {
return b.(proxy.Backend), nil
return b, nil
}

backend, err := NewAPID(target, factory.creds)
backend, err := NewAPID(target, factory.provider.ClientConfig)
if err != nil {
return nil, err
}
Expand All @@ -48,8 +52,19 @@ func (factory *APIDFactory) Get(target string) (proxy.Backend, error) {
// race: another Get() call built different backend
backend.Close()

return existing.(proxy.Backend), nil
return existing, nil
}

return backend, nil
}

// Flush all cached backends.
//
// This ensures that all connections are closed.
func (factory *APIDFactory) Flush() {
factory.cache.Range(func(key string, backend *APID) bool {
backend.Close()

return true
})
}
Loading

0 comments on commit 8eacc4b

Please sign in to comment.