Skip to content

Commit

Permalink
feat: support hardware watchdog timers
Browse files Browse the repository at this point in the history
Only enabled when activated by config, disabled on shutdown/reboot

Fixes siderolabs#8284

Signed-off-by: Dmitry Sharshakov <[email protected]>
Signed-off-by: Dmitry Sharshakov <[email protected]>
Signed-off-by: Andrey Smirnov <[email protected]>
  • Loading branch information
dsseng authored and smira committed Mar 21, 2024
1 parent 84ec8c1 commit 352b871
Show file tree
Hide file tree
Showing 29 changed files with 1,615 additions and 26 deletions.
14 changes: 14 additions & 0 deletions api/resource/definitions/runtime/runtime.proto
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ package talos.resource.definitions.runtime;
option go_package = "github.com/siderolabs/talos/pkg/machinery/api/resource/definitions/runtime";

import "common/common.proto";
import "google/protobuf/duration.proto";
import "resource/definitions/enums/enums.proto";

// DevicesStatusSpec is the spec for devices status.
Expand Down Expand Up @@ -126,3 +127,16 @@ message UnmetCondition {
string reason = 2;
}

// WatchdogTimerConfigSpec describes configuration of watchdog timer.
message WatchdogTimerConfigSpec {
string device = 1;
google.protobuf.Duration timeout = 2;
}

// WatchdogTimerStatusSpec describes configuration of watchdog timer.
message WatchdogTimerStatusSpec {
string device = 1;
google.protobuf.Duration timeout = 2;
google.protobuf.Duration feed_interval = 3;
}

10 changes: 10 additions & 0 deletions hack/modules-amd64.txt
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ kernel/drivers/ata/pata_marvell.ko
kernel/drivers/ata/pata_oldpiix.ko
kernel/drivers/ata/pata_sch.ko
kernel/drivers/block/nbd.ko
kernel/drivers/char/ipmi/ipmi_watchdog.ko
kernel/drivers/gpu/drm/amd/amdgpu/amdgpu.ko
kernel/drivers/gpu/drm/amd/amdxcp/amdxcp.ko
kernel/drivers/gpu/drm/display/drm_display_helper.ko
Expand Down Expand Up @@ -62,6 +63,8 @@ kernel/drivers/message/fusion/mptbase.ko
kernel/drivers/message/fusion/mptsas.ko
kernel/drivers/message/fusion/mptscsih.ko
kernel/drivers/message/fusion/mptspi.ko
kernel/drivers/mfd/lpc_ich.ko
kernel/drivers/mfd/mfd-core.ko
kernel/drivers/misc/hpilo.ko
kernel/drivers/mmc/host/sdhci_f_sdh30.ko
kernel/drivers/mmc/host/sdhci-acpi.ko
Expand Down Expand Up @@ -134,6 +137,13 @@ kernel/drivers/virtio/virtio_mmio.ko
kernel/drivers/virtio/virtio_pci_legacy_dev.ko
kernel/drivers/virtio/virtio_pci_modern_dev.ko
kernel/drivers/virtio/virtio_pci.ko
kernel/drivers/watchdog/i6300esb.ko
kernel/drivers/watchdog/iTCO_vendor_support.ko
kernel/drivers/watchdog/iTCO_wdt.ko
kernel/drivers/watchdog/sp5100_tco.ko
kernel/drivers/watchdog/watchdog.ko
kernel/drivers/watchdog/wdat_wdt.ko
kernel/drivers/watchdog/xen_wdt.ko
kernel/lib/objagg.ko
kernel/lib/parman.ko
kernel/lib/raid6/raid6_pq.ko
Expand Down
16 changes: 16 additions & 0 deletions hack/release.toml
Original file line number Diff line number Diff line change
Expand Up @@ -155,6 +155,22 @@ machine:
servers:
- /dev/ptp0
```
"""

[notes.watchdog]
title = "Hardware Watchdog Timers"
description = """\
Talos Linux now supports hardware watchdog timers configuration.
If enabled, and the machine becomes unresponsive, the hardware watchdog will reset the machine.
The watchdog can be enabled with the following configuration document:
```yaml
apiVersion: v1alpha1
kind: WatchdogTimerConfig
device: /dev/watchdog0
timeout: 3m0s
```
"""

[make_deps]
Expand Down
174 changes: 174 additions & 0 deletions internal/app/machined/pkg/controllers/runtime/watchdog_timer.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,174 @@
// This Source Code Form is subject to the terms of the Mozilla Public
// License, v. 2.0. If a copy of the MPL was not distributed with this
// file, You can obtain one at http://mozilla.org/MPL/2.0/.

package runtime

import (
"context"
"fmt"
"os"
"syscall"
"time"
"unsafe"

"github.com/cosi-project/runtime/pkg/controller"
"github.com/cosi-project/runtime/pkg/safe"
"github.com/cosi-project/runtime/pkg/state"
"github.com/siderolabs/gen/optional"
"go.uber.org/zap"
"golang.org/x/sys/unix"

"github.com/siderolabs/talos/pkg/machinery/resources/runtime"
)

// WatchdogTimerController watches v1alpha1.Config, creates/updates/deletes kernel module specs.
type WatchdogTimerController struct{}

// Name implements controller.Controller interface.
func (ctrl *WatchdogTimerController) Name() string {
return "runtime.WatchdogTimerController"
}

// Inputs implements controller.Controller interface.
func (ctrl *WatchdogTimerController) Inputs() []controller.Input {
return []controller.Input{
{
Namespace: runtime.NamespaceName,
Type: runtime.WatchdogTimerConfigType,
ID: optional.Some(runtime.WatchdogTimerConfigID),
},
}
}

// Outputs implements controller.Controller interface.
func (ctrl *WatchdogTimerController) Outputs() []controller.Output {
return []controller.Output{
{
Type: runtime.WatchdogTimerStatusType,
Kind: controller.OutputExclusive,
},
}
}

// Run implements controller.Controller interface.
//
//nolint:gocyclo,cyclop
func (ctrl *WatchdogTimerController) Run(ctx context.Context, r controller.Runtime, logger *zap.Logger) error {
var (
ticker *time.Ticker
tickerC <-chan time.Time
)

tickerStop := func() {
if ticker == nil {
return
}

ticker.Stop()

ticker = nil
tickerC = nil
}

defer tickerStop()

var wd *os.File

wdClose := func() {
if wd == nil {
return
}

logger.Info("closing hardware watchdog", zap.String("path", wd.Name()))

// Magic close: make sure old watchdog won't trip after we close it
if _, err := wd.WriteString("V"); err != nil {
logger.Error("failed to send magic close to watchdog", zap.String("path", wd.Name()))
}

if err := wd.Close(); err != nil {
logger.Error("failed to close watchdog", zap.String("path", wd.Name()))
}

wd = nil
}

defer wdClose()

for {
select {
case <-ctx.Done():
return nil
case <-tickerC:
if _, _, err := syscall.Syscall(syscall.SYS_IOCTL, wd.Fd(), unix.WDIOC_KEEPALIVE, 0); err != 0 {
return fmt.Errorf("failed to feed watchdog: %w", err)
}

continue
case <-r.EventCh():
}

cfg, err := safe.ReaderGetByID[*runtime.WatchdogTimerConfig](ctx, r, runtime.WatchdogTimerConfigID)
if err != nil {
if !state.IsNotFoundError(err) {
return fmt.Errorf("error getting watchdog config: %w", err)
}
}

r.StartTrackingOutputs()

if cfg == nil {
tickerStop()
wdClose()
} else {
// close the watchdog if requested to use new one
if wd != nil && wd.Name() != cfg.TypedSpec().Device {
wdClose()
}

if wd == nil {
wd, err = os.OpenFile(cfg.TypedSpec().Device, syscall.O_RDWR, 0o600)
if err != nil {
return fmt.Errorf("failed to open watchdog device: %s", err)
}

logger.Info("opened hardware watchdog", zap.String("path", cfg.TypedSpec().Device))
}

timeout := int(cfg.TypedSpec().Timeout.Seconds())

if _, _, err := syscall.Syscall(syscall.SYS_IOCTL, wd.Fd(), uintptr(unix.WDIOC_SETTIMEOUT), uintptr(unsafe.Pointer(&timeout))); err != 0 {
return fmt.Errorf("failed to set watchdog timeout: %w", err)
}

tickerStop()

// 3 pings per timeout should suffice in any case
feedInterval := cfg.TypedSpec().Timeout / 3

ticker = time.NewTicker(feedInterval)
//tickerC = ticker.C

if _, _, err := syscall.Syscall(syscall.SYS_IOCTL, wd.Fd(), uintptr(unix.WDIOC_KEEPALIVE), 0); err != 0 {
return fmt.Errorf("failed to feed watchdog: %w", err)
}

logger.Info("set hardware watchdog timeout", zap.Duration("timeout", cfg.TypedSpec().Timeout), zap.Duration("feed_interval", feedInterval))

if err = safe.WriterModify(ctx, r, runtime.NewWatchdogTimerStatus(cfg.Metadata().ID()), func(status *runtime.WatchdogTimerStatus) error {
status.TypedSpec().Device = cfg.TypedSpec().Device
status.TypedSpec().Timeout = cfg.TypedSpec().Timeout
status.TypedSpec().FeedInterval = feedInterval

return nil
}); err != nil {
return fmt.Errorf("error updating watchdog status: %w", err)
}
}

if err = safe.CleanupOutputs[*runtime.WatchdogTimerStatus](ctx, r); err != nil {
return err
}
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,84 @@
// This Source Code Form is subject to the terms of the Mozilla Public
// License, v. 2.0. If a copy of the MPL was not distributed with this
// file, You can obtain one at http://mozilla.org/MPL/2.0/.

package runtime

import (
"context"
"fmt"

"github.com/cosi-project/runtime/pkg/controller"
"github.com/cosi-project/runtime/pkg/safe"
"github.com/cosi-project/runtime/pkg/state"
"github.com/siderolabs/gen/optional"
"go.uber.org/zap"

"github.com/siderolabs/talos/pkg/machinery/resources/config"
"github.com/siderolabs/talos/pkg/machinery/resources/runtime"
)

// WatchdogTimerConfigController generates configuration for watchdog timers.
type WatchdogTimerConfigController struct{}

// Name implements controller.Controller interface.
func (ctrl *WatchdogTimerConfigController) Name() string {
return "runtime.WatchdogTimerConfigController"
}

// Inputs implements controller.Controller interface.
func (ctrl *WatchdogTimerConfigController) Inputs() []controller.Input {
return []controller.Input{
{
Namespace: config.NamespaceName,
Type: config.MachineConfigType,
ID: optional.Some(config.V1Alpha1ID),
Kind: controller.InputWeak,
},
}
}

// Outputs implements controller.Controller interface.
func (ctrl *WatchdogTimerConfigController) Outputs() []controller.Output {
return []controller.Output{
{
Type: runtime.WatchdogTimerConfigType,
Kind: controller.OutputExclusive,
},
}
}

// Run implements controller.Controller interface.
func (ctrl *WatchdogTimerConfigController) Run(ctx context.Context, r controller.Runtime, logger *zap.Logger) (err error) {
for {
select {
case <-ctx.Done():
return nil
case <-r.EventCh():
}

cfg, err := safe.ReaderGetByID[*config.MachineConfig](ctx, r, config.V1Alpha1ID)
if err != nil && !state.IsNotFoundError(err) {
return fmt.Errorf("error getting machine config: %w", err)
}

r.StartTrackingOutputs()

if cfg != nil {
if watchdogConfig := cfg.Config().Runtime().WatchdogTimer(); watchdogConfig != nil {
if err = safe.WriterModify(ctx, r, runtime.NewWatchdogTimerConfig(), func(cfg *runtime.WatchdogTimerConfig) error {
cfg.TypedSpec().Device = watchdogConfig.Device()
cfg.TypedSpec().Timeout = watchdogConfig.Timeout()

return nil
}); err != nil {
return fmt.Errorf("error updating kmsg log config: %w", err)
}
}
}

if err = safe.CleanupOutputs[*runtime.WatchdogTimerConfig](ctx, r); err != nil {
return err
}
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
// This Source Code Form is subject to the terms of the Mozilla Public
// License, v. 2.0. If a copy of the MPL was not distributed with this
// file, You can obtain one at http://mozilla.org/MPL/2.0/.

package runtime_test

import (
"testing"

"github.com/cosi-project/runtime/pkg/resource"
"github.com/cosi-project/runtime/pkg/resource/rtestutils"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/suite"

"github.com/siderolabs/talos/internal/app/machined/pkg/controllers/ctest"
runtimectrls "github.com/siderolabs/talos/internal/app/machined/pkg/controllers/runtime"
"github.com/siderolabs/talos/pkg/machinery/config/container"
runtimecfg "github.com/siderolabs/talos/pkg/machinery/config/types/runtime"
"github.com/siderolabs/talos/pkg/machinery/resources/config"
"github.com/siderolabs/talos/pkg/machinery/resources/runtime"
)

type WatchdogTimerConfigSuite struct {
ctest.DefaultSuite
}

func TestWatchdogTimerConfigSuite(t *testing.T) {
suite.Run(t, new(WatchdogTimerConfigSuite))
}

func (suite *WatchdogTimerConfigSuite) TestWatchdogTimerConfigNone() {
suite.Require().NoError(suite.Runtime().RegisterController(&runtimectrls.WatchdogTimerConfigController{}))

rtestutils.AssertNoResource[*runtime.WatchdogTimerConfig](suite.Ctx(), suite.T(), suite.State(), runtime.WatchdogTimerConfigID)
}

func (suite *WatchdogTimerConfigSuite) TestWatchdogTimerConfigMachineConfig() {
suite.Require().NoError(suite.Runtime().RegisterController(&runtimectrls.WatchdogTimerConfigController{}))

watchdogTimerConfig := &runtimecfg.WatchdogTimerV1Alpha1{
WatchdogDevice: "/dev/watchdog0",
}

cfg, err := container.New(watchdogTimerConfig)
suite.Require().NoError(err)

suite.Require().NoError(suite.State().Create(suite.Ctx(), config.NewMachineConfig(cfg)))

rtestutils.AssertResources[*runtime.WatchdogTimerConfig](suite.Ctx(), suite.T(), suite.State(), []resource.ID{runtime.WatchdogTimerConfigID},
func(cfg *runtime.WatchdogTimerConfig, asrt *assert.Assertions) {
asrt.Equal(
"/dev/watchdog0",
cfg.TypedSpec().Device,
)
asrt.Equal(
runtimecfg.DefaultWatchdogTimeout,
cfg.TypedSpec().Timeout,
)
})
}
Loading

0 comments on commit 352b871

Please sign in to comment.