-
Notifications
You must be signed in to change notification settings - Fork 2.4k
/
commander.go
179 lines (142 loc) · 4.38 KB
/
commander.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
// Copyright The OpenTelemetry Authors
// SPDX-License-Identifier: Apache-2.0
package commander
import (
"context"
"errors"
"fmt"
"os"
"os/exec"
"sync/atomic"
"syscall"
"time"
"go.uber.org/zap"
"github.com/open-telemetry/opentelemetry-collector-contrib/cmd/opampsupervisor/supervisor/config"
)
// Commander can start/stop/restart the Agent executable and also watch for a signal
// for the Agent process to finish.
type Commander struct {
logger *zap.Logger
cfg *config.Agent
args []string
cmd *exec.Cmd
doneCh chan struct{}
running *atomic.Int64
}
func NewCommander(logger *zap.Logger, cfg *config.Agent, args ...string) (*Commander, error) {
if cfg.Executable == "" {
return nil, errors.New("agent.executable config option must be specified")
}
return &Commander{
logger: logger,
cfg: cfg,
args: args,
running: &atomic.Int64{},
}, nil
}
// Start the Agent and begin watching the process.
// Agent's stdout and stderr are written to a file.
// Calling this method when a command is already running
// is a no-op.
func (c *Commander) Start(ctx context.Context) error {
if c.running.Load() == 1 {
// Already started, nothing to do
return nil
}
c.logger.Debug("Starting agent", zap.String("agent", c.cfg.Executable))
logFilePath := "agent.log"
logFile, err := os.Create(logFilePath)
if err != nil {
return fmt.Errorf("cannot create %s: %w", logFilePath, err)
}
c.cmd = exec.CommandContext(ctx, c.cfg.Executable, c.args...) // #nosec G204
// Capture standard output and standard error.
// https://github.com/open-telemetry/opentelemetry-collector-contrib/issues/21072
c.cmd.Stdout = logFile
c.cmd.Stderr = logFile
c.doneCh = make(chan struct{})
if err := c.cmd.Start(); err != nil {
return err
}
c.logger.Debug("Agent process started", zap.Int("pid", c.cmd.Process.Pid))
c.running.Store(1)
go c.watch()
return nil
}
func (c *Commander) Restart(ctx context.Context) error {
if err := c.Stop(ctx); err != nil {
return err
}
return c.Start(ctx)
}
func (c *Commander) watch() {
err := c.cmd.Wait()
// cmd.Wait returns an exec.ExitError when the Collector exits unsuccessfully or stops
// after receiving a signal. The Commander caller will handle these cases, so we filter
// them out here.
var exitError *exec.ExitError
if ok := errors.As(err, &exitError); err != nil && !ok {
c.logger.Error("An error occurred while watching the agent process", zap.Error(err))
}
c.running.Store(0)
close(c.doneCh)
}
// Done returns a channel that will send a signal when the Agent process is finished.
func (c *Commander) Done() <-chan struct{} {
return c.doneCh
}
// Pid returns Agent process PID if it is started or 0 if it is not.
func (c *Commander) Pid() int {
if c.cmd == nil || c.cmd.Process == nil {
return 0
}
return c.cmd.Process.Pid
}
// ExitCode returns Agent process exit code if it exited or 0 if it is not.
func (c *Commander) ExitCode() int {
if c.cmd == nil || c.cmd.ProcessState == nil {
return 0
}
return c.cmd.ProcessState.ExitCode()
}
func (c *Commander) IsRunning() bool {
return c.running.Load() != 0
}
// Stop the Agent process. Sends SIGTERM to the process and wait for up 10 seconds
// and if the process does not finish kills it forcedly by sending SIGKILL.
// Returns after the process is terminated.
func (c *Commander) Stop(ctx context.Context) error {
if c.running.Load() == 0 {
// Not started, nothing to do.
return nil
}
c.logger.Debug("Stopping agent process", zap.Int("pid", c.cmd.Process.Pid))
// Gracefully signal process to stop.
if err := c.cmd.Process.Signal(syscall.SIGTERM); err != nil {
return err
}
waitCtx, cancel := context.WithTimeout(ctx, 10*time.Second)
// Setup a goroutine to wait a while for process to finish and send kill signal
// to the process if it doesn't finish.
var innerErr error
go func() {
<-waitCtx.Done()
if !errors.Is(waitCtx.Err(), context.DeadlineExceeded) {
c.logger.Debug("Agent process successfully stopped.", zap.Int("pid", c.cmd.Process.Pid))
return
}
// Time is out. Kill the process.
c.logger.Debug(
"Agent process is not responding to SIGTERM. Sending SIGKILL to kill forcedly.",
zap.Int("pid", c.cmd.Process.Pid))
if innerErr = c.cmd.Process.Signal(syscall.SIGKILL); innerErr != nil {
return
}
}()
// Wait for process to terminate
<-c.doneCh
c.running.Store(0)
// Let goroutine know process is finished.
cancel()
return innerErr
}