From 019948acbe40797785b4d7c1bf482fc3dfa18932 Mon Sep 17 00:00:00 2001 From: Gabriel Adrian Samfira Date: Sat, 26 Aug 2023 19:40:01 +0000 Subject: [PATCH] Add JIT config as part of instance create We must create the DB entry for a runner with a JIT config included. Adding it later via an update runs the risk of having the consolidate loop pick up the incomplete instance. Signed-off-by: Gabriel Adrian Samfira --- database/sql/instances.go | 1 + params/requests.go | 3 +- runner/pool/enterprise.go | 12 +----- runner/pool/interfaces.go | 2 +- runner/pool/organization.go | 12 +----- runner/pool/pool.go | 77 ++++++++++++++++--------------------- runner/pool/repository.go | 12 +----- 7 files changed, 43 insertions(+), 76 deletions(-) diff --git a/database/sql/instances.go b/database/sql/instances.go index 0c8e4a48..608d4fa6 100644 --- a/database/sql/instances.go +++ b/database/sql/instances.go @@ -82,6 +82,7 @@ func (s *sqlDatabase) CreateInstance(ctx context.Context, poolID string, param p GitHubRunnerGroup: param.GitHubRunnerGroup, JitConfiguration: secret, AditionalLabels: labels, + AgentID: param.AgentID, } q := s.conn.Create(&newInstance) if q.Error != nil { diff --git a/params/requests.go b/params/requests.go index 0293223e..74bfeb50 100644 --- a/params/requests.go +++ b/params/requests.go @@ -136,7 +136,8 @@ type CreateInstanceParams struct { // GithubRunnerGroup is the github runner group to which the runner belongs. // The runner group must be created by someone with access to the enterprise. GitHubRunnerGroup string - CreateAttempt int `json:"-"` + CreateAttempt int `json:"-"` + AgentID int64 `json:"-"` AditionalLabels []string JitConfiguration map[string]string } diff --git a/runner/pool/enterprise.go b/runner/pool/enterprise.go index 904fa841..b7adad76 100644 --- a/runner/pool/enterprise.go +++ b/runner/pool/enterprise.go @@ -104,15 +104,7 @@ func (r *enterprise) findRunnerGroupByName(ctx context.Context, name string) (*g return nil, errors.Wrap(runnerErrors.ErrNotFound, "runner group not found") } -func (r *enterprise) GetJITConfig(ctx context.Context, instance params.Instance, pool params.Pool, labels []string) (jitConfigMap map[string]string, runner *github.Runner, err error) { - if instance.AgentID != 0 { - return nil, nil, fmt.Errorf("instance already has an agent ID: %w", runnerErrors.ErrBadRequest) - } - - if instance.JitConfiguration != nil { - return nil, nil, fmt.Errorf("instance already has a JIT configuration: %w", runnerErrors.ErrBadRequest) - } - +func (r *enterprise) GetJITConfig(ctx context.Context, instance string, pool params.Pool, labels []string) (jitConfigMap map[string]string, runner *github.Runner, err error) { var rg int64 = 1 if pool.GitHubRunnerGroup != "" { runnerGroup, err := r.findRunnerGroupByName(ctx, pool.GitHubRunnerGroup) @@ -123,7 +115,7 @@ func (r *enterprise) GetJITConfig(ctx context.Context, instance params.Instance, } req := github.GenerateJITConfigRequest{ - Name: instance.Name, + Name: instance, RunnerGroupID: rg, Labels: labels, // TODO(gabriel-samfira): Should we make this configurable? diff --git a/runner/pool/interfaces.go b/runner/pool/interfaces.go index d40fc39e..f981992b 100644 --- a/runner/pool/interfaces.go +++ b/runner/pool/interfaces.go @@ -37,7 +37,7 @@ type poolHelper interface { GithubCLI() common.GithubClient - GetJITConfig(ctx context.Context, instance params.Instance, pool params.Pool, labels []string) (map[string]string, *github.Runner, error) + GetJITConfig(ctx context.Context, instanceName string, pool params.Pool, labels []string) (map[string]string, *github.Runner, error) FetchDbInstances() ([]params.Instance, error) ListPools() ([]params.Pool, error) diff --git a/runner/pool/organization.go b/runner/pool/organization.go index 5e88f73e..539af39a 100644 --- a/runner/pool/organization.go +++ b/runner/pool/organization.go @@ -116,15 +116,7 @@ func (r *organization) findRunnerGroupByName(ctx context.Context, name string) ( return nil, errors.Wrap(runnerErrors.ErrNotFound, "runner group not found") } -func (r *organization) GetJITConfig(ctx context.Context, instance params.Instance, pool params.Pool, labels []string) (jitConfigMap map[string]string, runner *github.Runner, err error) { - if instance.AgentID != 0 { - return nil, nil, fmt.Errorf("instance already has an agent ID: %w", runnerErrors.ErrBadRequest) - } - - if instance.JitConfiguration != nil { - return nil, nil, fmt.Errorf("instance already has a JIT configuration: %w", runnerErrors.ErrBadRequest) - } - +func (r *organization) GetJITConfig(ctx context.Context, instance string, pool params.Pool, labels []string) (jitConfigMap map[string]string, runner *github.Runner, err error) { var rg int64 = 1 if pool.GitHubRunnerGroup != "" { runnerGroup, err := r.findRunnerGroupByName(ctx, pool.GitHubRunnerGroup) @@ -135,7 +127,7 @@ func (r *organization) GetJITConfig(ctx context.Context, instance params.Instanc } req := github.GenerateJITConfigRequest{ - Name: instance.Name, + Name: instance, RunnerGroupID: rg, Labels: labels, // TODO(gabriel-samfira): Should we make this configurable? diff --git a/runner/pool/pool.go b/runner/pool/pool.go index 842747d4..4ca49c4e 100644 --- a/runner/pool/pool.go +++ b/runner/pool/pool.go @@ -388,11 +388,18 @@ func (r *basePoolManager) cleanupOrphanedProviderRunners(runners []*github.Runne continue } + pool, err := r.store.GetPoolByID(r.ctx, instance.PoolID) + if err != nil { + return errors.Wrap(err, "fetching instance pool info") + } + switch instance.RunnerStatus { case params.RunnerPending, params.RunnerInstalling: - // runner is still installing. We give it a chance to finish. - r.log("runner %s is still installing, give it a chance to finish", instance.Name) - continue + if time.Since(instance.UpdatedAt).Minutes() < float64(pool.RunnerTimeout()) { + // runner is still installing. We give it a chance to finish. + r.log("runner %s is still installing, give it a chance to finish", instance.Name) + continue + } } if time.Since(instance.UpdatedAt).Minutes() < 5 { @@ -451,20 +458,7 @@ func (r *basePoolManager) reapTimedOutRunners(runners []*github.Runner) error { // * The runner never joined github within the pool timeout // * The runner managed to join github, but the setup process failed later and the runner // never started on the instance. - // - // There are several steps in the user data that sets up the runner: - // * Download and unarchive the runner from github (or used the cached version) - // * Configure runner (connects to github). At this point the runner is seen as offline. - // * Install the service - // * Set SELinux context (if SELinux is enabled) - // * Start the service (if successful, the runner will transition to "online") - // * Get the runner ID - // - // If we fail getting the runner ID after it's started, garm will set the runner status to "failed", - // even though, technically the runner is online and fully functional. This is why we check here for - // both the runner status as reported by GitHub and the runner status as reported by the provider. - // If the runner is "offline" and marked as "failed", it should be safe to reap it. - if runner, ok := runnersByName[instance.Name]; !ok || (runner.GetStatus() == "offline" && instance.RunnerStatus == params.RunnerFailed) { + if runner, ok := runnersByName[instance.Name]; !ok || runner.GetStatus() == "offline" { r.log("reaping timed-out/failed runner %s", instance.Name) if err := r.ForceDeleteRunner(instance); err != nil { r.log("failed to update runner %s status: %s", instance.Name, err) @@ -699,6 +693,12 @@ func (r *basePoolManager) AddRunner(ctx context.Context, poolID string, aditiona } name := fmt.Sprintf("%s-%s", pool.GetRunnerPrefix(), util.NewID()) + labels := r.getLabelsForInstance(pool) + // Attempt to create JIT config + jitConfig, runner, err := r.helper.GetJITConfig(ctx, name, pool, labels) + if err != nil { + r.log("failed to get JIT config, falling back to registration token: %s", err) + } createParams := params.CreateInstanceParams{ Name: name, @@ -711,6 +711,11 @@ func (r *basePoolManager) AddRunner(ctx context.Context, poolID string, aditiona CreateAttempt: 1, GitHubRunnerGroup: pool.GitHubRunnerGroup, AditionalLabels: aditionalLabels, + JitConfiguration: jitConfig, + } + + if runner != nil { + createParams.AgentID = runner.GetID() } instance, err := r.store.CreateInstance(r.ctx, poolID, createParams) @@ -719,37 +724,21 @@ func (r *basePoolManager) AddRunner(ctx context.Context, poolID string, aditiona } defer func() { - if err != nil && instance.ID != "" { - if err := r.ForceDeleteRunner(instance); err != nil { - r.log("failed to cleanup instance: %s", instance.Name) + if err != nil { + if instance.ID != "" { + if err := r.ForceDeleteRunner(instance); err != nil { + r.log("failed to cleanup instance: %s", instance.Name) + } } - } - }() - labels := r.getLabelsForInstance(pool) - // Attempt to create JIT config - jitConfig, runner, err := r.helper.GetJITConfig(ctx, instance, pool, labels) - if err == nil { - updateParams := params.UpdateInstanceParams{ - AgentID: runner.GetID(), - // We're using a JIT config. Setting the TokenFetched will disable the registration token - // metadata endpoint. - TokenFetched: github.Bool(true), - JitConfiguration: jitConfig, - } - instance, err = r.updateInstance(instance.Name, updateParams) - if err != nil { - // The agent ID is not recorded in the instance, so the deferred ForceDeleteRunner() will not - // attempt to clean it up. We need to do it here. - _, runnerCleanupErr := r.helper.RemoveGithubRunner(runner.GetID()) - if err != nil { - log.Printf("failed to remove runner %d: %s", runner.GetID(), runnerCleanupErr) + if runner != nil { + _, runnerCleanupErr := r.helper.RemoveGithubRunner(runner.GetID()) + if err != nil { + r.log("failed to remove runner %d: %s", runner.GetID(), runnerCleanupErr) + } } - return errors.Wrap(err, "updating instance") } - } else { - r.log("failed to get JIT config, falling back to registration token: %s", err) - } + }() return nil } diff --git a/runner/pool/repository.go b/runner/pool/repository.go index e09081d3..c74452de 100644 --- a/runner/pool/repository.go +++ b/runner/pool/repository.go @@ -88,17 +88,9 @@ type repository struct { mux sync.Mutex } -func (r *repository) GetJITConfig(ctx context.Context, instance params.Instance, pool params.Pool, labels []string) (jitConfigMap map[string]string, runner *github.Runner, err error) { - if instance.AgentID != 0 { - return nil, nil, fmt.Errorf("instance already has an agent ID: %w", runnerErrors.ErrBadRequest) - } - - if instance.JitConfiguration != nil { - return nil, nil, fmt.Errorf("instance already has a JIT configuration: %w", runnerErrors.ErrBadRequest) - } - +func (r *repository) GetJITConfig(ctx context.Context, instance string, pool params.Pool, labels []string) (jitConfigMap map[string]string, runner *github.Runner, err error) { req := github.GenerateJITConfigRequest{ - Name: instance.Name, + Name: instance, // At the repository level we only have the default runner group. RunnerGroupID: 1, Labels: labels,