diff --git a/apiserver/controllers/instances.go b/apiserver/controllers/instances.go index c9ef6533..dd1ad284 100644 --- a/apiserver/controllers/instances.go +++ b/apiserver/controllers/instances.go @@ -18,6 +18,7 @@ import ( "encoding/json" "log" "net/http" + "strconv" gErrors "github.com/cloudbase/garm-provider-common/errors" "github.com/cloudbase/garm/apiserver/params" @@ -121,6 +122,12 @@ func (a *APIController) GetInstanceHandler(w http.ResponseWriter, r *http.Reques // in: path // required: true // +// + name: forceRemove +// description: If true GARM will ignore any provider error when removing the runner and will continue to remove the runner from github and the GARM database. +// type: boolean +// in: query +// required: false +// // Responses: // default: APIErrorResponse func (a *APIController) DeleteInstanceHandler(w http.ResponseWriter, r *http.Request) { @@ -138,7 +145,8 @@ func (a *APIController) DeleteInstanceHandler(w http.ResponseWriter, r *http.Req return } - if err := a.r.ForceDeleteRunner(ctx, instanceName); err != nil { + forceRemove, _ := strconv.ParseBool(r.URL.Query().Get("forceRemove")) + if err := a.r.DeleteRunner(ctx, instanceName, forceRemove); err != nil { log.Printf("removing runner: %s", err) handleError(w, err) return diff --git a/apiserver/swagger.yaml b/apiserver/swagger.yaml index 1d5f33b1..2d269391 100644 --- a/apiserver/swagger.yaml +++ b/apiserver/swagger.yaml @@ -597,6 +597,10 @@ paths: name: instanceName required: true type: string + - description: If true GARM will ignore any provider error when removing the runner and will continue to remove the runner from github and the GARM database. + in: query + name: forceRemove + type: boolean responses: default: description: APIErrorResponse diff --git a/client/instances/delete_instance_parameters.go b/client/instances/delete_instance_parameters.go index 34cb6eee..d4f9695c 100644 --- a/client/instances/delete_instance_parameters.go +++ b/client/instances/delete_instance_parameters.go @@ -14,6 +14,7 @@ import ( "github.com/go-openapi/runtime" cr "github.com/go-openapi/runtime/client" "github.com/go-openapi/strfmt" + "github.com/go-openapi/swag" ) // NewDeleteInstanceParams creates a new DeleteInstanceParams object, @@ -61,6 +62,12 @@ DeleteInstanceParams contains all the parameters to send to the API endpoint */ type DeleteInstanceParams struct { + /* ForceRemove. + + If true GARM will ignore any provider error when removing the runner and will continue to remove the runner from github and the GARM database. + */ + ForceRemove *bool + /* InstanceName. Runner instance name. @@ -120,6 +127,17 @@ func (o *DeleteInstanceParams) SetHTTPClient(client *http.Client) { o.HTTPClient = client } +// WithForceRemove adds the forceRemove to the delete instance params +func (o *DeleteInstanceParams) WithForceRemove(forceRemove *bool) *DeleteInstanceParams { + o.SetForceRemove(forceRemove) + return o +} + +// SetForceRemove adds the forceRemove to the delete instance params +func (o *DeleteInstanceParams) SetForceRemove(forceRemove *bool) { + o.ForceRemove = forceRemove +} + // WithInstanceName adds the instanceName to the delete instance params func (o *DeleteInstanceParams) WithInstanceName(instanceName string) *DeleteInstanceParams { o.SetInstanceName(instanceName) @@ -139,6 +157,23 @@ func (o *DeleteInstanceParams) WriteToRequest(r runtime.ClientRequest, reg strfm } var res []error + if o.ForceRemove != nil { + + // query param forceRemove + var qrForceRemove bool + + if o.ForceRemove != nil { + qrForceRemove = *o.ForceRemove + } + qForceRemove := swag.FormatBool(qrForceRemove) + if qForceRemove != "" { + + if err := r.SetQueryParam("forceRemove", qForceRemove); err != nil { + return err + } + } + } + // path param instanceName if err := r.SetPathParam("instanceName", o.InstanceName); err != nil { return err diff --git a/cmd/garm-cli/cmd/runner.go b/cmd/garm-cli/cmd/runner.go index 0d7971bd..5b7d07c5 100644 --- a/cmd/garm-cli/cmd/runner.go +++ b/cmd/garm-cli/cmd/runner.go @@ -187,12 +187,9 @@ to either cancel the workflow or wait for it to finish. return fmt.Errorf("requires a runner name") } - if !forceRemove { - return fmt.Errorf("use --force-remove-runner=true to remove a runner") - } - deleteInstanceReq := apiClientInstances.NewDeleteInstanceParams() deleteInstanceReq.InstanceName = args[0] + deleteInstanceReq.ForceRemove = &forceRemove if err := apiCli.Instances.DeleteInstance(deleteInstanceReq, authToken); err != nil { return err } @@ -207,7 +204,7 @@ func init() { runnerListCmd.Flags().BoolVarP(&runnerAll, "all", "a", false, "List all runners, regardless of org or repo.") runnerListCmd.MarkFlagsMutuallyExclusive("repo", "org", "enterprise", "all") - runnerDeleteCmd.Flags().BoolVarP(&forceRemove, "force-remove-runner", "f", false, "Confirm you want to delete a runner") + runnerDeleteCmd.Flags().BoolVarP(&forceRemove, "force-remove-runner", "f", false, "Forcefully remove a runner. If set to true, GARM will ignore provider errors when removing the runner.") runnerDeleteCmd.MarkFlagsMutuallyExclusive("force-remove-runner") runnerCmd.AddCommand( diff --git a/database/common/mocks/Store.go b/database/common/mocks/Store.go index da21f1a7..7d9f58fa 100644 --- a/database/common/mocks/Store.go +++ b/database/common/mocks/Store.go @@ -1,4 +1,4 @@ -// Code generated by mockery v0.0.0-dev. DO NOT EDIT. +// Code generated by mockery v2.28.1. DO NOT EDIT. package mocks @@ -1578,12 +1578,13 @@ func (_m *Store) UpdateUser(ctx context.Context, user string, param params.Updat return r0, r1 } -// NewStore creates a new instance of Store. It also registers a testing interface on the mock and a cleanup function to assert the mocks expectations. -// The first argument is typically a *testing.T value. -func NewStore(t interface { +type mockConstructorTestingTNewStore interface { mock.TestingT Cleanup(func()) -}) *Store { +} + +// NewStore creates a new instance of Store. It also registers a testing interface on the mock and a cleanup function to assert the mocks expectations. +func NewStore(t mockConstructorTestingTNewStore) *Store { mock := &Store{} mock.Mock.Test(t) diff --git a/go.mod b/go.mod index 367821c6..2b8f49c0 100644 --- a/go.mod +++ b/go.mod @@ -4,7 +4,7 @@ go 1.20 require ( github.com/BurntSushi/toml v1.2.1 - github.com/cloudbase/garm-provider-common v0.1.0 + github.com/cloudbase/garm-provider-common v0.1.1-0.20231012061429-49001794e700 github.com/go-openapi/errors v0.20.4 github.com/go-openapi/runtime v0.26.0 github.com/go-openapi/strfmt v0.21.7 diff --git a/go.sum b/go.sum index 596df8af..dbd89cb8 100644 --- a/go.sum +++ b/go.sum @@ -22,8 +22,8 @@ github.com/chzyer/test v0.0.0-20180213035817-a1ea475d72b1/go.mod h1:Q3SI9o4m/ZMn github.com/chzyer/test v1.0.0 h1:p3BQDXSxOhOG0P9z6/hGnII4LGiEPOYBhs8asl/fC04= github.com/chzyer/test v1.0.0/go.mod h1:2JlltgoNkt4TW/z9V/IzDdFaMTM2JPIi26O1pF38GC8= github.com/client9/misspell v0.3.4/go.mod h1:qj6jICC3Q7zFZvVWo7KLAzC3yx5G7kyvSDkc90ppPyw= -github.com/cloudbase/garm-provider-common v0.1.0 h1:gc2n8nsLjt7G3InAfqZ+75iZjSIUkIx86d6/DFA2+jc= -github.com/cloudbase/garm-provider-common v0.1.0/go.mod h1:igxJRT3OlykERYc6ssdRQXcb+BCaeSfnucg6I0OSoDc= +github.com/cloudbase/garm-provider-common v0.1.1-0.20231012061429-49001794e700 h1:ZCJ1zZ2WI/37ffzpRsu7t5zzShAMThhYsXw7bBNKBR0= +github.com/cloudbase/garm-provider-common v0.1.1-0.20231012061429-49001794e700/go.mod h1:igxJRT3OlykERYc6ssdRQXcb+BCaeSfnucg6I0OSoDc= github.com/cpuguy83/go-md2man/v2 v2.0.2/go.mod h1:tgQtvFlXSQOSOSIRvRPT7W67SCa46tRHOmNcaadrF8o= github.com/creack/pty v1.1.9/go.mod h1:oKZEueFk5CKHvIhNR5MUki03XCEU+Q6VDXinZuGJ33E= github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= diff --git a/runner/common/mocks/GithubClient.go b/runner/common/mocks/GithubClient.go index b8f450e7..6bfec1d8 100644 --- a/runner/common/mocks/GithubClient.go +++ b/runner/common/mocks/GithubClient.go @@ -1,4 +1,4 @@ -// Code generated by mockery v0.0.0-dev. DO NOT EDIT. +// Code generated by mockery v2.28.1. DO NOT EDIT. package mocks @@ -730,12 +730,13 @@ func (_m *GithubClient) RemoveRunner(ctx context.Context, owner string, repo str return r0, r1 } -// NewGithubClient creates a new instance of GithubClient. It also registers a testing interface on the mock and a cleanup function to assert the mocks expectations. -// The first argument is typically a *testing.T value. -func NewGithubClient(t interface { +type mockConstructorTestingTNewGithubClient interface { mock.TestingT Cleanup(func()) -}) *GithubClient { +} + +// NewGithubClient creates a new instance of GithubClient. It also registers a testing interface on the mock and a cleanup function to assert the mocks expectations. +func NewGithubClient(t mockConstructorTestingTNewGithubClient) *GithubClient { mock := &GithubClient{} mock.Mock.Test(t) diff --git a/runner/common/mocks/GithubEnterpriseClient.go b/runner/common/mocks/GithubEnterpriseClient.go index 2fc1442a..c53e775b 100644 --- a/runner/common/mocks/GithubEnterpriseClient.go +++ b/runner/common/mocks/GithubEnterpriseClient.go @@ -1,4 +1,4 @@ -// Code generated by mockery v0.0.0-dev. DO NOT EDIT. +// Code generated by mockery v2.28.1. DO NOT EDIT. package mocks @@ -215,12 +215,13 @@ func (_m *GithubEnterpriseClient) RemoveRunner(ctx context.Context, enterprise s return r0, r1 } -// NewGithubEnterpriseClient creates a new instance of GithubEnterpriseClient. It also registers a testing interface on the mock and a cleanup function to assert the mocks expectations. -// The first argument is typically a *testing.T value. -func NewGithubEnterpriseClient(t interface { +type mockConstructorTestingTNewGithubEnterpriseClient interface { mock.TestingT Cleanup(func()) -}) *GithubEnterpriseClient { +} + +// NewGithubEnterpriseClient creates a new instance of GithubEnterpriseClient. It also registers a testing interface on the mock and a cleanup function to assert the mocks expectations. +func NewGithubEnterpriseClient(t mockConstructorTestingTNewGithubEnterpriseClient) *GithubEnterpriseClient { mock := &GithubEnterpriseClient{} mock.Mock.Test(t) diff --git a/runner/common/mocks/OrganizationHooks.go b/runner/common/mocks/OrganizationHooks.go index 7d0428d0..dffe6a66 100644 --- a/runner/common/mocks/OrganizationHooks.go +++ b/runner/common/mocks/OrganizationHooks.go @@ -1,4 +1,4 @@ -// Code generated by mockery v0.0.0-dev. DO NOT EDIT. +// Code generated by mockery v2.28.1. DO NOT EDIT. package mocks @@ -171,12 +171,13 @@ func (_m *OrganizationHooks) PingOrgHook(ctx context.Context, org string, id int return r0, r1 } -// NewOrganizationHooks creates a new instance of OrganizationHooks. It also registers a testing interface on the mock and a cleanup function to assert the mocks expectations. -// The first argument is typically a *testing.T value. -func NewOrganizationHooks(t interface { +type mockConstructorTestingTNewOrganizationHooks interface { mock.TestingT Cleanup(func()) -}) *OrganizationHooks { +} + +// NewOrganizationHooks creates a new instance of OrganizationHooks. It also registers a testing interface on the mock and a cleanup function to assert the mocks expectations. +func NewOrganizationHooks(t mockConstructorTestingTNewOrganizationHooks) *OrganizationHooks { mock := &OrganizationHooks{} mock.Mock.Test(t) diff --git a/runner/common/mocks/PoolManager.go b/runner/common/mocks/PoolManager.go index 949dfc55..9ff34758 100644 --- a/runner/common/mocks/PoolManager.go +++ b/runner/common/mocks/PoolManager.go @@ -1,4 +1,4 @@ -// Code generated by mockery v0.0.0-dev. DO NOT EDIT. +// Code generated by mockery v2.28.1. DO NOT EDIT. package mocks @@ -14,6 +14,20 @@ type PoolManager struct { mock.Mock } +// DeleteRunner provides a mock function with given fields: runner, forceRemove +func (_m *PoolManager) DeleteRunner(runner params.Instance, forceRemove bool) error { + ret := _m.Called(runner, forceRemove) + + var r0 error + if rf, ok := ret.Get(0).(func(params.Instance, bool) error); ok { + r0 = rf(runner, forceRemove) + } else { + r0 = ret.Error(0) + } + + return r0 +} + // ForceDeleteRunner provides a mock function with given fields: runner func (_m *PoolManager) ForceDeleteRunner(runner params.Instance) error { ret := _m.Called(runner) @@ -250,12 +264,13 @@ func (_m *PoolManager) WebhookSecret() string { return r0 } -// NewPoolManager creates a new instance of PoolManager. It also registers a testing interface on the mock and a cleanup function to assert the mocks expectations. -// The first argument is typically a *testing.T value. -func NewPoolManager(t interface { +type mockConstructorTestingTNewPoolManager interface { mock.TestingT Cleanup(func()) -}) *PoolManager { +} + +// NewPoolManager creates a new instance of PoolManager. It also registers a testing interface on the mock and a cleanup function to assert the mocks expectations. +func NewPoolManager(t mockConstructorTestingTNewPoolManager) *PoolManager { mock := &PoolManager{} mock.Mock.Test(t) diff --git a/runner/common/mocks/Provider.go b/runner/common/mocks/Provider.go index e5157e0f..53ee2c95 100644 --- a/runner/common/mocks/Provider.go +++ b/runner/common/mocks/Provider.go @@ -1,4 +1,4 @@ -// Code generated by mockery v0.0.0-dev. DO NOT EDIT. +// Code generated by mockery v2.28.1. DO NOT EDIT. package mocks @@ -160,12 +160,13 @@ func (_m *Provider) Stop(ctx context.Context, instance string, force bool) error return r0 } -// NewProvider creates a new instance of Provider. It also registers a testing interface on the mock and a cleanup function to assert the mocks expectations. -// The first argument is typically a *testing.T value. -func NewProvider(t interface { +type mockConstructorTestingTNewProvider interface { mock.TestingT Cleanup(func()) -}) *Provider { +} + +// NewProvider creates a new instance of Provider. It also registers a testing interface on the mock and a cleanup function to assert the mocks expectations. +func NewProvider(t mockConstructorTestingTNewProvider) *Provider { mock := &Provider{} mock.Mock.Test(t) diff --git a/runner/common/mocks/RepositoryHooks.go b/runner/common/mocks/RepositoryHooks.go index 67ffe3e0..5706d4fc 100644 --- a/runner/common/mocks/RepositoryHooks.go +++ b/runner/common/mocks/RepositoryHooks.go @@ -1,4 +1,4 @@ -// Code generated by mockery v0.0.0-dev. DO NOT EDIT. +// Code generated by mockery v2.28.1. DO NOT EDIT. package mocks @@ -171,12 +171,13 @@ func (_m *RepositoryHooks) PingRepoHook(ctx context.Context, owner string, repo return r0, r1 } -// NewRepositoryHooks creates a new instance of RepositoryHooks. It also registers a testing interface on the mock and a cleanup function to assert the mocks expectations. -// The first argument is typically a *testing.T value. -func NewRepositoryHooks(t interface { +type mockConstructorTestingTNewRepositoryHooks interface { mock.TestingT Cleanup(func()) -}) *RepositoryHooks { +} + +// NewRepositoryHooks creates a new instance of RepositoryHooks. It also registers a testing interface on the mock and a cleanup function to assert the mocks expectations. +func NewRepositoryHooks(t mockConstructorTestingTNewRepositoryHooks) *RepositoryHooks { mock := &RepositoryHooks{} mock.Mock.Test(t) diff --git a/runner/common/pool.go b/runner/common/pool.go index b44488ee..caf1da38 100644 --- a/runner/common/pool.go +++ b/runner/common/pool.go @@ -38,22 +38,54 @@ const ( //go:generate mockery --all type PoolManager interface { + // ID returns the ID of the entity (repo, org, enterprise) ID() string + // WebhookSecret returns the unencrypted webhook secret associated with the webhook installed + // in GitHub for GARM. For GARM to receive webhook events for an entity, either the operator or + // GARM will have to create a webhook in GitHub which points to the GARM API server. To authenticate + // the webhook, a webhook secret is used. This function returns that secret. WebhookSecret() string + // GithubRunnerRegistrationToken returns a new registration token for a github runner. This is used + // for GHES installations that have not yet upgraded to a version >= 3.10. Starting with 3.10, we use + // just-in-time runners, which no longer require exposing a runner registration token. GithubRunnerRegistrationToken() (string, error) + // HandleWorkflowJob handles a workflow job meant for a particular entity. When a webhook is fired for + // a repo, org or enterprise, we determine the destination of that webhook, retrieve the pool manager + // for it and call this function with the WorkflowJob as a parameter. HandleWorkflowJob(job params.WorkflowJob) error + // RefreshState allows us to update webhook secrets and configuration for a pool manager. RefreshState(param params.UpdatePoolStateParams) error + // ForceDeleteRunner will attempt to remove a runner from the pool. + // + // Deprecated: FunctionName is deprecated. Use DeleteRunner instead. ForceDeleteRunner(runner params.Instance) error + // DeleteRunner will attempt to remove a runner from the pool. If forceRemove is true, any error + // received from the provider will be ignored and we will procede to remove the runner from the database. + // An error received while attempting to remove from GitHub (other than 404) will still stop the deletion + // process. This can happen if the runner is already processing a job. At which point, you can simply cancel + // the job in github. Doing so will prompt GARM to reap the runner automatically. + DeleteRunner(runner params.Instance, forceRemove bool) error + + // InstallWebhook will create a webhook in github for the entity associated with this pool manager. InstallWebhook(ctx context.Context, param params.InstallWebhookParams) (params.HookInfo, error) + // GetWebhookInfo will return information about the webhook installed in github for the entity associated GetWebhookInfo(ctx context.Context) (params.HookInfo, error) + // UninstallWebhook will remove the webhook installed in github for the entity associated with this pool manager. UninstallWebhook(ctx context.Context) error + // RootCABundle will return a CA bundle that must be installed on all runners in order to properly validate + // x509 certificates used by various systems involved. This CA bundle is defined in the GARM config file and + // can include multiple CA certificates for the GARM api server, GHES server and any provider API endpoint that + // may use internal or self signed certificates. RootCABundle() (params.CertificateBundle, error) - // PoolManager lifecycle functions. Start/stop pool. + // Start will start the pool manager and all associated workers. Start() error + // Stop will stop the pool manager and all associated workers. Stop() error + // Status will return the current status of the pool manager. Status() params.PoolManagerStatus + // Wait will block until the pool manager has stopped. Wait() error } diff --git a/runner/mocks/PoolManagerController.go b/runner/mocks/PoolManagerController.go index d422508a..6806623a 100644 --- a/runner/mocks/PoolManagerController.go +++ b/runner/mocks/PoolManagerController.go @@ -1,4 +1,4 @@ -// Code generated by mockery v0.0.0-dev. DO NOT EDIT. +// Code generated by mockery v2.28.1. DO NOT EDIT. package mocks @@ -373,12 +373,13 @@ func (_m *PoolManagerController) UpdateRepoPoolManager(ctx context.Context, repo return r0, r1 } -// NewPoolManagerController creates a new instance of PoolManagerController. It also registers a testing interface on the mock and a cleanup function to assert the mocks expectations. -// The first argument is typically a *testing.T value. -func NewPoolManagerController(t interface { +type mockConstructorTestingTNewPoolManagerController interface { mock.TestingT Cleanup(func()) -}) *PoolManagerController { +} + +// NewPoolManagerController creates a new instance of PoolManagerController. It also registers a testing interface on the mock and a cleanup function to assert the mocks expectations. +func NewPoolManagerController(t mockConstructorTestingTNewPoolManagerController) *PoolManagerController { mock := &PoolManagerController{} mock.Mock.Test(t) diff --git a/runner/pool/pool.go b/runner/pool/pool.go index 7f6b07ca..adc46af7 100644 --- a/runner/pool/pool.go +++ b/runner/pool/pool.go @@ -381,7 +381,7 @@ func (r *basePoolManager) cleanupOrphanedProviderRunners(runners []*github.Runne switch commonParams.InstanceStatus(instance.Status) { case commonParams.InstancePendingCreate, - commonParams.InstancePendingDelete: + commonParams.InstancePendingDelete, commonParams.InstancePendingForceDelete: // this instance is in the process of being created or is awaiting deletion. // Instances in pending_create did not get a chance to register themselves in, // github so we let them be for now. @@ -454,13 +454,14 @@ func (r *basePoolManager) reapTimedOutRunners(runners []*github.Runner) error { continue } - // There are 2 cases (currently) where we consider a runner as timed out: + // There are 3 cases (currently) where we consider a runner as timed out: // * The runner never joined github within the pool timeout // * The runner managed to join github, but the setup process failed later and the runner // never started on the instance. + // * A JIT config was created, but the runner never joined github. if runner, ok := runnersByName[instance.Name]; !ok || runner.GetStatus() == "offline" { r.log("reaping timed-out/failed runner %s", instance.Name) - if err := r.ForceDeleteRunner(instance); err != nil { + if err := r.DeleteRunner(instance, false); err != nil { r.log("failed to update runner %s status: %s", instance.Name, err) return errors.Wrap(err, "updating runner") } @@ -726,7 +727,7 @@ func (r *basePoolManager) AddRunner(ctx context.Context, poolID string, aditiona defer func() { if err != nil { if instance.ID != "" { - if err := r.ForceDeleteRunner(instance); err != nil { + if err := r.DeleteRunner(instance, false); err != nil { r.log("failed to cleanup instance: %s", instance.Name) } } @@ -1030,7 +1031,7 @@ func (r *basePoolManager) scaleDownOnePool(ctx context.Context, pool params.Pool g.Go(func() error { r.log("scaling down idle worker %s from pool %s\n", instanceToDelete.Name, pool.ID) - if err := r.ForceDeleteRunner(instanceToDelete); err != nil { + if err := r.DeleteRunner(instanceToDelete, false); err != nil { return fmt.Errorf("failed to delete instance %s: %w", instanceToDelete.ID, err) } return nil @@ -1272,7 +1273,7 @@ func (r *basePoolManager) deletePendingInstances() error { r.log("removing instances in pending_delete") for _, instance := range instances { - if instance.Status != commonParams.InstancePendingDelete { + if instance.Status != commonParams.InstancePendingDelete && instance.Status != commonParams.InstancePendingForceDelete { // not in pending_delete status. Skip. continue } @@ -1284,6 +1285,7 @@ func (r *basePoolManager) deletePendingInstances() error { continue } + currentStatus := instance.Status // Set the status to deleting before launching the goroutine that removes // the runner from the provider (which can take a long time). if _, err := r.setInstanceStatus(instance.Name, commonParams.InstanceDeleting, nil); err != nil { @@ -1300,9 +1302,9 @@ func (r *basePoolManager) deletePendingInstances() error { defer func(instance params.Instance) { if err != nil { r.log("failed to remove instance %s: %s", instance.Name, err) - // failed to remove from provider. Set the status back to pending_delete, which - // will retry the operation. - if _, err := r.setInstanceStatus(instance.Name, commonParams.InstancePendingDelete, nil); err != nil { + // failed to remove from provider. Set status to previous value, which will retry + // the operation. + if _, err := r.setInstanceStatus(instance.Name, currentStatus, nil); err != nil { r.log("failed to update runner %s status: %s", instance.Name, err) } } @@ -1311,7 +1313,10 @@ func (r *basePoolManager) deletePendingInstances() error { r.log("removing instance %s from provider", instance.Name) err = r.deleteInstanceFromProvider(r.ctx, instance) if err != nil { - return fmt.Errorf("failed to remove instance from provider: %w", err) + if currentStatus != commonParams.InstancePendingForceDelete { + return fmt.Errorf("failed to remove instance from provider: %w", err) + } + log.Printf("failed to remove instance %s from provider (continuing anyway): %s", instance.Name, err) } r.log("removing instance %s from database", instance.Name) if deleteErr := r.store.DeleteInstance(r.ctx, instance.PoolID, instance.Name); deleteErr != nil { @@ -1397,18 +1402,14 @@ func (r *basePoolManager) runnerCleanup() (err error) { return fmt.Errorf("failed to reap timed out runners: %w", err) } - if err := r.cleanupOrphanedRunners(); err != nil { + if err := r.cleanupOrphanedRunners(runners); err != nil { return fmt.Errorf("failed to cleanup orphaned runners: %w", err) } return nil } -func (r *basePoolManager) cleanupOrphanedRunners() error { - runners, err := r.helper.GetGithubRunners() - if err != nil { - return errors.Wrap(err, "fetching github runners") - } +func (r *basePoolManager) cleanupOrphanedRunners(runners []*github.Runner) error { if err := r.cleanupOrphanedProviderRunners(runners); err != nil { return errors.Wrap(err, "cleaning orphaned instances") } @@ -1421,16 +1422,24 @@ func (r *basePoolManager) cleanupOrphanedRunners() error { } func (r *basePoolManager) Start() error { - go r.updateTools() //nolint - - go r.startLoopForFunction(r.runnerCleanup, common.PoolReapTimeoutInterval, "timeout_reaper", false) - go r.startLoopForFunction(r.scaleDown, common.PoolScaleDownInterval, "scale_down", false) - go r.startLoopForFunction(r.deletePendingInstances, common.PoolConsilitationInterval, "consolidate[delete_pending]", false) - go r.startLoopForFunction(r.addPendingInstances, common.PoolConsilitationInterval, "consolidate[add_pending]", false) - go r.startLoopForFunction(r.ensureMinIdleRunners, common.PoolConsilitationInterval, "consolidate[ensure_min_idle]", false) - go r.startLoopForFunction(r.retryFailedInstances, common.PoolConsilitationInterval, "consolidate[retry_failed]", false) - go r.startLoopForFunction(r.updateTools, common.PoolToolUpdateInterval, "update_tools", true) - go r.startLoopForFunction(r.consumeQueuedJobs, common.PoolConsilitationInterval, "job_queue_consumer", false) + initialToolUpdate := make(chan struct{}, 1) + go func() { + r.updateTools() //nolint + initialToolUpdate <- struct{}{} + }() + + go func() { + <-initialToolUpdate + defer close(initialToolUpdate) + go r.startLoopForFunction(r.runnerCleanup, common.PoolReapTimeoutInterval, "timeout_reaper", false) + go r.startLoopForFunction(r.scaleDown, common.PoolScaleDownInterval, "scale_down", false) + go r.startLoopForFunction(r.deletePendingInstances, common.PoolConsilitationInterval, "consolidate[delete_pending]", false) + go r.startLoopForFunction(r.addPendingInstances, common.PoolConsilitationInterval, "consolidate[add_pending]", false) + go r.startLoopForFunction(r.ensureMinIdleRunners, common.PoolConsilitationInterval, "consolidate[ensure_min_idle]", false) + go r.startLoopForFunction(r.retryFailedInstances, common.PoolConsilitationInterval, "consolidate[retry_failed]", false) + go r.startLoopForFunction(r.updateTools, common.PoolToolUpdateInterval, "update_tools", true) + go r.startLoopForFunction(r.consumeQueuedJobs, common.PoolConsilitationInterval, "job_queue_consumer", false) + }() return nil } @@ -1455,7 +1464,16 @@ func (r *basePoolManager) ID() string { return r.helper.ID() } +// ForceDeleteRunner will delete a runner from a pool. +// +// Deprecated: Use DeleteRunner instead. func (r *basePoolManager) ForceDeleteRunner(runner params.Instance) error { + return r.DeleteRunner(runner, true) +} + +// Delete runner will delete a runner from a pool. If forceRemove is set to true, any error received from +// the IaaS provider will be ignored and deletion will continue. +func (r *basePoolManager) DeleteRunner(runner params.Instance, forceRemove bool) error { if !r.managerIsRunning { return runnerErrors.NewConflictError("pool manager is not running for %s", r.helper.String()) } @@ -1485,9 +1503,14 @@ func (r *basePoolManager) ForceDeleteRunner(runner params.Instance) error { } } } - r.log("setting instance status for: %v", runner.Name) - if _, err := r.setInstanceStatus(runner.Name, commonParams.InstancePendingDelete, nil); err != nil { + instanceStatus := commonParams.InstancePendingDelete + if forceRemove { + instanceStatus = commonParams.InstancePendingForceDelete + } + + r.log("setting instance status for %v to %v", runner.Name, instanceStatus) + if _, err := r.setInstanceStatus(runner.Name, instanceStatus, nil); err != nil { r.log("failed to update runner %s status: %s", runner.Name, err) return errors.Wrap(err, "updating runner") } diff --git a/runner/runner.go b/runner/runner.go index aa4aedbe..aae2baf2 100644 --- a/runner/runner.go +++ b/runner/runner.go @@ -898,7 +898,17 @@ func (r *Runner) getPoolManagerFromInstance(ctx context.Context, instance params return poolMgr, nil } +// ForceDeleteRunner will attempt to remove a runner from a pool. +// +// Deprecated: FunctionName is deprecated. Use DeleteRunner instead. func (r *Runner) ForceDeleteRunner(ctx context.Context, instanceName string) error { + return r.DeleteRunner(ctx, instanceName, true) +} + +// DeleteRunner removes a runner from a pool. If forceDelete is true, GARM will ignore any provider errors +// that may occur, and attempt to remove the runner from GitHub and then the database, regardless of provider +// errors. +func (r *Runner) DeleteRunner(ctx context.Context, instanceName string, forceDelete bool) error { if !auth.IsAdmin(ctx) { return runnerErrors.ErrUnauthorized } @@ -909,7 +919,8 @@ func (r *Runner) ForceDeleteRunner(ctx context.Context, instanceName string) err } switch instance.Status { - case commonParams.InstanceRunning, commonParams.InstanceError: + case commonParams.InstanceRunning, commonParams.InstanceError, + commonParams.InstancePendingForceDelete, commonParams.InstancePendingDelete: default: return runnerErrors.NewBadRequestError("runner must be in %q or %q state", commonParams.InstanceRunning, commonParams.InstanceError) } @@ -919,7 +930,7 @@ func (r *Runner) ForceDeleteRunner(ctx context.Context, instanceName string) err return errors.Wrap(err, "fetching pool manager for instance") } - if err := poolMgr.ForceDeleteRunner(instance); err != nil { + if err := poolMgr.DeleteRunner(instance, forceDelete); err != nil { return errors.Wrap(err, "removing runner") } return nil diff --git a/test/integration/config/config.toml b/test/integration/config/config.toml index 8fbd139c..1a67a409 100644 --- a/test/integration/config/config.toml +++ b/test/integration/config/config.toml @@ -50,6 +50,14 @@ description = "Local LXD installation" protocol = "simplestreams" skip_verify = false +[[provider]] +name = "test_external" +description = "external test provider" +provider_type = "external" + [provider.external] + config_file = "/etc/garm/test-provider/config" + provider_executable = "/etc/garm/test-provider/garm-external-provider" + [[github]] name = "${CREDENTIALS_NAME}" description = "GARM GitHub OAuth token" diff --git a/test/integration/e2e/client_utils.go b/test/integration/e2e/client_utils.go index 1ae2ecff..955f7b52 100644 --- a/test/integration/e2e/client_utils.go +++ b/test/integration/e2e/client_utils.go @@ -387,9 +387,9 @@ func getInstance(apiCli *client.GarmAPI, apiAuthToken runtime.ClientAuthInfoWrit return &getInstancesResponse.Payload, nil } -func deleteInstance(apiCli *client.GarmAPI, apiAuthToken runtime.ClientAuthInfoWriter, instanceID string) error { +func deleteInstance(apiCli *client.GarmAPI, apiAuthToken runtime.ClientAuthInfoWriter, instanceID string, forceRemove bool) error { return apiCli.Instances.DeleteInstance( - clientInstances.NewDeleteInstanceParams().WithInstanceName(instanceID), + clientInstances.NewDeleteInstanceParams().WithInstanceName(instanceID).WithForceRemove(&forceRemove), apiAuthToken) } diff --git a/test/integration/e2e/e2e.go b/test/integration/e2e/e2e.go index 63cfbc5a..b94c0652 100644 --- a/test/integration/e2e/e2e.go +++ b/test/integration/e2e/e2e.go @@ -72,7 +72,7 @@ func GracefulCleanup() { panic(err) } for _, instance := range poolInstances { - if err := deleteInstance(cli, authToken, instance.Name); err != nil { + if err := deleteInstance(cli, authToken, instance.Name, false); err != nil { panic(err) } log.Printf("Instance %s deletion initiated", instance.Name) diff --git a/test/integration/e2e/instances.go b/test/integration/e2e/instances.go index db10c71b..7b0de20b 100644 --- a/test/integration/e2e/instances.go +++ b/test/integration/e2e/instances.go @@ -33,7 +33,14 @@ func waitInstanceStatus(name string, status commonParams.InstanceStatus, runnerS return nil, fmt.Errorf("timeout waiting for instance %s status to reach status %s and runner status %s", name, status, runnerStatus) } -func waitInstanceToBeRemoved(name string, timeout time.Duration) error { +func DeleteInstance(name string, forceRemove bool) { + if err := deleteInstance(cli, authToken, name, forceRemove); err != nil { + panic(err) + } + log.Printf("Instance %s deletion initiated", name) +} + +func WaitInstanceToBeRemoved(name string, timeout time.Duration) error { var timeWaited time.Duration = 0 var instance *params.Instance @@ -67,7 +74,7 @@ func waitInstanceToBeRemoved(name string, timeout time.Duration) error { return fmt.Errorf("instance %s was not removed within the timeout", name) } -func waitPoolRunningIdleInstances(poolID string, timeout time.Duration) error { +func WaitPoolInstances(poolID string, status commonParams.InstanceStatus, runnerStatus params.RunnerStatus, timeout time.Duration) error { var timeWaited time.Duration = 0 pool, err := getPool(cli, authToken, poolID) @@ -75,22 +82,22 @@ func waitPoolRunningIdleInstances(poolID string, timeout time.Duration) error { return err } - log.Printf("Waiting for pool %s to have all instances as idle running", poolID) + log.Printf("Waiting for pool %s instances to reach status: %s and runner status: %s", poolID, status, runnerStatus) for timeWaited < timeout { poolInstances, err := listPoolInstances(cli, authToken, poolID) if err != nil { return err } - runningIdleCount := 0 + instancesCount := 0 for _, instance := range poolInstances { - if instance.Status == commonParams.InstanceRunning && instance.RunnerStatus == params.RunnerIdle { - runningIdleCount++ + if instance.Status == status && instance.RunnerStatus == runnerStatus { + instancesCount++ } } - log.Printf("Pool min idle runners: %d, pool instances: %d, current pool running idle instances: %d", pool.MinIdleRunners, len(poolInstances), runningIdleCount) - if runningIdleCount == int(pool.MinIdleRunners) && runningIdleCount == len(poolInstances) { + log.Printf("Pool %s instance reached status: %s and runner status: %s: %d/%d", poolID, status, runnerStatus, instancesCount, len(poolInstances)) + if instancesCount == int(pool.MinIdleRunners) && instancesCount == len(poolInstances) { return nil } time.Sleep(5 * time.Second) @@ -99,5 +106,5 @@ func waitPoolRunningIdleInstances(poolID string, timeout time.Duration) error { _ = dumpPoolInstancesDetails(pool.ID) - return fmt.Errorf("timeout waiting for pool %s to have all idle instances running", poolID) + return fmt.Errorf("timeout waiting for pool %s instances to reach status: %s and runner status: %s", poolID, status, runnerStatus) } diff --git a/test/integration/e2e/jobs.go b/test/integration/e2e/jobs.go index 7a1c0f9d..d8abccf2 100644 --- a/test/integration/e2e/jobs.go +++ b/test/integration/e2e/jobs.go @@ -41,13 +41,13 @@ func ValidateJobLifecycle(label string) { } // wait for instance to be removed - err = waitInstanceToBeRemoved(instance.Name, 5*time.Minute) + err = WaitInstanceToBeRemoved(instance.Name, 5*time.Minute) if err != nil { panic(err) } // wait for GARM to rebuild the pool running idle instances - err = waitPoolRunningIdleInstances(instance.PoolID, 6*time.Minute) + err = WaitPoolInstances(instance.PoolID, commonParams.InstanceRunning, params.RunnerIdle, 5*time.Minute) if err != nil { panic(err) } diff --git a/test/integration/e2e/organizations.go b/test/integration/e2e/organizations.go index 277d36cc..490ca784 100644 --- a/test/integration/e2e/organizations.go +++ b/test/integration/e2e/organizations.go @@ -4,6 +4,7 @@ import ( "log" "time" + commonParams "github.com/cloudbase/garm-provider-common/params" "github.com/cloudbase/garm/params" ) @@ -100,7 +101,7 @@ func WaitOrgRunningIdleInstances(orgID string, timeout time.Duration) { panic(err) } for _, pool := range orgPools { - err := waitPoolRunningIdleInstances(pool.ID, timeout) + err := WaitPoolInstances(pool.ID, commonParams.InstanceRunning, params.RunnerIdle, timeout) if err != nil { _ = dumpOrgInstancesDetails(orgID) panic(err) diff --git a/test/integration/e2e/repositories.go b/test/integration/e2e/repositories.go index 2810393b..72c10e32 100644 --- a/test/integration/e2e/repositories.go +++ b/test/integration/e2e/repositories.go @@ -4,6 +4,7 @@ import ( "log" "time" + commonParams "github.com/cloudbase/garm-provider-common/params" "github.com/cloudbase/garm/params" ) @@ -95,13 +96,22 @@ func DeleteRepoPool(repoID, repoPoolID string) { } } +func DisableRepoPool(repoID, repoPoolID string) { + log.Printf("Disable repo %s pool %s", repoID, repoPoolID) + enabled := false + poolParams := params.UpdatePoolParams{Enabled: &enabled} + if _, err := updateRepoPool(cli, authToken, repoID, repoPoolID, poolParams); err != nil { + panic(err) + } +} + func WaitRepoRunningIdleInstances(repoID string, timeout time.Duration) { repoPools, err := listRepoPools(cli, authToken, repoID) if err != nil { panic(err) } for _, pool := range repoPools { - err := waitPoolRunningIdleInstances(pool.ID, timeout) + err := WaitPoolInstances(pool.ID, commonParams.InstanceRunning, params.RunnerIdle, timeout) if err != nil { _ = dumpRepoInstancesDetails(repoID) panic(err) diff --git a/test/integration/main.go b/test/integration/main.go index 0918d964..04b96861 100644 --- a/test/integration/main.go +++ b/test/integration/main.go @@ -2,6 +2,7 @@ package main import ( "fmt" + "log" "os" "time" @@ -34,6 +35,17 @@ var ( Tags: []string{"repo-runner"}, Enabled: true, } + repoPoolParams2 = params.CreatePoolParams{ + MaxRunners: 2, + MinIdleRunners: 0, + Flavor: "default", + Image: "ubuntu:22.04", + OSType: commonParams.Linux, + OSArch: commonParams.Amd64, + ProviderName: "test_external", + Tags: []string{"repo-runner-2"}, + Enabled: true, + } orgName = os.Getenv("ORG_NAME") orgWebhookSecret = os.Getenv("ORG_WEBHOOK_SECRET") @@ -101,6 +113,29 @@ func main() { repoPool = e2e.CreateRepoPool(repo.ID, repoPoolParams) _ = e2e.UpdateRepoPool(repo.ID, repoPool.ID, repoPoolParams.MaxRunners, 1) + ///////////////////////////// + // Test external provider /// + ///////////////////////////// + repoPool2 := e2e.CreateRepoPool(repo.ID, repoPoolParams2) + _ = e2e.UpdateRepoPool(repo.ID, repoPool2.ID, repoPoolParams2.MaxRunners, 1) + err := e2e.WaitPoolInstances(repoPool2.ID, commonParams.InstanceRunning, params.RunnerPending, 1*time.Minute) + if err != nil { + log.Printf("Failed to wait for instance to be running: %v", err) + } + repoPool2 = e2e.GetRepoPool(repo.ID, repoPool2.ID) + e2e.DisableRepoPool(repo.ID, repoPool2.ID) + e2e.DeleteInstance(repoPool2.Instances[0].Name, false) + err = e2e.WaitPoolInstances(repoPool2.ID, commonParams.InstancePendingDelete, params.RunnerPending, 1*time.Minute) + if err != nil { + log.Printf("Failed to wait for instance to be running: %v", err) + } + e2e.DeleteInstance(repoPool2.Instances[0].Name, true) // delete instance with forceRemove + err = e2e.WaitInstanceToBeRemoved(repoPool2.Instances[0].Name, 1*time.Minute) + if err != nil { + log.Printf("Failed to wait for instance to be removed: %v", err) + } + e2e.DeleteRepoPool(repo.ID, repoPool2.ID) + /////////////////// // organizations // /////////////////// diff --git a/test/integration/provider/garm-external-provider b/test/integration/provider/garm-external-provider new file mode 100755 index 00000000..88e6f46e --- /dev/null +++ b/test/integration/provider/garm-external-provider @@ -0,0 +1,56 @@ +#!/bin/bash + +set -e +set -o pipefail + +if [ ! -t 0 ] +then + INPUT=$(cat -) +fi + +if [ -z "$GARM_PROVIDER_CONFIG_FILE" ] +then + echo "no config file specified in env" + exit 1 +fi + +source "$GARM_PROVIDER_CONFIG_FILE" + +function CreateInstance() { + if [ -z "$INPUT" ]; then + echo "expected build params in stdin" + exit 1 + fi + + jq -rnc '{"provider_id": "test-provider-id", "name": "test-instance-name", "os_type": "linux", "os_name": "ubuntu", "os_version": "20.04", "os_arch": "x86_64", "status": "running"}' +} + +case "$GARM_COMMAND" in + "CreateInstance") + CreateInstance + ;; + "DeleteInstance") + echo "RemoveAllInstances not implemented" + exit 1 + ;; + "GetInstance") + echo "Get instance with id: ${GARM_INSTANCE_ID}" + ;; + "ListInstances") + echo "List instances with pool id: ${GARM_POOL_ID}" + ;; + "StartInstance") + echo "Start instance: ${GARM_INSTANCE_NAME} with id: ${GARM_INSTANCE_ID}" + ;; + "StopInstance") + echo "Stop instance: ${GARM_INSTANCE_NAME} with id: ${GARM_INSTANCE_ID}" + ;; + "RemoveAllInstances") + echo "RemoveAllInstances not implemented" + exit 1 + ;; + *) + echo "Invalid GARM provider command: \"$GARM_COMMAND\"" + exit 1 + ;; +esac diff --git a/test/integration/scripts/setup-garm.sh b/test/integration/scripts/setup-garm.sh index 525c2f92..91dddddf 100755 --- a/test/integration/scripts/setup-garm.sh +++ b/test/integration/scripts/setup-garm.sh @@ -5,6 +5,7 @@ DIR="$(dirname $0)" BINARIES_DIR="$PWD/bin" CONTRIB_DIR="$PWD/contrib" CONFIG_DIR="$PWD/test/integration/config" +CONFIG_DIR_PROV="$PWD/test/integration/provider" if [[ ! -f $BINARIES_DIR/garm ]] || [[ ! -f $BINARIES_DIR/garm-cli ]]; then echo "ERROR: Please build GARM binaries first" @@ -46,6 +47,10 @@ sudo mkdir -p /etc/garm cat $CONFIG_DIR/config.toml | envsubst | sudo tee /etc/garm/config.toml sudo chown -R garm:garm /etc/garm +sudo mkdir /etc/garm/test-provider +sudo touch $CONFIG_DIR_PROV/config +sudo cp $CONFIG_DIR_PROV/* /etc/garm/test-provider + sudo mv $BINARIES_DIR/* /usr/local/bin/ sudo cp $CONTRIB_DIR/garm.service /etc/systemd/system/garm.service diff --git a/vendor/github.com/cloudbase/garm-provider-common/params/params.go b/vendor/github.com/cloudbase/garm-provider-common/params/params.go index 3bd2e7d4..95a6e6bb 100644 --- a/vendor/github.com/cloudbase/garm-provider-common/params/params.go +++ b/vendor/github.com/cloudbase/garm-provider-common/params/params.go @@ -39,14 +39,15 @@ const ( ) const ( - InstanceRunning InstanceStatus = "running" - InstanceStopped InstanceStatus = "stopped" - InstanceError InstanceStatus = "error" - InstancePendingDelete InstanceStatus = "pending_delete" - InstanceDeleting InstanceStatus = "deleting" - InstancePendingCreate InstanceStatus = "pending_create" - InstanceCreating InstanceStatus = "creating" - InstanceStatusUnknown InstanceStatus = "unknown" + InstanceRunning InstanceStatus = "running" + InstanceStopped InstanceStatus = "stopped" + InstanceError InstanceStatus = "error" + InstancePendingDelete InstanceStatus = "pending_delete" + InstancePendingForceDelete InstanceStatus = "pending_force_delete" + InstanceDeleting InstanceStatus = "deleting" + InstancePendingCreate InstanceStatus = "pending_create" + InstanceCreating InstanceStatus = "creating" + InstanceStatusUnknown InstanceStatus = "unknown" ) const ( diff --git a/vendor/modules.txt b/vendor/modules.txt index 098e3341..e2af7de0 100644 --- a/vendor/modules.txt +++ b/vendor/modules.txt @@ -14,7 +14,7 @@ github.com/cespare/xxhash/v2 # github.com/chzyer/readline v1.5.1 ## explicit; go 1.15 github.com/chzyer/readline -# github.com/cloudbase/garm-provider-common v0.1.0 +# github.com/cloudbase/garm-provider-common v0.1.1-0.20231012061429-49001794e700 ## explicit; go 1.20 github.com/cloudbase/garm-provider-common/cloudconfig github.com/cloudbase/garm-provider-common/defaults