From be8f5145d55fc66244c03f2c2419396c9b655011 Mon Sep 17 00:00:00 2001 From: Yann DEGAT Date: Wed, 11 Jul 2018 15:30:46 +0200 Subject: [PATCH] remote/backend/swift: Add support for locking --- backend/remote-state/swift/backend.go | 11 + backend/remote-state/swift/backend_state.go | 94 +++++- backend/remote-state/swift/backend_test.go | 12 +- backend/remote-state/swift/client.go | 321 ++++++++++++++++++-- 4 files changed, 406 insertions(+), 32 deletions(-) diff --git a/backend/remote-state/swift/backend.go b/backend/remote-state/swift/backend.go index e1e4e95b2989..fe309949cbc5 100644 --- a/backend/remote-state/swift/backend.go +++ b/backend/remote-state/swift/backend.go @@ -173,6 +173,13 @@ func New() backend.Backend { Optional: true, Description: descriptions["expire_after"], }, + + "lock": &schema.Schema{ + Type: schema.TypeBool, + Optional: true, + Description: "Lock state access", + Default: true, + }, }, } @@ -238,6 +245,7 @@ type Backend struct { archiveContainer string expireSecs int container string + lock bool } func (b *Backend) configure(ctx context.Context) error { @@ -276,6 +284,9 @@ func (b *Backend) configure(ctx context.Context) error { b.container = data.Get("path").(string) } + // Store the lock information + b.lock = data.Get("lock").(bool) + // Enable object archiving? if archiveContainer, ok := data.GetOk("archive_container"); ok { log.Printf("[DEBUG] Archive_container set, enabling object versioning") diff --git a/backend/remote-state/swift/backend_state.go b/backend/remote-state/swift/backend_state.go index 4eb76bfac79b..27a4325f31ff 100644 --- a/backend/remote-state/swift/backend_state.go +++ b/backend/remote-state/swift/backend_state.go @@ -22,6 +22,7 @@ func (b *Backend) States() ([]string, error) { archive: b.archive, archiveContainer: b.archiveContainer, expireSecs: b.expireSecs, + lockState: b.lock, } // List our container objects @@ -34,7 +35,6 @@ func (b *Backend) States() ([]string, error) { // Find the envs, we use a map since we can get duplicates with // path suffixes. envs := map[string]struct{}{} - for _, object := range objectNames { object = strings.TrimPrefix(object, objectEnvPrefix) object = strings.TrimSuffix(object, delimiter) @@ -45,6 +45,18 @@ func (b *Backend) States() ([]string, error) { continue } + // swift is eventually consistent, thus a deleted object may + // be listed in objectList. To ensure consistency, we query + // each object with a "newest" arg set to true + payload, err := client.get(client.container, b.objectName(object)) + if err != nil { + return nil, err + } + if payload == nil { + // object doesn't exist anymore. skipping. + continue + } + envs[object] = struct{}{} } @@ -71,10 +83,12 @@ func (b *Backend) DeleteState(name string) error { archiveContainer: b.archiveContainer, expireSecs: b.expireSecs, objectName: b.objectName(name), + lockState: b.lock, } - // List our container objects + // Delete our object err := client.Delete() + return err } @@ -91,27 +105,85 @@ func (b *Backend) State(name string) (state.State, error) { archiveContainer: b.archiveContainer, expireSecs: b.expireSecs, objectName: b.objectName(name), + lockState: b.lock, } - stateMgr := &remote.State{Client: client} + var stateMgr state.State = &remote.State{Client: client} + + // If we're not locking, disable it + if !b.lock { + stateMgr = &state.LockDisabled{Inner: stateMgr} + } + + // Check to see if this state already exists. + // If we're trying to force-unlock a state, we can't take the lock before + // fetching the state. If the state doesn't exist, we have to assume this + // is a normal create operation, and take the lock at that point. + // + // If we need to force-unlock, but for some reason the state no longer + // exists, the user will have to use openstack tools to manually fix the + // situation. + existing, err := b.States() + if err != nil { + return nil, err + } + + exists := false + for _, s := range existing { + if s == name { + exists = true + break + } + } + + // We need to create the object so it's listed by States. + if !exists { + // the default state always exists + if name == backend.DefaultStateName { + return stateMgr, nil + } + + // Grab a lock, we use this to write an empty state if one doesn't + // exist already. We have to write an empty state as a sentinel value + // so States() knows it exists. + lockInfo := state.NewLockInfo() + lockInfo.Operation = "init" + lockId, err := stateMgr.Lock(lockInfo) + if err != nil { + return nil, fmt.Errorf("failed to lock state in Swift: %s", err) + } + + // Local helper function so we can call it multiple places + lockUnlock := func(parent error) error { + if err := stateMgr.Unlock(lockId); err != nil { + return fmt.Errorf(strings.TrimSpace(errStateUnlock), lockId, err) + } + + return parent + } - //if this isn't the default state name, we need to create the object so - //it's listed by States. - if name != backend.DefaultStateName { // Grab the value if err := stateMgr.RefreshState(); err != nil { + err = lockUnlock(err) return nil, err } // If we have no state, we have to create an empty state if v := stateMgr.State(); v == nil { if err := stateMgr.WriteState(terraform.NewState()); err != nil { + err = lockUnlock(err) return nil, err } if err := stateMgr.PersistState(); err != nil { + err = lockUnlock(err) return nil, err } } + + // Unlock, the state should now be initialized + if err := lockUnlock(nil); err != nil { + return nil, err + } } return stateMgr, nil @@ -126,3 +198,13 @@ func (b *Backend) objectName(name string) string { return name } + +const errStateUnlock = ` +Error unlocking Swift state. Lock ID: %s + +Error: %s + +You may have to force-unlock this state in order to use it again. +The Swift backend acquires a lock during initialization to ensure +the minimum required keys are prepared. +` diff --git a/backend/remote-state/swift/backend_test.go b/backend/remote-state/swift/backend_test.go index 2c2345ef3b8b..d98106c09490 100644 --- a/backend/remote-state/swift/backend_test.go +++ b/backend/remote-state/swift/backend_test.go @@ -61,13 +61,19 @@ func TestBackend(t *testing.T) { container := fmt.Sprintf("terraform-state-swift-testbackend-%x", time.Now().Unix()) - b := backend.TestBackendConfig(t, New(), map[string]interface{}{ + b1 := backend.TestBackendConfig(t, New(), map[string]interface{}{ + "container": container, + }).(*Backend) + + b2 := backend.TestBackendConfig(t, New(), map[string]interface{}{ "container": container, }).(*Backend) - defer deleteSwiftContainer(t, b.client, container) + defer deleteSwiftContainer(t, b1.client, container) - backend.TestBackendStates(t, b) + backend.TestBackendStates(t, b1) + backend.TestBackendStateLocks(t, b1, b2) + backend.TestBackendStateForceUnlock(t, b1, b2) } func TestBackendArchive(t *testing.T) { diff --git a/backend/remote-state/swift/client.go b/backend/remote-state/swift/client.go index 94a83020d698..2e93d5c337a3 100644 --- a/backend/remote-state/swift/client.go +++ b/backend/remote-state/swift/client.go @@ -2,24 +2,37 @@ package swift import ( "bytes" + "context" "crypto/md5" + "encoding/json" "fmt" "log" + "sync" + "time" "github.com/gophercloud/gophercloud" "github.com/gophercloud/gophercloud/openstack/objectstorage/v1/containers" "github.com/gophercloud/gophercloud/openstack/objectstorage/v1/objects" "github.com/gophercloud/gophercloud/pagination" - + "github.com/hashicorp/terraform/state" "github.com/hashicorp/terraform/state/remote" ) const ( - TFSTATE_NAME = "tfstate.tf" - TFSTATE_LOCK_NAME = "tfstate.lock" + TFSTATE_NAME = "tfstate.tf" + + consistencyTimeout = 15 + + // Suffix that will be appended to state file paths + // when locking + lockSuffix = ".lock" + + // The TTL associated with this lock. + lockTTL = 15 ) // RemoteClient implements the Client interface for an Openstack Swift server. +// Implements "state/remote".ClientLocker type RemoteClient struct { client *gophercloud.ServiceClient container string @@ -27,6 +40,17 @@ type RemoteClient struct { archiveContainer string expireSecs int objectName string + + mu sync.Mutex + // lockState is true if we're using locks + lockState bool + + info *state.LockInfo + + // lockCancel cancels the Context use for lockRenewPeriodic, and is + // called when unlocking, or before creating a new lock if the lock is + // lost. + lockCancel context.CancelFunc } func (c *RemoteClient) ListObjectsNames(prefix string, delim string) ([]string, error) { @@ -63,16 +87,24 @@ func (c *RemoteClient) ListObjectsNames(prefix string, delim string) ([]string, } -func (c *RemoteClient) Get() (*remote.Payload, error) { - log.Printf("[DEBUG] Getting object %s in container %s", c.objectName, c.container) - if err := c.ensureContainerExists(); err != nil { - return nil, err +// GetNewestOpts add a parameter to ensure the objects returned are the most +// recent known by swift. +// swift is eventually consistent. Yet end users may expect to get a consistent +// response when querying for state or lock objects +type GetNewestOpts struct{} + +// ToObjectDownloadParams formats a DownloadOpts into a query string and map of +// headers. +func (opts GetNewestOpts) ToObjectDownloadParams() (map[string]string, string, error) { + newest := map[string]string{ + "X-Newest": "true", } - result := objects.Download(c.client, c.container, c.objectName, nil) + return newest, "", nil +} - // Extract any errors from result - _, err := result.Extract() +func (c *RemoteClient) Get() (*remote.Payload, error) { + payload, err := c.get(c.container, c.objectName) // 404 response is to be expected if the object doesn't already exist! if _, ok := err.(gophercloud.ErrDefault404); ok { @@ -80,6 +112,110 @@ func (c *RemoteClient) Get() (*remote.Payload, error) { return nil, nil } + return payload, err +} + +// swift is eventually constistent. Consistency +// is ensured by the Get func which will always try +// to retrieve the most recent object +func (c *RemoteClient) Put(data []byte) error { + if c.expireSecs != 0 { + log.Printf("[DEBUG] ExpireSecs = %d", c.expireSecs) + return c.put(c.container, c.objectName, data, c.expireSecs, "") + } + + return c.put(c.container, c.objectName, data, -1, "") + +} + +func (c *RemoteClient) Delete() error { + return c.delete(c.container, c.objectName) +} + +func (c *RemoteClient) Lock(info *state.LockInfo) (string, error) { + c.mu.Lock() + defer c.mu.Unlock() + + if !c.lockState { + return "", nil + } + + log.Printf("[DEBUG] Acquiring Lock %#v on %s/%s", info, c.container, c.objectName) + + // This check only is to ensure we strictly follow the specification. + // Terraform shouldn't ever re-lock, so provide errors for the possible + // states if this is called. + if c.info != nil { + // we have an active lock already + return "", fmt.Errorf("state %q already locked", c.lockFilePath()) + } + + // update the path we're using + info.Path = c.lockFilePath() + + if err := c.writeLockInfo(info, lockTTL, "*"); err != nil { + return "", err + } + + log.Printf("[DEBUG] Acquired Lock %s on %s", info.ID, c.objectName) + + c.info = info + + ctx, cancel := context.WithCancel(context.Background()) + c.lockCancel = cancel + + // keep the lock renewed + go c.lockRenewPeriodic(ctx, info) + + return info.ID, nil +} + +func (c *RemoteClient) Unlock(id string) error { + c.mu.Lock() + + if !c.lockState { + return nil + } + + defer func() { + // The periodic lock renew is canceled + // the lockCancel func may not be nil in most usecases + // but can typically be nil when using a second client + // to ForceUnlock the state based on the same lock Id + if c.lockCancel != nil { + c.lockCancel() + } + c.info = nil + c.mu.Unlock() + }() + + log.Printf("[DEBUG] Releasing Lock %s on %s", id, c.objectName) + + info, err := c.lockInfo() + if err != nil { + return c.lockError(fmt.Errorf("failed to retrieve lock info: %s", err), nil) + } + + c.info = info + + // conflicting lock + if info.ID != id { + return c.lockError(fmt.Errorf("lock id %q does not match existing lock", id), info) + } + + return c.delete(c.container, c.lockFilePath()) +} + +func (c *RemoteClient) get(container, object string) (*remote.Payload, error) { + log.Printf("[DEBUG] Getting object %s/%s", c.container, object) + result := objects.Download(c.client, container, object, GetNewestOpts{}) + + // Extract any errors from result + _, err := result.Extract() + if err != nil { + return nil, err + } + bytes, err := result.ExtractContent() if err != nil { return nil, err @@ -94,31 +230,170 @@ func (c *RemoteClient) Get() (*remote.Payload, error) { return payload, nil } -func (c *RemoteClient) Put(data []byte) error { +func (c *RemoteClient) put(container, object string, data []byte, deleteAfter int, ifNoneMatch string) error { + log.Printf("[DEBUG] Writing object in %s/%s", c.container, object) if err := c.ensureContainerExists(); err != nil { return err } - log.Printf("[DEBUG] Putting object %s in container %s", c.objectName, c.container) - reader := bytes.NewReader(data) + contentType := "application/json" + contentLength := int64(len(data)) + createOpts := objects.CreateOpts{ - Content: reader, + Content: bytes.NewReader(data), + ContentType: contentType, + ContentLength: int64(contentLength), } - if c.expireSecs != 0 { - log.Printf("[DEBUG] ExpireSecs = %d", c.expireSecs) - createOpts.DeleteAfter = c.expireSecs + if deleteAfter >= 0 { + createOpts.DeleteAfter = deleteAfter } - result := objects.Create(c.client, c.container, c.objectName, createOpts) + if ifNoneMatch != "" { + createOpts.IfNoneMatch = ifNoneMatch + } - return result.Err + result := objects.Create(c.client, c.container, object, createOpts) + if result.Err != nil { + return result.Err + } + + return nil } -func (c *RemoteClient) Delete() error { - log.Printf("[DEBUG] Deleting object %s in container %s", c.objectName, c.container) - result := objects.Delete(c.client, c.container, c.objectName, nil) - return result.Err +func (c *RemoteClient) delete(container, object string) error { + log.Printf("[DEBUG] Deleting object %s/%s", c.container, c.objectName) + result := objects.Delete(c.client, container, object, nil) + + if result.Err != nil { + return result.Err + } + + // swift is eventually consistent. As delete operations are synchronous + // we check that objects are really deleted to be consistent + data, err := c.get(container, object) + if _, ok := err.(gophercloud.ErrDefault404); ok { + log.Println("[DEBUG] Object to delete doesn't exist.") + return nil + } + + if err != nil { + return err + } + + if data != nil { + return fmt.Errorf("object %s/%s hasnt been properly deleted. Data is not null: %#v", c.container, c.objectName, data) + } + + return nil +} + +func (c *RemoteClient) writeLockInfo(info *state.LockInfo, deleteAfter int, ifNoneMatch string) error { + err := c.put(c.container, c.lockFilePath(), info.Marshal(), deleteAfter, ifNoneMatch) + + if httpErr, ok := err.(gophercloud.ErrUnexpectedResponseCode); ok && httpErr.Actual == 412 { + log.Printf("[DEBUG] Couldn't write lock %s. One already exists.", info.ID) + info2, err2 := c.lockInfo() + if err2 != nil { + return fmt.Errorf("Couldn't read lock info: %v", err2) + } + + return c.lockError(err, info2) + } + + if err != nil { + return c.lockError(err, nil) + } + + return nil +} + +func (c *RemoteClient) lockError(err error, conflictingLock *state.LockInfo) *state.LockError { + lockErr := &state.LockError{ + Err: err, + Info: conflictingLock, + } + + return lockErr +} + +// lockInfo reads the lock file, parses its contents and returns the parsed +// LockInfo struct. +func (c *RemoteClient) lockInfo() (*state.LockInfo, error) { + raw, err := c.get(c.container, c.lockFilePath()) + if err != nil { + return nil, err + } + + info := &state.LockInfo{} + + if err := json.Unmarshal(raw.Data, info); err != nil { + return nil, err + } + + return info, nil +} + +func (c *RemoteClient) lockRenewPeriodic(ctx context.Context, info *state.LockInfo) error { + log.Printf("[DEBUG] Renew lock %v", info) + + ttl := lockTTL * time.Second + waitDur := ttl / 2 + lastRenewTime := time.Now() + var lastErr error + for { + if time.Since(lastRenewTime) > ttl { + return lastErr + } + select { + case <-time.After(waitDur): + c.mu.Lock() + // Unlock may have released the mu.Lock + // in which case we shouldn't renew the lock + select { + case <-ctx.Done(): + log.Printf("[DEBUG] Stopping Periodic renew of lock %v", info) + return nil + default: + } + + info2, err := c.lockInfo() + if _, ok := err.(gophercloud.ErrDefault404); ok { + log.Println("[DEBUG] Lock has expired trying to reacquire.") + err = nil + } + + if err == nil && (info2 == nil || info.ID == info2.ID) { + info2 = info + err = c.writeLockInfo(info, lockTTL, "") + } + + c.mu.Unlock() + + if err != nil { + log.Printf("[ERROR] could not reacquire lock (%v): %s", info, err) + waitDur = time.Second + lastErr = err + continue + } + + // conflicting lock + if info2.ID != info.ID { + return c.lockError(fmt.Errorf("lock id %q does not match existing lock %q", info.ID, info2.ID), info2) + } + + waitDur = ttl / 2 + lastRenewTime = time.Now() + + case <-ctx.Done(): + log.Printf("[DEBUG] Stopping Periodic renew of lock %s", info.ID) + return nil + } + } +} + +func (c *RemoteClient) lockFilePath() string { + return c.objectName + lockSuffix } func (c *RemoteClient) ensureContainerExists() error {