Skip to content

Commit

Permalink
Fix sandbox cleanup
Browse files Browse the repository at this point in the history
Driver and Sandbox have 2 different stores where the endpoints are saved
It is possible that the 2 store go out of sync if the endpoint is added to the driver
but there is a crash before the sandbox join.
On restart now we take the list of endpoints from the network and we assign
them back to the sandbox

(This is a balena cherry-pick of moby#1805)

Signed-off-by: Flavio Crisciani <[email protected]>
  • Loading branch information
Flavio Crisciani authored and lmbarros committed Apr 18, 2023
1 parent 4d09346 commit ace5cf5
Show file tree
Hide file tree
Showing 2 changed files with 43 additions and 23 deletions.
9 changes: 5 additions & 4 deletions endpoint.go
Original file line number Diff line number Diff line change
Expand Up @@ -844,10 +844,6 @@ func (ep *endpoint) Delete(force bool) error {
}
}

if err = n.getController().deleteFromStore(ep); err != nil {
return err
}

defer func() {
if err != nil && !force {
ep.dbExists = false
Expand All @@ -864,6 +860,11 @@ func (ep *endpoint) Delete(force bool) error {
return err
}

// This has to come after the sandbox and the driver to guarantee that can be the source of truth on restart cases
if err = n.getController().deleteFromStore(ep); err != nil {
return err
}

ep.releaseAddress()

if err := n.getEpCnt().DecEndpointCnt(); err != nil {
Expand Down
57 changes: 38 additions & 19 deletions sandbox_store.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@ package libnetwork

import (
"encoding/json"
"sync"

"github.com/docker/libnetwork/datastore"
"github.com/docker/libnetwork/osl"
Expand Down Expand Up @@ -207,6 +206,40 @@ func (c *controller) sandboxCleanup(activeSandboxes map[string]interface{}) {
return
}

// Get all the endpoints
// Use the network as the source of truth so that if there was an issue before the sandbox registered the endpoint
// this will be taken anyway
endpointsInSandboxID := map[string][]*endpoint{}
nl, err := c.getNetworksForScope(datastore.LocalScope)
if err != nil {
logrus.Warnf("Could not get list of networks during sandbox cleanup: %v", err)
return
}

for _, n := range nl {
var epl []*endpoint
epl, err = n.getEndpointsFromStore()
if err != nil {
logrus.Warnf("Could not get list of endpoints in network %s during sandbox cleanup: %v", n.name, err)
continue
}
for _, ep := range epl {
ep, err = n.getEndpointFromStore(ep.id)
if err != nil {
logrus.Warnf("Could not get endpoint in network %s during sandbox cleanup: %v", n.name, err)
continue
}
if ep.sandboxID == "" {
logrus.Warnf("Endpoint %s not associated to any sandbox, deleting it", ep.id)
ep.Delete(true)
continue
}

// Append the endpoint to the corresponding sandboxID
endpointsInSandboxID[ep.sandboxID] = append(endpointsInSandboxID[ep.sandboxID], ep)
}
}

for _, kvo := range kvol {
sbs := kvo.(*sbState)

Expand Down Expand Up @@ -252,25 +285,11 @@ func (c *controller) sandboxCleanup(activeSandboxes map[string]interface{}) {
c.sandboxes[sb.id] = sb
c.Unlock()

for _, eps := range sbs.Eps {
n, err := c.getNetworkFromStore(eps.Nid)
var ep *endpoint
if err != nil {
logrus.Errorf("getNetworkFromStore for nid %s failed while trying to build sandbox for cleanup: %v", eps.Nid, err)
n = &network{id: eps.Nid, ctrlr: c, drvOnce: &sync.Once{}, persist: true}
ep = &endpoint{id: eps.Eid, network: n, sandboxID: sbs.ID}
} else {
ep, err = n.getEndpointFromStore(eps.Eid)
if err != nil {
logrus.Errorf("getEndpointFromStore for eid %s failed while trying to build sandbox for cleanup: %v", eps.Eid, err)
ep = &endpoint{id: eps.Eid, network: n, sandboxID: sbs.ID}
}
}
if _, ok := activeSandboxes[sb.ID()]; ok && err != nil {
logrus.Errorf("failed to restore endpoint %s in %s for container %s due to %v", eps.Eid, eps.Nid, sb.ContainerID(), err)
continue
// Restore all the endpoints that are supposed to be in this sandbox
if eps, ok := endpointsInSandboxID[sb.id]; ok {
for _, ep := range eps {
sb.addEndpoint(ep)
}
sb.addEndpoint(ep)
}

if _, ok := activeSandboxes[sb.ID()]; !ok {
Expand Down

0 comments on commit ace5cf5

Please sign in to comment.