Skip to content

Commit

Permalink
Refactor locking for join/leave to avoid race
Browse files Browse the repository at this point in the history
Instead of using "sync.Once" to determine whether to initialize a
network sandbox or subnet sandbox, we use a traditional mutex +
initialization boolean.  This is because the initialization state isn't
truly a once-and-done condition.  Rather, libnetwork destroys network
and subnet sandboxes when the last endpoint leaves them.  The use of
sync.Once in this kind of scenario requires, therefore, re-initializing
the Once which is impoissible.  So the approach that libnetwork
currently takes is to use a pointer to a Once and redirect that pointer
to a new Once on reset.  This leads to nasty race conditions.

In addition to refactoring the locking, this patch merges the functions
joinSandbox(), joinSubnetSandbox() and incEndpointCount(). This makes
the code both cleaner and it also holds the join lock through the series
of read-modify-writes avoiding further potential races.  This does
reduce the potential parallelism which could be applied should there be
many joins coming in on many different subnets in the same overlay
network.  However, this should be an extremely minor performance hit for
a very obscure case.

The patch uses a separate mutex for the join/leave sandbox
initialization because the network-wide and subnet-wide locks are used
for finer-grained management of the overlay network data members.  It
turns out that there is a very subtle deadlock that can occur if
joining/leaving holds the network lock.  A join holding the network lock
requires posting an event to a synchronous channel to the peerdb.  But
the peerdb can block trying to gain access to the network lock in order
to retrieve things like the network's sandbox or vxlan IDs.  This patch
documents in the code which mutexes are used to manage which network
fields.

Signed-off-by: Chris Telfer <ctelfer@docker.com>
  • Loading branch information
ctelfer committed May 9, 2018
1 parent 13ea4bd commit 0026dc2
Show file tree
Hide file tree
Showing 4 changed files with 62 additions and 79 deletions.
10 changes: 1 addition & 9 deletions drivers/overlay/joinleave.go
Original file line number Diff line number Diff line change
Expand Up @@ -47,18 +47,10 @@ func (d *driver) Join(nid, eid string, sboxKey string, jinfo driverapi.JoinInfo,
return fmt.Errorf("couldn't get vxlan id for %q: %v", s.subnetIP.String(), err)
}

if err := n.joinSandbox(false); err != nil {
if err := n.joinSandbox(s, false, true); err != nil {
return fmt.Errorf("network sandbox join failed: %v", err)
}

if err := n.joinSubnetSandbox(s, false); err != nil {
return fmt.Errorf("subnet sandbox join failed for %q: %v", s.subnetIP.String(), err)
}

// joinSubnetSandbox gets called when an endpoint comes up on a new subnet in the
// overlay network. Hence the Endpoint count should be updated outside joinSubnetSandbox
n.incEndpointCount()

sbox := n.sandbox()

overlayIfName, containerIfName, err := createVethPair()
Expand Down
110 changes: 58 additions & 52 deletions drivers/overlay/ov_network.go
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@ var (
type networkTable map[string]*network

type subnet struct {
once *sync.Once
sboxInit bool
vxlanName string
brName string
vni uint32
Expand All @@ -55,21 +55,22 @@ type subnetJSON struct {
}

type network struct {
id string
dbIndex uint64
dbExists bool
sbox osl.Sandbox
nlSocket *nl.NetlinkSocket
endpoints endpointTable
driver *driver
joinCnt int
once *sync.Once
initEpoch int
initErr error
subnets []*subnet
secure bool
mtu int
sync.Mutex
id string
dbIndex uint64
dbExists bool
sbox osl.Sandbox
nlSocket *nl.NetlinkSocket
endpoints endpointTable
driver *driver
joinCnt int
sboxInit bool
initEpoch int
initErr error
subnets []*subnet
secure bool
mtu int
joinLock sync.Mutex // protects joinCnt, sboxInit, initErr, subnets[i].sboxInit, subnets[i].initErr
sync.Mutex // protects all other fields
}

func init() {
Expand Down Expand Up @@ -150,7 +151,6 @@ func (d *driver) CreateNetwork(id string, option map[string]interface{}, nInfo d
id: id,
driver: d,
endpoints: endpointTable{},
once: &sync.Once{},
subnets: []*subnet{},
}

Expand Down Expand Up @@ -193,7 +193,6 @@ func (d *driver) CreateNetwork(id string, option map[string]interface{}, nInfo d
s := &subnet{
subnetIP: ipd.Pool,
gwIP: ipd.Gateway,
once: &sync.Once{},
}

if len(vnis) != 0 {
Expand Down Expand Up @@ -296,48 +295,62 @@ func (d *driver) RevokeExternalConnectivity(nid, eid string) error {
return nil
}

func (n *network) incEndpointCount() {
n.Lock()
defer n.Unlock()
n.joinCnt++
}
func (n *network) joinSandbox(s *subnet, restore bool, incJoinCnt bool) error {
networkOnce.Do(networkOnceInit)

n.joinLock.Lock()
defer n.joinLock.Unlock()

func (n *network) joinSandbox(restore bool) error {
// If there is a race between two go routines here only one will win
// the other will wait.
n.once.Do(func() {
// save the error status of initSandbox in n.initErr so that
// all the racing go routines are able to know the status.
if !n.sboxInit {
n.initErr = n.initSandbox(restore)
})
n.sboxInit = true // we cannot recover from this error
}

return n.initErr
}
if n.initErr != nil {
return fmt.Errorf("network sandbox join failed: %v", n.initErr)
}

func (n *network) joinSubnetSandbox(s *subnet, restore bool) error {
s.once.Do(func() {
s.initErr = n.initSubnetSandbox(s, restore)
})
return s.initErr
subnetErr := s.initErr
if !s.sboxInit {
subnetErr := n.initSubnetSandbox(s, restore)
// We can recover from these errors, but not on restore
if restore || subnetErr == nil {
s.initErr = subnetErr
s.sboxInit = true
}
}
if subnetErr != nil {
return fmt.Errorf("subnet sandbox join failed for %q: %v", s.subnetIP.String(), subnetErr)
}

if incJoinCnt {
n.joinCnt++
}

return nil
}

func (n *network) leaveSandbox() {
n.Lock()
defer n.Unlock()
n.joinLock.Lock()
defer n.joinLock.Unlock()

n.joinCnt--
if n.joinCnt != 0 {
return
}

// We are about to destroy sandbox since the container is leaving the network
// Reinitialize the once variable so that we will be able to trigger one time
// sandbox initialization(again) when another container joins subsequently.
n.once = &sync.Once{}
n.Lock()
defer n.Unlock()

n.destroySandbox()

n.sboxInit = false
n.initErr = nil
for _, s := range n.subnets {
s.once = &sync.Once{}
s.sboxInit = false
s.initErr = nil
}

n.destroySandbox()
}

// to be called while holding network lock
Expand Down Expand Up @@ -638,9 +651,6 @@ func (n *network) initSubnetSandbox(s *subnet, restore bool) error {
}
} else {
if err := n.setupSubnetSandbox(s, brName, vxlanName); err != nil {
// The error in setupSubnetSandbox could be a temporary glitch. reset the
// subnet once object to allow the setup to be retried on another endpoint join.
s.once = &sync.Once{}
return err
}
}
Expand Down Expand Up @@ -693,8 +703,6 @@ func (n *network) initSandbox(restore bool) error {
n.initEpoch++
n.Unlock()

networkOnce.Do(networkOnceInit)

if !restore {
if hostMode {
if err := addNetworkChain(n.id[:12]); err != nil {
Expand Down Expand Up @@ -848,7 +856,6 @@ func (d *driver) restoreNetworkFromStore(nid string) *network {
if n != nil {
n.driver = d
n.endpoints = endpointTable{}
n.once = &sync.Once{}
d.networks[nid] = n
}
return n
Expand Down Expand Up @@ -1009,7 +1016,6 @@ func (n *network) SetValue(value []byte) error {
subnetIP: subnetIP,
gwIP: gwIP,
vni: vni,
once: &sync.Once{},
}
n.subnets = append(n.subnets, s)
} else {
Expand Down
19 changes: 2 additions & 17 deletions drivers/overlay/overlay.go
Original file line number Diff line number Diff line change
Expand Up @@ -105,17 +105,6 @@ func Init(dc driverapi.DriverCallback, config map[string]interface{}) error {
logrus.Warnf("Failure during overlay endpoints restore: %v", err)
}

// If an error happened when the network join the sandbox during the endpoints restore
// we should reset it now along with the once variable, so that subsequent endpoint joins
// outside of the restore path can potentially fix the network join and succeed.
for nid, n := range d.networks {
if n.initErr != nil {
logrus.Infof("resetting init error and once variable for network %s after unsuccessful endpoint restore: %v", nid, n.initErr)
n.initErr = nil
n.once = &sync.Once{}
}
}

return dc.RegisterDriver(networkType, d, c)
}

Expand Down Expand Up @@ -151,25 +140,21 @@ func (d *driver) restoreEndpoints() error {
return fmt.Errorf("could not find subnet for endpoint %s", ep.id)
}

if err := n.joinSandbox(true); err != nil {
if err := n.joinSandbox(s, true, true); err != nil {
return fmt.Errorf("restore network sandbox failed: %v", err)
}

if err := n.joinSubnetSandbox(s, true); err != nil {
return fmt.Errorf("restore subnet sandbox failed for %q: %v", s.subnetIP.String(), err)
}

Ifaces := make(map[string][]osl.IfaceOption)
vethIfaceOption := make([]osl.IfaceOption, 1)
vethIfaceOption = append(vethIfaceOption, n.sbox.InterfaceOptions().Master(s.brName))
Ifaces["veth+veth"] = vethIfaceOption

err := n.sbox.Restore(Ifaces, nil, nil, nil)
if err != nil {
n.leaveSandbox()
return fmt.Errorf("failed to restore overlay sandbox: %v", err)
}

n.incEndpointCount()
d.peerAdd(ep.nid, ep.id, ep.addr.IP, ep.addr.Mask, ep.mac, net.ParseIP(d.advertiseAddress), false, false, true)
}
return nil
Expand Down
2 changes: 1 addition & 1 deletion drivers/overlay/peerdb.go
Original file line number Diff line number Diff line change
Expand Up @@ -384,7 +384,7 @@ func (d *driver) peerAddOp(nid, eid string, peerIP net.IP, peerIPMask net.IPMask
return fmt.Errorf("couldn't get vxlan id for %q: %v", s.subnetIP.String(), err)
}

if err := n.joinSubnetSandbox(s, false); err != nil {
if err := n.joinSandbox(s, false, false); err != nil {
return fmt.Errorf("subnet sandbox join failed for %q: %v", s.subnetIP.String(), err)
}

Expand Down

0 comments on commit 0026dc2

Please sign in to comment.