diff options
author | Tsachi Herman <tsachi.herman@algorand.com> | 2022-01-14 14:16:57 -0500 |
---|---|---|
committer | GitHub <noreply@github.com> | 2022-01-14 14:16:57 -0500 |
commit | 4006ce219ebcebbb65c621b9f04362b45b256065 (patch) | |
tree | 5c73ed03dd6693eb2bb9b3ba2cc72361879afa68 | |
parent | 9d073692bb9617d3d4a9a892feda147a508be8fb (diff) |
network: faster node shutdown (#3416)
During the node shutdown, all the current outgoing connections are being disconnected.
Since these connections are web sockets, they require a close connection message to be sent.
However, sending this message can take a while, and in situations where the other party has already shut down, we might never get a response. That, in turn, would lead the node waiting until the deadline is reached.
The current deadline was 5 seconds. This PR changes the deadline during shutdown to be 50ms.
-rw-r--r-- | network/wsNetwork.go | 21 | ||||
-rw-r--r-- | network/wsPeer.go | 10 |
2 files changed, 21 insertions, 10 deletions
diff --git a/network/wsNetwork.go b/network/wsNetwork.go index 7768addf2..8bde3040d 100644 --- a/network/wsNetwork.go +++ b/network/wsNetwork.go @@ -125,6 +125,12 @@ var peers = metrics.MakeGauge(metrics.MetricName{Name: "algod_network_peers", De var incomingPeers = metrics.MakeGauge(metrics.MetricName{Name: "algod_network_incoming_peers", Description: "Number of active incoming peers."}) var outgoingPeers = metrics.MakeGauge(metrics.MetricName{Name: "algod_network_outgoing_peers", Description: "Number of active outgoing peers."}) +// peerDisconnectionAckDuration defines the time we would wait for the peer disconnection to compelete. +const peerDisconnectionAckDuration time.Duration = 5 * time.Second + +// peerShutdownDisconnectionAckDuration defines the time we would wait for the peer disconnection to compelete during shutdown. +const peerShutdownDisconnectionAckDuration time.Duration = 50 * time.Millisecond + // Peer opaque interface for referring to a neighbor in the network type Peer interface{} @@ -542,13 +548,13 @@ func (wn *WebsocketNetwork) disconnect(badnode Peer, reason disconnectReason) { return } peer := badnode.(*wsPeer) - peer.CloseAndWait() + peer.CloseAndWait(time.Now().Add(peerDisconnectionAckDuration)) wn.removePeer(peer, reason) } -func closeWaiter(wg *sync.WaitGroup, peer *wsPeer) { +func closeWaiter(wg *sync.WaitGroup, peer *wsPeer, deadline time.Time) { defer wg.Done() - peer.CloseAndWait() + peer.CloseAndWait(deadline) } // DisconnectPeers shuts down all connections @@ -557,8 +563,9 @@ func (wn *WebsocketNetwork) DisconnectPeers() { defer wn.peersLock.Unlock() closeGroup := sync.WaitGroup{} closeGroup.Add(len(wn.peers)) + deadline := time.Now().Add(peerDisconnectionAckDuration) for _, peer := range wn.peers { - go closeWaiter(&closeGroup, peer) + go closeWaiter(&closeGroup, peer, deadline) } wn.peers = wn.peers[:0] closeGroup.Wait() @@ -812,8 +819,12 @@ func (wn *WebsocketNetwork) innerStop() { wn.peersLock.Lock() defer wn.peersLock.Unlock() wn.wg.Add(len(wn.peers)) + // this method is called only during node shutdown. In this case, we want to send the + // shutdown message, but we don't want to wait for a long time - since we might not be lucky + // to get a response. + deadline := time.Now().Add(peerShutdownDisconnectionAckDuration) for _, peer := range wn.peers { - go closeWaiter(&wn.wg, peer) + go closeWaiter(&wn.wg, peer, deadline) } wn.peers = wn.peers[:0] } diff --git a/network/wsPeer.go b/network/wsPeer.go index e0d209c41..337dae07b 100644 --- a/network/wsPeer.go +++ b/network/wsPeer.go @@ -754,15 +754,15 @@ func (wp *wsPeer) internalClose(reason disconnectReason) { if atomic.CompareAndSwapInt32(&wp.didSignalClose, 0, 1) { wp.net.peerRemoteClose(wp, reason) } - wp.Close() + wp.Close(time.Now().Add(peerDisconnectionAckDuration)) } // called either here or from above enclosing node logic -func (wp *wsPeer) Close() { +func (wp *wsPeer) Close(deadline time.Time) { atomic.StoreInt32(&wp.didSignalClose, 1) if atomic.CompareAndSwapInt32(&wp.didInnerClose, 0, 1) { close(wp.closing) - err := wp.conn.WriteControl(websocket.CloseMessage, websocket.FormatCloseMessage(websocket.CloseNormalClosure, ""), time.Now().Add(5*time.Second)) + err := wp.conn.WriteControl(websocket.CloseMessage, websocket.FormatCloseMessage(websocket.CloseNormalClosure, ""), deadline) if err != nil { wp.net.log.Infof("failed to write CloseMessage to connection for %s", wp.conn.RemoteAddr().String()) } @@ -774,8 +774,8 @@ func (wp *wsPeer) Close() { } // CloseAndWait internally calls Close() then waits for all peer activity to stop -func (wp *wsPeer) CloseAndWait() { - wp.Close() +func (wp *wsPeer) CloseAndWait(deadline time.Time) { + wp.Close(deadline) wp.wg.Wait() } |