summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorTsachi Herman <tsachi.herman@algorand.com>2022-01-14 14:16:57 -0500
committerGitHub <noreply@github.com>2022-01-14 14:16:57 -0500
commit4006ce219ebcebbb65c621b9f04362b45b256065 (patch)
tree5c73ed03dd6693eb2bb9b3ba2cc72361879afa68
parent9d073692bb9617d3d4a9a892feda147a508be8fb (diff)
network: faster node shutdown (#3416)
During the node shutdown, all the current outgoing connections are being disconnected. Since these connections are web sockets, they require a close connection message to be sent. However, sending this message can take a while, and in situations where the other party has already shut down, we might never get a response. That, in turn, would lead the node waiting until the deadline is reached. The current deadline was 5 seconds. This PR changes the deadline during shutdown to be 50ms.
-rw-r--r--network/wsNetwork.go21
-rw-r--r--network/wsPeer.go10
2 files changed, 21 insertions, 10 deletions
diff --git a/network/wsNetwork.go b/network/wsNetwork.go
index 7768addf2..8bde3040d 100644
--- a/network/wsNetwork.go
+++ b/network/wsNetwork.go
@@ -125,6 +125,12 @@ var peers = metrics.MakeGauge(metrics.MetricName{Name: "algod_network_peers", De
var incomingPeers = metrics.MakeGauge(metrics.MetricName{Name: "algod_network_incoming_peers", Description: "Number of active incoming peers."})
var outgoingPeers = metrics.MakeGauge(metrics.MetricName{Name: "algod_network_outgoing_peers", Description: "Number of active outgoing peers."})
+// peerDisconnectionAckDuration defines the time we would wait for the peer disconnection to compelete.
+const peerDisconnectionAckDuration time.Duration = 5 * time.Second
+
+// peerShutdownDisconnectionAckDuration defines the time we would wait for the peer disconnection to compelete during shutdown.
+const peerShutdownDisconnectionAckDuration time.Duration = 50 * time.Millisecond
+
// Peer opaque interface for referring to a neighbor in the network
type Peer interface{}
@@ -542,13 +548,13 @@ func (wn *WebsocketNetwork) disconnect(badnode Peer, reason disconnectReason) {
return
}
peer := badnode.(*wsPeer)
- peer.CloseAndWait()
+ peer.CloseAndWait(time.Now().Add(peerDisconnectionAckDuration))
wn.removePeer(peer, reason)
}
-func closeWaiter(wg *sync.WaitGroup, peer *wsPeer) {
+func closeWaiter(wg *sync.WaitGroup, peer *wsPeer, deadline time.Time) {
defer wg.Done()
- peer.CloseAndWait()
+ peer.CloseAndWait(deadline)
}
// DisconnectPeers shuts down all connections
@@ -557,8 +563,9 @@ func (wn *WebsocketNetwork) DisconnectPeers() {
defer wn.peersLock.Unlock()
closeGroup := sync.WaitGroup{}
closeGroup.Add(len(wn.peers))
+ deadline := time.Now().Add(peerDisconnectionAckDuration)
for _, peer := range wn.peers {
- go closeWaiter(&closeGroup, peer)
+ go closeWaiter(&closeGroup, peer, deadline)
}
wn.peers = wn.peers[:0]
closeGroup.Wait()
@@ -812,8 +819,12 @@ func (wn *WebsocketNetwork) innerStop() {
wn.peersLock.Lock()
defer wn.peersLock.Unlock()
wn.wg.Add(len(wn.peers))
+ // this method is called only during node shutdown. In this case, we want to send the
+ // shutdown message, but we don't want to wait for a long time - since we might not be lucky
+ // to get a response.
+ deadline := time.Now().Add(peerShutdownDisconnectionAckDuration)
for _, peer := range wn.peers {
- go closeWaiter(&wn.wg, peer)
+ go closeWaiter(&wn.wg, peer, deadline)
}
wn.peers = wn.peers[:0]
}
diff --git a/network/wsPeer.go b/network/wsPeer.go
index e0d209c41..337dae07b 100644
--- a/network/wsPeer.go
+++ b/network/wsPeer.go
@@ -754,15 +754,15 @@ func (wp *wsPeer) internalClose(reason disconnectReason) {
if atomic.CompareAndSwapInt32(&wp.didSignalClose, 0, 1) {
wp.net.peerRemoteClose(wp, reason)
}
- wp.Close()
+ wp.Close(time.Now().Add(peerDisconnectionAckDuration))
}
// called either here or from above enclosing node logic
-func (wp *wsPeer) Close() {
+func (wp *wsPeer) Close(deadline time.Time) {
atomic.StoreInt32(&wp.didSignalClose, 1)
if atomic.CompareAndSwapInt32(&wp.didInnerClose, 0, 1) {
close(wp.closing)
- err := wp.conn.WriteControl(websocket.CloseMessage, websocket.FormatCloseMessage(websocket.CloseNormalClosure, ""), time.Now().Add(5*time.Second))
+ err := wp.conn.WriteControl(websocket.CloseMessage, websocket.FormatCloseMessage(websocket.CloseNormalClosure, ""), deadline)
if err != nil {
wp.net.log.Infof("failed to write CloseMessage to connection for %s", wp.conn.RemoteAddr().String())
}
@@ -774,8 +774,8 @@ func (wp *wsPeer) Close() {
}
// CloseAndWait internally calls Close() then waits for all peer activity to stop
-func (wp *wsPeer) CloseAndWait() {
- wp.Close()
+func (wp *wsPeer) CloseAndWait(deadline time.Time) {
+ wp.Close(deadline)
wp.wg.Wait()
}