catchup: suspend the catchup session once the agreement service kicks in (#3299)

The catchup service stops when it is complete, i.e. it has reached up to the round which is being agreed on. The catchup service knows it is complete and should stop, when it finds that a block is in the ledger before it adds it. In other words, apart from the catchup, only the agreement adds blocks to the ledger. And when the agreement adds a block to the ledger before the catchup, it means the agreement is ahead, and the catchup is complete. When `fetchAndWrite` detects the block is already in the ledger, it returns. The return value of `false` stops the catchup syncing. In previous releases, `fetchAndWrite` was only checking if the block is already in the ledger after attempting to fetch it. Since it fails to fetch a block not yet agreed on, the fetch fails after multiple attempts, and `fetchAndWrite` returns `false` ending the catchup. A recent change made this process more efficient by first checking if the block is in the ledger before/during the fetch. However, once the block was found in the ledger, `fetchAndWrite` returned true instead of false (consistent with already existing logic since forever, which was also wrong). This caused the catchup to continue syncing after catchup was complete. This change fixes the return value from true to false.
author: Shant Karakashian <55754073+algonautshant@users.noreply.github.com> 2021-12-09 22:06:47 -0500
committer: John Lee <john.lee@algorand.com> 2021-12-09 22:20:11 -0500
commit: 47917068283c6a0ffef26e442d9df0c6494005aa (patch)
tree: 802bbf7549256692a35710adea8d0b680a91f006
parent: 4860375c526218fb621749397aff39ab875b6037 (diff)
1 files changed, 9 insertions, 5 deletions
diff --git a/catchup/service.go b/catchup/service.go
index 27ce957ba..de4a52a94 100644
--- a/catchup/service.go
+++ b/catchup/service.go
@@ -241,8 +241,10 @@ func (s *Service) fetchAndWrite(r basics.Round, prevFetchCompleteChan chan bool,
 
 		if err != nil {
 			if err == errLedgerAlreadyHasBlock {
-				// ledger already has the block, no need to request this block from anyone.
-				return true
+				// ledger already has the block, no need to request this block.
+				// only the agreement could have added this block into the ledger, catchup is complete
+				s.log.Infof("fetchAndWrite(%d): the block is already in the ledger. The catchup is complete", r)
+				return false
 			}
 			s.log.Debugf("fetchAndWrite(%v): Could not fetch: %v (attempt %d)", r, err, i)
 			peerSelector.rankPeer(psp, peerRankDownloadFailed)
@@ -353,8 +355,10 @@ func (s *Service) fetchAndWrite(r basics.Round, prevFetchCompleteChan chan bool,
 						s.log.Infof("fetchAndWrite(%d): no need to re-evaluate historical block", r)
 						return true
 					case ledgercore.BlockInLedgerError:
-						s.log.Infof("fetchAndWrite(%d): block already in ledger", r)
-						return true
+						// the block was added to the ledger from elsewhere after fetching it here
+						// only the agreement could have added this block into the ledger, catchup is complete
+						s.log.Infof("fetchAndWrite(%d): after fetching the block, it is already in the ledger. The catchup is complete", r)
+						return false
 					case protocol.Error:
 						if !s.protocolErrorLogged {
 							logging.Base().Errorf("fetchAndWrite(%v): unrecoverable protocol error detected: %v", r, err)
@@ -387,7 +391,7 @@ func (s *Service) pipelineCallback(r basics.Round, thisFetchComplete chan bool,
 		thisFetchComplete <- fetchResult
 
 		if !fetchResult {
-			s.log.Infof("failed to fetch block %v", r)
+			s.log.Infof("pipelineCallback(%d): did not fetch or write the block", r)
 			return 0
 		}
 		return r
author	Shant Karakashian <55754073+algonautshant@users.noreply.github.com>	2021-12-09 22:06:47 -0500
committer	John Lee <john.lee@algorand.com>	2021-12-09 22:20:11 -0500
commit	47917068283c6a0ffef26e442d9df0c6494005aa (patch)
tree	802bbf7549256692a35710adea8d0b680a91f006
parent	4860375c526218fb621749397aff39ab875b6037 (diff)