diff options
author | Shant Karakashian <55754073+algonautshant@users.noreply.github.com> | 2021-12-09 22:06:47 -0500 |
---|---|---|
committer | John Lee <john.lee@algorand.com> | 2021-12-09 22:20:11 -0500 |
commit | 47917068283c6a0ffef26e442d9df0c6494005aa (patch) | |
tree | 802bbf7549256692a35710adea8d0b680a91f006 | |
parent | 4860375c526218fb621749397aff39ab875b6037 (diff) |
catchup: suspend the catchup session once the agreement service kicks in (#3299)
The catchup service stops when it is complete, i.e. it has reached up to the round which is being agreed on.
The catchup service knows it is complete and should stop, when it finds that a block is in the ledger before it adds it.
In other words, apart from the catchup, only the agreement adds blocks to the ledger. And when the agreement adds a block to the ledger before the catchup, it means the agreement is ahead, and the catchup is complete.
When `fetchAndWrite` detects the block is already in the ledger, it returns. The return value of `false` stops the catchup syncing.
In previous releases, `fetchAndWrite` was only checking if the block is already in the ledger after attempting to fetch it.
Since it fails to fetch a block not yet agreed on, the fetch fails after multiple attempts, and `fetchAndWrite` returns `false` ending the catchup.
A recent change made this process more efficient by first checking if the block is in the ledger before/during the fetch.
However, once the block was found in the ledger, `fetchAndWrite` returned true instead of false (consistent with already existing logic since forever, which was also wrong). This caused the catchup to continue syncing after catchup was complete.
This change fixes the return value from true to false.
-rw-r--r-- | catchup/service.go | 14 |
1 files changed, 9 insertions, 5 deletions
diff --git a/catchup/service.go b/catchup/service.go index 27ce957ba..de4a52a94 100644 --- a/catchup/service.go +++ b/catchup/service.go @@ -241,8 +241,10 @@ func (s *Service) fetchAndWrite(r basics.Round, prevFetchCompleteChan chan bool, if err != nil { if err == errLedgerAlreadyHasBlock { - // ledger already has the block, no need to request this block from anyone. - return true + // ledger already has the block, no need to request this block. + // only the agreement could have added this block into the ledger, catchup is complete + s.log.Infof("fetchAndWrite(%d): the block is already in the ledger. The catchup is complete", r) + return false } s.log.Debugf("fetchAndWrite(%v): Could not fetch: %v (attempt %d)", r, err, i) peerSelector.rankPeer(psp, peerRankDownloadFailed) @@ -353,8 +355,10 @@ func (s *Service) fetchAndWrite(r basics.Round, prevFetchCompleteChan chan bool, s.log.Infof("fetchAndWrite(%d): no need to re-evaluate historical block", r) return true case ledgercore.BlockInLedgerError: - s.log.Infof("fetchAndWrite(%d): block already in ledger", r) - return true + // the block was added to the ledger from elsewhere after fetching it here + // only the agreement could have added this block into the ledger, catchup is complete + s.log.Infof("fetchAndWrite(%d): after fetching the block, it is already in the ledger. The catchup is complete", r) + return false case protocol.Error: if !s.protocolErrorLogged { logging.Base().Errorf("fetchAndWrite(%v): unrecoverable protocol error detected: %v", r, err) @@ -387,7 +391,7 @@ func (s *Service) pipelineCallback(r basics.Round, thisFetchComplete chan bool, thisFetchComplete <- fetchResult if !fetchResult { - s.log.Infof("failed to fetch block %v", r) + s.log.Infof("pipelineCallback(%d): did not fetch or write the block", r) return 0 } return r |