diff options
author | John Lee <64482439+algojohnlee@users.noreply.github.com> | 2020-08-17 19:27:35 -0400 |
---|---|---|
committer | GitHub <noreply@github.com> | 2020-08-17 19:27:35 -0400 |
commit | f67af107affa268c524224fc7213914b5664001b (patch) | |
tree | dd5b5f2fe2b8d9487ac26f3103fc84b41e3c1093 | |
parent | 2fc191356106c0edaf09191669c0b555b7b196c9 (diff) | |
parent | 271860a8a039d9ca730bba7b0027f641f155074f (diff) |
Merge pull request #1411 from onetechnical/onetechnical/relbeta2.1.3v2.1.3-beta
Onetechnical/relbeta2.1.3
-rw-r--r-- | buildnumber.dat | 2 | ||||
-rw-r--r-- | catchup/service.go | 12 | ||||
-rw-r--r-- | catchup/service_test.go | 4 | ||||
-rw-r--r-- | cmd/goal/node.go | 4 | ||||
-rw-r--r-- | config/config.go | 9 | ||||
-rw-r--r-- | config/local_defaults.go | 3 | ||||
-rw-r--r-- | installer/config.json.example | 3 | ||||
-rw-r--r-- | ledger/acctupdates.go | 40 | ||||
-rw-r--r-- | ledger/ledger.go | 8 | ||||
-rwxr-xr-x | scripts/compute_branch_deadlock_default.sh | 13 | ||||
-rwxr-xr-x | scripts/travis/build_test.sh | 2 | ||||
-rwxr-xr-x | scripts/travis/integration_test.sh | 3 | ||||
-rwxr-xr-x | scripts/travis/test_release.sh | 2 | ||||
-rw-r--r-- | test/testdata/configs/config-v11.json | 77 |
14 files changed, 171 insertions, 11 deletions
diff --git a/buildnumber.dat b/buildnumber.dat index 0cfbf0888..00750edc0 100644 --- a/buildnumber.dat +++ b/buildnumber.dat @@ -1 +1 @@ -2 +3 diff --git a/catchup/service.go b/catchup/service.go index d580d3992..247b48d81 100644 --- a/catchup/service.go +++ b/catchup/service.go @@ -56,6 +56,7 @@ type Ledger interface { EnsureBlock(block *bookkeeping.Block, c agreement.Certificate) LastRound() basics.Round Block(basics.Round) (bookkeeping.Block, error) + IsWritingCatchpointFile() bool } // Service represents the catchup service. Once started and until it is stopped, it ensures that the ledger is up to date with network. @@ -385,6 +386,12 @@ func (s *Service) pipelinedFetch(seedLookback uint64) { // there was an error return } + // if we're writing a catchpoint file, stop catching up to reduce the memory pressure. Once we finish writing the file we + // could resume with the catchup. + if s.ledger.IsWritingCatchpointFile() { + s.log.Info("Catchup is stopping due to catchpoint file being written") + return + } completedRounds[round] = true // fetch rounds we can validate for completedRounds[nextRound-basics.Round(parallelRequests)] { @@ -436,6 +443,11 @@ func (s *Service) periodicSync() { sleepDuration = s.deadlineTimeout continue } + // check to see if we're currently writing a catchpoint file. If so, wait longer before attempting again. + if s.ledger.IsWritingCatchpointFile() { + // keep the existing sleep duration and try again later. + continue + } s.log.Info("It's been too long since our ledger advanced; resyncing") s.sync(nil) case cert := <-s.unmatchedPendingCertificates: diff --git a/catchup/service_test.go b/catchup/service_test.go index 090ce3c21..cc451c7dd 100644 --- a/catchup/service_test.go +++ b/catchup/service_test.go @@ -577,6 +577,10 @@ func (m *mockedLedger) LookupDigest(basics.Round) (crypto.Digest, error) { return crypto.Digest{}, errors.New("not needed for mockedLedger") } +func (m *mockedLedger) IsWritingCatchpointFile() bool { + return false +} + func testingenv(t testing.TB, numBlocks int) (ledger, emptyLedger Ledger) { mLedger := new(mockedLedger) mEmptyLedger := new(mockedLedger) diff --git a/cmd/goal/node.go b/cmd/goal/node.go index 95f2a6c0c..74b8daaef 100644 --- a/cmd/goal/node.go +++ b/cmd/goal/node.go @@ -422,8 +422,8 @@ func makeStatusString(stat generatedV2.NodeStatusResponse) string { *stat.Catchpoint) if stat.CatchpointTotalAccounts != nil && (*stat.CatchpointTotalAccounts > 0) && stat.CatchpointProcessedAccounts != nil { - statusString = statusString + "\n" + fmt.Sprintf(infoNodeCatchpointCatchupAccounts, *stat.CatchpointProcessedAccounts, - *stat.CatchpointTotalAccounts) + statusString = statusString + "\n" + fmt.Sprintf(infoNodeCatchpointCatchupAccounts, *stat.CatchpointTotalAccounts, + *stat.CatchpointProcessedAccounts) } if stat.CatchpointAcquiredBlocks != nil && stat.CatchpointTotalBlocks != nil && (*stat.CatchpointAcquiredBlocks+*stat.CatchpointTotalBlocks > 0) { statusString = statusString + "\n" + fmt.Sprintf(infoNodeCatchpointCatchupBlocks, *stat.CatchpointTotalBlocks, diff --git a/config/config.go b/config/config.go index 19d8b6043..86bbc6018 100644 --- a/config/config.go +++ b/config/config.go @@ -63,7 +63,7 @@ type Local struct { // Version tracks the current version of the defaults so we can migrate old -> new // This is specifically important whenever we decide to change the default value // for an existing parameter. This field tag must be updated any time we add a new version. - Version uint32 `version[0]:"0" version[1]:"1" version[2]:"2" version[3]:"3" version[4]:"4" version[5]:"5" version[6]:"6" version[7]:"7" version[8]:"8" version[9]:"9" version[10]:"10"` + Version uint32 `version[0]:"0" version[1]:"1" version[2]:"2" version[3]:"3" version[4]:"4" version[5]:"5" version[6]:"6" version[7]:"7" version[8]:"8" version[9]:"9" version[10]:"10" version[11]:"11"` // environmental (may be overridden) // When enabled, stores blocks indefinitally, otherwise, only the most recents blocks @@ -328,6 +328,13 @@ type Local struct { // OptimizeAccountsDatabaseOnStartup controls whether the accounts database would be optimized // on algod startup. OptimizeAccountsDatabaseOnStartup bool `version[10]:"false"` + + // CatchpointTracking determines if catchpoints are going to be tracked. The value is interpreted as follows: + // A of -1 means "don't track catchpoints". + // A value of 1 means "track catchpoints as long as CatchpointInterval is also set to a positive non-zero value". If CatchpointInterval <= 0, no catchpoint tracking would be performed. + // A value of 0 means automatic, which is the default value. In this mode, a non archival node would not track the catchpoints, and an archival node would track the catchpoints as long as CatchpointInterval > 0. + // Other values of CatchpointTracking would give a warning in the log file, and would behave as if the default value was provided. + CatchpointTracking int64 `version[11]:"0"` } // Filenames of config files within the configdir (e.g. ~/.algorand) diff --git a/config/local_defaults.go b/config/local_defaults.go index 07a4647e4..28743e4c8 100644 --- a/config/local_defaults.go +++ b/config/local_defaults.go @@ -20,7 +20,7 @@ package config var defaultLocal = Local{ - Version: 10, + Version: 11, AnnounceParticipationKey: true, Archival: false, BaseLoggerDebugLevel: 4, @@ -28,6 +28,7 @@ var defaultLocal = Local{ CadaverSizeTarget: 1073741824, CatchpointFileHistoryLength: 365, CatchpointInterval: 10000, + CatchpointTracking: 0, CatchupBlockDownloadRetryAttempts: 1000, CatchupFailurePeerRefreshRate: 10, CatchupGossipBlockFetchTimeoutSec: 4, diff --git a/installer/config.json.example b/installer/config.json.example index 93fe6229b..c29679a17 100644 --- a/installer/config.json.example +++ b/installer/config.json.example @@ -1,5 +1,5 @@ { - "Version": 10, + "Version": 11, "AnnounceParticipationKey": true, "Archival": false, "BaseLoggerDebugLevel": 4, @@ -7,6 +7,7 @@ "CadaverSizeTarget": 1073741824, "CatchpointFileHistoryLength": 365, "CatchpointInterval": 10000, + "CatchpointTracking": 0, "CatchupBlockDownloadRetryAttempts": 1000, "CatchupFailurePeerRefreshRate": 10, "CatchupGossipBlockFetchTimeoutSec": 4, diff --git a/ledger/acctupdates.go b/ledger/acctupdates.go index c33648dee..fc2928c03 100644 --- a/ledger/acctupdates.go +++ b/ledger/acctupdates.go @@ -218,7 +218,23 @@ func (au *accountUpdates) initialize(cfg config.Local, dbPathPrefix string, gene au.initAccounts = genesisAccounts au.dbDirectory = filepath.Dir(dbPathPrefix) au.archivalLedger = cfg.Archival - au.catchpointInterval = cfg.CatchpointInterval + switch cfg.CatchpointTracking { + case -1: + au.catchpointInterval = 0 + default: + // give a warning, then fall thought + logging.Base().Warnf("accountUpdates: the CatchpointTracking field in the config.json file contains an invalid value (%d). The default value of 0 would be used instead.", cfg.CatchpointTracking) + fallthrough + case 0: + if au.archivalLedger { + au.catchpointInterval = cfg.CatchpointInterval + } else { + au.catchpointInterval = 0 + } + case 1: + au.catchpointInterval = cfg.CatchpointInterval + } + au.catchpointFileHistoryLength = cfg.CatchpointFileHistoryLength if cfg.CatchpointFileHistoryLength < -1 { au.catchpointFileHistoryLength = -1 @@ -275,6 +291,22 @@ func (au *accountUpdates) close() { <-au.commitSyncerClosed } +// IsWritingCatchpointFile returns true when a catchpoint file is being generated. The function is used by the catchup service +// to avoid memory pressure until the catchpoint file writing is complete. +func (au *accountUpdates) IsWritingCatchpointFile() bool { + au.accountsMu.Lock() + defer au.accountsMu.Unlock() + // if we're still writing the previous balances, we can't move forward yet. + select { + case <-au.catchpointWriting: + // the channel catchpointWriting is currently closed, meaning that we're currently not writing any + // catchpoint file. + return false + default: + return true + } +} + // Lookup returns the accound data for a given address at a given round. The withRewards indicates whether the // rewards should be added to the AccountData before returning. Note that the function doesn't update the account with the rewards, // even while it could return the AccoutData which represent the "rewarded" account data. @@ -1579,6 +1611,12 @@ func (au *accountUpdates) generateCatchpoint(committedRound basics.Round, label db.ResetTransactionWarnDeadline(ctx, tx, time.Now().Add(1*time.Second)) select { case <-time.After(100 * time.Millisecond): + // increase the time slot allocated for writing the catchpoint, but stop when we get to the longChunkExecutionDuration limit. + // this would allow the catchpoint writing speed to ramp up while still leaving some cpu available. + chunkExecutionDuration *= 2 + if chunkExecutionDuration > longChunkExecutionDuration { + chunkExecutionDuration = longChunkExecutionDuration + } case <-au.ctx.Done(): retryCatchpointCreation = true err2 := catchpointWriter.Abort() diff --git a/ledger/ledger.go b/ledger/ledger.go index 98b49e5b6..8c6181c93 100644 --- a/ledger/ledger.go +++ b/ledger/ledger.go @@ -584,6 +584,14 @@ func (l *Ledger) trackerEvalVerified(blk bookkeeping.Block, accUpdatesLedger led return eval(context.Background(), accUpdatesLedger, blk, false, nil, nil) } +// IsWritingCatchpointFile returns true when a catchpoint file is being generated. The function is used by the catchup service +// to avoid memory pressure until the catchpoint file writing is complete. +func (l *Ledger) IsWritingCatchpointFile() bool { + l.trackerMu.RLock() + defer l.trackerMu.RUnlock() + return l.accts.IsWritingCatchpointFile() +} + // A txlease is a transaction (sender, lease) pair which uniquely specifies a // transaction lease. type txlease struct { diff --git a/scripts/compute_branch_deadlock_default.sh b/scripts/compute_branch_deadlock_default.sh index 3285114ca..e5f81e140 100755 --- a/scripts/compute_branch_deadlock_default.sh +++ b/scripts/compute_branch_deadlock_default.sh @@ -1,7 +1,12 @@ #!/usr/bin/env bash -if [[ "$1" =~ ^rel/ ]]; then - echo "disable" -else - echo "enable" +# if the user ( i.e. algorand developer ) has explicitly enabled the deadlock detection in his environment, we want to enable it. +if [ "$ALGORAND_DEADLOCK" != "" ]; then + echo "$ALGORAND_DEADLOCK" + exit 0 fi + +# we used to disable the deadlock on all release builds, which cause issues with individuals who compiled it on their own. +# as a result, we decided to disable it always, unless we're running on travis. If we'll ever want to make it dependent +# on the build branch, the build branch is available in $1. ( i.e. if [[ "$1" =~ ^rel/ ]]; then ... ) +echo "disable" diff --git a/scripts/travis/build_test.sh b/scripts/travis/build_test.sh index e22bcfb5f..a52ce6478 100755 --- a/scripts/travis/build_test.sh +++ b/scripts/travis/build_test.sh @@ -9,6 +9,8 @@ # Examples: scripts/travis/build_test.sh set -e +ALGORAND_DEADLOCK=enable +export ALGORAND_DEADLOCK SCRIPTPATH="$( cd "$(dirname "$0")" ; pwd -P )" if [ "${USER}" = "travis" ]; then diff --git a/scripts/travis/integration_test.sh b/scripts/travis/integration_test.sh index 6a7301152..ac41ba68e 100755 --- a/scripts/travis/integration_test.sh +++ b/scripts/travis/integration_test.sh @@ -9,6 +9,9 @@ # Examples: scripts/travis/integration_test.sh set -e +ALGORAND_DEADLOCK=enable +export ALGORAND_DEADLOCK + SCRIPTPATH="$( cd "$(dirname "$0")" ; pwd -P )" export BUILD_TYPE="integration" diff --git a/scripts/travis/test_release.sh b/scripts/travis/test_release.sh index ea176a640..458186bbb 100755 --- a/scripts/travis/test_release.sh +++ b/scripts/travis/test_release.sh @@ -8,6 +8,8 @@ set -e +ALGORAND_DEADLOCK=enable +export ALGORAND_DEADLOCK BRANCH=$(./scripts/compute_branch.sh) CHANNEL=$(./scripts/compute_branch_channel.sh "$BRANCH") diff --git a/test/testdata/configs/config-v11.json b/test/testdata/configs/config-v11.json new file mode 100644 index 000000000..c29679a17 --- /dev/null +++ b/test/testdata/configs/config-v11.json @@ -0,0 +1,77 @@ +{ + "Version": 11, + "AnnounceParticipationKey": true, + "Archival": false, + "BaseLoggerDebugLevel": 4, + "BroadcastConnectionsLimit": -1, + "CadaverSizeTarget": 1073741824, + "CatchpointFileHistoryLength": 365, + "CatchpointInterval": 10000, + "CatchpointTracking": 0, + "CatchupBlockDownloadRetryAttempts": 1000, + "CatchupFailurePeerRefreshRate": 10, + "CatchupGossipBlockFetchTimeoutSec": 4, + "CatchupHTTPBlockFetchTimeoutSec": 4, + "CatchupLedgerDownloadRetryAttempts": 50, + "CatchupParallelBlocks": 16, + "ConnectionsRateLimitingCount": 60, + "ConnectionsRateLimitingWindowSeconds": 1, + "DNSBootstrapID": "<network>.algorand.network", + "DNSSecurityFlags": 1, + "DeadlockDetection": 0, + "DisableOutgoingConnectionThrottling": false, + "EnableAgreementReporting": false, + "EnableAgreementTimeMetrics": false, + "EnableAssembleStats": false, + "EnableBlockService": false, + "EnableDeveloperAPI": false, + "EnableGossipBlockService": true, + "EnableIncomingMessageFilter": false, + "EnableLedgerService": false, + "EnableMetricReporting": false, + "EnableOutgoingNetworkMessageFiltering": true, + "EnablePingHandler": true, + "EnableProcessBlockStats": false, + "EnableProfiler": false, + "EnableRequestLogger": false, + "EnableTopAccountsReporting": false, + "EndpointAddress": "127.0.0.1:0", + "FallbackDNSResolverAddress": "", + "ForceRelayMessages": false, + "GossipFanout": 4, + "IncomingConnectionsLimit": 10000, + "IncomingMessageFilterBucketCount": 5, + "IncomingMessageFilterBucketSize": 512, + "IsIndexerActive": false, + "LogArchiveMaxAge": "", + "LogArchiveName": "node.archive.log", + "LogSizeLimit": 1073741824, + "MaxConnectionsPerIP": 30, + "NetAddress": "", + "NetworkProtocolVersion": "", + "NodeExporterListenAddress": ":9100", + "NodeExporterPath": "./node_exporter", + "OptimizeAccountsDatabaseOnStartup": false, + "OutgoingMessageFilterBucketCount": 3, + "OutgoingMessageFilterBucketSize": 128, + "PeerConnectionsUpdateInterval": 3600, + "PeerPingPeriodSeconds": 0, + "PriorityPeers": {}, + "PublicAddress": "", + "ReconnectTime": 60000000000, + "ReservedFDs": 256, + "RestReadTimeoutSeconds": 15, + "RestWriteTimeoutSeconds": 120, + "RunHosted": false, + "SuggestedFeeBlockHistory": 3, + "SuggestedFeeSlidingWindowSize": 50, + "TLSCertFile": "", + "TLSKeyFile": "", + "TelemetryToLog": true, + "TxPoolExponentialIncreaseFactor": 2, + "TxPoolSize": 15000, + "TxSyncIntervalSeconds": 60, + "TxSyncServeResponseSize": 1000000, + "TxSyncTimeoutSeconds": 30, + "UseXForwardedForAddressField": "" +} |