summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJohn Lee <64482439+algojohnlee@users.noreply.github.com>2020-08-17 19:27:35 -0400
committerGitHub <noreply@github.com>2020-08-17 19:27:35 -0400
commitf67af107affa268c524224fc7213914b5664001b (patch)
treedd5b5f2fe2b8d9487ac26f3103fc84b41e3c1093
parent2fc191356106c0edaf09191669c0b555b7b196c9 (diff)
parent271860a8a039d9ca730bba7b0027f641f155074f (diff)
Merge pull request #1411 from onetechnical/onetechnical/relbeta2.1.3v2.1.3-beta
Onetechnical/relbeta2.1.3
-rw-r--r--buildnumber.dat2
-rw-r--r--catchup/service.go12
-rw-r--r--catchup/service_test.go4
-rw-r--r--cmd/goal/node.go4
-rw-r--r--config/config.go9
-rw-r--r--config/local_defaults.go3
-rw-r--r--installer/config.json.example3
-rw-r--r--ledger/acctupdates.go40
-rw-r--r--ledger/ledger.go8
-rwxr-xr-xscripts/compute_branch_deadlock_default.sh13
-rwxr-xr-xscripts/travis/build_test.sh2
-rwxr-xr-xscripts/travis/integration_test.sh3
-rwxr-xr-xscripts/travis/test_release.sh2
-rw-r--r--test/testdata/configs/config-v11.json77
14 files changed, 171 insertions, 11 deletions
diff --git a/buildnumber.dat b/buildnumber.dat
index 0cfbf0888..00750edc0 100644
--- a/buildnumber.dat
+++ b/buildnumber.dat
@@ -1 +1 @@
-2
+3
diff --git a/catchup/service.go b/catchup/service.go
index d580d3992..247b48d81 100644
--- a/catchup/service.go
+++ b/catchup/service.go
@@ -56,6 +56,7 @@ type Ledger interface {
EnsureBlock(block *bookkeeping.Block, c agreement.Certificate)
LastRound() basics.Round
Block(basics.Round) (bookkeeping.Block, error)
+ IsWritingCatchpointFile() bool
}
// Service represents the catchup service. Once started and until it is stopped, it ensures that the ledger is up to date with network.
@@ -385,6 +386,12 @@ func (s *Service) pipelinedFetch(seedLookback uint64) {
// there was an error
return
}
+ // if we're writing a catchpoint file, stop catching up to reduce the memory pressure. Once we finish writing the file we
+ // could resume with the catchup.
+ if s.ledger.IsWritingCatchpointFile() {
+ s.log.Info("Catchup is stopping due to catchpoint file being written")
+ return
+ }
completedRounds[round] = true
// fetch rounds we can validate
for completedRounds[nextRound-basics.Round(parallelRequests)] {
@@ -436,6 +443,11 @@ func (s *Service) periodicSync() {
sleepDuration = s.deadlineTimeout
continue
}
+ // check to see if we're currently writing a catchpoint file. If so, wait longer before attempting again.
+ if s.ledger.IsWritingCatchpointFile() {
+ // keep the existing sleep duration and try again later.
+ continue
+ }
s.log.Info("It's been too long since our ledger advanced; resyncing")
s.sync(nil)
case cert := <-s.unmatchedPendingCertificates:
diff --git a/catchup/service_test.go b/catchup/service_test.go
index 090ce3c21..cc451c7dd 100644
--- a/catchup/service_test.go
+++ b/catchup/service_test.go
@@ -577,6 +577,10 @@ func (m *mockedLedger) LookupDigest(basics.Round) (crypto.Digest, error) {
return crypto.Digest{}, errors.New("not needed for mockedLedger")
}
+func (m *mockedLedger) IsWritingCatchpointFile() bool {
+ return false
+}
+
func testingenv(t testing.TB, numBlocks int) (ledger, emptyLedger Ledger) {
mLedger := new(mockedLedger)
mEmptyLedger := new(mockedLedger)
diff --git a/cmd/goal/node.go b/cmd/goal/node.go
index 95f2a6c0c..74b8daaef 100644
--- a/cmd/goal/node.go
+++ b/cmd/goal/node.go
@@ -422,8 +422,8 @@ func makeStatusString(stat generatedV2.NodeStatusResponse) string {
*stat.Catchpoint)
if stat.CatchpointTotalAccounts != nil && (*stat.CatchpointTotalAccounts > 0) && stat.CatchpointProcessedAccounts != nil {
- statusString = statusString + "\n" + fmt.Sprintf(infoNodeCatchpointCatchupAccounts, *stat.CatchpointProcessedAccounts,
- *stat.CatchpointTotalAccounts)
+ statusString = statusString + "\n" + fmt.Sprintf(infoNodeCatchpointCatchupAccounts, *stat.CatchpointTotalAccounts,
+ *stat.CatchpointProcessedAccounts)
}
if stat.CatchpointAcquiredBlocks != nil && stat.CatchpointTotalBlocks != nil && (*stat.CatchpointAcquiredBlocks+*stat.CatchpointTotalBlocks > 0) {
statusString = statusString + "\n" + fmt.Sprintf(infoNodeCatchpointCatchupBlocks, *stat.CatchpointTotalBlocks,
diff --git a/config/config.go b/config/config.go
index 19d8b6043..86bbc6018 100644
--- a/config/config.go
+++ b/config/config.go
@@ -63,7 +63,7 @@ type Local struct {
// Version tracks the current version of the defaults so we can migrate old -> new
// This is specifically important whenever we decide to change the default value
// for an existing parameter. This field tag must be updated any time we add a new version.
- Version uint32 `version[0]:"0" version[1]:"1" version[2]:"2" version[3]:"3" version[4]:"4" version[5]:"5" version[6]:"6" version[7]:"7" version[8]:"8" version[9]:"9" version[10]:"10"`
+ Version uint32 `version[0]:"0" version[1]:"1" version[2]:"2" version[3]:"3" version[4]:"4" version[5]:"5" version[6]:"6" version[7]:"7" version[8]:"8" version[9]:"9" version[10]:"10" version[11]:"11"`
// environmental (may be overridden)
// When enabled, stores blocks indefinitally, otherwise, only the most recents blocks
@@ -328,6 +328,13 @@ type Local struct {
// OptimizeAccountsDatabaseOnStartup controls whether the accounts database would be optimized
// on algod startup.
OptimizeAccountsDatabaseOnStartup bool `version[10]:"false"`
+
+ // CatchpointTracking determines if catchpoints are going to be tracked. The value is interpreted as follows:
+ // A of -1 means "don't track catchpoints".
+ // A value of 1 means "track catchpoints as long as CatchpointInterval is also set to a positive non-zero value". If CatchpointInterval <= 0, no catchpoint tracking would be performed.
+ // A value of 0 means automatic, which is the default value. In this mode, a non archival node would not track the catchpoints, and an archival node would track the catchpoints as long as CatchpointInterval > 0.
+ // Other values of CatchpointTracking would give a warning in the log file, and would behave as if the default value was provided.
+ CatchpointTracking int64 `version[11]:"0"`
}
// Filenames of config files within the configdir (e.g. ~/.algorand)
diff --git a/config/local_defaults.go b/config/local_defaults.go
index 07a4647e4..28743e4c8 100644
--- a/config/local_defaults.go
+++ b/config/local_defaults.go
@@ -20,7 +20,7 @@
package config
var defaultLocal = Local{
- Version: 10,
+ Version: 11,
AnnounceParticipationKey: true,
Archival: false,
BaseLoggerDebugLevel: 4,
@@ -28,6 +28,7 @@ var defaultLocal = Local{
CadaverSizeTarget: 1073741824,
CatchpointFileHistoryLength: 365,
CatchpointInterval: 10000,
+ CatchpointTracking: 0,
CatchupBlockDownloadRetryAttempts: 1000,
CatchupFailurePeerRefreshRate: 10,
CatchupGossipBlockFetchTimeoutSec: 4,
diff --git a/installer/config.json.example b/installer/config.json.example
index 93fe6229b..c29679a17 100644
--- a/installer/config.json.example
+++ b/installer/config.json.example
@@ -1,5 +1,5 @@
{
- "Version": 10,
+ "Version": 11,
"AnnounceParticipationKey": true,
"Archival": false,
"BaseLoggerDebugLevel": 4,
@@ -7,6 +7,7 @@
"CadaverSizeTarget": 1073741824,
"CatchpointFileHistoryLength": 365,
"CatchpointInterval": 10000,
+ "CatchpointTracking": 0,
"CatchupBlockDownloadRetryAttempts": 1000,
"CatchupFailurePeerRefreshRate": 10,
"CatchupGossipBlockFetchTimeoutSec": 4,
diff --git a/ledger/acctupdates.go b/ledger/acctupdates.go
index c33648dee..fc2928c03 100644
--- a/ledger/acctupdates.go
+++ b/ledger/acctupdates.go
@@ -218,7 +218,23 @@ func (au *accountUpdates) initialize(cfg config.Local, dbPathPrefix string, gene
au.initAccounts = genesisAccounts
au.dbDirectory = filepath.Dir(dbPathPrefix)
au.archivalLedger = cfg.Archival
- au.catchpointInterval = cfg.CatchpointInterval
+ switch cfg.CatchpointTracking {
+ case -1:
+ au.catchpointInterval = 0
+ default:
+ // give a warning, then fall thought
+ logging.Base().Warnf("accountUpdates: the CatchpointTracking field in the config.json file contains an invalid value (%d). The default value of 0 would be used instead.", cfg.CatchpointTracking)
+ fallthrough
+ case 0:
+ if au.archivalLedger {
+ au.catchpointInterval = cfg.CatchpointInterval
+ } else {
+ au.catchpointInterval = 0
+ }
+ case 1:
+ au.catchpointInterval = cfg.CatchpointInterval
+ }
+
au.catchpointFileHistoryLength = cfg.CatchpointFileHistoryLength
if cfg.CatchpointFileHistoryLength < -1 {
au.catchpointFileHistoryLength = -1
@@ -275,6 +291,22 @@ func (au *accountUpdates) close() {
<-au.commitSyncerClosed
}
+// IsWritingCatchpointFile returns true when a catchpoint file is being generated. The function is used by the catchup service
+// to avoid memory pressure until the catchpoint file writing is complete.
+func (au *accountUpdates) IsWritingCatchpointFile() bool {
+ au.accountsMu.Lock()
+ defer au.accountsMu.Unlock()
+ // if we're still writing the previous balances, we can't move forward yet.
+ select {
+ case <-au.catchpointWriting:
+ // the channel catchpointWriting is currently closed, meaning that we're currently not writing any
+ // catchpoint file.
+ return false
+ default:
+ return true
+ }
+}
+
// Lookup returns the accound data for a given address at a given round. The withRewards indicates whether the
// rewards should be added to the AccountData before returning. Note that the function doesn't update the account with the rewards,
// even while it could return the AccoutData which represent the "rewarded" account data.
@@ -1579,6 +1611,12 @@ func (au *accountUpdates) generateCatchpoint(committedRound basics.Round, label
db.ResetTransactionWarnDeadline(ctx, tx, time.Now().Add(1*time.Second))
select {
case <-time.After(100 * time.Millisecond):
+ // increase the time slot allocated for writing the catchpoint, but stop when we get to the longChunkExecutionDuration limit.
+ // this would allow the catchpoint writing speed to ramp up while still leaving some cpu available.
+ chunkExecutionDuration *= 2
+ if chunkExecutionDuration > longChunkExecutionDuration {
+ chunkExecutionDuration = longChunkExecutionDuration
+ }
case <-au.ctx.Done():
retryCatchpointCreation = true
err2 := catchpointWriter.Abort()
diff --git a/ledger/ledger.go b/ledger/ledger.go
index 98b49e5b6..8c6181c93 100644
--- a/ledger/ledger.go
+++ b/ledger/ledger.go
@@ -584,6 +584,14 @@ func (l *Ledger) trackerEvalVerified(blk bookkeeping.Block, accUpdatesLedger led
return eval(context.Background(), accUpdatesLedger, blk, false, nil, nil)
}
+// IsWritingCatchpointFile returns true when a catchpoint file is being generated. The function is used by the catchup service
+// to avoid memory pressure until the catchpoint file writing is complete.
+func (l *Ledger) IsWritingCatchpointFile() bool {
+ l.trackerMu.RLock()
+ defer l.trackerMu.RUnlock()
+ return l.accts.IsWritingCatchpointFile()
+}
+
// A txlease is a transaction (sender, lease) pair which uniquely specifies a
// transaction lease.
type txlease struct {
diff --git a/scripts/compute_branch_deadlock_default.sh b/scripts/compute_branch_deadlock_default.sh
index 3285114ca..e5f81e140 100755
--- a/scripts/compute_branch_deadlock_default.sh
+++ b/scripts/compute_branch_deadlock_default.sh
@@ -1,7 +1,12 @@
#!/usr/bin/env bash
-if [[ "$1" =~ ^rel/ ]]; then
- echo "disable"
-else
- echo "enable"
+# if the user ( i.e. algorand developer ) has explicitly enabled the deadlock detection in his environment, we want to enable it.
+if [ "$ALGORAND_DEADLOCK" != "" ]; then
+ echo "$ALGORAND_DEADLOCK"
+ exit 0
fi
+
+# we used to disable the deadlock on all release builds, which cause issues with individuals who compiled it on their own.
+# as a result, we decided to disable it always, unless we're running on travis. If we'll ever want to make it dependent
+# on the build branch, the build branch is available in $1. ( i.e. if [[ "$1" =~ ^rel/ ]]; then ... )
+echo "disable"
diff --git a/scripts/travis/build_test.sh b/scripts/travis/build_test.sh
index e22bcfb5f..a52ce6478 100755
--- a/scripts/travis/build_test.sh
+++ b/scripts/travis/build_test.sh
@@ -9,6 +9,8 @@
# Examples: scripts/travis/build_test.sh
set -e
+ALGORAND_DEADLOCK=enable
+export ALGORAND_DEADLOCK
SCRIPTPATH="$( cd "$(dirname "$0")" ; pwd -P )"
if [ "${USER}" = "travis" ]; then
diff --git a/scripts/travis/integration_test.sh b/scripts/travis/integration_test.sh
index 6a7301152..ac41ba68e 100755
--- a/scripts/travis/integration_test.sh
+++ b/scripts/travis/integration_test.sh
@@ -9,6 +9,9 @@
# Examples: scripts/travis/integration_test.sh
set -e
+ALGORAND_DEADLOCK=enable
+export ALGORAND_DEADLOCK
+
SCRIPTPATH="$( cd "$(dirname "$0")" ; pwd -P )"
export BUILD_TYPE="integration"
diff --git a/scripts/travis/test_release.sh b/scripts/travis/test_release.sh
index ea176a640..458186bbb 100755
--- a/scripts/travis/test_release.sh
+++ b/scripts/travis/test_release.sh
@@ -8,6 +8,8 @@
set -e
+ALGORAND_DEADLOCK=enable
+export ALGORAND_DEADLOCK
BRANCH=$(./scripts/compute_branch.sh)
CHANNEL=$(./scripts/compute_branch_channel.sh "$BRANCH")
diff --git a/test/testdata/configs/config-v11.json b/test/testdata/configs/config-v11.json
new file mode 100644
index 000000000..c29679a17
--- /dev/null
+++ b/test/testdata/configs/config-v11.json
@@ -0,0 +1,77 @@
+{
+ "Version": 11,
+ "AnnounceParticipationKey": true,
+ "Archival": false,
+ "BaseLoggerDebugLevel": 4,
+ "BroadcastConnectionsLimit": -1,
+ "CadaverSizeTarget": 1073741824,
+ "CatchpointFileHistoryLength": 365,
+ "CatchpointInterval": 10000,
+ "CatchpointTracking": 0,
+ "CatchupBlockDownloadRetryAttempts": 1000,
+ "CatchupFailurePeerRefreshRate": 10,
+ "CatchupGossipBlockFetchTimeoutSec": 4,
+ "CatchupHTTPBlockFetchTimeoutSec": 4,
+ "CatchupLedgerDownloadRetryAttempts": 50,
+ "CatchupParallelBlocks": 16,
+ "ConnectionsRateLimitingCount": 60,
+ "ConnectionsRateLimitingWindowSeconds": 1,
+ "DNSBootstrapID": "<network>.algorand.network",
+ "DNSSecurityFlags": 1,
+ "DeadlockDetection": 0,
+ "DisableOutgoingConnectionThrottling": false,
+ "EnableAgreementReporting": false,
+ "EnableAgreementTimeMetrics": false,
+ "EnableAssembleStats": false,
+ "EnableBlockService": false,
+ "EnableDeveloperAPI": false,
+ "EnableGossipBlockService": true,
+ "EnableIncomingMessageFilter": false,
+ "EnableLedgerService": false,
+ "EnableMetricReporting": false,
+ "EnableOutgoingNetworkMessageFiltering": true,
+ "EnablePingHandler": true,
+ "EnableProcessBlockStats": false,
+ "EnableProfiler": false,
+ "EnableRequestLogger": false,
+ "EnableTopAccountsReporting": false,
+ "EndpointAddress": "127.0.0.1:0",
+ "FallbackDNSResolverAddress": "",
+ "ForceRelayMessages": false,
+ "GossipFanout": 4,
+ "IncomingConnectionsLimit": 10000,
+ "IncomingMessageFilterBucketCount": 5,
+ "IncomingMessageFilterBucketSize": 512,
+ "IsIndexerActive": false,
+ "LogArchiveMaxAge": "",
+ "LogArchiveName": "node.archive.log",
+ "LogSizeLimit": 1073741824,
+ "MaxConnectionsPerIP": 30,
+ "NetAddress": "",
+ "NetworkProtocolVersion": "",
+ "NodeExporterListenAddress": ":9100",
+ "NodeExporterPath": "./node_exporter",
+ "OptimizeAccountsDatabaseOnStartup": false,
+ "OutgoingMessageFilterBucketCount": 3,
+ "OutgoingMessageFilterBucketSize": 128,
+ "PeerConnectionsUpdateInterval": 3600,
+ "PeerPingPeriodSeconds": 0,
+ "PriorityPeers": {},
+ "PublicAddress": "",
+ "ReconnectTime": 60000000000,
+ "ReservedFDs": 256,
+ "RestReadTimeoutSeconds": 15,
+ "RestWriteTimeoutSeconds": 120,
+ "RunHosted": false,
+ "SuggestedFeeBlockHistory": 3,
+ "SuggestedFeeSlidingWindowSize": 50,
+ "TLSCertFile": "",
+ "TLSKeyFile": "",
+ "TelemetryToLog": true,
+ "TxPoolExponentialIncreaseFactor": 2,
+ "TxPoolSize": 15000,
+ "TxSyncIntervalSeconds": 60,
+ "TxSyncServeResponseSize": 1000000,
+ "TxSyncTimeoutSeconds": 30,
+ "UseXForwardedForAddressField": ""
+}