From f10f76fe1ecd7842f517882060ab1e23764adb22 Mon Sep 17 00:00:00 2001 From: Tsachi Herman Date: Thu, 3 Mar 2022 12:29:36 -0500 Subject: [PATCH] fast catchup: retry peers fetching (#3711) Summary The existing code was failing the fast catchup on node startup if the peers could not be retrieved from the network package. Unfortunately, this is almost always the case. This change allow the node to retry fetching the peers list from the network package multiple times, while having a short delay between the iterations. This issue was reported by a Algorand Forum member. Test Plan Test the change manually. --- catchup/catchpointService.go | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/catchup/catchpointService.go b/catchup/catchpointService.go index 3f221015fe..87bd19b123 100644 --- a/catchup/catchpointService.go +++ b/catchup/catchpointService.go @@ -33,6 +33,12 @@ import ( "github.com/algorand/go-algorand/network" ) +const ( + // noPeersAvailableSleepInterval is the sleep interval that the node would wait if no peers are available to download the next block from. + // this delay is intended to ensure to give the network package some time to download the list of relays. + noPeersAvailableSleepInterval = 50 * time.Millisecond +) + // CatchpointCatchupNodeServices defines the extenal node support needed // for the catchpoint service to switch the node between "regular" operational mode and catchup mode. type CatchpointCatchupNodeServices interface { @@ -592,7 +598,13 @@ func (cs *CatchpointCatchupService) processStageBlocksDownload() (err error) { func (cs *CatchpointCatchupService) fetchBlock(round basics.Round, retryCount uint64) (blk *bookkeeping.Block, downloadDuration time.Duration, psp *peerSelectorPeer, stop bool, err error) { psp, err = cs.blocksDownloadPeerSelector.getNextPeer() if err != nil { - err = fmt.Errorf("fetchBlock: unable to obtain a list of peers to retrieve the latest block from") + if err == errPeerSelectorNoPeerPoolsAvailable { + cs.log.Infof("fetchBlock: unable to obtain a list of peers to retrieve the latest block from; will retry shortly.") + // this is a possible on startup, since the network package might have yet to retrieve the list of peers. + time.Sleep(noPeersAvailableSleepInterval) + return nil, time.Duration(0), psp, false, nil + } + err = fmt.Errorf("fetchBlock: unable to obtain a list of peers to retrieve the latest block from : %w", err) return nil, time.Duration(0), psp, true, cs.abort(err) } peer := psp.Peer