Skip to content

Commit 67b8dd7

Browse files
authored
Beacon sync bookkeeping update and bug fix (#3051)
* Update unprocessed blocks bookkeeping avoiding race condition details: Instead of keeping track in `borrowed` of the sum of a set of blocks fetched from the `unprocessed` list, the particular ranges details are stored. When committing a particular range, the range itself (rather then its length) is removed from the `borrowed` data. why: The function `blocksUnprocAmend()`) may produce a race condition, so it was re-implemented as `blocksUnprocAppend()` considering range overlaps with borrowed. * Clean up/re-org `blocks_unproc` module why: The `borrowed` data structure can be fully maintained with the `blocksUnprocXxx()` functions. * Update/simplify unprocessed headers bookkeeping (similar as for blocks) * Removing stashed headers right after successfully assembled blocks why: Was previously deleted after importing blocks although it is not needed anymore after blocks have been assembled using bodies fetched over the ethXX network. * Always curb the blocks to the initialised length why: Some error exit directives missed some clean up leaving the blocks list with empty/stale block bodies. * Enter reorg-mode right away after block import error why: There is not much one can do. Typically, this type of error is due to a switch to a different canonical chain. When this happens, the block batch is expected to be relatively short as the cause for a chain switch is an RPC instruction. This in turn is effective if some of the blocks on the `FC` database are maintained by the `CL`.
1 parent daebbfa commit 67b8dd7

File tree

9 files changed

+236
-179
lines changed

9 files changed

+236
-179
lines changed

nimbus/sync/beacon/worker.nim

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@ import
2424
# ------------------------------------------------------------------------------
2525

2626
proc headersToFetchOk(buddy: BeaconBuddyRef): bool =
27-
0 < buddy.ctx.headersUnprocTotal() and
27+
0 < buddy.ctx.headersUnprocAvail() and
2828
buddy.ctrl.running and
2929
not buddy.ctx.poolMode
3030

@@ -127,8 +127,8 @@ proc runDaemon*(
127127

128128
# Import from staged queue.
129129
while await ctx.blocksStagedImport(info):
130-
if not ctx.daemon:
131-
# Implied by external sync shutdown?
130+
if not ctx.daemon or # Implied by external sync shutdown?
131+
ctx.poolMode: # Oops, re-org needed?
132132
return
133133

134134
# At the end of the cycle, leave time to trigger refill headers/blocks

nimbus/sync/beacon/worker/blocks_staged.nim

Lines changed: 63 additions & 59 deletions
Original file line numberDiff line numberDiff line change
@@ -58,18 +58,20 @@ proc fetchAndCheck(
5858
for n in 1u ..< ivReq.len:
5959
let header = ctx.dbHeaderPeek(ivReq.minPt + n).valueOr:
6060
# There is nothing one can do here
61-
info "Block header missing, requesting reorg", ivReq, n,
61+
info "Block header missing (reorg triggered)", ivReq, n,
6262
nth=(ivReq.minPt + n).bnStr
6363
# So require reorg
64+
blk.blocks.setLen(offset)
6465
ctx.poolMode = true
6566
return false
6667
blockHash[n - 1] = header.parentHash
6768
blk.blocks[offset + n].header = header
6869
blk.blocks[offset].header = ctx.dbHeaderPeek(ivReq.minPt).valueOr:
6970
# There is nothing one can do here
70-
info "Block header missing, requesting reorg", ivReq, n=0,
71+
info "Block header missing (reorg triggered)", ivReq, n=0,
7172
nth=ivReq.minPt.bnStr
7273
# So require reorg
74+
blk.blocks.setLen(offset)
7375
ctx.poolMode = true
7476
return false
7577
blockHash[ivReq.len - 1] =
@@ -100,15 +102,17 @@ proc fetchAndCheck(
100102
# Oops, cut off the rest
101103
blk.blocks.setLen(offset + n)
102104
buddy.fetchRegisterError()
103-
trace info & ": fetch bodies cut off junk", peer=buddy.peer, ivReq,
104-
n, nTxs=bodies[n].transactions.len, nBodies,
105-
nRespErrors=buddy.only.nBdyRespErrors
105+
trace info & ": cut off fetched junk", peer=buddy.peer, ivReq, n,
106+
nTxs=bodies[n].transactions.len, nBodies, bdyErrors=buddy.bdyErrors
106107
break loop
107108

108109
blk.blocks[offset + n].transactions = bodies[n].transactions
109110
blk.blocks[offset + n].uncles = bodies[n].uncles
110111
blk.blocks[offset + n].withdrawals = bodies[n].withdrawals
111112

113+
# Remove stashed header
114+
ctx.dbHeaderUnstash blk.blocks[offset + n].header.number
115+
112116
if offset < blk.blocks.len.uint64:
113117
return true
114118

@@ -122,33 +126,46 @@ proc fetchAndCheck(
122126
func blocksStagedCanImportOk*(ctx: BeaconCtxRef): bool =
123127
## Check whether the queue is at its maximum size so import can start with
124128
## a full queue.
125-
if ctx.pool.blocksStagedQuLenMax <= ctx.blk.staged.len:
126-
return true
129+
##
130+
if ctx.poolMode:
131+
# Re-org is scheduled
132+
return false
127133

128134
if 0 < ctx.blk.staged.len:
129-
# Import if what is on the queue is all we have got.
130-
if ctx.blocksUnprocIsEmpty() and ctx.blocksUnprocBorrowed() == 0:
131-
return true
132-
# Import if there is currently no peer active
133-
if ctx.pool.nBuddies == 0:
135+
# Import if what is on the queue is all we have got. Note that the function
136+
# `blocksUnprocIsEmpty()` returns `true` only if all blocks possible have
137+
# been fetched (i.e. the `borrowed` list is also empty.)
138+
if ctx.blocksUnprocIsEmpty():
134139
return true
135140

141+
# Make sure that the lowest block is available, already. Or the other way
142+
# round: no unprocessed block number range precedes the least staged block.
143+
if ctx.blk.staged.ge(0).value.key < ctx.blocksUnprocTotalBottom():
144+
# Also suggest importing blocks if there is currently no peer active.
145+
# The `unprocessed` ranges will contain some higher number block ranges,
146+
# but these can be fetched later.
147+
if ctx.pool.nBuddies == 0:
148+
return true
149+
150+
# Start importing if the queue is long enough.
151+
if ctx.pool.blocksStagedQuLenMax <= ctx.blk.staged.len:
152+
return true
153+
136154
false
137155

138156

139157
func blocksStagedFetchOk*(ctx: BeaconCtxRef): bool =
140158
## Check whether body records can be fetched and stored on the `staged` queue.
141159
##
142-
let uBottom = ctx.blocksUnprocBottom()
143-
if uBottom < high(BlockNumber):
160+
if 0 < ctx.blocksUnprocAvail():
144161
# Not to start fetching while the queue is busy (i.e. larger than Lwm)
145162
# so that import might still be running strong.
146163
if ctx.blk.staged.len < ctx.pool.blocksStagedQuLenMax:
147164
return true
148165

149166
# Make sure that there is no gap at the bottom which needs to be
150-
# addressed regardless of the length of the queue.
151-
if uBottom < ctx.blk.staged.ge(0).value.key:
167+
# fetched regardless of the length of the queue.
168+
if ctx.blocksUnprocAvailBottom() < ctx.blk.staged.ge(0).value.key:
152169
return true
153170

154171
false
@@ -160,7 +177,7 @@ proc blocksStagedCollect*(
160177
): Future[bool] {.async: (raises: []).} =
161178
## Collect bodies and stage them.
162179
##
163-
if buddy.ctx.blocksUnprocIsEmpty():
180+
if buddy.ctx.blocksUnprocAvail() == 0:
164181
# Nothing to do
165182
return false
166183

@@ -203,7 +220,7 @@ proc blocksStagedCollect*(
203220
if not await buddy.fetchAndCheck(ivReq, blk, info):
204221
if ctx.poolMode:
205222
# Reorg requested?
206-
ctx.blocksUnprocCommit(iv.len, iv)
223+
ctx.blocksUnprocCommit(iv, iv)
207224
return false
208225

209226
haveError = true
@@ -222,7 +239,7 @@ proc blocksStagedCollect*(
222239
nStaged=ctx.blk.staged.len, ctrl=buddy.ctrl.state,
223240
bdyErrors=buddy.bdyErrors
224241

225-
ctx.blocksUnprocCommit(iv.len, iv)
242+
ctx.blocksUnprocCommit(iv, iv)
226243
# At this stage allow a task switch so that some other peer might try
227244
# to work on the currently returned interval.
228245
try: await sleepAsync asyncThreadSwitchTimeSlot
@@ -234,22 +251,22 @@ proc blocksStagedCollect*(
234251
trace info & ": list partially failed", peer, iv, ivReq,
235252
unused=BnRange.new(ivBottom,iv.maxPt)
236253
# There is some left over to store back
237-
ctx.blocksUnprocCommit(iv.len, ivBottom, iv.maxPt)
254+
ctx.blocksUnprocCommit(iv, ivBottom, iv.maxPt)
238255
break
239256

240257
# Update remaining interval
241258
let ivRespLen = blk.blocks.len - nBlkBlocks
242259
if iv.maxPt < ivBottom + ivRespLen.uint64:
243260
# All collected
244-
ctx.blocksUnprocCommit(iv.len)
261+
ctx.blocksUnprocCommit(iv)
245262
break
246263

247264
ivBottom += ivRespLen.uint64 # will mostly result into `ivReq.maxPt+1`
248265

249266
if buddy.ctrl.stopped:
250267
# There is some left over to store back. And `ivBottom <= iv.maxPt`
251268
# because of the check against `ivRespLen` above.
252-
ctx.blocksUnprocCommit(iv.len, ivBottom, iv.maxPt)
269+
ctx.blocksUnprocCommit(iv, ivBottom, iv.maxPt)
253270
break
254271

255272
# Store `blk` chain on the `staged` queue
@@ -276,21 +293,16 @@ proc blocksStagedImport*(
276293
## Import/execute blocks record from staged queue
277294
##
278295
let qItem = ctx.blk.staged.ge(0).valueOr:
296+
# Empty queue
279297
return false
280298

281-
# Fetch least record, accept only if it matches the global ledger state
282-
block:
283-
let imported = ctx.chain.latestNumber()
284-
if imported + 1 < qItem.key:
285-
# If there is a gap, the `FC` module data area might have been re-set (or
286-
# some problem occured due to concurrent collection.) In any case, the
287-
# missing block numbers are added to the range of blocks that need to be
288-
# fetched.
289-
ctx.blocksUnprocAmend(imported + 1, qItem.key - 1)
290-
trace info & ": there is a gap L vs. staged",
291-
B=ctx.chain.baseNumber.bnStr, L=imported.bnStr, staged=qItem.key.bnStr,
292-
C=ctx.layout.coupler.bnStr
293-
return false
299+
# Make sure that the lowest block is available, already. Or the other way
300+
# round: no unprocessed block number range precedes the least staged block.
301+
let uBottom = ctx.blocksUnprocTotalBottom()
302+
if uBottom < qItem.key:
303+
trace info & ": block queue not ready yet", nBuddies=ctx.pool.nBuddies,
304+
unprocBottom=uBottom.bnStr, least=qItem.key.bnStr
305+
return false
294306

295307
# Remove from queue
296308
discard ctx.blk.staged.delete qItem.key
@@ -306,32 +318,27 @@ proc blocksStagedImport*(
306318
var maxImport = iv.maxPt
307319
block importLoop:
308320
for n in 0 ..< nBlocks:
309-
# It is known that `key <= imported + 1`. This means that some blocks
310-
# potentally overlap with what is already known by `FC` (e.g. due to
311-
# concurrently running `importBlock()` by a `newPayload` RPC requests.)
312-
#
313-
# It is not left to `FC` to ignore this record. Passing a block before
314-
# the `base` (which also might have changed) is responded by `FC` with
315-
# an error. This would cause throwing away all `nBlocks` rather than
316-
# ignoring the first some.
317-
#
318321
let nBn = qItem.data.blocks[n].header.number
319322
if nBn <= ctx.chain.baseNumber:
320-
trace info & ": ignoring block <= base", n, iv,
323+
trace info & ": ignoring block less eq. base", n, iv,
321324
B=ctx.chain.baseNumber.bnStr, L=ctx.chain.latestNumber.bnStr,
322325
nthBn=nBn.bnStr, nthHash=qItem.data.getNthHash(n).short
323326
continue
324327
ctx.pool.chain.importBlock(qItem.data.blocks[n]).isOkOr:
325-
warn info & ": import block error", n, iv,
328+
# The way out here is simply to re-compile the block queue. At any
329+
# point, the `FC` module data area might have been moved to a new
330+
# canonocal branch.
331+
#
332+
ctx.poolMode = true
333+
warn info & ": import block error (reorg triggered)", n, iv,
326334
B=ctx.chain.baseNumber.bnStr, L=ctx.chain.latestNumber.bnStr,
327-
nthBn=nBn.bnStr, nthHash=qItem.data.getNthHash(n).short, `error`=error
328-
# Restore what is left over below
329-
maxImport = ctx.chain.latestNumber()
335+
nthBn=nBn.bnStr, nthHash=qItem.data.getNthHash(n).short,
336+
`error`=error
330337
break importLoop
331338

332339
# Allow pseudo/async thread switch.
333340
(await ctx.updateAsyncTasks()).isOkOr:
334-
maxImport = ctx.chain.latestNumber()
341+
maxImport = nBn # shutdown?
335342
break importLoop
336343

337344
# Occasionally mark the chain finalized
@@ -343,29 +350,26 @@ proc blocksStagedImport*(
343350

344351
doAssert nBn == ctx.chain.latestNumber()
345352
ctx.pool.chain.forkChoice(nthHash, finHash).isOkOr:
346-
warn info & ": fork choice error", n, iv,
353+
warn info & ": fork choice error (reorg triggered)", n, iv,
347354
B=ctx.chain.baseNumber.bnStr, L=ctx.chain.latestNumber.bnStr,
348355
F=ctx.layout.final.bnStr, nthBn=nBn.bnStr, nthHash=nthHash.short,
349356
finHash=(if finHash == nthHash: "nthHash" else: "F"), `error`=error
350357
# Restore what is left over below
351-
maxImport = ctx.chain.latestNumber()
358+
ctx.poolMode = true
352359
break importLoop
353360

354361
# Allow pseudo/async thread switch.
355362
(await ctx.updateAsyncTasks()).isOkOr:
356-
maxImport = ctx.chain.latestNumber()
363+
maxImport = nBn # shutdown?
357364
break importLoop
358365

359366
# Import probably incomplete, so a partial roll back may be needed
360367
if maxImport < iv.maxPt:
361-
ctx.blocksUnprocCommit(0, maxImport+1, qItem.data.blocks[^1].header.number)
362-
363-
# Remove stashed headers for imported blocks
364-
for bn in iv.minPt .. maxImport:
365-
ctx.dbHeaderUnstash bn
368+
ctx.blocksUnprocAppend(maxImport+1, iv.maxPt)
366369

367370
info "Import done", iv, nBlocks, base=ctx.chain.baseNumber.bnStr,
368371
head=ctx.chain.latestNumber.bnStr, target=ctx.layout.final.bnStr
372+
369373
return true
370374

371375

@@ -382,7 +386,7 @@ proc blocksStagedReorg*(ctx: BeaconCtxRef; info: static[string]) =
382386
## the block queues as there won't be mant data cached, then.
383387
##
384388
if ctx.blk.staged.len == 0 and
385-
ctx.blocksUnprocChunks() == 0:
389+
ctx.blocksUnprocIsEmpty():
386390
# nothing to do
387391
return
388392

0 commit comments

Comments
 (0)