Skip to content

Commit 7a17903

Browse files
committed
MB-59601: Fix data race in CheckpointManager::takeAndResetCursors
The method did not take a queueLock and could mutate the CheckpointManager while it is being accessed, e.g. in CheckpointManager::getListOfCursorsToDrop. CheckpointMemRecoveryTask calls getListOfCursorsToDrop which iterates CM::cursors. A concurrent RollbackTask can result in resetting the vbucket and calling CM::takeAndResetCursors, which among others mutates CM::cursors. WARNING: ThreadSanitizer: data race (pid=47061) Write of size 8 at 0x00010d3b77a8 by main thread (mutexes: write M0, write M1, write M2): #0 CheckpointManager::takeAndResetCursors(CheckpointManager&) checkpoint_manager.cc:1754 (ep-engine_ep_unit_tests:arm64+0x1003c7dd8) #1 KVBucket::resetVBucket_UNLOCKED(LockedVBucketPtr&, std::__1::unique_lock<std::__1::mutex>&) kv_bucket.cc:1273 (ep-engine_ep_unit_tests:arm64+0x1001fc414) #2 KVBucket::rollback(Vbid, unsigned long long) kv_bucket.cc:2634 (ep-engine_ep_unit_tests:arm64+0x10020a910) #3 CheckpointRemoverTest_MB59601_Test::TestBody() checkpoint_remover_test.cc:518 (ep-engine_ep_unit_tests:arm64+0x1005d2224) #4 virtual thunk to CheckpointRemoverTest_MB59601_Test::TestBody() checkpoint_remover_test.cc (ep-engine_ep_unit_tests:arm64+0x1005d24e8) #5 void testing::internal::HandleExceptionsInMethodIfSupported<testing::Test, void>(testing::Test*, void (testing::Test::*)(), char const*) gtest.cc:2648 (ep-engine_ep_unit_tests:arm64+0x101b8f6bc) #6 <null> <null> (0x000186e390e0) Previous read of size 8 at 0x00010d3b77a8 by thread T1 (mutexes: write M3): #0 CheckpointManager::getListOfCursorsToDrop() checkpoint_manager.cc:842 (ep-engine_ep_unit_tests:arm64+0x1003c1af0) #1 CheckpointMemRecoveryTask::attemptCursorDropping() checkpoint_remover.cc:183 (ep-engine_ep_unit_tests:arm64+0x1003caf8c) #2 CheckpointMemRecoveryTask::runInner(bool) checkpoint_remover.cc:245 (ep-engine_ep_unit_tests:arm64+0x1003cb77c) #3 EpNotifiableTask::run() ep_task.cc:56 (ep-engine_ep_unit_tests:arm64+0x10028763c) #4 void* std::__1::__thread_proxy[abi:v160006]<std::__1::tuple<std::__1::unique_ptr<std::__1::__thread_struct, std::__1::default_delete<std::__1::__thread_struct>>, CheckpointRemoverTest_MB59601_Test::TestBody()::$_2::operator()() const::'lambda0'()>>(void*) thread:299 (ep-engine_ep_unit_tests:arm64+0x100600c30) Change-Id: I15c1e9ccc6f45f3251ebd7f78649c8a446d65b54 Reviewed-on: https://review.couchbase.org/c/kv_engine/+/203302 Reviewed-by: Vesko Karaganev <vesko.karaganev@couchbase.com> Tested-by: Build Bot <build@couchbase.com> Reviewed-by: Paolo Cocchi <paolo.cocchi@couchbase.com>
1 parent e0e4d9d commit 7a17903

File tree

3 files changed

+90
-5
lines changed

3 files changed

+90
-5
lines changed

engines/ep/src/checkpoint_manager.cc

Lines changed: 17 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -836,6 +836,8 @@ std::vector<Cursor> CheckpointManager::getListOfCursorsToDrop() {
836836
? *backupFound->second
837837
: *persistenceCursor;
838838

839+
getListOfCursorsToDropHook();
840+
839841
for (const auto& pair : cursors) {
840842
const auto cursor = pair.second;
841843
// Note: Strict condition here.
@@ -1734,13 +1736,23 @@ void CheckpointManager::addStats(const AddStatFn& add_stat,
17341736
}
17351737

17361738
void CheckpointManager::takeAndResetCursors(CheckpointManager& other) {
1737-
pCursor = other.pCursor;
1738-
persistenceCursor = pCursor.lock().get();
1739-
for (auto& cursor : other.cursors) {
1740-
cursors[cursor.second->getName()] = cursor.second;
1739+
other.takeAndResetCursorsHook();
1740+
1741+
Cursor otherPCursor;
1742+
cursor_index otherCursors;
1743+
{
1744+
std::lock_guard<std::mutex> otherLH(other.queueLock);
1745+
otherPCursor = other.pCursor;
1746+
otherCursors = std::move(other.cursors);
1747+
other.cursors.clear();
17411748
}
1742-
other.cursors.clear();
17431749

1750+
std::lock_guard<std::mutex> lh(queueLock);
1751+
pCursor = std::move(otherPCursor);
1752+
persistenceCursor = pCursor.lock().get();
1753+
for (auto& cursor : otherCursors) {
1754+
cursors[cursor.second->getName()] = std::move(cursor.second);
1755+
}
17441756
resetCursors();
17451757
}
17461758

engines/ep/src/checkpoint_manager.h

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -599,6 +599,14 @@ class CheckpointManager {
599599
// (and not yet re-acquired) the CM::lock. Introduced in MB-56644.
600600
TestingHook<> expelHook;
601601

602+
/// Testing hook called at the start of CM::takeAndResetCursors.
603+
/// Introduced in MB-59601.
604+
TestingHook<> takeAndResetCursorsHook;
605+
606+
/// Testing hook called just before iterating CM::cursors in
607+
/// CM::getListOfCursorsToDrop. Introduced in MB-59601.
608+
TestingHook<> getListOfCursorsToDropHook;
609+
602610
protected:
603611
/**
604612
* @param lh, the queueLock held

engines/ep/tests/module_tests/checkpoint_remover_test.cc

Lines changed: 65 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@
2222
#include "collections/vbucket_manifest_handles.h"
2323
#include "dcp/response.h"
2424
#include "test_helpers.h"
25+
#include "thread_gate.h"
2526
#include "vbucket.h"
2627

2728
#include <folly/portability/GMock.h>
@@ -454,6 +455,70 @@ TEST_P(CheckpointRemoverTest, MemRecoveryByCheckpointCreation) {
454455
EXPECT_EQ(0, store->getRequiredCMMemoryReduction());
455456
}
456457

458+
// Without the fix, there is a data race in
459+
// CheckpointManager::takeAndResetCursors which did not take a queueLock,
460+
// and could mutate the CheckpointManager while it is being accessed,
461+
// e.g. in CheckpointManager::getListOfCursorsToDrop.
462+
TEST_P(CheckpointRemoverTest, MB59601) {
463+
if (!isPersistent()) {
464+
GTEST_SKIP();
465+
}
466+
467+
setVBucketStateAndRunPersistTask(vbid, vbucket_state_active);
468+
auto& config = engine->getConfiguration();
469+
config.setChkExpelEnabled(false);
470+
config.setMaxSize(100UL * 1024 * 1024);
471+
// Disable the mem-based checkpoint creation in this test, we would end up
472+
// doing straight CheckpointRemoval rather than ItemExpel/CursorDrop
473+
config.setCheckpointMaxSize(std::numeric_limits<size_t>::max());
474+
const auto chkptMemRecoveryLimit =
475+
config.getMaxSize() * store->getCheckpointMemoryRatio() *
476+
store->getCheckpointMemoryRecoveryUpperMark();
477+
auto& stats = engine->getEpStats();
478+
stats.mem_low_wat.store(1);
479+
480+
int numItems = 0;
481+
const std::string value(1024 * 1024, 'x');
482+
while (stats.getCheckpointManagerEstimatedMemUsage() <
483+
chkptMemRecoveryLimit) {
484+
auto docKey = "key_" + std::to_string(++numItems);
485+
store_item(vbid, makeStoredDocKey(docKey), value);
486+
}
487+
flushVBucketToDiskIfPersistent(vbid, numItems);
488+
489+
// VB needs to be replica to rollback
490+
store->setVBucketState(vbid, vbucket_state_replica);
491+
492+
EXPECT_GT(stats.getNumCheckpoints(), 0);
493+
EXPECT_GT(store->getRequiredCMMemoryReduction(), 0);
494+
495+
/// Synchronises just before accessing and mutating CM::cursors
496+
ThreadGate tg(2);
497+
std::thread bgThread;
498+
499+
auto& oldManager = *store->getVBucket(vbid)->checkpointManager;
500+
oldManager.takeAndResetCursorsHook = [this, &tg, &bgThread]() {
501+
// Note: takeAndResetCursorsHook is executed *after* the new VBucket
502+
// has already been created
503+
504+
auto& newManager = *store->getVBucket(vbid)->checkpointManager;
505+
newManager.getListOfCursorsToDropHook = [&tg]() { tg.threadUp(); };
506+
bgThread = std::thread([this]() {
507+
auto remover = std::make_shared<CheckpointMemRecoveryTask>(
508+
*engine,
509+
engine->getEpStats(),
510+
engine->getConfiguration().getChkRemoverStime(),
511+
0);
512+
remover->run();
513+
});
514+
515+
tg.threadUp();
516+
};
517+
518+
store->rollback(vbid, 0);
519+
bgThread.join();
520+
}
521+
457522
// Test written for MB-36366. With the fix removed this test failed because
458523
// post expel, we continued onto cursor dropping.
459524
// MB-36447 - unreliable test, disabling for now

0 commit comments

Comments
 (0)