Skip to content

Commit 2413a06

Browse files
committed
Improve stability of db_stress
Summary: Currently, whenever DB Verification fails we bail out by calling `exit(1)`. This is kind of bad since it causes unclean shutdown and spew of error log messages like: 05:03:27 pthread lock: Invalid argument 05:03:27 pthread lock: Invalid argument 05:03:27 pthread lock: Invalid argument 05:03:27 pthread lock: Invalid argument 05:03:27 pthread lock: Invalid argument 05:03:27 pthread lock: Invalid argument 05:03:27 pthread lock: Invalid argument 05:03:27 pthread lock: Invalid argument 05:03:27 pthread lock: Invalid argument This diff adds a new parameter that is set to true when verification fails. It can then use the parameter to bail out safely. Test Plan: Casued artificail failure. Verified that exit was clean. Reviewers: dhruba, haobo, ljin Reviewed By: haobo CC: leveldb Differential Revision: https://reviews.facebook.net/D18243
1 parent d8fe006 commit 2413a06

File tree

1 file changed

+61
-23
lines changed

1 file changed

+61
-23
lines changed

tools/db_stress.cc

+61-23
Original file line numberDiff line numberDiff line change
@@ -651,6 +651,10 @@ class SharedState {
651651
return start_verify_;
652652
}
653653

654+
void SetVerificationFailure() { verification_failure_.store(true); }
655+
656+
bool HasVerificationFailedYet() { return verification_failure_.load(); }
657+
654658
port::Mutex* GetMutexForKey(int cf, long key) {
655659
return &key_locks_[cf][key >> log2_keys_per_lock_];
656660
}
@@ -695,6 +699,7 @@ class SharedState {
695699
bool start_;
696700
bool start_verify_;
697701
StressTest* stress_test_;
702+
std::atomic<bool> verification_failure_;
698703

699704
std::vector<std::vector<uint32_t>> values_;
700705
std::vector<std::vector<port::Mutex>> key_locks_;
@@ -752,7 +757,7 @@ class StressTest {
752757
delete filter_policy_;
753758
}
754759

755-
void Run() {
760+
bool Run() {
756761
PrintEnv();
757762
Open();
758763
SharedState shared(this);
@@ -814,6 +819,12 @@ class StressTest {
814819
FLAGS_env->TimeToString((uint64_t) now/1000000).c_str());
815820
}
816821
PrintStatistics();
822+
823+
if (shared.HasVerificationFailedYet()) {
824+
printf("Verification failed :(\n");
825+
return false;
826+
}
827+
return true;
817828
}
818829

819830
private:
@@ -1101,7 +1112,10 @@ class StressTest {
11011112

11021113
thread->stats.Start();
11031114
for (uint64_t i = 0; i < FLAGS_ops_per_thread; i++) {
1104-
if(i != 0 && (i % (FLAGS_ops_per_thread / (FLAGS_reopen + 1))) == 0) {
1115+
if (thread->shared->HasVerificationFailedYet()) {
1116+
break;
1117+
}
1118+
if (i != 0 && (i % (FLAGS_ops_per_thread / (FLAGS_reopen + 1))) == 0) {
11051119
{
11061120
thread->stats.FinishedSingleOp();
11071121
MutexLock l(thread->shared->GetMutex());
@@ -1211,8 +1225,10 @@ class StressTest {
12111225
std::string keystr2 = Key(rand_key);
12121226
Slice k = keystr2;
12131227
Status s = db_->Get(read_opts, column_family, k, &from_db);
1214-
VerifyValue(rand_column_family, rand_key, read_opts,
1215-
*(thread->shared), from_db, s, true);
1228+
if (VerifyValue(rand_column_family, rand_key, read_opts,
1229+
thread->shared, from_db, s, true) == false) {
1230+
break;
1231+
}
12161232
}
12171233
thread->shared->Put(rand_column_family, rand_key, value_base);
12181234
if (FLAGS_use_merge) {
@@ -1246,22 +1262,28 @@ class StressTest {
12461262

12471263
void VerifyDb(ThreadState* thread) const {
12481264
ReadOptions options(FLAGS_verify_checksum, true);
1249-
const SharedState& shared = *(thread->shared);
1250-
static const long max_key = shared.GetMaxKey();
1251-
static const long keys_per_thread = max_key / shared.GetNumThreads();
1265+
auto shared = thread->shared;
1266+
static const long max_key = shared->GetMaxKey();
1267+
static const long keys_per_thread = max_key / shared->GetNumThreads();
12521268
long start = keys_per_thread * thread->tid;
12531269
long end = start + keys_per_thread;
1254-
if (thread->tid == shared.GetNumThreads() - 1) {
1270+
if (thread->tid == shared->GetNumThreads() - 1) {
12551271
end = max_key;
12561272
}
12571273
for (size_t cf = 0; cf < column_families_.size(); ++cf) {
1274+
if (thread->shared->HasVerificationFailedYet()) {
1275+
break;
1276+
}
12581277
if (!thread->rand.OneIn(2)) {
12591278
// Use iterator to verify this range
12601279
options.prefix_seek = FLAGS_prefix_size > 0;
12611280
unique_ptr<Iterator> iter(
12621281
db_->NewIterator(options, column_families_[cf]));
12631282
iter->Seek(Key(start));
12641283
for (long i = start; i < end; i++) {
1284+
if (thread->shared->HasVerificationFailedYet()) {
1285+
break;
1286+
}
12651287
// TODO(ljin): update "long" to uint64_t
12661288
// Reseek when the prefix changes
12671289
if (i % (static_cast<int64_t>(1) << 8 * (8 - FLAGS_prefix_size)) ==
@@ -1279,7 +1301,7 @@ class StressTest {
12791301
from_db = iter->value().ToString();
12801302
iter->Next();
12811303
} else if (iter->key().compare(k) < 0) {
1282-
VerificationAbort("An out of range key was found", cf, i);
1304+
VerificationAbort(shared, "An out of range key was found", cf, i);
12831305
}
12841306
} else {
12851307
// The iterator found no value for the key in question, so do not
@@ -1294,6 +1316,9 @@ class StressTest {
12941316
} else {
12951317
// Use Get to verify this range
12961318
for (long i = start; i < end; i++) {
1319+
if (thread->shared->HasVerificationFailedYet()) {
1320+
break;
1321+
}
12971322
std::string from_db;
12981323
std::string keystr = Key(i);
12991324
Slice k = keystr;
@@ -1307,38 +1332,48 @@ class StressTest {
13071332
}
13081333
}
13091334

1310-
void VerificationAbort(std::string msg, int cf, long key) const {
1311-
fprintf(stderr, "Verification failed for column family %d key %ld: %s\n",
1312-
cf, key, msg.c_str());
1313-
exit(1);
1335+
void VerificationAbort(SharedState* shared, std::string msg, int cf,
1336+
long key) const {
1337+
printf("Verification failed for column family %d key %ld: %s\n", cf, key,
1338+
msg.c_str());
1339+
shared->SetVerificationFailure();
13141340
}
13151341

1316-
void VerifyValue(int cf, long key, const ReadOptions& opts,
1317-
const SharedState& shared, const std::string& value_from_db,
1342+
bool VerifyValue(int cf, long key, const ReadOptions& opts,
1343+
SharedState* shared, const std::string& value_from_db,
13181344
Status s, bool strict = false) const {
1345+
if (shared->HasVerificationFailedYet()) {
1346+
return false;
1347+
}
13191348
// compare value_from_db with the value in the shared state
13201349
char value[100];
1321-
uint32_t value_base = shared.Get(cf, key);
1350+
uint32_t value_base = shared->Get(cf, key);
13221351
if (value_base == SharedState::SENTINEL && !strict) {
1323-
return;
1352+
return true;
13241353
}
13251354

13261355
if (s.ok()) {
13271356
if (value_base == SharedState::SENTINEL) {
1328-
VerificationAbort("Unexpected value found", cf, key);
1357+
VerificationAbort(shared, "Unexpected value found", cf, key);
1358+
return false;
13291359
}
13301360
size_t sz = GenerateValue(value_base, value, sizeof(value));
13311361
if (value_from_db.length() != sz) {
1332-
VerificationAbort("Length of value read is not equal", cf, key);
1362+
VerificationAbort(shared, "Length of value read is not equal", cf, key);
1363+
return false;
13331364
}
13341365
if (memcmp(value_from_db.data(), value, sz) != 0) {
1335-
VerificationAbort("Contents of value read don't match", cf, key);
1366+
VerificationAbort(shared, "Contents of value read don't match", cf,
1367+
key);
1368+
return false;
13361369
}
13371370
} else {
13381371
if (value_base != SharedState::SENTINEL) {
1339-
VerificationAbort("Value not found", cf, key);
1372+
VerificationAbort(shared, "Value not found: " + s.ToString(), cf, key);
1373+
return false;
13401374
}
13411375
}
1376+
return true;
13421377
}
13431378

13441379
static void PrintKeyValue(int cf, uint32_t key, const char* value,
@@ -1693,6 +1728,9 @@ int main(int argc, char** argv) {
16931728
}
16941729

16951730
rocksdb::StressTest stress;
1696-
stress.Run();
1697-
return 0;
1731+
if (stress.Run()) {
1732+
return 0;
1733+
} else {
1734+
return 1;
1735+
}
16981736
}

0 commit comments

Comments
 (0)