Skip to content

Commit 71e6a34

Browse files
committed
Add a DB property to indicate number of background errors encountered
Summary: Add a property to calculate number of background errors encountered to help users build their monitoring Test Plan: Add a unit test. make all check Reviewers: haobo, igor, dhruba Reviewed By: igor CC: ljin, nkg-, yhchiang, leveldb Differential Revision: https://reviews.facebook.net/D16959
1 parent 1ec72b3 commit 71e6a34

File tree

5 files changed

+88
-14
lines changed

5 files changed

+88
-14
lines changed

db/db_impl.cc

+16-6
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,8 @@
99

1010
#include "db/db_impl.h"
1111

12+
#define __STDC_FORMAT_MACROS
13+
#include <inttypes.h>
1214
#include <algorithm>
1315
#include <climits>
1416
#include <cstdio>
@@ -1806,8 +1808,10 @@ Status DBImpl::WaitForFlushMemTable() {
18061808
return s;
18071809
}
18081810

1809-
Status DBImpl::TEST_FlushMemTable() {
1810-
return FlushMemTable(FlushOptions());
1811+
Status DBImpl::TEST_FlushMemTable(bool wait) {
1812+
FlushOptions fo;
1813+
fo.wait = wait;
1814+
return FlushMemTable(fo);
18111815
}
18121816

18131817
Status DBImpl::TEST_WaitForFlushMemTable() {
@@ -1904,10 +1908,13 @@ void DBImpl::BackgroundCallFlush() {
19041908
// case this is an environmental problem and we do not want to
19051909
// chew up resources for failed compactions for the duration of
19061910
// the problem.
1911+
uint64_t error_cnt = internal_stats_.BumpAndGetBackgroundErrorCount();
19071912
bg_cv_.SignalAll(); // In case a waiter can proceed despite the error
1908-
Log(options_.info_log, "Waiting after background flush error: %s",
1909-
s.ToString().c_str());
19101913
mutex_.Unlock();
1914+
Log(options_.info_log,
1915+
"Waiting after background flush error: %s"
1916+
"Accumulated background error counts: %" PRIu64,
1917+
s.ToString().c_str(), error_cnt);
19111918
log_buffer.FlushBufferToLog();
19121919
LogFlush(options_.info_log);
19131920
env_->SleepForMicroseconds(1000000);
@@ -1978,11 +1985,14 @@ void DBImpl::BackgroundCallCompaction() {
19781985
// case this is an environmental problem and we do not want to
19791986
// chew up resources for failed compactions for the duration of
19801987
// the problem.
1988+
uint64_t error_cnt = internal_stats_.BumpAndGetBackgroundErrorCount();
19811989
bg_cv_.SignalAll(); // In case a waiter can proceed despite the error
19821990
mutex_.Unlock();
19831991
log_buffer.FlushBufferToLog();
1984-
Log(options_.info_log, "Waiting after background compaction error: %s",
1985-
s.ToString().c_str());
1992+
Log(options_.info_log,
1993+
"Waiting after background compaction error: %s, "
1994+
"Accumulated background error counts: %" PRIu64,
1995+
s.ToString().c_str(), error_cnt);
19861996
LogFlush(options_.info_log);
19871997
env_->SleepForMicroseconds(1000000);
19881998
mutex_.Lock();

db/db_impl.h

+1-1
Original file line numberDiff line numberDiff line change
@@ -109,7 +109,7 @@ class DBImpl : public DB {
109109
const Slice* end);
110110

111111
// Force current memtable contents to be flushed.
112-
Status TEST_FlushMemTable();
112+
Status TEST_FlushMemTable(bool wait = true);
113113

114114
// Wait for memtable compaction
115115
Status TEST_WaitForFlushMemTable();

db/db_test.cc

+42
Original file line numberDiff line numberDiff line change
@@ -4188,6 +4188,11 @@ TEST(DBTest, NoSpace) {
41884188
dbfull()->TEST_CompactRange(level, nullptr, nullptr);
41894189
}
41904190
}
4191+
4192+
std::string property_value;
4193+
ASSERT_TRUE(db_->GetProperty("rocksdb.background-errors", &property_value));
4194+
ASSERT_EQ("5", property_value);
4195+
41914196
env_->no_space_.Release_Store(nullptr);
41924197
ASSERT_LT(CountFiles(), num_files + 3);
41934198

@@ -4196,6 +4201,43 @@ TEST(DBTest, NoSpace) {
41964201
} while (ChangeCompactOptions());
41974202
}
41984203

4204+
// Check background error counter bumped on flush failures.
4205+
TEST(DBTest, NoSpaceFlush) {
4206+
do {
4207+
Options options = CurrentOptions();
4208+
options.env = env_;
4209+
options.max_background_flushes = 1;
4210+
Reopen(&options);
4211+
4212+
ASSERT_OK(Put("foo", "v1"));
4213+
env_->no_space_.Release_Store(env_); // Force out-of-space errors
4214+
4215+
std::string property_value;
4216+
// Background error count is 0 now.
4217+
ASSERT_TRUE(db_->GetProperty("rocksdb.background-errors", &property_value));
4218+
ASSERT_EQ("0", property_value);
4219+
4220+
dbfull()->TEST_FlushMemTable(false);
4221+
4222+
// Wait 300 milliseconds or background-errors turned 1 from 0.
4223+
int time_to_sleep_limit = 300000;
4224+
while (time_to_sleep_limit > 0) {
4225+
int to_sleep = (time_to_sleep_limit > 1000) ? 1000 : time_to_sleep_limit;
4226+
time_to_sleep_limit -= to_sleep;
4227+
env_->SleepForMicroseconds(to_sleep);
4228+
4229+
ASSERT_TRUE(
4230+
db_->GetProperty("rocksdb.background-errors", &property_value));
4231+
if (property_value == "1") {
4232+
break;
4233+
}
4234+
}
4235+
ASSERT_EQ("1", property_value);
4236+
4237+
env_->no_space_.Release_Store(nullptr);
4238+
} while (ChangeCompactOptions());
4239+
}
4240+
41994241
TEST(DBTest, NonWritableFileSystem) {
42004242
do {
42014243
Options options = CurrentOptions();

db/internal_stats.cc

+12-4
Original file line numberDiff line numberDiff line change
@@ -30,9 +30,11 @@ DBPropertyType GetPropertyType(const Slice& property) {
3030
} else if (in == "num-immutable-mem-table") {
3131
return kNumImmutableMemTable;
3232
} else if (in == "mem-table-flush-pending") {
33-
return MemtableFlushPending;
33+
return kMemtableFlushPending;
3434
} else if (in == "compaction-pending") {
35-
return CompactionPending;
35+
return kCompactionPending;
36+
} else if (in == "background-errors") {
37+
return kBackgroundErrors;
3638
}
3739
return kUnknown;
3840
}
@@ -330,15 +332,21 @@ bool InternalStats::GetProperty(DBPropertyType property_type,
330332
case kNumImmutableMemTable:
331333
*value = std::to_string(imm.size());
332334
return true;
333-
case MemtableFlushPending:
335+
case kMemtableFlushPending:
334336
// Return number of mem tables that are ready to flush (made immutable)
335337
*value = std::to_string(imm.IsFlushPending() ? 1 : 0);
336338
return true;
337-
case CompactionPending:
339+
case kCompactionPending:
338340
// 1 if the system already determines at least one compacdtion is needed.
339341
// 0 otherwise,
340342
*value = std::to_string(current->NeedsCompaction() ? 1 : 0);
341343
return true;
344+
/////////////
345+
case kBackgroundErrors:
346+
// Accumulated number of errors in background flushes or compactions.
347+
*value = std::to_string(GetBackgroundErrorCount());
348+
return true;
349+
/////////
342350
default:
343351
return false;
344352
}

db/internal_stats.h

+17-3
Original file line numberDiff line numberDiff line change
@@ -26,9 +26,11 @@ enum DBPropertyType {
2626
kStats, // Return general statitistics of DB
2727
kSsTables, // Return a human readable string of current SST files
2828
kNumImmutableMemTable, // Return number of immutable mem tables
29-
MemtableFlushPending, // Return 1 if mem table flushing is pending, otherwise
30-
// 0.
31-
CompactionPending, // Return 1 if a compaction is pending. Otherwise 0.
29+
kMemtableFlushPending, // Return 1 if mem table flushing is pending,
30+
// otherwise
31+
// 0.
32+
kCompactionPending, // Return 1 if a compaction is pending. Otherwise 0.
33+
kBackgroundErrors, // Return accumulated background errors encountered.
3234
kUnknown,
3335
};
3436

@@ -49,6 +51,7 @@ class InternalStats {
4951
stall_counts_(WRITE_STALLS_ENUM_MAX, 0),
5052
stall_leveln_slowdown_(num_levels, 0),
5153
stall_leveln_slowdown_count_(num_levels, 0),
54+
bg_error_count_(0),
5255
number_levels_(num_levels),
5356
statistics_(statistics),
5457
env_(env),
@@ -116,6 +119,10 @@ class InternalStats {
116119
stall_leveln_slowdown_count_[level] += micros;
117120
}
118121

122+
uint64_t GetBackgroundErrorCount() const { return bg_error_count_; }
123+
124+
uint64_t BumpAndGetBackgroundErrorCount() { return ++bg_error_count_; }
125+
119126
bool GetProperty(DBPropertyType property_type, const Slice& property,
120127
std::string* value, VersionSet* version_set,
121128
const MemTableList& imm);
@@ -158,6 +165,13 @@ class InternalStats {
158165
std::vector<uint64_t> stall_leveln_slowdown_;
159166
std::vector<uint64_t> stall_leveln_slowdown_count_;
160167

168+
// Total number of background errors encountered. Every time a flush task
169+
// or compaction task fails, this counter is incremented. The failure can
170+
// be caused by any possible reason, including file system errors, out of
171+
// resources, or input file corruption. Failing when retrying the same flush
172+
// or compaction will cause the counter to increase too.
173+
uint64_t bg_error_count_;
174+
161175
int number_levels_;
162176
Statistics* statistics_;
163177
Env* env_;

0 commit comments

Comments
 (0)