Skip to content

Commit dff9214

Browse files
committed
Merge branch 'master' into columnfamilies
Conflicts: db/db_impl.cc tools/db_stress.cc
2 parents fb2346f + 45ad75d commit dff9214

11 files changed

+162
-49
lines changed

HISTORY.md

+1
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@
2020
* Added new option -- verify_checksums_in_compaction
2121
* Chagned Options.prefix_extractor from raw pointer to shared_ptr (take ownership)
2222
Changed HashSkipListRepFactory and HashLinkListRepFactory constructor to not take SliceTransform object (use Options.prefix_extractor implicitly)
23+
* Added Env::GetThreadPoolQueueLen(), which returns the waiting queue length of thread pools
2324

2425
### New Features
2526
* If we find one truncated record at the end of the MANIFEST or WAL files,

db/db_impl.cc

+47-12
Original file line numberDiff line numberDiff line change
@@ -226,6 +226,7 @@ DBImpl::DBImpl(const DBOptions& options, const std::string& dbname)
226226
logfile_number_(0),
227227
default_cf_handle_(nullptr),
228228
tmp_batch_(),
229+
bg_schedule_needed_(false),
229230
bg_compaction_scheduled_(0),
230231
bg_manual_only_(0),
231232
bg_flush_scheduled_(0),
@@ -1732,6 +1733,7 @@ Status DBImpl::TEST_WaitForCompact() {
17321733

17331734
void DBImpl::MaybeScheduleFlushOrCompaction() {
17341735
mutex_.AssertHeld();
1736+
bg_schedule_needed_ = false;
17351737
if (bg_work_gate_closed_) {
17361738
// gate closed for backgrond work
17371739
} else if (shutting_down_.Acquire_Load()) {
@@ -1752,6 +1754,8 @@ void DBImpl::MaybeScheduleFlushOrCompaction() {
17521754
if (bg_flush_scheduled_ < options_.max_background_flushes) {
17531755
bg_flush_scheduled_++;
17541756
env_->Schedule(&DBImpl::BGWorkFlush, this, Env::Priority::HIGH);
1757+
} else {
1758+
bg_schedule_needed_ = true;
17551759
}
17561760
}
17571761
bool is_compaction_needed = false;
@@ -1767,11 +1771,13 @@ void DBImpl::MaybeScheduleFlushOrCompaction() {
17671771
// Do it only if max_background_compactions hasn't been reached and, in case
17681772
// bg_manual_only_ > 0, if it's a manual compaction.
17691773
if ((manual_compaction_ || is_compaction_needed) &&
1770-
bg_compaction_scheduled_ < options_.max_background_compactions &&
17711774
(!bg_manual_only_ || manual_compaction_)) {
1772-
1773-
bg_compaction_scheduled_++;
1774-
env_->Schedule(&DBImpl::BGWorkCompaction, this, Env::Priority::LOW);
1775+
if (bg_compaction_scheduled_ < options_.max_background_compactions) {
1776+
bg_compaction_scheduled_++;
1777+
env_->Schedule(&DBImpl::BGWorkCompaction, this, Env::Priority::LOW);
1778+
} else {
1779+
bg_schedule_needed_ = true;
1780+
}
17751781
}
17761782
}
17771783
}
@@ -1850,20 +1856,34 @@ void DBImpl::BackgroundCallFlush() {
18501856
// to delete all obsolete files and we force FindObsoleteFiles()
18511857
FindObsoleteFiles(deletion_state, !s.ok());
18521858
// delete unnecessary files if any, this is done outside the mutex
1853-
if (deletion_state.HaveSomethingToDelete()) {
1859+
if (deletion_state.HaveSomethingToDelete() || !log_buffer.IsEmpty()) {
18541860
mutex_.Unlock();
1861+
// Have to flush the info logs before bg_flush_scheduled_--
1862+
// because if bg_flush_scheduled_ becomes 0 and the lock is
1863+
// released, the deconstructor of DB can kick in and destroy all the
1864+
// states of DB so info_log might not be available after that point.
1865+
// It also applies to access other states that DB owns.
18551866
log_buffer.FlushBufferToLog();
1856-
PurgeObsoleteFiles(deletion_state);
1867+
if (deletion_state.HaveSomethingToDelete()) {
1868+
PurgeObsoleteFiles(deletion_state);
1869+
}
18571870
mutex_.Lock();
18581871
}
18591872

18601873
bg_flush_scheduled_--;
1861-
if (madeProgress) {
1874+
// Any time the mutex is released After finding the work to do, another
1875+
// thread might execute MaybeScheduleFlushOrCompaction(). It is possible
1876+
// that there is a pending job but it is not scheduled because of the
1877+
// max thread limit.
1878+
if (madeProgress || bg_schedule_needed_) {
18621879
MaybeScheduleFlushOrCompaction();
18631880
}
18641881
bg_cv_.SignalAll();
1882+
// IMPORTANT: there should be no code after calling SignalAll. This call may
1883+
// signal the DB destructor that it's OK to proceed with destruction. In
1884+
// that case, all DB variables will be dealloacated and referencing them
1885+
// will cause trouble.
18651886
}
1866-
log_buffer.FlushBufferToLog();
18671887
}
18681888

18691889

@@ -1913,10 +1933,17 @@ void DBImpl::BackgroundCallCompaction() {
19131933
FindObsoleteFiles(deletion_state, !s.ok());
19141934

19151935
// delete unnecessary files if any, this is done outside the mutex
1916-
if (deletion_state.HaveSomethingToDelete()) {
1936+
if (deletion_state.HaveSomethingToDelete() || !log_buffer.IsEmpty()) {
19171937
mutex_.Unlock();
1938+
// Have to flush the info logs before bg_compaction_scheduled_--
1939+
// because if bg_flush_scheduled_ becomes 0 and the lock is
1940+
// released, the deconstructor of DB can kick in and destroy all the
1941+
// states of DB so info_log might not be available after that point.
1942+
// It also applies to access other states that DB owns.
19181943
log_buffer.FlushBufferToLog();
1919-
PurgeObsoleteFiles(deletion_state);
1944+
if (deletion_state.HaveSomethingToDelete()) {
1945+
PurgeObsoleteFiles(deletion_state);
1946+
}
19201947
mutex_.Lock();
19211948
}
19221949

@@ -1927,12 +1954,20 @@ void DBImpl::BackgroundCallCompaction() {
19271954
// Previous compaction may have produced too many files in a level,
19281955
// So reschedule another compaction if we made progress in the
19291956
// last compaction.
1930-
if (madeProgress) {
1957+
//
1958+
// Also, any time the mutex is released After finding the work to do,
1959+
// another thread might execute MaybeScheduleFlushOrCompaction(). It is
1960+
// possible that there is a pending job but it is not scheduled because of
1961+
// the max thread limit.
1962+
if (madeProgress || bg_schedule_needed_) {
19311963
MaybeScheduleFlushOrCompaction();
19321964
}
19331965
bg_cv_.SignalAll();
1966+
// IMPORTANT: there should be no code after calling SignalAll. This call may
1967+
// signal the DB destructor that it's OK to proceed with destruction. In
1968+
// that case, all DB variables will be dealloacated and referencing them
1969+
// will cause trouble.
19341970
}
1935-
log_buffer.FlushBufferToLog();
19361971
}
19371972

19381973
Status DBImpl::BackgroundCompaction(bool* madeProgress,

db/db_impl.h

+4
Original file line numberDiff line numberDiff line change
@@ -410,6 +410,10 @@ class DBImpl : public DB {
410410
// part of ongoing compactions.
411411
std::set<uint64_t> pending_outputs_;
412412

413+
// At least one compaction or flush job is pending but not yet scheduled
414+
// because of the max background thread limit.
415+
bool bg_schedule_needed_;
416+
413417
// count how many background compactions are running or have been scheduled
414418
int bg_compaction_scheduled_;
415419

hdfs/env_hdfs.h

+9
Original file line numberDiff line numberDiff line change
@@ -110,6 +110,11 @@ class HdfsEnv : public Env {
110110

111111
virtual void WaitForJoin() { posixEnv->WaitForJoin(); }
112112

113+
virtual unsigned int GetThreadPoolQueueLen(Priority pri = LOW) const
114+
override {
115+
return posixEnv->GetThreadPoolQueueLen(pri);
116+
}
117+
113118
virtual Status GetTestDirectory(std::string* path) {
114119
return posixEnv->GetTestDirectory(path);
115120
}
@@ -292,6 +297,10 @@ class HdfsEnv : public Env {
292297

293298
virtual void WaitForJoin() {}
294299

300+
virtual unsigned int GetThreadPoolQueueLen(Priority pri = LOW) const {
301+
return 0;
302+
}
303+
295304
virtual Status GetTestDirectory(std::string* path) {return notsup;}
296305

297306
virtual uint64_t NowMicros() {return 0;}

include/rocksdb/env.h

+8
Original file line numberDiff line numberDiff line change
@@ -210,6 +210,11 @@ class Env {
210210
// Wait for all threads started by StartThread to terminate.
211211
virtual void WaitForJoin() = 0;
212212

213+
// Get thread pool queue length for specific thrad pool.
214+
virtual unsigned int GetThreadPoolQueueLen(Priority pri = LOW) const {
215+
return 0;
216+
}
217+
213218
// *path is set to a temporary directory that can be used for testing. It may
214219
// or many not have just been created. The directory may or may not differ
215220
// between runs of the same process, but subsequent calls will return the
@@ -702,6 +707,9 @@ class EnvWrapper : public Env {
702707
return target_->StartThread(f, a);
703708
}
704709
void WaitForJoin() { return target_->WaitForJoin(); }
710+
virtual unsigned int GetThreadPoolQueueLen(Priority pri = LOW) const {
711+
return target_->GetThreadPoolQueueLen(pri);
712+
}
705713
virtual Status GetTestDirectory(std::string* path) {
706714
return target_->GetTestDirectory(path);
707715
}

tools/db_crashtest.py

+17-8
Original file line numberDiff line numberDiff line change
@@ -93,6 +93,8 @@ def main(argv):
9393
--max_background_compactions=20
9494
--max_bytes_for_level_base=10485760
9595
--filter_deletes=%s
96+
--memtablerep=prefix_hash
97+
--prefix_size=7
9698
""" % (ops_per_thread,
9799
threads,
98100
write_buf_size,
@@ -108,16 +110,23 @@ def main(argv):
108110
print("Running db_stress with pid=%d: %s\n\n"
109111
% (child.pid, cmd))
110112

113+
stop_early = False
111114
while time.time() < killtime:
112-
time.sleep(10)
115+
if child.poll() is not None:
116+
print("WARNING: db_stress ended before kill: exitcode=%d\n"
117+
% child.returncode)
118+
stop_early = True
119+
break
120+
time.sleep(1)
113121

114-
if child.poll() is not None:
115-
print("WARNING: db_stress ended before kill: exitcode=%d\n"
116-
% child.returncode)
117-
else:
118-
child.kill()
119-
print("KILLED %d\n" % child.pid)
120-
time.sleep(1) # time to stabilize after a kill
122+
if not stop_early:
123+
if child.poll() is not None:
124+
print("WARNING: db_stress ended before kill: exitcode=%d\n"
125+
% child.returncode)
126+
else:
127+
child.kill()
128+
print("KILLED %d\n" % child.pid)
129+
time.sleep(1) # time to stabilize after a kill
121130

122131
while True:
123132
line = child.stderr.readline().strip()

tools/db_crashtest2.py

+2
Original file line numberDiff line numberDiff line change
@@ -107,6 +107,8 @@ def main(argv):
107107
--max_background_compactions=20
108108
--max_bytes_for_level_base=10485760
109109
--filter_deletes=%s
110+
--memtablerep=prefix_hash
111+
--prefix_size=7
110112
%s
111113
""" % (random.randint(0, 1),
112114
threads,

tools/db_stress.cc

+39-23
Original file line numberDiff line numberDiff line change
@@ -334,19 +334,19 @@ enum RepFactory StringToRepFactory(const char* ctype) {
334334
return kSkipList;
335335
}
336336
static enum RepFactory FLAGS_rep_factory;
337-
DEFINE_string(memtablerep, "skip_list", "");
337+
DEFINE_string(memtablerep, "prefix_hash", "");
338338

339339
static bool ValidatePrefixSize(const char* flagname, int32_t value) {
340-
if (value < 0 || value>=2000000000) {
341-
fprintf(stderr, "Invalid value for --%s: %d. 0<= PrefixSize <=2000000000\n",
340+
if (value < 0 || value > 8) {
341+
fprintf(stderr, "Invalid value for --%s: %d. 0 <= PrefixSize <= 8\n",
342342
flagname, value);
343343
return false;
344344
}
345345
return true;
346346
}
347-
DEFINE_int32(prefix_size, 0, "Control the prefix size for HashSkipListRep");
348-
static const bool FLAGS_prefix_size_dummy __attribute__((unused)) =
349-
google::RegisterFlagValidator(&FLAGS_prefix_size, &ValidatePrefixSize);
347+
DEFINE_int32(prefix_size, 7, "Control the prefix size for HashSkipListRep");
348+
static const bool FLAGS_prefix_size_dummy =
349+
google::RegisterFlagValidator(&FLAGS_prefix_size, &ValidatePrefixSize);
350350

351351
DEFINE_bool(use_merge, false, "On true, replaces all writes with a Merge "
352352
"that behaves like a Put");
@@ -951,15 +951,15 @@ class StressTest {
951951
return s;
952952
}
953953

954-
// Given a prefix P, this does prefix scans for "0"+P, "1"+P,..."9"+P
955-
// in the same snapshot. Each of these 10 scans returns a series of
956-
// values; each series should be the same length, and it is verified
957-
// for each index i that all the i'th values are of the form "0"+V,
958-
// "1"+V,..."9"+V.
954+
// Given a key, this does prefix scans for "0"+P, "1"+P,..."9"+P
955+
// in the same snapshot where P is the first FLAGS_prefix_size - 1 bytes
956+
// of the key. Each of these 10 scans returns a series of values;
957+
// each series should be the same length, and it is verified for each
958+
// index i that all the i'th values are of the form "0"+V, "1"+V,..."9"+V.
959959
// ASSUMES that MultiPut was used to put (K, V)
960960
Status MultiPrefixScan(ThreadState* thread, const ReadOptions& readoptions,
961961
ColumnFamilyHandle* column_family,
962-
const Slice& prefix) {
962+
const Slice& key) {
963963
std::string prefixes[10] = {"0", "1", "2", "3", "4",
964964
"5", "6", "7", "8", "9"};
965965
Slice prefix_slices[10];
@@ -968,8 +968,9 @@ class StressTest {
968968
Iterator* iters[10];
969969
Status s = Status::OK();
970970
for (int i = 0; i < 10; i++) {
971-
prefixes[i] += prefix.ToString();
972-
prefix_slices[i] = prefixes[i];
971+
prefixes[i] += key.ToString();
972+
prefixes[i].resize(FLAGS_prefix_size);
973+
prefix_slices[i] = Slice(prefixes[i]);
973974
readoptionscopy[i] = readoptions;
974975
readoptionscopy[i].prefix = &prefix_slices[i];
975976
readoptionscopy[i].snapshot = snapshot;
@@ -1000,7 +1001,7 @@ class StressTest {
10001001
for (int i = 0; i < 10; i++) {
10011002
if (values[i] != values[0]) {
10021003
fprintf(stderr, "error : inconsistent values for prefix %s: %s, %s\n",
1003-
prefix.ToString().c_str(), values[0].c_str(),
1004+
prefixes[i].c_str(), values[0].c_str(),
10041005
values[i].c_str());
10051006
// we continue after error rather than exiting so that we can
10061007
// find more errors if any
@@ -1035,6 +1036,7 @@ class StressTest {
10351036
const Snapshot* snapshot = db_->GetSnapshot();
10361037
ReadOptions readoptionscopy = readoptions;
10371038
readoptionscopy.snapshot = snapshot;
1039+
readoptionscopy.prefix_seek = FLAGS_prefix_size > 0;
10381040
unique_ptr<Iterator> iter(db_->NewIterator(readoptionscopy, column_family));
10391041

10401042
iter->Seek(key);
@@ -1149,27 +1151,29 @@ class StressTest {
11491151
}
11501152
} else if ((int)FLAGS_readpercent <= prob_op && prob_op < prefixBound) {
11511153
// OPERATION prefix scan
1152-
// keys are longs (e.g., 8 bytes), so we let prefixes be
1153-
// everything except the last byte. So there will be 2^8=256
1154-
// keys per prefix.
1155-
Slice prefix = Slice(key.data(), key.size() - 1);
1154+
// keys are 8 bytes long, prefix size is FLAGS_prefix_size. There are
1155+
// (8 - FLAGS_prefix_size) bytes besides the prefix. So there will
1156+
// be 2 ^ ((8 - FLAGS_prefix_size) * 8) possible keys with the same
1157+
// prefix
11561158
if (!FLAGS_test_batches_snapshots) {
1159+
Slice prefix = Slice(key.data(), FLAGS_prefix_size);
11571160
read_opts.prefix = &prefix;
11581161
Iterator* iter = db_->NewIterator(read_opts, column_family);
1159-
int count = 0;
1162+
int64_t count = 0;
11601163
for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
11611164
assert(iter->key().starts_with(prefix));
1162-
count++;
1165+
++count;
11631166
}
1164-
assert(count <= 256);
1167+
assert(count <=
1168+
(static_cast<int64_t>(1) << ((8 - FLAGS_prefix_size) * 8)));
11651169
if (iter->status().ok()) {
11661170
thread->stats.AddPrefixes(1, count);
11671171
} else {
11681172
thread->stats.AddErrors(1);
11691173
}
11701174
delete iter;
11711175
} else {
1172-
MultiPrefixScan(thread, read_opts, column_family, prefix);
1176+
MultiPrefixScan(thread, read_opts, column_family, key);
11731177
}
11741178
read_opts.prefix = nullptr;
11751179
} else if (prefixBound <= prob_op && prob_op < writeBound) {
@@ -1617,6 +1621,18 @@ int main(int argc, char** argv) {
16171621
// max number of concurrent compactions.
16181622
FLAGS_env->SetBackgroundThreads(FLAGS_max_background_compactions);
16191623

1624+
if (FLAGS_prefixpercent > 0 && FLAGS_prefix_size <= 0) {
1625+
fprintf(stderr,
1626+
"Error: prefixpercent is non-zero while prefix_size is "
1627+
"not positive!\n");
1628+
exit(1);
1629+
}
1630+
if (FLAGS_test_batches_snapshots && FLAGS_prefix_size <= 0) {
1631+
fprintf(stderr,
1632+
"Error: please specify prefix_size for "
1633+
"test_batches_snapshots test!\n");
1634+
exit(1);
1635+
}
16201636
if ((FLAGS_readpercent + FLAGS_prefixpercent +
16211637
FLAGS_writepercent + FLAGS_delpercent + FLAGS_iterpercent) != 100) {
16221638
fprintf(stderr,

0 commit comments

Comments
 (0)