Skip to content

Commit 8454cfe

Browse files
committed
Add read/modify/write functionality to Put() api
Summary: The application can set a callback function, which is applied on the previous value. And calculates the new value. This new value can be set, either inplace, if the previous value existed in memtable, and new value is smaller than previous value. Otherwise the new value is added normally. Test Plan: fbmake. Added unit tests. All unit tests pass. Reviewers: dhruba, haobo Reviewed By: haobo CC: sdong, kailiu, xinyaohu, sumeet, leveldb Differential Revision: https://reviews.facebook.net/D14745
1 parent aa0ef66 commit 8454cfe

File tree

6 files changed

+309
-70
lines changed

6 files changed

+309
-70
lines changed

db/db_test.cc

+115-35
Original file line numberDiff line numberDiff line change
@@ -555,7 +555,7 @@ class DBTest {
555555
case kTypeDeletion:
556556
result += "DEL";
557557
break;
558-
case kTypeLogData:
558+
default:
559559
assert(false);
560560
break;
561561
}
@@ -705,6 +705,44 @@ class DBTest {
705705
ASSERT_EQ(IterStatus(iter), expected_key);
706706
delete iter;
707707
}
708+
709+
710+
// Used to test InplaceUpdate
711+
712+
// If previous value is nullptr or delta is > than previous value,
713+
// sets newValue with delta
714+
// If previous value is not empty,
715+
// updates previous value with 'b' string of previous value size
716+
static bool updateInPlace(char* prevValue, size_t prevSize,
717+
Slice delta, std::string* newValue) {
718+
if (prevValue == nullptr || delta.size() > prevSize) {
719+
*newValue = std::string(delta.size(), 'c');
720+
return false;
721+
} else {
722+
std::string str_b = std::string(prevSize, 'b');
723+
memcpy(prevValue, str_b.c_str(), str_b.size());
724+
return true;
725+
}
726+
}
727+
728+
// Used to test InplaceUpdate
729+
void validateNumberOfEntries(int numValues) {
730+
Iterator* iter = dbfull()->TEST_NewInternalIterator();
731+
iter->SeekToFirst();
732+
ASSERT_EQ(iter->status().ok(), true);
733+
int seq = numValues;
734+
while (iter->Valid()) {
735+
ParsedInternalKey ikey;
736+
ikey.sequence = -1;
737+
ASSERT_EQ(ParseInternalKey(iter->key(), &ikey), true);
738+
739+
// checks sequence number for updates
740+
ASSERT_EQ(ikey.sequence, (unsigned)seq--);
741+
iter->Next();
742+
}
743+
delete iter;
744+
ASSERT_EQ(0, seq);
745+
}
708746
};
709747
std::unique_ptr<const SliceTransform> DBTest::prefix_1_transform(
710748
NewFixedPrefixTransform(1));
@@ -2391,60 +2429,102 @@ TEST(DBTest, InPlaceUpdate) {
23912429
options.inplace_update_support = true;
23922430
options.env = env_;
23932431
options.write_buffer_size = 100000;
2432+
Reopen(&options);
23942433

23952434
// Update key with values of smaller size
2396-
Reopen(&options);
23972435
int numValues = 10;
23982436
for (int i = numValues; i > 0; i--) {
23992437
std::string value = DummyString(i, 'a');
24002438
ASSERT_OK(Put("key", value));
24012439
ASSERT_EQ(value, Get("key"));
24022440
}
24032441

2404-
int count = 0;
2405-
Iterator* iter = dbfull()->TEST_NewInternalIterator();
2406-
iter->SeekToFirst();
2407-
ASSERT_EQ(iter->status().ok(), true);
2408-
while (iter->Valid()) {
2409-
ParsedInternalKey ikey(Slice(), 0, kTypeValue);
2410-
ikey.sequence = -1;
2411-
ASSERT_EQ(ParseInternalKey(iter->key(), &ikey), true);
2412-
count++;
2413-
// All updates with the same sequence number.
2414-
ASSERT_EQ(ikey.sequence, (unsigned)1);
2415-
iter->Next();
2416-
}
24172442
// Only 1 instance for that key.
2418-
ASSERT_EQ(count, 1);
2419-
delete iter;
2443+
validateNumberOfEntries(1);
2444+
2445+
} while (ChangeCompactOptions());
2446+
}
2447+
2448+
TEST(DBTest, InPlaceUpdateLargeNewValue) {
2449+
do {
2450+
Options options = CurrentOptions();
2451+
options.create_if_missing = true;
2452+
options.inplace_update_support = true;
2453+
options.env = env_;
2454+
options.write_buffer_size = 100000;
2455+
Reopen(&options);
24202456

24212457
// Update key with values of larger size
2422-
DestroyAndReopen(&options);
2423-
numValues = 10;
2458+
int numValues = 10;
24242459
for (int i = 0; i < numValues; i++) {
24252460
std::string value = DummyString(i, 'a');
24262461
ASSERT_OK(Put("key", value));
24272462
ASSERT_EQ(value, Get("key"));
24282463
}
24292464

2430-
count = 0;
2431-
iter = dbfull()->TEST_NewInternalIterator();
2432-
iter->SeekToFirst();
2433-
ASSERT_EQ(iter->status().ok(), true);
2434-
int seq = numValues;
2435-
while (iter->Valid()) {
2436-
ParsedInternalKey ikey(Slice(), 0, kTypeValue);
2437-
ikey.sequence = -1;
2438-
ASSERT_EQ(ParseInternalKey(iter->key(), &ikey), true);
2439-
count++;
2440-
// No inplace updates. All updates are puts with new seq number
2441-
ASSERT_EQ(ikey.sequence, (unsigned)seq--);
2442-
iter->Next();
2443-
}
24442465
// All 10 updates exist in the internal iterator
2445-
ASSERT_EQ(count, numValues);
2446-
delete iter;
2466+
validateNumberOfEntries(numValues);
24472467

2468+
} while (ChangeCompactOptions());
2469+
}
2470+
2471+
2472+
TEST(DBTest, InPlaceUpdateCallback) {
2473+
do {
2474+
Options options = CurrentOptions();
2475+
options.create_if_missing = true;
2476+
options.inplace_update_support = true;
2477+
2478+
options.env = env_;
2479+
options.write_buffer_size = 100000;
2480+
options.inplace_callback =
2481+
rocksdb::DBTest::updateInPlace;
2482+
Reopen(&options);
2483+
2484+
// Update key with values of smaller size
2485+
int numValues = 10;
2486+
ASSERT_OK(Put("key", DummyString(numValues, 'a')));
2487+
ASSERT_EQ(DummyString(numValues, 'c'), Get("key"));
2488+
2489+
for (int i = numValues; i > 0; i--) {
2490+
ASSERT_OK(Put("key", DummyString(i, 'a')));
2491+
ASSERT_EQ(DummyString(numValues, 'b'), Get("key"));
2492+
}
2493+
2494+
// Only 1 instance for that key.
2495+
validateNumberOfEntries(1);
2496+
2497+
} while (ChangeCompactOptions());
2498+
}
2499+
2500+
TEST(DBTest, InPlaceUpdateCallbackNotFound) {
2501+
do {
2502+
//Test sst get/update/put
2503+
} while (ChangeCompactOptions());
2504+
}
2505+
2506+
TEST(DBTest, InPlaceUpdateCallbackLargeNewValue) {
2507+
do {
2508+
Options options = CurrentOptions();
2509+
options.create_if_missing = true;
2510+
options.inplace_update_support = true;
2511+
2512+
options.env = env_;
2513+
options.write_buffer_size = 100000;
2514+
options.inplace_callback =
2515+
rocksdb::DBTest::updateInPlace;
2516+
Reopen(&options);
2517+
2518+
// Update key with values of larger size
2519+
int numValues = 10;
2520+
for (int i = 1; i <= numValues; i++) {
2521+
ASSERT_OK(Put("key", DummyString(i, 'a')));
2522+
ASSERT_EQ(DummyString(i, 'c'), Get("key"));
2523+
}
2524+
2525+
// No inplace updates. All updates are puts with new seq number
2526+
// All 10 updates exist in the internal iterator
2527+
validateNumberOfEntries(numValues);
24482528

24492529
} while (ChangeCompactOptions());
24502530
}

db/memtable.cc

+95-18
Original file line numberDiff line numberDiff line change
@@ -302,7 +302,7 @@ bool MemTable::Get(const LookupKey& key, std::string* value, Status* s,
302302
}
303303
break;
304304
}
305-
case kTypeLogData:
305+
default:
306306
assert(false);
307307
break;
308308
}
@@ -322,7 +322,7 @@ bool MemTable::Get(const LookupKey& key, std::string* value, Status* s,
322322
return found_final_value;
323323
}
324324

325-
bool MemTable::Update(SequenceNumber seq, ValueType type,
325+
void MemTable::Update(SequenceNumber seq,
326326
const Slice& key,
327327
const Slice& value) {
328328
LookupKey lkey(key, seq);
@@ -335,7 +335,7 @@ bool MemTable::Update(SequenceNumber seq, ValueType type,
335335

336336
if (iter->Valid()) {
337337
// entry format is:
338-
// klength varint32
338+
// key_length varint32
339339
// userkey char[klength-8]
340340
// tag uint64
341341
// vlength varint32
@@ -352,37 +352,114 @@ bool MemTable::Update(SequenceNumber seq, ValueType type,
352352
const uint64_t tag = DecodeFixed64(key_ptr + key_length - 8);
353353
switch (static_cast<ValueType>(tag & 0xff)) {
354354
case kTypeValue: {
355-
uint32_t vlength;
356-
GetVarint32Ptr(key_ptr + key_length,
357-
key_ptr + key_length+5, &vlength);
355+
Slice prev_value = GetLengthPrefixedSlice(key_ptr + key_length);
356+
uint32_t prev_value_size = prev_value.size();
357+
uint32_t new_value_size = value.size();
358+
358359
// Update value, if newValue size <= curValue size
359-
if (value.size() <= vlength) {
360+
if (new_value_size <= prev_value_size ) {
360361
char* p = EncodeVarint32(const_cast<char*>(key_ptr) + key_length,
361-
value.size());
362+
new_value_size);
362363
WriteLock wl(GetLock(lkey.user_key()));
363-
memcpy(p, value.data(), value.size());
364+
memcpy(p, value.data(), new_value_size);
364365
assert(
365-
(p + value.size()) - entry ==
366+
(p + new_value_size) - entry ==
366367
(unsigned) (VarintLength(key_length) +
367368
key_length +
368-
VarintLength(value.size()) +
369-
value.size())
369+
VarintLength(new_value_size) +
370+
new_value_size)
370371
);
371372
// no need to update bloom, as user key does not change.
372-
return true;
373+
return;
373374
}
374375
}
375376
default:
376377
// If the latest value is kTypeDeletion, kTypeMerge or kTypeLogData
377-
// then we probably don't have enough space to update in-place
378-
// Maybe do something later
379-
// Return false, and do normal Add()
380-
return false;
378+
// we don't have enough space for update inplace
379+
Add(seq, kTypeValue, key, value);
380+
return;
381381
}
382382
}
383383
}
384384

385-
// Key doesn't exist
385+
// key doesn't exist
386+
Add(seq, kTypeValue, key, value);
387+
}
388+
389+
bool MemTable::UpdateCallback(SequenceNumber seq,
390+
const Slice& key,
391+
const Slice& delta,
392+
const Options& options) {
393+
LookupKey lkey(key, seq);
394+
Slice memkey = lkey.memtable_key();
395+
396+
std::shared_ptr<MemTableRep::Iterator> iter(
397+
table_->GetIterator(lkey.user_key()));
398+
iter->Seek(key, memkey.data());
399+
400+
if (iter->Valid()) {
401+
// entry format is:
402+
// key_length varint32
403+
// userkey char[klength-8]
404+
// tag uint64
405+
// vlength varint32
406+
// value char[vlength]
407+
// Check that it belongs to same user key. We do not check the
408+
// sequence number since the Seek() call above should have skipped
409+
// all entries with overly large sequence numbers.
410+
const char* entry = iter->key();
411+
uint32_t key_length;
412+
const char* key_ptr = GetVarint32Ptr(entry, entry + 5, &key_length);
413+
if (comparator_.comparator.user_comparator()->Compare(
414+
Slice(key_ptr, key_length - 8), lkey.user_key()) == 0) {
415+
// Correct user key
416+
const uint64_t tag = DecodeFixed64(key_ptr + key_length - 8);
417+
switch (static_cast<ValueType>(tag & 0xff)) {
418+
case kTypeValue: {
419+
Slice prev_value = GetLengthPrefixedSlice(key_ptr + key_length);
420+
uint32_t prev_value_size = prev_value.size();
421+
422+
WriteLock wl(GetLock(lkey.user_key()));
423+
std::string str_value;
424+
if (options.inplace_callback(const_cast<char*>(prev_value.data()),
425+
prev_value_size, delta, &str_value)) {
426+
// Value already updated by callback.
427+
// TODO: Change size of value in memtable slice.
428+
// This works for leaf, since size is already encoded in the
429+
// value. It doesn't depend on rocksdb buffer size.
430+
return true;
431+
}
432+
Slice slice_value = Slice(str_value);
433+
uint32_t new_value_size = slice_value.size();
434+
435+
// Update value, if newValue size <= curValue size
436+
if (new_value_size <= prev_value_size ) {
437+
char* p = EncodeVarint32(const_cast<char*>(key_ptr) + key_length,
438+
new_value_size);
439+
440+
memcpy(p, slice_value.data(), new_value_size);
441+
assert(
442+
(p + new_value_size) - entry ==
443+
(unsigned) (VarintLength(key_length) +
444+
key_length +
445+
VarintLength(new_value_size) +
446+
new_value_size)
447+
);
448+
return true;
449+
} else {
450+
// If we don't have enough space to update in-place
451+
// Return as NotUpdatable, and do normal Add()
452+
Add(seq, kTypeValue, key, slice_value);
453+
return true;
454+
}
455+
}
456+
default:
457+
break;
458+
}
459+
}
460+
}
461+
// If the latest value is not kTypeValue
462+
// or key doesn't exist
386463
return false;
387464
}
388465
} // namespace rocksdb

db/memtable.h

+23-8
Original file line numberDiff line numberDiff line change
@@ -98,16 +98,31 @@ class MemTable {
9898
bool Get(const LookupKey& key, std::string* value, Status* s,
9999
MergeContext& merge_context, const Options& options);
100100

101-
// Update the value and return status ok,
102-
// if key exists in current memtable
103-
// if new sizeof(new_value) <= sizeof(old_value) &&
104-
// old_value for that key is a put i.e. kTypeValue
105-
// else return false, and status - NotUpdatable()
106-
// else return false, and status - NotFound()
107-
bool Update(SequenceNumber seq, ValueType type,
101+
// Attempts to update the new_value inplace, else does normal Add
102+
// Pseudocode
103+
// if key exists in current memtable && prev_value is of type kTypeValue
104+
// if new sizeof(new_value) <= sizeof(prev_value)
105+
// update inplace
106+
// else add(key, new_value)
107+
// else add(key, new_value)
108+
void Update(SequenceNumber seq,
108109
const Slice& key,
109110
const Slice& value);
110111

112+
// If prev_value for key exits, attempts to update it inplace.
113+
// else returns false
114+
// Pseudocode
115+
// if key exists in current memtable && prev_value is of type kTypeValue
116+
// new_value = delta(prev_value)
117+
// if sizeof(new_value) <= sizeof(prev_value)
118+
// update inplace
119+
// else add(key, new_value)
120+
// else return false
121+
bool UpdateCallback(SequenceNumber seq,
122+
const Slice& key,
123+
const Slice& delta,
124+
const Options& options);
125+
111126
// Returns the edits area that is needed for flushing the memtable
112127
VersionEdit* GetEdits() { return &edit_; }
113128

@@ -149,7 +164,7 @@ class MemTable {
149164
bool flush_completed_; // finished the flush
150165
uint64_t file_number_; // filled up after flush is complete
151166

152-
// The udpates to be applied to the transaction log when this
167+
// The updates to be applied to the transaction log when this
153168
// memtable is flushed to storage.
154169
VersionEdit edit_;
155170

0 commit comments

Comments
 (0)