Skip to content

Commit 5ffd938

Browse files
author
moonshadow565
committed
automatic bundle partitioning
1 parent 6398838 commit 5ffd938

File tree

6 files changed

+174
-58
lines changed

6 files changed

+174
-58
lines changed

lib/rlib/common.hpp

+5
Original file line numberDiff line numberDiff line change
@@ -55,6 +55,11 @@
5555
}()
5656

5757
namespace rlib {
58+
static std::size_t KiB = 1024;
59+
static std::size_t MiB = KiB * 1024;
60+
static std::size_t GiB = MiB * 1024;
61+
static std::size_t TiB = GiB * 1024;
62+
5863
namespace fs = std::filesystem;
5964

6065
[[noreturn]] extern void throw_error(char const* from, char const* msg);

lib/rlib/rcache.cpp

+105-30
Original file line numberDiff line numberDiff line change
@@ -9,35 +9,82 @@
99

1010
using namespace rlib;
1111

12-
static constexpr auto rcache_file_flags(RCache::Options const& options) -> IO::Flags {
13-
return (options.readonly ? IO::READ : IO::WRITE) | IO::NO_INTERUPT | IO::NO_OVERGROW;
12+
static constexpr auto rcache_file_flags(bool readonly) -> IO::Flags {
13+
return (readonly ? IO::READ : IO::WRITE) | IO::NO_INTERUPT | IO::NO_OVERGROW;
1414
}
1515

16-
RCache::RCache(Options const& options) : file_(options.path, rcache_file_flags(options)), options_(options) {
17-
auto file_size = file_.size();
18-
if (file_size == 0 && !options.readonly) {
19-
flush();
20-
return;
16+
static auto rcache_file_path(fs::path base, std::size_t index) -> fs::path {
17+
if (!index) return base;
18+
return std::move(base.replace_extension(fmt::format(".{:05d}.bundle", index)));
19+
}
20+
21+
RCache::RCache(Options const& options) : options_(options) {
22+
if (!options_.readonly) {
23+
options_.flush_size = std::max(32 * MiB, options_.flush_size);
24+
options_.max_size = std::max(options_.flush_size * 2, options_.max_size) - options_.flush_size;
25+
}
26+
for (fs::path path = options_.path;;) {
27+
auto const index = files_.size();
28+
auto next_path = rcache_file_path(options_.path, index + 1);
29+
auto const next_exists = fs::exists(next_path);
30+
auto const flags = rcache_file_flags(options_.readonly || next_exists);
31+
32+
auto file = std::make_unique<IO::File>(path, flags);
33+
auto const is_empty = file->size() == 0;
34+
auto bundle = !is_empty ? RBUN::read(*file) : RBUN{};
35+
files_.push_back(std::move(file));
36+
for (auto& chunk : bundle.lookup) {
37+
chunk.second.bundleId = (BundleID)index;
38+
}
39+
lookup_.merge(std::move(bundle.lookup));
40+
41+
if (flags & IO::WRITE) {
42+
writer_ = {
43+
.toc_offset = bundle.toc_offset,
44+
.end_offset = bundle.toc_offset + sizeof(RBUN::Footer),
45+
.chunks = std::move(bundle.chunks),
46+
};
47+
writer_.end_offset += sizeof(RChunk) * writer_.chunks.size();
48+
writer_.buffer.reserve(options_.flush_size * 2);
49+
can_write_ = true;
50+
if (is_empty) {
51+
this->flush();
52+
} else {
53+
this->check_space(options_.flush_size);
54+
}
55+
}
56+
57+
if (!next_exists) {
58+
break;
59+
}
60+
61+
path = std::move(next_path);
2162
}
22-
bundle_ = RBUN::read(file_);
2363
}
2464

2565
RCache::~RCache() { this->flush(); }
2666

2767
auto RCache::add(RChunk const& chunk, std::span<char const> data) -> bool {
2868
rlib_assert(chunk.compressed_size == data.size());
29-
if (!can_write() || bundle_.lookup.contains(chunk.chunkId)) {
69+
if (!can_write() || lookup_.contains(chunk.chunkId)) {
3070
return false;
3171
}
3272
if (chunk.chunkId == ChunkID::None) {
3373
return false;
3474
}
35-
bundle_.chunks.push_back(chunk);
36-
bundle_.lookup[chunk.chunkId] = {chunk, BundleID::None, buffer_.size() + bundle_.toc_offset};
37-
buffer_.insert(buffer_.end(), data.begin(), data.end());
38-
if (buffer_.size() > options_.flush_size) {
75+
76+
// check if we hit chunk limit
77+
auto const extra_data = sizeof(RChunk) + data.size();
78+
this->check_space(extra_data);
79+
80+
writer_.chunks.push_back(chunk);
81+
lookup_[chunk.chunkId] = {chunk, BundleID::None, writer_.buffer.size() + writer_.toc_offset};
82+
writer_.buffer.insert(writer_.buffer.end(), data.begin(), data.end());
83+
if (writer_.buffer.size() > options_.flush_size) {
3984
this->flush();
4085
}
86+
writer_.end_offset += extra_data;
87+
4188
return true;
4289
}
4390

@@ -58,11 +105,11 @@ auto RCache::add_uncompressed(std::span<char const> src, int level) -> RChunk::S
58105
return chunk;
59106
}
60107

61-
auto RCache::contains(ChunkID chunkId) const noexcept -> bool { return bundle_.lookup.contains(chunkId); }
108+
auto RCache::contains(ChunkID chunkId) const noexcept -> bool { return lookup_.contains(chunkId); }
62109

63110
auto RCache::find(ChunkID chunkId) const noexcept -> RChunk::Src {
64-
auto i = bundle_.lookup.find(chunkId);
65-
if (i == bundle_.lookup.end()) {
111+
auto i = lookup_.find(chunkId);
112+
if (i == lookup_.end()) {
66113
return {};
67114
}
68115
return i->second;
@@ -82,6 +129,7 @@ auto RCache::uncache(std::vector<RChunk::Dst> chunks, RChunk::Dst::data_cb on_da
82129
rlib_assert(c.uncompressed_size == chunk.uncompressed_size);
83130
chunk.compressed_offset = c.compressed_offset;
84131
chunk.compressed_size = c.compressed_size;
132+
chunk.bundleId = c.bundleId;
85133
found.push_back(chunk);
86134
return true;
87135
});
@@ -95,11 +143,12 @@ auto RCache::uncache(std::vector<RChunk::Dst> chunks, RChunk::Dst::data_cb on_da
95143
on_data(chunk, dst);
96144
continue;
97145
}
98-
auto src = std::span(buffer_);
99-
if (chunk.compressed_offset > bundle_.toc_offset) {
100-
src = src.subspan(chunk.compressed_offset - bundle_.toc_offset, chunk.compressed_size);
146+
auto src = std::span<char const>{};
147+
auto const& file = files_.at((std::size_t)chunk.bundleId);
148+
if (can_write() && &file == &files_.back() && chunk.compressed_offset > writer_.toc_offset) {
149+
src = src.subspan(chunk.compressed_offset - writer_.toc_offset, chunk.compressed_size);
101150
} else {
102-
src = file_.copy(chunk.compressed_offset, chunk.compressed_size);
151+
src = file->copy(chunk.compressed_offset, chunk.compressed_size);
103152
}
104153
dst = zstd_decompress(src, chunk.uncompressed_size);
105154
on_data(chunk, dst);
@@ -108,23 +157,49 @@ auto RCache::uncache(std::vector<RChunk::Dst> chunks, RChunk::Dst::data_cb on_da
108157
return std::move(chunks);
109158
}
110159

160+
auto RCache::check_space(std::size_t extra) -> bool {
161+
// ensure we can allways at least write one file
162+
if (writer_.end_offset <= sizeof(RBUN::Footer)) {
163+
return false;
164+
}
165+
// still have space
166+
if (writer_.end_offset + extra < options_.max_size) {
167+
return false;
168+
}
169+
this->flush(); // flush anything that we have atm
170+
auto const index = files_.size();
171+
auto const path = rcache_file_path(options_.path, index);
172+
auto const flags = rcache_file_flags(false);
173+
auto file = std::make_unique<IO::File>(path, flags);
174+
file->resize(0, 0);
175+
files_.push_back(std::move(file));
176+
writer_.toc_offset = 0;
177+
writer_.end_offset = sizeof(RBUN::Footer);
178+
writer_.chunks.clear();
179+
writer_.buffer.clear();
180+
this->flush();
181+
return true;
182+
}
183+
111184
auto RCache::flush() -> bool {
112185
// Dont reflush when there is nothing to flush.
113-
if (!can_write() || (buffer_.empty() && bundle_.toc_offset != 0)) {
186+
if (!can_write() || (writer_.buffer.empty() && writer_.toc_offset != 0)) {
114187
return false;
115188
}
116-
auto toc_size = sizeof(RChunk) * bundle_.chunks.size();
189+
auto toc_size = sizeof(RChunk) * writer_.chunks.size();
117190
RBUN::Footer footer = {
118-
.checksum = std::bit_cast<std::array<char, 8>>(XXH64((char const*)bundle_.chunks.data(), toc_size, 0)),
119-
.entry_count = (std::uint32_t)bundle_.chunks.size(),
191+
.checksum = std::bit_cast<std::array<char, 8>>(XXH64((char const*)writer_.chunks.data(), toc_size, 0)),
192+
.entry_count = (std::uint32_t)writer_.chunks.size(),
120193
.version = RBUN::Footer::VERSION,
121194
.magic = {'R', 'B', 'U', 'N'},
122195
};
123-
auto new_toc_offset = bundle_.toc_offset + buffer_.size();
124-
buffer_.insert(buffer_.end(), (char const*)bundle_.chunks.data(), (char const*)bundle_.chunks.data() + toc_size);
125-
buffer_.insert(buffer_.end(), (char const*)&footer, (char const*)&footer + sizeof(footer));
126-
rlib_assert(file_.write(bundle_.toc_offset, buffer_));
127-
buffer_.clear();
128-
bundle_.toc_offset = new_toc_offset;
196+
auto new_toc_offset = writer_.toc_offset + writer_.buffer.size();
197+
writer_.buffer.insert(writer_.buffer.end(),
198+
(char const*)writer_.chunks.data(),
199+
(char const*)writer_.chunks.data() + toc_size);
200+
writer_.buffer.insert(writer_.buffer.end(), (char const*)&footer, (char const*)&footer + sizeof(footer));
201+
rlib_assert(files_.back()->write(writer_.toc_offset, writer_.buffer));
202+
writer_.buffer.clear();
203+
writer_.toc_offset = new_toc_offset;
129204
return true;
130205
}

lib/rlib/rcache.hpp

+17-6
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,8 @@ namespace rlib {
1212
struct Options {
1313
std::string path;
1414
bool readonly;
15-
std::uint32_t flush_size;
15+
std::size_t flush_size;
16+
std::size_t max_size;
1617
};
1718

1819
RCache(Options const& options);
@@ -30,11 +31,21 @@ namespace rlib {
3031

3132
auto flush() -> bool;
3233

33-
auto can_write() const noexcept -> bool { return file_.fd() && !options_.readonly; }
34+
auto can_write() const noexcept -> bool { return can_write_; }
35+
3436
private:
35-
IO::File file_;
36-
Options options_;
37-
std::vector<char> buffer_;
38-
RBUN bundle_;
37+
struct Writer {
38+
std::size_t toc_offset;
39+
std::size_t end_offset;
40+
std::vector<RChunk> chunks;
41+
std::vector<char> buffer;
42+
};
43+
bool can_write_ = {};
44+
Options options_ = {};
45+
Writer writer_ = {};
46+
std::vector<std::unique_ptr<IO::File>> files_;
47+
std::unordered_map<ChunkID, RChunk::Src> lookup_ = {};
48+
49+
auto check_space(std::size_t extra) -> bool;
3950
};
4051
}

src/rbun_merge.cpp

+16-9
Original file line numberDiff line numberDiff line change
@@ -9,12 +9,11 @@ using namespace rlib;
99

1010
struct Main {
1111
struct CLI {
12-
std::string output = {};
12+
RCache::Options output = {};
1313
std::vector<std::string> inputs = {};
1414
bool no_hash = {};
1515
bool no_extract = {};
1616
bool no_progress = {};
17-
std::uint32_t buffer = {};
1817
} cli = {};
1918

2019
auto parse_args(int argc, char** argv) -> void {
@@ -32,22 +31,30 @@ struct Main {
3231
.help("Do not print progress to cerr.")
3332
.default_value(false)
3433
.implicit_value(true);
35-
3634
program.add_argument("--buffer")
37-
.help("Size for buffer before flush to disk in killobytes [64, 1048576]")
38-
.default_value(std::uint32_t{32 * 1024 * 1024u})
35+
.help("Size for buffer before flush to disk in megabytes [1, 1048576]")
36+
.default_value(std::uint32_t{32})
37+
.action([](std::string const& value) -> std::uint32_t {
38+
return std::clamp((std::uint32_t)std::stoul(value), 1u, 1024u * 1024);
39+
});
40+
program.add_argument("--limit")
41+
.help("Size for bundle limit in gigabytes [0, 4096]")
42+
.default_value(std::uint32_t{4096})
3943
.action([](std::string const& value) -> std::uint32_t {
40-
return std::clamp((std::uint32_t)std::stoul(value), 64u, 1024u * 1024) * 1024u;
44+
return std::clamp((std::uint32_t)std::stoul(value), 0u, 4096u);
4145
});
4246

4347
program.parse_args(argc, argv);
4448

45-
cli.output = program.get<std::string>("output");
49+
cli.output = {
50+
.path = program.get<std::string>("output"),
51+
.flush_size = program.get<std::uint32_t>("--buffer") * MiB,
52+
.max_size = program.get<std::uint32_t>("--limit") * GiB,
53+
};
4654
cli.inputs = program.get<std::vector<std::string>>("input");
4755
cli.no_hash = program.get<bool>("--no-extract");
4856
cli.no_extract = program.get<bool>("--no-hash");
4957
cli.no_progress = program.get<bool>("--no-progress");
50-
cli.buffer = program.get<std::uint32_t>("--buffer");
5158
}
5259

5360
auto run() {
@@ -57,7 +64,7 @@ struct Main {
5764
return;
5865
}
5966
std::cerr << "Processing output bundle ... " << std::endl;
60-
auto output = RCache(RCache::Options{.path = cli.output, .readonly = false, .flush_size = cli.buffer});
67+
auto output = RCache(cli.output);
6168
std::cerr << "Processing input bundles ... " << std::endl;
6269
for (std::uint32_t index = paths.size(); auto const& path : paths) {
6370
add_bundle(path, output, index--);

src/rman_dl.cpp

+11-5
Original file line numberDiff line numberDiff line change
@@ -63,11 +63,16 @@ struct Main {
6363
.default_value(false)
6464
.implicit_value(true);
6565
program.add_argument("--cache-buffer")
66-
.help("Size for cache buffer in killobytes [64, 1048576]")
67-
.default_value(std::uint32_t{32 * 1024 * 1024})
68-
.scan<'u', std::uint32_t>()
66+
.help("Size for cache buffer in megabytes [1, 1048576]")
67+
.default_value(std::uint32_t{32})
68+
.action([](std::string const& value) -> std::uint32_t {
69+
return std::clamp((std::uint32_t)std::stoul(value), 1u, 1024u * 1024);
70+
});
71+
program.add_argument("--cache-limit")
72+
.help("Size for cache bundle limit in gigabytes [0, 4096]")
73+
.default_value(std::uint32_t{4})
6974
.action([](std::string const& value) -> std::uint32_t {
70-
return std::clamp((std::uint32_t)std::stoul(value), 64u, 1024u * 1024) * 1024u;
75+
return std::clamp((std::uint32_t)std::stoul(value), 0u, 4096u);
7176
});
7277

7378
// CDN options
@@ -117,7 +122,8 @@ struct Main {
117122
cli.cache = {
118123
.path = program.get<std::string>("--cache"),
119124
.readonly = program.get<bool>("--cache-readonly"),
120-
.flush_size = program.get<std::uint32_t>("--cache-buffer"),
125+
.flush_size = program.get<std::uint32_t>("--cache-buffer") * MiB,
126+
.max_size = program.get<std::uint32_t>("--cache-limit") * GiB,
121127
};
122128

123129
cli.cdn = {

0 commit comments

Comments
 (0)