Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Differantial encoding for name offsets #1069

Merged
merged 11 commits into from
Jun 15, 2014
231 changes: 231 additions & 0 deletions DataStructures/RangeTable.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,231 @@
#ifndef __RANGE_TABLE_H__
#define __RANGE_TABLE_H__

#include "SharedMemoryFactory.h"
#include "SharedMemoryVectorWrapper.h"

#include <boost/range/irange.hpp>

#include <fstream>
#include <vector>
#include <array>

/*
* These pre-declarations are needed because parsing C++ is hard
* and otherwise the compiler gets confused.
*/

template<unsigned BLOCK_SIZE=16, bool USE_SHARED_MEMORY = false> class RangeTable;

template<unsigned BLOCK_SIZE, bool USE_SHARED_MEMORY>
std::ostream& operator<<(std::ostream &out, const RangeTable<BLOCK_SIZE, USE_SHARED_MEMORY> &table);

template<unsigned BLOCK_SIZE, bool USE_SHARED_MEMORY>
std::istream& operator>>(std::istream &in, RangeTable<BLOCK_SIZE, USE_SHARED_MEMORY> &table);

/**
* Stores adjacent ranges in a compressed format.
*
* Maximum supported length of a range is 255.
*
* Note: BLOCK_SIZE is the number of differential encodoed values.
* But each block consists of an absolute value and BLOCK_SIZE differential values.
* So the effective block size is sizeof(unsigned) + BLOCK_SIZE.
*/
template<unsigned BLOCK_SIZE, bool USE_SHARED_MEMORY>
class RangeTable
{
public:

typedef std::array<unsigned char, BLOCK_SIZE> BlockT;
typedef typename ShM<BlockT, USE_SHARED_MEMORY>::vector BlockContainerT;
typedef typename ShM<unsigned, USE_SHARED_MEMORY>::vector OffsetContainerT;
typedef decltype(boost::irange(0u,0u)) RangeT;

friend std::ostream& operator<< <>(std::ostream &out, const RangeTable &table);
friend std::istream& operator>> <>(std::istream &in, RangeTable &table);

RangeTable() {}
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We can probably omit this c'tor or delete it if not needed.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

That one is actually needed, because otherwise you can't use the stream operators. (And I won't get the default constructor because I already defined constructors)


// for loading from shared memory
explicit RangeTable(OffsetContainerT& external_offsets, BlockContainerT& external_blocks, const unsigned sum_lengths)
: sum_lengths(sum_lengths)
{
block_offsets.swap(external_offsets);
diff_blocks.swap(external_blocks);
}

// construct table from length vector
explicit RangeTable(const std::vector<unsigned>& lengths)
{
const unsigned number_of_blocks = [&lengths]() {
unsigned num = (lengths.size() + 1) / (BLOCK_SIZE + 1);
if ((lengths.size() + 1) % (BLOCK_SIZE + 1) != 0)
{
num += 1;
}
return num;
}();

block_offsets.reserve(number_of_blocks);
diff_blocks.reserve(number_of_blocks);

unsigned last_length = 0;
unsigned lengths_prefix_sum = 0;
unsigned block_idx = 0;
unsigned block_counter = 0;
BlockT block;
unsigned block_sum = 0;
for (const unsigned l : lengths)
{
// first entry of a block: encode absolute offset
if (block_idx == 0)
{
block_offsets.push_back(lengths_prefix_sum);
block_sum = 0;
}
else
{
block[block_idx - 1] = last_length;
block_sum += last_length;
}

BOOST_ASSERT((block_idx == 0 && block_offsets[block_counter] == lengths_prefix_sum)
|| lengths_prefix_sum == (block_offsets[block_counter]+block_sum));

// block is full
if (BLOCK_SIZE == block_idx)
{
diff_blocks.push_back(block);
block_counter++;
}

// we can only store strings with length 255
BOOST_ASSERT(l <= 255);

lengths_prefix_sum += l;
last_length = l;

block_idx = (block_idx + 1) % (BLOCK_SIZE + 1);
}

// Last block can't be finished because we didn't add the sentinel
BOOST_ASSERT (block_counter == (number_of_blocks - 1));

// one block missing: starts with guard value
if (0 == block_idx)
{
// the last value is used as sentinel
block_offsets.push_back(lengths_prefix_sum);
block_idx = (block_idx + 1) % BLOCK_SIZE;
}

while (0 != block_idx)
{
block[block_idx - 1] = last_length;
last_length = 0;
block_idx = (block_idx + 1) % (BLOCK_SIZE + 1);
}
diff_blocks.push_back(block);

BOOST_ASSERT(diff_blocks.size() == number_of_blocks && block_offsets.size() == number_of_blocks);

sum_lengths = lengths_prefix_sum;
}

inline RangeT GetRange(const unsigned id) const
{
BOOST_ASSERT(id < block_offsets.size() + diff_blocks.size() * BLOCK_SIZE);
// internal_idx 0 is implicitly stored in block_offsets[block_idx]
const unsigned internal_idx = id % (BLOCK_SIZE + 1);
const unsigned block_idx = id / (BLOCK_SIZE + 1);

BOOST_ASSERT(block_idx < diff_blocks.size());

unsigned begin_idx = 0;
unsigned end_idx = 0;
begin_idx = block_offsets[block_idx];
const BlockT& block = diff_blocks[block_idx];
if (internal_idx > 0)
{
begin_idx += PrefixSumAtIndex(internal_idx - 1, block);
}

// next index inside current block
if (internal_idx < BLOCK_SIZE)
{
// note internal_idx - 1 is the *current* index for uint8_blocks
end_idx = begin_idx + block[internal_idx];
}
else
{
BOOST_ASSERT(block_idx < block_offsets.size() - 1);
end_idx = block_offsets[block_idx + 1];
}

BOOST_ASSERT(begin_idx < sum_lengths && end_idx <= sum_lengths);
BOOST_ASSERT(begin_idx <= end_idx);

return boost::irange(begin_idx, end_idx);
}
private:

inline unsigned PrefixSumAtIndex(int index, const BlockT& block) const;

// contains offset for each differential block
OffsetContainerT block_offsets;
// blocks of differential encoded offsets, should be aligned
BlockContainerT diff_blocks;
unsigned sum_lengths;
};

template<unsigned BLOCK_SIZE, bool USE_SHARED_MEMORY>
unsigned RangeTable<BLOCK_SIZE, USE_SHARED_MEMORY>::PrefixSumAtIndex(int index, const BlockT& block) const
{
// this loop looks inefficent, but a modern compiler
// will emit nice SIMD here, at least for sensible block sizes. (I checked.)
unsigned sum = 0;
for (int i = 0; i <= index; ++i)
{
sum += block[i];
}

return sum;
}

template<unsigned BLOCK_SIZE, bool USE_SHARED_MEMORY>
std::ostream& operator<<(std::ostream &out, const RangeTable<BLOCK_SIZE, USE_SHARED_MEMORY> &table)
{
// write number of block
const unsigned number_of_blocks = table.diff_blocks.size();
out.write((char *) &number_of_blocks, sizeof(unsigned));
// write total length
out.write((char *) &table.sum_lengths, sizeof(unsigned));
// write block offsets
out.write((char *) table.block_offsets.data(), sizeof(unsigned) * table.block_offsets.size());
// write blocks
out.write((char *) table.diff_blocks.data(), BLOCK_SIZE * table.diff_blocks.size());

return out;
}

template<unsigned BLOCK_SIZE, bool USE_SHARED_MEMORY>
std::istream& operator>>(std::istream &in, RangeTable<BLOCK_SIZE, USE_SHARED_MEMORY> &table)
{
// read number of block
unsigned number_of_blocks;
in.read((char *) &number_of_blocks, sizeof(unsigned));
// read total length
in.read((char *) &table.sum_lengths, sizeof(unsigned));

table.block_offsets.resize(number_of_blocks);
table.diff_blocks.resize(number_of_blocks);

// read block offsets
in.read((char *) table.block_offsets.data(), sizeof(unsigned) * number_of_blocks);
// read blocks
in.read((char *) table.diff_blocks.data(), BLOCK_SIZE * number_of_blocks);
return in;
}

#endif
2 changes: 2 additions & 0 deletions DataStructures/StaticRTree.h
Original file line number Diff line number Diff line change
Expand Up @@ -766,7 +766,9 @@ class StaticRTree
}
const uint64_t seek_pos = sizeof(uint64_t) + leaf_id * sizeof(LeafNode);
thread_local_rtree_stream->seekg(seek_pos);
BOOST_ASSERT_MSG(thread_local_rtree_stream->good(), "Seeking to position in leaf file failed.");
thread_local_rtree_stream->read((char *)&result_node, sizeof(LeafNode));
BOOST_ASSERT_MSG(thread_local_rtree_stream->good(), "Reading from leaf file failed.");
}

inline bool EdgesAreEquivalent(const FixedPointCoordinate &a,
Expand Down
31 changes: 12 additions & 19 deletions Extractor/ExtractionContainers.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "../Util/OSRMException.h"
#include "../Util/SimpleLogger.h"
#include "../Util/TimingUtil.h"
#include "../DataStructures/RangeTable.h"

#include <boost/assert.hpp>
#include <boost/filesystem.hpp>
Expand Down Expand Up @@ -64,6 +65,7 @@ void ExtractionContainers::PrepareData(const std::string &output_file_name,
{
unsigned number_of_used_nodes = 0;
unsigned number_of_used_edges = 0;

std::cout << "[extractor] Sorting used nodes ... " << std::flush;
TIMER_START(sorting_used_nodes);
stxxl::sort(used_node_id_list.begin(), used_node_id_list.end(), Cmp(), stxxl_memory);
Expand Down Expand Up @@ -395,32 +397,23 @@ void ExtractionContainers::PrepareData(const std::string &output_file_name,
std::string name_file_streamName = (output_file_name + ".names");
boost::filesystem::ofstream name_file_stream(name_file_streamName, std::ios::binary);

// write number of names
const unsigned number_of_names = name_list.size() + 1;
name_file_stream.write((char *)&(number_of_names), sizeof(unsigned));

// compute total number of chars
unsigned total_number_of_chars = 0;
for (const std::string &temp_string : name_list)
{
total_number_of_chars += temp_string.length();
}
// write total number of chars
name_file_stream.write((char *)&(total_number_of_chars), sizeof(unsigned));
// write prefixe sums
unsigned name_lengths_prefix_sum = 0;
unsigned total_length = 0;
std::vector<unsigned> name_lengths;
for (const std::string &temp_string : name_list)
{
name_file_stream.write((char *)&(name_lengths_prefix_sum), sizeof(unsigned));
name_lengths_prefix_sum += temp_string.length();
const unsigned string_length = std::min(static_cast<unsigned>(temp_string.length()), 255u);
name_lengths.push_back(string_length);
total_length += string_length;
}
// duplicate on purpose!
name_file_stream.write((char *)&(name_lengths_prefix_sum), sizeof(unsigned));

RangeTable<> table(name_lengths);
name_file_stream << table;

name_file_stream.write((char*) &total_length, sizeof(unsigned));
// write all chars consecutively
for (const std::string &temp_string : name_list)
{
const unsigned string_length = temp_string.length();
const unsigned string_length = std::min(static_cast<unsigned>(temp_string.length()), 255u);
name_file_stream.write(temp_string.c_str(), string_length);
}

Expand Down
31 changes: 13 additions & 18 deletions Server/DataStructures/InternalDataFacade.h
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "../../DataStructures/SharedMemoryVectorWrapper.h"
#include "../../DataStructures/StaticGraph.h"
#include "../../DataStructures/StaticRTree.h"
#include "../../DataStructures/RangeTable.h"
#include "../../Util/BoostFileSystemFix.h"
#include "../../Util/GraphLoader.h"
#include "../../Util/ProgramOptions.h"
Expand Down Expand Up @@ -66,13 +67,13 @@ template <class EdgeDataT> class InternalDataFacade : public BaseDataFacade<Edge
ShM<unsigned, false>::vector m_name_ID_list;
ShM<TurnInstruction, false>::vector m_turn_instruction_list;
ShM<char, false>::vector m_names_char_list;
ShM<unsigned, false>::vector m_name_begin_indices;
ShM<bool, false>::vector m_egde_is_compressed;
ShM<unsigned, false>::vector m_geometry_indices;
ShM<unsigned, false>::vector m_geometry_list;

std::shared_ptr<StaticRTree<RTreeLeaf, ShM<FixedPointCoordinate, false>::vector, false>>
m_static_rtree;
RangeTable<16, false> m_name_table;

void LoadTimestamp(const boost::filesystem::path &timestamp_path)
{
Expand Down Expand Up @@ -203,16 +204,12 @@ template <class EdgeDataT> class InternalDataFacade : public BaseDataFacade<Edge
void LoadStreetNames(const boost::filesystem::path &names_file)
{
boost::filesystem::ifstream name_stream(names_file, std::ios::binary);
unsigned number_of_names = 0;

name_stream >> m_name_table;

unsigned number_of_chars = 0;
name_stream.read((char *)&number_of_names, sizeof(unsigned));
name_stream.read((char *)&number_of_chars, sizeof(unsigned));
BOOST_ASSERT_MSG(0 != number_of_names, "name file broken");
BOOST_ASSERT_MSG(0 != number_of_chars, "name file broken");

m_name_begin_indices.resize(number_of_names);
name_stream.read((char *)&m_name_begin_indices[0], number_of_names * sizeof(unsigned));

m_names_char_list.resize(number_of_chars + 1); //+1 gives sentinel element
name_stream.read((char *)&m_names_char_list[0], number_of_chars * sizeof(char));
BOOST_ASSERT_MSG(0 != m_names_char_list.size(), "could not load any names");
Expand Down Expand Up @@ -384,18 +381,16 @@ template <class EdgeDataT> class InternalDataFacade : public BaseDataFacade<Edge
result = "";
return;
}
BOOST_ASSERT_MSG(name_id < m_name_begin_indices.size(), "name id too high");
const unsigned begin_index = m_name_begin_indices[name_id];
const unsigned end_index = m_name_begin_indices[name_id + 1];
BOOST_ASSERT_MSG(begin_index < m_names_char_list.size(), "begin index of name too high");
BOOST_ASSERT_MSG(end_index < m_names_char_list.size(), "end index of name too high");
auto range = m_name_table.GetRange(name_id);

BOOST_ASSERT_MSG(begin_index <= end_index, "string ends before begin");
result.clear();
result.resize(end_index - begin_index);
std::copy(m_names_char_list.begin() + begin_index,
m_names_char_list.begin() + end_index,
result.begin());
if (range.begin() != range.end())
{
result.resize(range.back() - range.front() + 1);
std::copy(m_names_char_list.begin() + range.front(),
m_names_char_list.begin() + range.back() + 1,
result.begin());
}
}

virtual unsigned GetGeometryIndexForEdgeID(const unsigned id) const
Expand Down
Loading