Skip to content

Commit a25d890

Browse files
committed
add pgm test, update output of all
1 parent 54d8e66 commit a25d890

13 files changed

+14078
-32
lines changed

include/pgm/morton_nd.hpp

+574
Large diffs are not rendered by default.

include/pgm/pgm_index.hpp

+262
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,262 @@
1+
// This file is part of PGM-index <https://github.com/gvinciguerra/PGM-index>.
2+
// Copyright (c) 2018 Giorgio Vinciguerra.
3+
//
4+
// Licensed under the Apache License, Version 2.0 (the "License");
5+
// you may not use this file except in compliance with the License.
6+
// You may obtain a copy of the License at
7+
//
8+
// http://www.apache.org/licenses/LICENSE-2.0
9+
//
10+
// Unless required by applicable law or agreed to in writing, software
11+
// distributed under the License is distributed on an "AS IS" BASIS,
12+
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
// See the License for the specific language governing permissions and
14+
// limitations under the License.
15+
16+
#pragma once
17+
18+
#include "piecewise_linear_model.hpp"
19+
#include <algorithm>
20+
#include <cstddef>
21+
#include <cstdint>
22+
#include <iterator>
23+
#include <limits>
24+
#include <stdexcept>
25+
#include <utility>
26+
#include <vector>
27+
28+
namespace pgm {
29+
30+
#define PGM_SUB_EPS(x, epsilon) ((x) <= (epsilon) ? 0 : ((x) - (epsilon)))
31+
#define PGM_ADD_EPS(x, epsilon, size) ((x) + (epsilon) + 2 >= (size) ? (size) : (x) + (epsilon) + 2)
32+
33+
/**
34+
* A struct that stores the result of a query to a @ref PGMIndex, that is, a range [@ref lo, @ref hi)
35+
* centered around an approximate position @ref pos of the sought key.
36+
*/
37+
struct ApproxPos {
38+
size_t pos; ///< The approximate position of the key.
39+
size_t lo; ///< The lower bound of the range.
40+
size_t hi; ///< The upper bound of the range.
41+
};
42+
43+
/**
44+
* A space-efficient index that enables fast search operations on a sorted sequence of @c n numbers.
45+
*
46+
* A search returns a struct @ref ApproxPos containing an approximate position of the sought key in the sequence and
47+
* the bounds of a range of size 2*Epsilon+1 where the sought key is guaranteed to be found if present.
48+
* If the key is not present, the range is guaranteed to contain a key that is not less than (i.e. greater or equal to)
49+
* the sought key, or @c n if no such key is found.
50+
* In the case of repeated keys, the index finds the position of the first occurrence of a key.
51+
*
52+
* The @p Epsilon template parameter should be set according to the desired space-time trade-off. A smaller value
53+
* makes the estimation more precise and the range smaller but at the cost of increased space usage.
54+
*
55+
* Internally the index uses a succinct piecewise linear mapping from keys to their position in the sorted order.
56+
* This mapping is represented as a sequence of linear models (segments) which, if @p EpsilonRecursive is not zero, are
57+
* themselves recursively indexed by other piecewise linear mappings.
58+
*
59+
* @tparam K the type of the indexed keys
60+
* @tparam Epsilon controls the size of the returned search range
61+
* @tparam EpsilonRecursive controls the size of the search range in the internal structure
62+
* @tparam Floating the floating-point type to use for slopes
63+
*/
64+
template<typename K, size_t Epsilon = 64, size_t EpsilonRecursive = 4, typename Floating = float>
65+
class PGMIndex {
66+
protected:
67+
template<typename, size_t, size_t, uint8_t, typename>
68+
friend class BucketingPGMIndex;
69+
70+
template<typename, size_t, typename>
71+
friend class EliasFanoPGMIndex;
72+
73+
static_assert(Epsilon > 0);
74+
struct Segment;
75+
76+
size_t n; ///< The number of elements this index was built on.
77+
K first_key; ///< The smallest element.
78+
std::vector<Segment> segments; ///< The segments composing the index.
79+
std::vector<size_t> levels_offsets; ///< The starting position of each level in segments[], in reverse order.
80+
81+
template<typename RandomIt>
82+
static void build(RandomIt first, RandomIt last,
83+
size_t epsilon, size_t epsilon_recursive,
84+
std::vector<Segment> &segments,
85+
std::vector<size_t> &levels_offsets) {
86+
auto n = (size_t) std::distance(first, last);
87+
if (n == 0)
88+
return;
89+
90+
levels_offsets.push_back(0);
91+
segments.reserve(n / (epsilon * epsilon));
92+
93+
auto ignore_last = *std::prev(last) == std::numeric_limits<K>::max(); // max() is the sentinel value
94+
auto last_n = n - ignore_last;
95+
last -= ignore_last;
96+
97+
auto build_level = [&](auto epsilon, auto in_fun, auto out_fun) {
98+
auto n_segments = internal::make_segmentation_par(last_n, epsilon, in_fun, out_fun);
99+
if (last_n > 1 && segments.back().slope == 0) {
100+
// Here we need to ensure that keys > *(last-1) are approximated to a position == prev_level_size
101+
segments.emplace_back(*std::prev(last) + 1, 0, last_n);
102+
++n_segments;
103+
}
104+
segments.emplace_back(last_n); // Add the sentinel segment
105+
return n_segments;
106+
};
107+
108+
// Build first level
109+
auto in_fun = [&](auto i) {
110+
auto x = first[i];
111+
// Here there is an adjustment for inputs with duplicate keys: at the end of a run of duplicate keys equal
112+
// to x=first[i] such that x+1!=first[i+1], we map the values x+1,...,first[i+1]-1 to their correct rank i
113+
auto flag = i > 0 && i + 1u < n && x == first[i - 1] && x != first[i + 1] && x + 1 != first[i + 1];
114+
return std::pair<K, size_t>(x + flag, i);
115+
};
116+
auto out_fun = [&](auto cs) { segments.emplace_back(cs); };
117+
last_n = build_level(epsilon, in_fun, out_fun);
118+
levels_offsets.push_back(levels_offsets.back() + last_n + 1);
119+
120+
// Build upper levels
121+
while (epsilon_recursive && last_n > 1) {
122+
auto offset = levels_offsets[levels_offsets.size() - 2];
123+
auto in_fun_rec = [&](auto i) { return std::pair<K, size_t>(segments[offset + i].key, i); };
124+
last_n = build_level(epsilon_recursive, in_fun_rec, out_fun);
125+
levels_offsets.push_back(levels_offsets.back() + last_n + 1);
126+
}
127+
}
128+
129+
/**
130+
* Returns the segment responsible for a given key, that is, the rightmost segment having key <= the sought key.
131+
* @param key the value of the element to search for
132+
* @return an iterator to the segment responsible for the given key
133+
*/
134+
auto segment_for_key(const K &key) const {
135+
if constexpr (EpsilonRecursive == 0) {
136+
return std::prev(std::upper_bound(segments.begin(), segments.begin() + segments_count(), key));
137+
}
138+
139+
auto it = segments.begin() + *(levels_offsets.end() - 2);
140+
for (auto l = int(height()) - 2; l >= 0; --l) {
141+
auto level_begin = segments.begin() + levels_offsets[l];
142+
auto pos = std::min<size_t>((*it)(key), std::next(it)->intercept);
143+
auto lo = level_begin + PGM_SUB_EPS(pos, EpsilonRecursive + 1);
144+
145+
static constexpr size_t linear_search_threshold = 8 * 64 / sizeof(Segment);
146+
if constexpr (EpsilonRecursive <= linear_search_threshold) {
147+
for (; std::next(lo)->key <= key; ++lo)
148+
continue;
149+
it = lo;
150+
} else {
151+
auto level_size = levels_offsets[l + 1] - levels_offsets[l] - 1;
152+
auto hi = level_begin + PGM_ADD_EPS(pos, EpsilonRecursive, level_size);
153+
it = std::prev(std::upper_bound(lo, hi, key));
154+
}
155+
}
156+
return it;
157+
}
158+
159+
public:
160+
161+
static constexpr size_t epsilon_value = Epsilon;
162+
163+
/**
164+
* Constructs an empty index.
165+
*/
166+
PGMIndex() = default;
167+
168+
/**
169+
* Constructs the index on the given sorted vector.
170+
* @param data the vector of keys to be indexed, must be sorted
171+
*/
172+
explicit PGMIndex(const std::vector<K> &data) : PGMIndex(data.begin(), data.end()) {}
173+
174+
/**
175+
* Constructs the index on the sorted keys in the range [first, last).
176+
* @param first, last the range containing the sorted keys to be indexed
177+
*/
178+
template<typename RandomIt>
179+
PGMIndex(RandomIt first, RandomIt last)
180+
: n(std::distance(first, last)),
181+
first_key(n ? *first : K(0)),
182+
segments(),
183+
levels_offsets() {
184+
build(first, last, Epsilon, EpsilonRecursive, segments, levels_offsets);
185+
}
186+
187+
/**
188+
* Returns the approximate position and the range where @p key can be found.
189+
* @param key the value of the element to search for
190+
* @return a struct with the approximate position and bounds of the range
191+
*/
192+
ApproxPos search(const K &key) const {
193+
auto k = std::max(first_key, key);
194+
auto it = segment_for_key(k);
195+
auto pos = std::min<size_t>((*it)(k), std::next(it)->intercept);
196+
auto lo = PGM_SUB_EPS(pos, Epsilon);
197+
auto hi = PGM_ADD_EPS(pos, Epsilon, n);
198+
return {pos, lo, hi};
199+
}
200+
201+
/**
202+
* Returns the number of segments in the last level of the index.
203+
* @return the number of segments
204+
*/
205+
size_t segments_count() const { return segments.empty() ? 0 : levels_offsets[1] - 1; }
206+
207+
/**
208+
* Returns the number of levels of the index.
209+
* @return the number of levels of the index
210+
*/
211+
size_t height() const { return levels_offsets.size() - 1; }
212+
213+
/**
214+
* Returns the size of the index in bytes.
215+
* @return the size of the index in bytes
216+
*/
217+
size_t size_in_bytes() const { return segments.size() * sizeof(Segment) + levels_offsets.size() * sizeof(size_t); }
218+
};
219+
220+
#pragma pack(push, 1)
221+
222+
template<typename K, size_t Epsilon, size_t EpsilonRecursive, typename Floating>
223+
struct PGMIndex<K, Epsilon, EpsilonRecursive, Floating>::Segment {
224+
K key; ///< The first key that the segment indexes.
225+
Floating slope; ///< The slope of the segment.
226+
int32_t intercept; ///< The intercept of the segment.
227+
228+
Segment() = default;
229+
230+
Segment(K key, Floating slope, int32_t intercept) : key(key), slope(slope), intercept(intercept) {};
231+
232+
explicit Segment(size_t n) : key(std::numeric_limits<K>::max()), slope(), intercept(n) {};
233+
234+
explicit Segment(const typename internal::OptimalPiecewiseLinearModel<K, size_t>::CanonicalSegment &cs)
235+
: key(cs.get_first_x()) {
236+
auto[cs_slope, cs_intercept] = cs.get_floating_point_segment(key);
237+
if (cs_intercept > std::numeric_limits<decltype(intercept)>::max())
238+
throw std::overflow_error("Change the type of Segment::intercept to int64");
239+
slope = cs_slope;
240+
intercept = cs_intercept;
241+
}
242+
243+
friend inline bool operator<(const Segment &s, const K &k) { return s.key < k; }
244+
friend inline bool operator<(const K &k, const Segment &s) { return k < s.key; }
245+
friend inline bool operator<(const Segment &s, const Segment &t) { return s.key < t.key; }
246+
247+
operator K() { return key; };
248+
249+
/**
250+
* Returns the approximate position of the specified key.
251+
* @param k the key whose position must be approximated
252+
* @return the approximate position of the specified key
253+
*/
254+
inline size_t operator()(const K &k) const {
255+
auto pos = int64_t(slope * (k - key)) + intercept;
256+
return pos > 0 ? size_t(pos) : 0ull;
257+
}
258+
};
259+
260+
#pragma pack(pop)
261+
262+
}

0 commit comments

Comments
 (0)