1
+ // This file is part of PGM-index <https://github.com/gvinciguerra/PGM-index>.
2
+ // Copyright (c) 2018 Giorgio Vinciguerra.
3
+ //
4
+ // Licensed under the Apache License, Version 2.0 (the "License");
5
+ // you may not use this file except in compliance with the License.
6
+ // You may obtain a copy of the License at
7
+ //
8
+ // http://www.apache.org/licenses/LICENSE-2.0
9
+ //
10
+ // Unless required by applicable law or agreed to in writing, software
11
+ // distributed under the License is distributed on an "AS IS" BASIS,
12
+ // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ // See the License for the specific language governing permissions and
14
+ // limitations under the License.
15
+
16
+ #pragma once
17
+
18
+ #include " piecewise_linear_model.hpp"
19
+ #include < algorithm>
20
+ #include < cstddef>
21
+ #include < cstdint>
22
+ #include < iterator>
23
+ #include < limits>
24
+ #include < stdexcept>
25
+ #include < utility>
26
+ #include < vector>
27
+
28
+ namespace pgm {
29
+
30
+ #define PGM_SUB_EPS (x, epsilon ) ((x) <= (epsilon) ? 0 : ((x) - (epsilon)))
31
+ #define PGM_ADD_EPS (x, epsilon, size ) ((x) + (epsilon) + 2 >= (size) ? (size) : (x) + (epsilon) + 2 )
32
+
33
+ /* *
34
+ * A struct that stores the result of a query to a @ref PGMIndex, that is, a range [@ref lo, @ref hi)
35
+ * centered around an approximate position @ref pos of the sought key.
36
+ */
37
+ struct ApproxPos {
38
+ size_t pos; // /< The approximate position of the key.
39
+ size_t lo; // /< The lower bound of the range.
40
+ size_t hi; // /< The upper bound of the range.
41
+ };
42
+
43
+ /* *
44
+ * A space-efficient index that enables fast search operations on a sorted sequence of @c n numbers.
45
+ *
46
+ * A search returns a struct @ref ApproxPos containing an approximate position of the sought key in the sequence and
47
+ * the bounds of a range of size 2*Epsilon+1 where the sought key is guaranteed to be found if present.
48
+ * If the key is not present, the range is guaranteed to contain a key that is not less than (i.e. greater or equal to)
49
+ * the sought key, or @c n if no such key is found.
50
+ * In the case of repeated keys, the index finds the position of the first occurrence of a key.
51
+ *
52
+ * The @p Epsilon template parameter should be set according to the desired space-time trade-off. A smaller value
53
+ * makes the estimation more precise and the range smaller but at the cost of increased space usage.
54
+ *
55
+ * Internally the index uses a succinct piecewise linear mapping from keys to their position in the sorted order.
56
+ * This mapping is represented as a sequence of linear models (segments) which, if @p EpsilonRecursive is not zero, are
57
+ * themselves recursively indexed by other piecewise linear mappings.
58
+ *
59
+ * @tparam K the type of the indexed keys
60
+ * @tparam Epsilon controls the size of the returned search range
61
+ * @tparam EpsilonRecursive controls the size of the search range in the internal structure
62
+ * @tparam Floating the floating-point type to use for slopes
63
+ */
64
+ template <typename K, size_t Epsilon = 64 , size_t EpsilonRecursive = 4 , typename Floating = float >
65
+ class PGMIndex {
66
+ protected:
67
+ template <typename , size_t , size_t , uint8_t , typename >
68
+ friend class BucketingPGMIndex ;
69
+
70
+ template <typename , size_t , typename >
71
+ friend class EliasFanoPGMIndex ;
72
+
73
+ static_assert (Epsilon > 0 );
74
+ struct Segment ;
75
+
76
+ size_t n; // /< The number of elements this index was built on.
77
+ K first_key; // /< The smallest element.
78
+ std::vector<Segment> segments; // /< The segments composing the index.
79
+ std::vector<size_t > levels_offsets; // /< The starting position of each level in segments[], in reverse order.
80
+
81
+ template <typename RandomIt>
82
+ static void build (RandomIt first, RandomIt last,
83
+ size_t epsilon, size_t epsilon_recursive,
84
+ std::vector<Segment> &segments,
85
+ std::vector<size_t > &levels_offsets) {
86
+ auto n = (size_t ) std::distance (first, last);
87
+ if (n == 0 )
88
+ return ;
89
+
90
+ levels_offsets.push_back (0 );
91
+ segments.reserve (n / (epsilon * epsilon));
92
+
93
+ auto ignore_last = *std::prev (last) == std::numeric_limits<K>::max (); // max() is the sentinel value
94
+ auto last_n = n - ignore_last;
95
+ last -= ignore_last;
96
+
97
+ auto build_level = [&](auto epsilon, auto in_fun, auto out_fun) {
98
+ auto n_segments = internal::make_segmentation_par (last_n, epsilon, in_fun, out_fun);
99
+ if (last_n > 1 && segments.back ().slope == 0 ) {
100
+ // Here we need to ensure that keys > *(last-1) are approximated to a position == prev_level_size
101
+ segments.emplace_back (*std::prev (last) + 1 , 0 , last_n);
102
+ ++n_segments;
103
+ }
104
+ segments.emplace_back (last_n); // Add the sentinel segment
105
+ return n_segments;
106
+ };
107
+
108
+ // Build first level
109
+ auto in_fun = [&](auto i) {
110
+ auto x = first[i];
111
+ // Here there is an adjustment for inputs with duplicate keys: at the end of a run of duplicate keys equal
112
+ // to x=first[i] such that x+1!=first[i+1], we map the values x+1,...,first[i+1]-1 to their correct rank i
113
+ auto flag = i > 0 && i + 1u < n && x == first[i - 1 ] && x != first[i + 1 ] && x + 1 != first[i + 1 ];
114
+ return std::pair<K, size_t >(x + flag, i);
115
+ };
116
+ auto out_fun = [&](auto cs) { segments.emplace_back (cs); };
117
+ last_n = build_level (epsilon, in_fun, out_fun);
118
+ levels_offsets.push_back (levels_offsets.back () + last_n + 1 );
119
+
120
+ // Build upper levels
121
+ while (epsilon_recursive && last_n > 1 ) {
122
+ auto offset = levels_offsets[levels_offsets.size () - 2 ];
123
+ auto in_fun_rec = [&](auto i) { return std::pair<K, size_t >(segments[offset + i].key , i); };
124
+ last_n = build_level (epsilon_recursive, in_fun_rec, out_fun);
125
+ levels_offsets.push_back (levels_offsets.back () + last_n + 1 );
126
+ }
127
+ }
128
+
129
+ /* *
130
+ * Returns the segment responsible for a given key, that is, the rightmost segment having key <= the sought key.
131
+ * @param key the value of the element to search for
132
+ * @return an iterator to the segment responsible for the given key
133
+ */
134
+ auto segment_for_key (const K &key) const {
135
+ if constexpr (EpsilonRecursive == 0 ) {
136
+ return std::prev (std::upper_bound (segments.begin (), segments.begin () + segments_count (), key));
137
+ }
138
+
139
+ auto it = segments.begin () + *(levels_offsets.end () - 2 );
140
+ for (auto l = int (height ()) - 2 ; l >= 0 ; --l) {
141
+ auto level_begin = segments.begin () + levels_offsets[l];
142
+ auto pos = std::min<size_t >((*it)(key), std::next (it)->intercept );
143
+ auto lo = level_begin + PGM_SUB_EPS (pos, EpsilonRecursive + 1 );
144
+
145
+ static constexpr size_t linear_search_threshold = 8 * 64 / sizeof (Segment);
146
+ if constexpr (EpsilonRecursive <= linear_search_threshold) {
147
+ for (; std::next (lo)->key <= key; ++lo)
148
+ continue ;
149
+ it = lo;
150
+ } else {
151
+ auto level_size = levels_offsets[l + 1 ] - levels_offsets[l] - 1 ;
152
+ auto hi = level_begin + PGM_ADD_EPS (pos, EpsilonRecursive, level_size);
153
+ it = std::prev (std::upper_bound (lo, hi, key));
154
+ }
155
+ }
156
+ return it;
157
+ }
158
+
159
+ public:
160
+
161
+ static constexpr size_t epsilon_value = Epsilon;
162
+
163
+ /* *
164
+ * Constructs an empty index.
165
+ */
166
+ PGMIndex () = default ;
167
+
168
+ /* *
169
+ * Constructs the index on the given sorted vector.
170
+ * @param data the vector of keys to be indexed, must be sorted
171
+ */
172
+ explicit PGMIndex (const std::vector<K> &data) : PGMIndex(data.begin(), data.end()) {}
173
+
174
+ /* *
175
+ * Constructs the index on the sorted keys in the range [first, last).
176
+ * @param first, last the range containing the sorted keys to be indexed
177
+ */
178
+ template <typename RandomIt>
179
+ PGMIndex (RandomIt first, RandomIt last)
180
+ : n(std::distance(first, last)),
181
+ first_key (n ? *first : K(0 )),
182
+ segments(),
183
+ levels_offsets() {
184
+ build (first, last, Epsilon, EpsilonRecursive, segments, levels_offsets);
185
+ }
186
+
187
+ /* *
188
+ * Returns the approximate position and the range where @p key can be found.
189
+ * @param key the value of the element to search for
190
+ * @return a struct with the approximate position and bounds of the range
191
+ */
192
+ ApproxPos search (const K &key) const {
193
+ auto k = std::max (first_key, key);
194
+ auto it = segment_for_key (k);
195
+ auto pos = std::min<size_t >((*it)(k), std::next (it)->intercept );
196
+ auto lo = PGM_SUB_EPS (pos, Epsilon);
197
+ auto hi = PGM_ADD_EPS (pos, Epsilon, n);
198
+ return {pos, lo, hi};
199
+ }
200
+
201
+ /* *
202
+ * Returns the number of segments in the last level of the index.
203
+ * @return the number of segments
204
+ */
205
+ size_t segments_count () const { return segments.empty () ? 0 : levels_offsets[1 ] - 1 ; }
206
+
207
+ /* *
208
+ * Returns the number of levels of the index.
209
+ * @return the number of levels of the index
210
+ */
211
+ size_t height () const { return levels_offsets.size () - 1 ; }
212
+
213
+ /* *
214
+ * Returns the size of the index in bytes.
215
+ * @return the size of the index in bytes
216
+ */
217
+ size_t size_in_bytes () const { return segments.size () * sizeof (Segment) + levels_offsets.size () * sizeof (size_t ); }
218
+ };
219
+
220
+ #pragma pack(push, 1)
221
+
222
+ template <typename K, size_t Epsilon, size_t EpsilonRecursive, typename Floating>
223
+ struct PGMIndex <K, Epsilon, EpsilonRecursive, Floating>::Segment {
224
+ K key; // /< The first key that the segment indexes.
225
+ Floating slope; // /< The slope of the segment.
226
+ int32_t intercept; // /< The intercept of the segment.
227
+
228
+ Segment () = default ;
229
+
230
+ Segment (K key, Floating slope, int32_t intercept) : key(key), slope(slope), intercept(intercept) {};
231
+
232
+ explicit Segment (size_t n) : key(std::numeric_limits<K>::max()), slope(), intercept(n) {};
233
+
234
+ explicit Segment (const typename internal::OptimalPiecewiseLinearModel<K, size_t >::CanonicalSegment &cs)
235
+ : key(cs.get_first_x()) {
236
+ auto [cs_slope, cs_intercept] = cs.get_floating_point_segment (key);
237
+ if (cs_intercept > std::numeric_limits<decltype (intercept)>::max ())
238
+ throw std::overflow_error (" Change the type of Segment::intercept to int64" );
239
+ slope = cs_slope;
240
+ intercept = cs_intercept;
241
+ }
242
+
243
+ friend inline bool operator <(const Segment &s, const K &k) { return s.key < k; }
244
+ friend inline bool operator <(const K &k, const Segment &s) { return k < s.key ; }
245
+ friend inline bool operator <(const Segment &s, const Segment &t) { return s.key < t.key ; }
246
+
247
+ operator K () { return key; };
248
+
249
+ /* *
250
+ * Returns the approximate position of the specified key.
251
+ * @param k the key whose position must be approximated
252
+ * @return the approximate position of the specified key
253
+ */
254
+ inline size_t operator ()(const K &k) const {
255
+ auto pos = int64_t (slope * (k - key)) + intercept;
256
+ return pos > 0 ? size_t (pos) : 0ull ;
257
+ }
258
+ };
259
+
260
+ #pragma pack(pop)
261
+
262
+ }
0 commit comments