Skip to content

Commit 0a47174

Browse files
authored
Allow REGEX expressions with a dynamically computed regular expression (#1889)
Add support for the `REGEX` function in the case where the regular expression is dynamically computed by the query, e.g. `REGEX(?input, ?regex)` or `REGEX(?input, CONCAT("bla", "*"))`. Previously, QLever only supported hardcoded regular expressions like `REGEX(?input, "bla*")`. The implementation optimizes for the case, that the regex is stored in a variable with a low multiplicity, so for example for the case `REGEX(?input, ?regex)`, where the `?regex` only binds to a few distinct values by using a simple LRU cache in the `RegexValueGetter`. Also clean up the code for the `Regex`, `Replace` and `Prefix` expressions. Fixes #1821
1 parent 707f5e7 commit 0a47174

14 files changed

+751
-437
lines changed

src/engine/sparqlExpressions/RegexExpression.cpp

+158-188
Large diffs are not rendered by default.

src/engine/sparqlExpressions/RegexExpression.h

+48-49
Original file line numberDiff line numberDiff line change
@@ -4,78 +4,77 @@
44

55
#pragma once
66

7+
#include <gtest/gtest_prod.h>
8+
#include <re2/re2.h>
9+
710
#include <string>
811

9-
#include "engine/sparqlExpressions/LiteralExpression.h"
1012
#include "engine/sparqlExpressions/SparqlExpression.h"
11-
#include "re2/re2.h"
1213

1314
namespace sparqlExpression {
14-
// Class implementing the REGEX function, which takes two mandatory arguments
15-
// (an expression and a regex) and one optional argument (a string of flags).
16-
class RegexExpression : public SparqlExpression {
15+
// Class implementing a specialization of the REGEX function. This optimization
16+
// is possible if the regex is known in advance and is a simple prefix regex.
17+
class PrefixRegexExpression : public SparqlExpression {
1718
private:
18-
SparqlExpression::Ptr child_;
19-
// The reguar expression. It needs to be a `std::optional` because `RE2`
20-
// objects do not have a default constructor.
21-
std::optional<RE2> regex_;
22-
// If this `std::optional` holds a string, we have a simple prefix regex
23-
// (which translates to a range search) and this string holds the prefix.
24-
std::optional<std::string> prefixRegex_;
25-
// The regex as a string, used for the cache key.
26-
std::string regexAsString_;
27-
28-
// True iff the expression is enclosed in `STR()`.
19+
Ptr child_;
20+
// A simple prefix regex (which translates to a range search) and this string
21+
// holds the prefix.
22+
std::string prefixRegex_;
23+
// Holds the variable over which the regex is evaluated.
24+
Variable variable_;
25+
// If the variable is wrapped inside a `STR()` function, this is set to true.
2926
bool childIsStrExpression_ = false;
3027

31-
public:
3228
// The `child` must be a `VariableExpression` and `regex` must be a
3329
// `LiteralExpression` that stores a string, otherwise an exception will be
3430
// thrown.
35-
RegexExpression(SparqlExpression::Ptr child, SparqlExpression::Ptr regex,
36-
std::optional<SparqlExpression::Ptr> optionalFlags);
31+
PrefixRegexExpression(Ptr child, std::string prefixRegex, Variable variable);
32+
33+
public:
34+
PrefixRegexExpression(PrefixRegexExpression&&) noexcept = default;
35+
PrefixRegexExpression& operator=(PrefixRegexExpression&&) noexcept = default;
36+
PrefixRegexExpression(const PrefixRegexExpression&) noexcept = delete;
37+
PrefixRegexExpression& operator=(const PrefixRegexExpression&) noexcept =
38+
delete;
39+
40+
// Check if the children of this expression allow for the prefix regex
41+
// optimization. If this is the case, a `PrefixRegexExpression` is returned,
42+
// otherwise `std::nullopt`.
43+
static std::optional<PrefixRegexExpression>
44+
makePrefixRegexExpressionIfPossible(Ptr& string,
45+
const SparqlExpression& regex);
3746

47+
// ___________________________________________________________________________
3848
ExpressionResult evaluate(EvaluationContext* context) const override;
3949

40-
// _________________________________________________________________________
41-
[[nodiscard]] string getCacheKey(
50+
// ___________________________________________________________________________
51+
[[nodiscard]] std::string getCacheKey(
4252
const VariableToColumnMap& varColMap) const override;
4353

44-
// _________________________________________________________________________
45-
[[nodiscard]] bool isPrefixExpression() const;
46-
47-
// _________________________________________________________________________
54+
// ___________________________________________________________________________
4855
Estimates getEstimatesForFilterExpression(
4956
uint64_t inputSize,
5057
const std::optional<Variable>& firstSortedVariable) const override;
5158

5259
private:
53-
std::span<SparqlExpression::Ptr> childrenImpl() override;
54-
55-
// Evaluate for the special case, where the expression is a variable and we
56-
// have a simple prefix regex (in which case the regex match translates to a
57-
// simple range check).
58-
ExpressionResult evaluatePrefixRegex(
59-
const Variable& variable,
60-
sparqlExpression::EvaluationContext* context) const;
61-
62-
// Evaluate for the general case.
63-
CPP_template(typename T)(requires SingleExpressionResult<T>) ExpressionResult
64-
evaluateGeneralCase(T&& input,
65-
sparqlExpression::EvaluationContext* context) const;
60+
std::span<Ptr> childrenImpl() override;
6661

6762
// Check if the `CancellationHandle` of `context` has been cancelled and throw
6863
// an exception if this is the case.
69-
static void checkCancellation(
70-
const sparqlExpression::EvaluationContext* context,
71-
ad_utility::source_location location =
72-
ad_utility::source_location::current());
64+
static void checkCancellation(const EvaluationContext* context,
65+
ad_utility::source_location location =
66+
ad_utility::source_location::current());
67+
68+
// Check if `regex` is a prefix regex which means that it starts with `^` and
69+
// contains no other "special" regex characters like `*` or `.`. If this check
70+
// succeeds, the prefix is returned without the leading `^` and with all
71+
// escaping undone. Else, `std::nullopt` is returned.
72+
static std::optional<std::string> getPrefixRegex(std::string regex);
73+
74+
FRIEND_TEST(RegexExpression, getPrefixRegex);
7375
};
74-
namespace detail {
75-
// Check if `regex` is a prefix regex which means that it starts with `^` and
76-
// contains no other "special" regex characters like `*` or `.`. If this check
77-
// succeeds, the prefix is returned without the leading `^` and with all
78-
// escaping undone. Else, `std::nullopt` is returned.
79-
std::optional<std::string> getPrefixRegex(std::string regex);
80-
} // namespace detail
76+
77+
SparqlExpression::Ptr makeRegexExpression(SparqlExpression::Ptr string,
78+
SparqlExpression::Ptr regex,
79+
SparqlExpression::Ptr flags);
8180
} // namespace sparqlExpression

src/engine/sparqlExpressions/SparqlExpressionValueGetters.cpp

+1-1
Original file line numberDiff line numberDiff line change
@@ -124,7 +124,7 @@ LiteralValueGetterWithoutStrFunction::operator()(
124124
std::optional<std::string> ReplacementStringGetter::operator()(
125125
Id id, const EvaluationContext* context) const {
126126
std::optional<std::string> originalString =
127-
StringValueGetter::operator()(id, context);
127+
LiteralFromIdGetter{}(id, context);
128128
if (!originalString.has_value()) {
129129
return originalString;
130130
}

src/engine/sparqlExpressions/SparqlExpressionValueGetters.h

+28-24
Original file line numberDiff line numberDiff line change
@@ -9,11 +9,11 @@
99
#include <re2/re2.h>
1010

1111
#include "engine/ExportQueryExecutionTrees.h"
12-
#include "engine/Result.h"
1312
#include "engine/sparqlExpressions/SparqlExpressionTypes.h"
1413
#include "global/Id.h"
1514
#include "parser/GeoPoint.h"
1615
#include "util/ConstexprSmallString.h"
16+
#include "util/LruCache.h"
1717
#include "util/TypeTraits.h"
1818

1919
/// Several classes that can be used as the `ValueGetter` template
@@ -144,22 +144,6 @@ struct StringValueGetter : Mixin<StringValueGetter> {
144144
return std::string(asStringViewUnsafe(s.getContent()));
145145
}
146146
};
147-
// Similar to `StringValueGetter`, but correctly preprocesses strings so that
148-
// they can be used by re2 as replacement strings. So '$1 \abc \$' becomes
149-
// '\1 \\abc $', where the former variant is valid in the SPARQL standard and
150-
// the latter represents the format that re2 expects.
151-
struct ReplacementStringGetter : StringValueGetter,
152-
Mixin<ReplacementStringGetter> {
153-
using Mixin<ReplacementStringGetter>::operator();
154-
std::optional<std::string> operator()(ValueId,
155-
const EvaluationContext*) const;
156-
157-
std::optional<std::string> operator()(const LiteralOrIri& s,
158-
const EvaluationContext*) const;
159-
160-
private:
161-
static std::string convertToReplacementString(std::string_view view);
162-
};
163147

164148
// This class can be used as the `ValueGetter` argument of Expression
165149
// templates. It implicitly applies the STR() function. In particular,
@@ -295,19 +279,39 @@ struct LiteralFromIdGetter : Mixin<LiteralFromIdGetter> {
295279
}
296280
};
297281

298-
// Convert the input into a `unique_ptr<RE2>`. Return nullptr if the input is
282+
// Similar to `LiteralFromIdGetter`, but correctly preprocesses strings so that
283+
// they can be used by re2 as replacement strings. So '$1 \abc \$' becomes
284+
// '\1 \\abc $', where the former variant is valid in the SPARQL standard and
285+
// the latter represents the format that re2 expects.
286+
struct ReplacementStringGetter : Mixin<ReplacementStringGetter> {
287+
using Mixin<ReplacementStringGetter>::operator();
288+
std::optional<std::string> operator()(ValueId,
289+
const EvaluationContext*) const;
290+
291+
std::optional<std::string> operator()(const LiteralOrIri& s,
292+
const EvaluationContext*) const;
293+
294+
private:
295+
static std::string convertToReplacementString(std::string_view view);
296+
};
297+
298+
// Convert the input into a `shared_ptr<RE2>`. Return nullptr if the input is
299299
// not convertible to a string.
300300
struct RegexValueGetter {
301+
mutable ad_utility::util::LRUCache<std::string, std::shared_ptr<RE2>> cache_{
302+
100};
301303
template <typename S>
302-
auto operator()(S&& input, const EvaluationContext* context) const -> CPP_ret(
303-
std::unique_ptr<re2::RE2>)(
304-
requires SingleExpressionResult<S>&&
305-
ranges::invocable<StringValueGetter, S&&, const EvaluationContext*>) {
306-
auto str = StringValueGetter{}(AD_FWD(input), context);
304+
auto operator()(S&& input, const EvaluationContext* context) const
305+
-> CPP_ret(std::shared_ptr<RE2>)(
306+
requires SingleExpressionResult<S>&& ranges::invocable<
307+
LiteralFromIdGetter, S&&, const EvaluationContext*>) {
308+
auto str = LiteralFromIdGetter{}(AD_FWD(input), context);
307309
if (!str.has_value()) {
308310
return nullptr;
309311
}
310-
return std::make_unique<re2::RE2>(str.value(), re2::RE2::Quiet);
312+
return cache_.getOrCompute(str.value(), [](const std::string& value) {
313+
return std::make_shared<RE2>(value, RE2::Quiet);
314+
});
311315
}
312316
};
313317

src/engine/sparqlExpressions/StringExpressions.cpp

+5-69
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66

77
#include "engine/sparqlExpressions/LiteralExpression.h"
88
#include "engine/sparqlExpressions/NaryExpressionImpl.h"
9+
#include "engine/sparqlExpressions/StringExpressionsHelper.h"
910
#include "engine/sparqlExpressions/VariadicExpression.h"
1011

1112
namespace sparqlExpression {
@@ -66,70 +67,6 @@ class StrExpression : public StrExpressionImpl {
6667
bool isStrExpression() const override { return true; }
6768
};
6869

69-
// Template for an expression that works on string literals. The arguments are
70-
// the same as those to `NaryExpression` with the difference that the value
71-
// getter is deduced automatically. If the child of the expression is the
72-
// `STR()` expression, then the `StringValueGetter` will be used (which also
73-
// returns string values for IRIs, numeric literals, etc.), otherwise the
74-
// `LiteralFromIdGetter` is used (which returns `std::nullopt` for these cases).
75-
template <typename ValueGetterWithStr, typename ValueGetterWithoutStr, size_t N,
76-
typename Function, typename... AdditionalNonStringValueGetters>
77-
class StringExpressionImplImpl : public SparqlExpression {
78-
private:
79-
using ExpressionWithStr = NARY<
80-
N, FV<Function, ValueGetterWithStr, AdditionalNonStringValueGetters...>>;
81-
using ExpressionWithoutStr = NARY<N, FV<Function, ValueGetterWithoutStr,
82-
AdditionalNonStringValueGetters...>>;
83-
84-
SparqlExpression::Ptr impl_;
85-
86-
public:
87-
CPP_template(typename... C)(
88-
requires(concepts::same_as<C, SparqlExpression::Ptr>&&...) CPP_and(
89-
sizeof...(C) + 1 ==
90-
N)) explicit StringExpressionImplImpl(SparqlExpression::Ptr child,
91-
C... children) {
92-
AD_CORRECTNESS_CHECK(child != nullptr);
93-
if (child->isStrExpression()) {
94-
auto childrenOfStr = std::move(*child).moveChildrenOut();
95-
AD_CORRECTNESS_CHECK(childrenOfStr.size() == 1);
96-
impl_ = std::make_unique<ExpressionWithStr>(
97-
std::move(childrenOfStr.at(0)), std::move(children)...);
98-
} else {
99-
impl_ = std::make_unique<ExpressionWithoutStr>(std::move(child),
100-
std::move(children)...);
101-
}
102-
}
103-
104-
ExpressionResult evaluate(EvaluationContext* context) const override {
105-
return impl_->evaluate(context);
106-
}
107-
std::string getCacheKey(const VariableToColumnMap& varColMap) const override {
108-
return impl_->getCacheKey(varColMap);
109-
}
110-
111-
private:
112-
std::span<SparqlExpression::Ptr> childrenImpl() override {
113-
return impl_->children();
114-
}
115-
};
116-
117-
// Impl class for expressions that work on plain strings.
118-
template <size_t N, typename Function,
119-
typename... AdditionalNonStringValueGetters>
120-
using StringExpressionImpl =
121-
StringExpressionImplImpl<StringValueGetter, LiteralFromIdGetter, N,
122-
Function, AdditionalNonStringValueGetters...>;
123-
124-
// Impl class for expressions that work on literals with datatypes and language
125-
// tags.
126-
template <size_t N, typename Function,
127-
typename... AdditionalNonStringValueGetters>
128-
using LiteralExpressionImpl =
129-
StringExpressionImplImpl<LiteralValueGetterWithStrFunction,
130-
LiteralValueGetterWithoutStrFunction, N, Function,
131-
AdditionalNonStringValueGetters...>;
132-
13370
// Lift a `Function` that takes one or multiple `std::string`s (possibly via
13471
// references) and returns an `Id` or `std::string` to a function that takes the
13572
// same number of `std::optional<std::string>` and returns `Id` or
@@ -409,7 +346,7 @@ using StrBeforeExpression =
409346
if (!flags.has_value() || !regex.has_value()) {
410347
return Id::makeUndefined();
411348
}
412-
auto firstInvalidFlag = flags.value().find_first_not_of("imsu");
349+
auto firstInvalidFlag = flags.value().find_first_not_of("imsU");
413350
if (firstInvalidFlag != std::string::npos) {
414351
return Id::makeUndefined();
415352
}
@@ -423,11 +360,10 @@ using StrBeforeExpression =
423360
};
424361

425362
using MergeRegexPatternAndFlagsExpression =
426-
StringExpressionImpl<2, decltype(mergeFlagsIntoRegex), StringValueGetter>;
363+
StringExpressionImpl<2, decltype(mergeFlagsIntoRegex), LiteralFromIdGetter>;
427364

428365
[[maybe_unused]] auto replaceImpl =
429-
[](std::optional<std::string> input,
430-
const std::unique_ptr<re2::RE2>& pattern,
366+
[](std::optional<std::string> input, const std::shared_ptr<RE2>& pattern,
431367
const std::optional<std::string>& replacement) -> IdOrLiteralOrIri {
432368
if (!input.has_value() || !pattern || !replacement.has_value()) {
433369
return Id::makeUndefined();
@@ -439,7 +375,7 @@ using MergeRegexPatternAndFlagsExpression =
439375
return Id::makeUndefined();
440376
}
441377
const auto& repl = replacement.value();
442-
re2::RE2::GlobalReplace(&in, pat, repl);
378+
RE2::GlobalReplace(&in, pat, repl);
443379
return toLiteral(in);
444380
};
445381

0 commit comments

Comments
 (0)