Skip to content

Commit

Permalink
Add IntervalsSource for range and regexp queries (#13569)
Browse files Browse the repository at this point in the history
We already have convenient functions for contructing IntervalsSource
for wildcard and fuzzy functions. This adds functions for
regexp and range as well.

Backport for PR #13562
  • Loading branch information
mayya-sharipova authored Jul 15, 2024
1 parent 26f2ad7 commit c2327f5
Show file tree
Hide file tree
Showing 3 changed files with 160 additions and 0 deletions.
3 changes: 3 additions & 0 deletions lucene/CHANGES.txt
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,9 @@ Improvements

* GITHUB#13548: Refactor and javadoc update for KNN vector writer classes. (Patrick Zhai)

* GITHUB#13562: Add Intervals.regexp and Intervals.range methods to produce IntervalsSource
for regexp and range queries. (Mayya Sharipova)

Optimizations
---------------------

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -27,11 +27,15 @@
import org.apache.lucene.index.Term;
import org.apache.lucene.search.FuzzyQuery;
import org.apache.lucene.search.PrefixQuery;
import org.apache.lucene.search.RegexpQuery;
import org.apache.lucene.search.TermRangeQuery;
import org.apache.lucene.search.WildcardQuery;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.automaton.Automaton;
import org.apache.lucene.util.automaton.CompiledAutomaton;
import org.apache.lucene.util.automaton.LevenshteinAutomata;
import org.apache.lucene.util.automaton.Operations;
import org.apache.lucene.util.automaton.RegExp;

/**
* Factory functions for creating {@link IntervalsSource interval sources}.
Expand Down Expand Up @@ -208,6 +212,90 @@ public static IntervalsSource wildcard(BytesRef wildcard, int maxExpansions) {
return new MultiTermIntervalsSource(ca, maxExpansions, wildcard.utf8ToString());
}

/**
* Return an {@link IntervalsSource} over the disjunction of all terms that match a regular
* expression
*
* @param regexp regular expression
* @throws IllegalStateException if the regex expands to more than {@link #DEFAULT_MAX_EXPANSIONS}
* terms
* @see RegexpQuery for regexp format
*/
public static IntervalsSource regexp(BytesRef regexp) {
return regexp(regexp, DEFAULT_MAX_EXPANSIONS);
}

/**
* Expert: Return an {@link IntervalsSource} over the disjunction of all terms that match a
* regular expression
*
* <p>WARNING: Setting {@code maxExpansions} to higher than the default value of {@link
* #DEFAULT_MAX_EXPANSIONS} can be both slow and memory-intensive
*
* @param regexp regular expression
* @param maxExpansions the maximum number of terms to expand to
* @throws IllegalStateException if the regex expands to more than {@link #DEFAULT_MAX_EXPANSIONS}
* terms
* @see RegexpQuery for regexp format
*/
public static IntervalsSource regexp(BytesRef regexp, int maxExpansions) {
Automaton automaton = new RegExp(new Term("", regexp).text()).toAutomaton();
CompiledAutomaton ca = new CompiledAutomaton(automaton, false, true);
return new MultiTermIntervalsSource(ca, maxExpansions, regexp.utf8ToString());
}

/**
* Return an {@link IntervalsSource} over the disjunction of all terms that fall within the given
* range
*
* @param lowerTerm The term text at the lower end of the range
* @param upperTerm The term text at the upper end of the range
* @param includeLower If true, the <code>lowerTerm</code> is included in the range
* @param includeUpper If true, the <code>upperTerm</code> is included in the range
* @throws IllegalStateException if the range expands to more than {@link #DEFAULT_MAX_EXPANSIONS}
* terms
*/
public static IntervalsSource range(
BytesRef lowerTerm, BytesRef upperTerm, boolean includeLower, boolean includeUpper) {
return range(lowerTerm, upperTerm, includeLower, includeUpper, DEFAULT_MAX_EXPANSIONS);
}

/**
* Expert: Return an {@link IntervalsSource} over the disjunction of all terms that fall within
* the given range
*
* <p>WARNING: Setting {@code maxExpansions} to higher than the default value of {@link
* #DEFAULT_MAX_EXPANSIONS} can be both slow and memory-intensive
*
* @param lowerTerm The term text at the lower end of the range
* @param upperTerm The term text at the upper end of the range
* @param includeLower If true, the <code>lowerTerm</code> is included in the range
* @param includeUpper If true, the <code>upperTerm</code> is included in the range
* @param maxExpansions the maximum number of terms to expand to
* @throws IllegalStateException if the wildcard glob expands to more than {@code maxExpansions}
* terms
*/
public static IntervalsSource range(
BytesRef lowerTerm,
BytesRef upperTerm,
boolean includeLower,
boolean includeUpper,
int maxExpansions) {
Automaton automaton =
TermRangeQuery.toAutomaton(lowerTerm, upperTerm, includeLower, includeUpper);
CompiledAutomaton ca =
new CompiledAutomaton(
automaton, false, true, Operations.DEFAULT_DETERMINIZE_WORK_LIMIT, true);

StringBuilder buffer = new StringBuilder();
buffer.append("{");
buffer.append(lowerTerm.utf8ToString());
buffer.append(",");
buffer.append(upperTerm.utf8ToString());
buffer.append("}");
return new MultiTermIntervalsSource(ca, maxExpansions, buffer.toString());
}

/**
* A fuzzy term {@link IntervalsSource} matches the disjunction of intervals of terms that are
* within the specified {@code maxEdits} from the provided term.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -1029,6 +1029,40 @@ public void testWildcard() throws IOException {
checkVisits(Intervals.wildcard(new BytesRef("p??")), 1);
}

public void testRegexp() throws IOException {
IntervalsSource source = Intervals.regexp(new BytesRef(".ot"));
checkIntervals(
source,
"field1",
4,
new int[][] {
{},
{2, 2, 10, 10, 17, 17, 27, 27},
{5, 5, 10, 10, 21, 21},
{3, 3},
{2, 2, 10, 10, 17, 17},
{}
});
MatchesIterator mi = getMatches(source, 4, "field1");
assertNotNull(mi);
assertMatch(mi, 2, 2, 15, 18);
assertMatch(mi, 10, 10, 63, 66);
assertMatch(mi, 17, 17, 97, 100);

IllegalStateException e =
expectThrows(
IllegalStateException.class,
() -> {
IntervalsSource s = Intervals.regexp(new BytesRef(".ot"), 1);
for (LeafReaderContext ctx : searcher.getIndexReader().leaves()) {
s.intervals("field1", ctx);
}
});
assertEquals("Automaton [.ot] expanded to too many terms (limit 1)", e.getMessage());

checkVisits(Intervals.regexp(new BytesRef("p.*")), 1);
}

public void testFuzzyTerm() throws IOException {
IntervalsSource source = Intervals.fuzzyTerm("kot", 1); // matches 'pot'
checkIntervals(
Expand Down Expand Up @@ -1070,6 +1104,41 @@ public void testFuzzyTerm() throws IOException {
checkVisits(Intervals.fuzzyTerm("kot", FuzzyQuery.defaultMaxEdits), 1);
}

public void testRange() throws IOException {
IntervalsSource source = Intervals.range(new BytesRef("cold"), new BytesRef("hot"), true, true);
checkIntervals(
source,
"field1",
6,
new int[][] {
{5, 5},
{2, 2, 5, 5, 12, 12, 17, 17, 21, 21, 29, 29},
{2, 2, 5, 5, 12, 12, 17, 17, 21, 21, 27, 27},
{1, 1, 3, 3, 4, 4},
{2, 2, 5, 5, 17, 17},
{2, 2}
});
MatchesIterator mi = getMatches(source, 3, "field1");
assertNotNull(mi);
assertMatch(mi, 1, 1, 4, 8);
assertMatch(mi, 3, 3, 15, 18);
assertMatch(mi, 4, 4, 19, 24);

IllegalStateException e =
expectThrows(
IllegalStateException.class,
() -> {
IntervalsSource s =
Intervals.range(new BytesRef("cold"), new BytesRef("hot"), true, true, 1);
for (LeafReaderContext ctx : searcher.getIndexReader().leaves()) {
s.intervals("field1", ctx);
}
});
assertEquals("Automaton [{cold,hot}] expanded to too many terms (limit 1)", e.getMessage());

checkVisits(source, 1);
}

public void testWrappedFilters() throws IOException {
IntervalsSource source =
Intervals.or(
Expand Down

0 comments on commit c2327f5

Please sign in to comment.