Add IntervalsSource for range and regexp queries (#13569)

We already have convenient functions for contructing IntervalsSource for wildcard and fuzzy functions. This adds functions for regexp and range as well. Backport for PR #13562
apache · Jul 15, 2024 · c2327f5 · c2327f5
1 parent 26f2ad7
commit c2327f5
Show file tree

Hide file tree

Showing 3 changed files with 160 additions and 0 deletions.
diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt
@@ -21,6 +21,9 @@ Improvements
 
 * GITHUB#13548: Refactor and javadoc update for KNN vector writer classes. (Patrick Zhai)
 
+* GITHUB#13562: Add Intervals.regexp and Intervals.range methods to produce  IntervalsSource
+  for regexp and range queries. (Mayya Sharipova)
+
 Optimizations
 ---------------------
 

diff --git a/lucene/queries/src/java/org/apache/lucene/queries/intervals/Intervals.java b/lucene/queries/src/java/org/apache/lucene/queries/intervals/Intervals.java
@@ -27,11 +27,15 @@
 import org.apache.lucene.index.Term;
 import org.apache.lucene.search.FuzzyQuery;
 import org.apache.lucene.search.PrefixQuery;
+import org.apache.lucene.search.RegexpQuery;
+import org.apache.lucene.search.TermRangeQuery;
 import org.apache.lucene.search.WildcardQuery;
 import org.apache.lucene.util.BytesRef;
+import org.apache.lucene.util.automaton.Automaton;
 import org.apache.lucene.util.automaton.CompiledAutomaton;
 import org.apache.lucene.util.automaton.LevenshteinAutomata;
 import org.apache.lucene.util.automaton.Operations;
+import org.apache.lucene.util.automaton.RegExp;
 
 /**
  * Factory functions for creating {@link IntervalsSource interval sources}.
@@ -208,6 +212,90 @@ public static IntervalsSource wildcard(BytesRef wildcard, int maxExpansions) {
     return new MultiTermIntervalsSource(ca, maxExpansions, wildcard.utf8ToString());
   }
 
+  /**
+   * Return an {@link IntervalsSource} over the disjunction of all terms that match a regular
+   * expression
+   *
+   * @param regexp regular expression
+   * @throws IllegalStateException if the regex expands to more than {@link #DEFAULT_MAX_EXPANSIONS}
+   *     terms
+   * @see RegexpQuery for regexp format
+   */
+  public static IntervalsSource regexp(BytesRef regexp) {
+    return regexp(regexp, DEFAULT_MAX_EXPANSIONS);
+  }
+
+  /**
+   * Expert: Return an {@link IntervalsSource} over the disjunction of all terms that match a
+   * regular expression
+   *
+   * <p>WARNING: Setting {@code maxExpansions} to higher than the default value of {@link
+   * #DEFAULT_MAX_EXPANSIONS} can be both slow and memory-intensive
+   *
+   * @param regexp regular expression
+   * @param maxExpansions the maximum number of terms to expand to
+   * @throws IllegalStateException if the regex expands to more than {@link #DEFAULT_MAX_EXPANSIONS}
+   *     terms
+   * @see RegexpQuery for regexp format
+   */
+  public static IntervalsSource regexp(BytesRef regexp, int maxExpansions) {
+    Automaton automaton = new RegExp(new Term("", regexp).text()).toAutomaton();
+    CompiledAutomaton ca = new CompiledAutomaton(automaton, false, true);
+    return new MultiTermIntervalsSource(ca, maxExpansions, regexp.utf8ToString());
+  }
+
+  /**
+   * Return an {@link IntervalsSource} over the disjunction of all terms that fall within the given
+   * range
+   *
+   * @param lowerTerm The term text at the lower end of the range
+   * @param upperTerm The term text at the upper end of the range
+   * @param includeLower If true, the <code>lowerTerm</code> is included in the range
+   * @param includeUpper If true, the <code>upperTerm</code> is included in the range
+   * @throws IllegalStateException if the range expands to more than {@link #DEFAULT_MAX_EXPANSIONS}
+   *     terms
+   */
+  public static IntervalsSource range(
+      BytesRef lowerTerm, BytesRef upperTerm, boolean includeLower, boolean includeUpper) {
+    return range(lowerTerm, upperTerm, includeLower, includeUpper, DEFAULT_MAX_EXPANSIONS);
+  }
+
+  /**
+   * Expert: Return an {@link IntervalsSource} over the disjunction of all terms that fall within
+   * the given range
+   *
+   * <p>WARNING: Setting {@code maxExpansions} to higher than the default value of {@link
+   * #DEFAULT_MAX_EXPANSIONS} can be both slow and memory-intensive
+   *
+   * @param lowerTerm The term text at the lower end of the range
+   * @param upperTerm The term text at the upper end of the range
+   * @param includeLower If true, the <code>lowerTerm</code> is included in the range
+   * @param includeUpper If true, the <code>upperTerm</code> is included in the range
+   * @param maxExpansions the maximum number of terms to expand to
+   * @throws IllegalStateException if the wildcard glob expands to more than {@code maxExpansions}
+   *     terms
+   */
+  public static IntervalsSource range(
+      BytesRef lowerTerm,
+      BytesRef upperTerm,
+      boolean includeLower,
+      boolean includeUpper,
+      int maxExpansions) {
+    Automaton automaton =
+        TermRangeQuery.toAutomaton(lowerTerm, upperTerm, includeLower, includeUpper);
+    CompiledAutomaton ca =
+        new CompiledAutomaton(
+            automaton, false, true, Operations.DEFAULT_DETERMINIZE_WORK_LIMIT, true);
+
+    StringBuilder buffer = new StringBuilder();
+    buffer.append("{");
+    buffer.append(lowerTerm.utf8ToString());
+    buffer.append(",");
+    buffer.append(upperTerm.utf8ToString());
+    buffer.append("}");
+    return new MultiTermIntervalsSource(ca, maxExpansions, buffer.toString());
+  }
+
   /**
    * A fuzzy term {@link IntervalsSource} matches the disjunction of intervals of terms that are
    * within the specified {@code maxEdits} from the provided term.

diff --git a/lucene/queries/src/test/org/apache/lucene/queries/intervals/TestIntervals.java b/lucene/queries/src/test/org/apache/lucene/queries/intervals/TestIntervals.java
@@ -1029,6 +1029,40 @@ public void testWildcard() throws IOException {
     checkVisits(Intervals.wildcard(new BytesRef("p??")), 1);
   }
 
+  public void testRegexp() throws IOException {
+    IntervalsSource source = Intervals.regexp(new BytesRef(".ot"));
+    checkIntervals(
+        source,
+        "field1",
+        4,
+        new int[][] {
+          {},
+          {2, 2, 10, 10, 17, 17, 27, 27},
+          {5, 5, 10, 10, 21, 21},
+          {3, 3},
+          {2, 2, 10, 10, 17, 17},
+          {}
+        });
+    MatchesIterator mi = getMatches(source, 4, "field1");
+    assertNotNull(mi);
+    assertMatch(mi, 2, 2, 15, 18);
+    assertMatch(mi, 10, 10, 63, 66);
+    assertMatch(mi, 17, 17, 97, 100);
+
+    IllegalStateException e =
+        expectThrows(
+            IllegalStateException.class,
+            () -> {
+              IntervalsSource s = Intervals.regexp(new BytesRef(".ot"), 1);
+              for (LeafReaderContext ctx : searcher.getIndexReader().leaves()) {
+                s.intervals("field1", ctx);
+              }
+            });
+    assertEquals("Automaton [.ot] expanded to too many terms (limit 1)", e.getMessage());
+
+    checkVisits(Intervals.regexp(new BytesRef("p.*")), 1);
+  }
+
   public void testFuzzyTerm() throws IOException {
     IntervalsSource source = Intervals.fuzzyTerm("kot", 1); // matches 'pot'
     checkIntervals(
@@ -1070,6 +1104,41 @@ public void testFuzzyTerm() throws IOException {
     checkVisits(Intervals.fuzzyTerm("kot", FuzzyQuery.defaultMaxEdits), 1);
   }
 
+  public void testRange() throws IOException {
+    IntervalsSource source = Intervals.range(new BytesRef("cold"), new BytesRef("hot"), true, true);
+    checkIntervals(
+        source,
+        "field1",
+        6,
+        new int[][] {
+          {5, 5},
+          {2, 2, 5, 5, 12, 12, 17, 17, 21, 21, 29, 29},
+          {2, 2, 5, 5, 12, 12, 17, 17, 21, 21, 27, 27},
+          {1, 1, 3, 3, 4, 4},
+          {2, 2, 5, 5, 17, 17},
+          {2, 2}
+        });
+    MatchesIterator mi = getMatches(source, 3, "field1");
+    assertNotNull(mi);
+    assertMatch(mi, 1, 1, 4, 8);
+    assertMatch(mi, 3, 3, 15, 18);
+    assertMatch(mi, 4, 4, 19, 24);
+
+    IllegalStateException e =
+        expectThrows(
+            IllegalStateException.class,
+            () -> {
+              IntervalsSource s =
+                  Intervals.range(new BytesRef("cold"), new BytesRef("hot"), true, true, 1);
+              for (LeafReaderContext ctx : searcher.getIndexReader().leaves()) {
+                s.intervals("field1", ctx);
+              }
+            });
+    assertEquals("Automaton [{cold,hot}] expanded to too many terms (limit 1)", e.getMessage());
+
+    checkVisits(source, 1);
+  }
+
   public void testWrappedFilters() throws IOException {
     IntervalsSource source =
         Intervals.or(