UnifiedHighlighter highlight on multiple fields

Add ability to highlight on multiple matched fields for UnifiedHighlighter. FastVectorHighlighter for a long time has had an option to highlight a single field based on matches from several fields. But UnifiedHighlighter was missing this option. This adds this ability.
apache · Apr 4, 2024 · 147b57e · 147b57e
1 parent 75e1ebc
commit 147b57e
Show file tree

Hide file tree

Showing 10 changed files with 329 additions and 99 deletions.
diff --git a/...nchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/SearchTravRetHighlightTask.java b/...nchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/SearchTravRetHighlightTask.java
@@ -310,7 +310,8 @@ protected OffsetSource getOffsetSource(String field) {
     @Override
     public void withTopDocs(IndexSearcher searcher, Query q, TopDocs hits) throws Exception {
       reset(searcher);
-      Map<String, String[]> result = highlighter.highlightFields(fields, q, hits, maxPassages);
+      Map<String, String[]> result =
+          highlighter.highlightFields(fields, q, hits, maxPassages, null);
       preventOptimizeAway = result.size();
     }
   }

diff --git a/...e/highlighter/src/java/org/apache/lucene/search/uhighlight/MultiFieldsOffsetStrategy.java b/...e/highlighter/src/java/org/apache/lucene/search/uhighlight/MultiFieldsOffsetStrategy.java
@@ -0,0 +1,62 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.lucene.search.uhighlight;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.List;
+import org.apache.lucene.index.LeafReader;
+
+/**
+ * FieldOffsetStrategy that combines offsets from multiple fields. Used to highlight a single field
+ * based on matches from multiple fields.
+ *
+ * @lucene.internal
+ */
+public class MultiFieldsOffsetStrategy extends FieldOffsetStrategy {
+  private final List<FieldOffsetStrategy> fieldsOffsetStrategies;
+
+  public MultiFieldsOffsetStrategy(List<FieldOffsetStrategy> fieldsOffsetStrategies) {
+    super(null);
+    this.fieldsOffsetStrategies = fieldsOffsetStrategies;
+  }
+
+  @Override
+  public String getField() {
+    throw new IllegalStateException("MultiFieldsOffsetStrategy does not have a single field.");
+  }
+
+  @Override
+  public UnifiedHighlighter.OffsetSource getOffsetSource() {
+    // TODO: what should be returned here as offset source?
+    return fieldsOffsetStrategies.getFirst().getOffsetSource();
+  }
+
+  @Override
+  public OffsetsEnum getOffsetsEnum(LeafReader reader, int docId, String content)
+      throws IOException {
+    List<OffsetsEnum> fieldsOffsetsEnums = new ArrayList<>(fieldsOffsetStrategies.size());
+    for (FieldOffsetStrategy fieldOffsetStrategy : fieldsOffsetStrategies) {
+      OffsetsEnum offsetsEnum = fieldOffsetStrategy.getOffsetsEnum(reader, docId, content);
+      if (offsetsEnum != OffsetsEnum.EMPTY) {
+        fieldsOffsetsEnums.add(offsetsEnum);
+      }
+    }
+    return new OffsetsEnum.MultiOffsetsEnum(fieldsOffsetsEnums);
+  }
+}
diff --git a/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/UnifiedHighlighter.java b/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/UnifiedHighlighter.java
@@ -701,7 +701,7 @@ protected FieldInfo getFieldInfo(String field) {
    *     IndexOptions#DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS}
    */
   public String[] highlight(String field, Query query, TopDocs topDocs) throws IOException {
-    return highlight(field, query, topDocs, 1);
+    return highlight(field, query, topDocs, 1, null);
   }
 
   /**
@@ -712,17 +712,21 @@ public String[] highlight(String field, Query query, TopDocs topDocs) throws IOE
    * @param topDocs TopDocs containing the summary result documents to highlight.
    * @param maxPassages The maximum number of top-N ranked passages used to form the highlighted
    *     snippets.
+   * @param matchedFields fields whose matched are combined to highlight the given field
    * @return Array of formatted snippets corresponding to the documents in <code>topDocs</code>. If
    *     no highlights were found for a document, the first {@code maxPassages} sentences from the
    *     field will be returned.
    * @throws IOException if an I/O error occurred during processing
    * @throws IllegalArgumentException if <code>field</code> was indexed without {@link
    *     IndexOptions#DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS}
    */
-  public String[] highlight(String field, Query query, TopDocs topDocs, int maxPassages)
+  public String[] highlight(
+      String field, Query query, TopDocs topDocs, int maxPassages, Set<String> matchedFields)
       throws IOException {
+    List<Set<String>> matchedFieldsIn = matchedFields == null ? null : List.of(matchedFields);
     Map<String, String[]> res =
-        highlightFields(new String[] {field}, query, topDocs, new int[] {maxPassages});
+        highlightFields(
+            new String[] {field}, query, topDocs, new int[] {maxPassages}, matchedFieldsIn);
     return res.get(field);
   }
 
@@ -753,7 +757,7 @@ public Map<String, String[]> highlightFields(String[] fields, Query query, TopDo
       throws IOException {
     int[] maxPassages = new int[fields.length];
     Arrays.fill(maxPassages, 1);
-    return highlightFields(fields, query, topDocs, maxPassages);
+    return highlightFields(fields, query, topDocs, maxPassages, null);
   }
 
   /**
@@ -774,6 +778,7 @@ public Map<String, String[]> highlightFields(String[] fields, Query query, TopDo
    * @param topDocs TopDocs containing the summary result documents to highlight.
    * @param maxPassages The maximum number of top-N ranked passages per-field used to form the
    *     highlighted snippets.
+   * @param matchedFields fields whose matched are combined to highlight, per-field
    * @return Map keyed on field name, containing the array of formatted snippets corresponding to
    *     the documents in <code>topDocs</code>. If no highlights were found for a document, the
    *     first {@code maxPassages} sentences from the field will be returned.
@@ -782,14 +787,19 @@ public Map<String, String[]> highlightFields(String[] fields, Query query, TopDo
    *     IndexOptions#DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS}
    */
   public Map<String, String[]> highlightFields(
-      String[] fields, Query query, TopDocs topDocs, int[] maxPassages) throws IOException {
+      String[] fields,
+      Query query,
+      TopDocs topDocs,
+      int[] maxPassages,
+      List<Set<String>> matchedFields)
+      throws IOException {
     final ScoreDoc[] scoreDocs = topDocs.scoreDocs;
     int[] docids = new int[scoreDocs.length];
     for (int i = 0; i < docids.length; i++) {
       docids[i] = scoreDocs[i].doc;
     }
 
-    return highlightFields(fields, query, docids, maxPassages);
+    return highlightFields(fields, query, docids, maxPassages, matchedFields);
   }
 
   /**
@@ -800,6 +810,7 @@ public Map<String, String[]> highlightFields(
    * @param docidsIn containing the document IDs to highlight.
    * @param maxPassagesIn The maximum number of top-N ranked passages per-field used to form the
    *     highlighted snippets.
+   * @param matchedFieldsIn fields whose matched are combined to highlight, per-field
    * @return Map keyed on field name, containing the array of formatted snippets corresponding to
    *     the documents in <code>docidsIn</code>. If no highlights were found for a document, the
    *     first {@code maxPassages} from the field will be returned.
@@ -808,10 +819,16 @@ public Map<String, String[]> highlightFields(
    *     IndexOptions#DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS}
    */
   public Map<String, String[]> highlightFields(
-      String[] fieldsIn, Query query, int[] docidsIn, int[] maxPassagesIn) throws IOException {
+      String[] fieldsIn,
+      Query query,
+      int[] docidsIn,
+      int[] maxPassagesIn,
+      List<Set<String>> matchedFieldsIn)
+      throws IOException {
     Map<String, String[]> snippets = new HashMap<>();
     for (Map.Entry<String, Object[]> ent :
-        highlightFieldsAsObjects(fieldsIn, query, docidsIn, maxPassagesIn).entrySet()) {
+        highlightFieldsAsObjects(fieldsIn, query, docidsIn, maxPassagesIn, matchedFieldsIn)
+            .entrySet()) {
       Object[] snippetObjects = ent.getValue();
       String[] snippetStrings = new String[snippetObjects.length];
       snippets.put(ent.getKey(), snippetStrings);
@@ -836,6 +853,7 @@ public Map<String, String[]> highlightFields(
    * @param docIdsIn containing the document IDs to highlight.
    * @param maxPassagesIn The maximum number of top-N ranked passages per-field used to form the
    *     highlighted snippets.
+   * @param matchedFieldsIn fields whose matched are combined to highlight, per-field
    * @return Map keyed on field name, containing the array of formatted snippets corresponding to
    *     the documents in <code>docIdsIn</code>. If no highlights were found for a document, the
    *     first {@code maxPassages} from the field will be returned.
@@ -844,7 +862,12 @@ public Map<String, String[]> highlightFields(
    *     IndexOptions#DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS}
    */
   protected Map<String, Object[]> highlightFieldsAsObjects(
-      String[] fieldsIn, Query query, int[] docIdsIn, int[] maxPassagesIn) throws IOException {
+      String[] fieldsIn,
+      Query query,
+      int[] docIdsIn,
+      int[] maxPassagesIn,
+      List<Set<String>> matchedFieldsIn)
+      throws IOException {
     if (fieldsIn.length < 1) {
       throw new IllegalArgumentException("fieldsIn must not be empty");
     }
@@ -877,7 +900,12 @@ protected Map<String, Object[]> highlightFieldsAsObjects(
     int numPostings = 0;
     for (int f = 0; f < fields.length; f++) {
       FieldHighlighter fieldHighlighter =
-          getFieldHighlighter(fields[f], query, queryTerms, maxPassages[f]);
+          getFieldHighlighter(
+              fields[f],
+              query,
+              queryTerms,
+              maxPassages[f],
+              matchedFieldsIn == null ? null : matchedFieldsIn.get(f));
       fieldHighlighters[f] = fieldHighlighter;
 
       switch (fieldHighlighter.getOffsetSource()) {
@@ -1059,17 +1087,30 @@ public Object highlightWithoutSearcher(String field, Query query, String content
     }
     Objects.requireNonNull(content, "content is required");
     Set<Term> queryTerms = extractTerms(query);
-    return getFieldHighlighter(field, query, queryTerms, maxPassages)
+    return getFieldHighlighter(field, query, queryTerms, maxPassages, null)
         .highlightFieldForDoc(null, -1, content);
   }
 
   protected FieldHighlighter getFieldHighlighter(
-      String field, Query query, Set<Term> allTerms, int maxPassages) {
-    UHComponents components = getHighlightComponents(field, query, allTerms);
-    OffsetSource offsetSource = getOptimizedOffsetSource(components);
+      String field, Query query, Set<Term> allTerms, int maxPassages, Set<String> matchedFields) {
+
+    FieldOffsetStrategy fieldOffsetStrategy;
+    if (matchedFields == null) {
+      UHComponents components = getHighlightComponents(field, query, allTerms);
+      OffsetSource offsetSource = getOptimizedOffsetSource(components);
+      fieldOffsetStrategy = getOffsetStrategy(offsetSource, components);
+    } else {
+      List<FieldOffsetStrategy> fieldsOffsetStrategies = new ArrayList<>(matchedFields.size());
+      for (String matchedField : matchedFields) {
+        UHComponents components = getHighlightComponents(matchedField, query, allTerms);
+        OffsetSource offsetSource = getOptimizedOffsetSource(components);
+        fieldsOffsetStrategies.add(getOffsetStrategy(offsetSource, components));
+      }
+      fieldOffsetStrategy = new MultiFieldsOffsetStrategy(fieldsOffsetStrategies);
+    }
     return newFieldHighlighter(
         field,
-        getOffsetStrategy(offsetSource, components),
+        fieldOffsetStrategy,
         new SplittingBreakIterator(getBreakIterator(field), UnifiedHighlighter.MULTIVAL_SEP_CHAR),
         getScorer(field),
         maxPassages,