Skip to content

Commit

Permalink
UnifiedHighlighter highlight on multiple fields
Browse files Browse the repository at this point in the history
Add ability to highlight on multiple matched fields for UnifiedHighlighter.

FastVectorHighlighter for a long time has had an option to highlight
 a single field based on matches from several fields.
 But UnifiedHighlighter was missing this option. This adds this ability.
  • Loading branch information
mayya-sharipova committed Apr 4, 2024
1 parent 75e1ebc commit 147b57e
Show file tree
Hide file tree
Showing 10 changed files with 329 additions and 99 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -310,7 +310,8 @@ protected OffsetSource getOffsetSource(String field) {
@Override
public void withTopDocs(IndexSearcher searcher, Query q, TopDocs hits) throws Exception {
reset(searcher);
Map<String, String[]> result = highlighter.highlightFields(fields, q, hits, maxPassages);
Map<String, String[]> result =
highlighter.highlightFields(fields, q, hits, maxPassages, null);
preventOptimizeAway = result.size();
}
}
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package org.apache.lucene.search.uhighlight;

import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import org.apache.lucene.index.LeafReader;

/**
* FieldOffsetStrategy that combines offsets from multiple fields. Used to highlight a single field
* based on matches from multiple fields.
*
* @lucene.internal
*/
public class MultiFieldsOffsetStrategy extends FieldOffsetStrategy {
private final List<FieldOffsetStrategy> fieldsOffsetStrategies;

public MultiFieldsOffsetStrategy(List<FieldOffsetStrategy> fieldsOffsetStrategies) {
super(null);
this.fieldsOffsetStrategies = fieldsOffsetStrategies;
}

@Override
public String getField() {
throw new IllegalStateException("MultiFieldsOffsetStrategy does not have a single field.");
}

@Override
public UnifiedHighlighter.OffsetSource getOffsetSource() {
// TODO: what should be returned here as offset source?
return fieldsOffsetStrategies.getFirst().getOffsetSource();
}

@Override
public OffsetsEnum getOffsetsEnum(LeafReader reader, int docId, String content)
throws IOException {
List<OffsetsEnum> fieldsOffsetsEnums = new ArrayList<>(fieldsOffsetStrategies.size());
for (FieldOffsetStrategy fieldOffsetStrategy : fieldsOffsetStrategies) {
OffsetsEnum offsetsEnum = fieldOffsetStrategy.getOffsetsEnum(reader, docId, content);
if (offsetsEnum != OffsetsEnum.EMPTY) {
fieldsOffsetsEnums.add(offsetsEnum);
}
}
return new OffsetsEnum.MultiOffsetsEnum(fieldsOffsetsEnums);
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -701,7 +701,7 @@ protected FieldInfo getFieldInfo(String field) {
* IndexOptions#DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS}
*/
public String[] highlight(String field, Query query, TopDocs topDocs) throws IOException {
return highlight(field, query, topDocs, 1);
return highlight(field, query, topDocs, 1, null);
}

/**
Expand All @@ -712,17 +712,21 @@ public String[] highlight(String field, Query query, TopDocs topDocs) throws IOE
* @param topDocs TopDocs containing the summary result documents to highlight.
* @param maxPassages The maximum number of top-N ranked passages used to form the highlighted
* snippets.
* @param matchedFields fields whose matched are combined to highlight the given field
* @return Array of formatted snippets corresponding to the documents in <code>topDocs</code>. If
* no highlights were found for a document, the first {@code maxPassages} sentences from the
* field will be returned.
* @throws IOException if an I/O error occurred during processing
* @throws IllegalArgumentException if <code>field</code> was indexed without {@link
* IndexOptions#DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS}
*/
public String[] highlight(String field, Query query, TopDocs topDocs, int maxPassages)
public String[] highlight(
String field, Query query, TopDocs topDocs, int maxPassages, Set<String> matchedFields)
throws IOException {
List<Set<String>> matchedFieldsIn = matchedFields == null ? null : List.of(matchedFields);
Map<String, String[]> res =
highlightFields(new String[] {field}, query, topDocs, new int[] {maxPassages});
highlightFields(
new String[] {field}, query, topDocs, new int[] {maxPassages}, matchedFieldsIn);
return res.get(field);
}

Expand Down Expand Up @@ -753,7 +757,7 @@ public Map<String, String[]> highlightFields(String[] fields, Query query, TopDo
throws IOException {
int[] maxPassages = new int[fields.length];
Arrays.fill(maxPassages, 1);
return highlightFields(fields, query, topDocs, maxPassages);
return highlightFields(fields, query, topDocs, maxPassages, null);
}

/**
Expand All @@ -774,6 +778,7 @@ public Map<String, String[]> highlightFields(String[] fields, Query query, TopDo
* @param topDocs TopDocs containing the summary result documents to highlight.
* @param maxPassages The maximum number of top-N ranked passages per-field used to form the
* highlighted snippets.
* @param matchedFields fields whose matched are combined to highlight, per-field
* @return Map keyed on field name, containing the array of formatted snippets corresponding to
* the documents in <code>topDocs</code>. If no highlights were found for a document, the
* first {@code maxPassages} sentences from the field will be returned.
Expand All @@ -782,14 +787,19 @@ public Map<String, String[]> highlightFields(String[] fields, Query query, TopDo
* IndexOptions#DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS}
*/
public Map<String, String[]> highlightFields(
String[] fields, Query query, TopDocs topDocs, int[] maxPassages) throws IOException {
String[] fields,
Query query,
TopDocs topDocs,
int[] maxPassages,
List<Set<String>> matchedFields)
throws IOException {
final ScoreDoc[] scoreDocs = topDocs.scoreDocs;
int[] docids = new int[scoreDocs.length];
for (int i = 0; i < docids.length; i++) {
docids[i] = scoreDocs[i].doc;
}

return highlightFields(fields, query, docids, maxPassages);
return highlightFields(fields, query, docids, maxPassages, matchedFields);
}

/**
Expand All @@ -800,6 +810,7 @@ public Map<String, String[]> highlightFields(
* @param docidsIn containing the document IDs to highlight.
* @param maxPassagesIn The maximum number of top-N ranked passages per-field used to form the
* highlighted snippets.
* @param matchedFieldsIn fields whose matched are combined to highlight, per-field
* @return Map keyed on field name, containing the array of formatted snippets corresponding to
* the documents in <code>docidsIn</code>. If no highlights were found for a document, the
* first {@code maxPassages} from the field will be returned.
Expand All @@ -808,10 +819,16 @@ public Map<String, String[]> highlightFields(
* IndexOptions#DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS}
*/
public Map<String, String[]> highlightFields(
String[] fieldsIn, Query query, int[] docidsIn, int[] maxPassagesIn) throws IOException {
String[] fieldsIn,
Query query,
int[] docidsIn,
int[] maxPassagesIn,
List<Set<String>> matchedFieldsIn)
throws IOException {
Map<String, String[]> snippets = new HashMap<>();
for (Map.Entry<String, Object[]> ent :
highlightFieldsAsObjects(fieldsIn, query, docidsIn, maxPassagesIn).entrySet()) {
highlightFieldsAsObjects(fieldsIn, query, docidsIn, maxPassagesIn, matchedFieldsIn)
.entrySet()) {
Object[] snippetObjects = ent.getValue();
String[] snippetStrings = new String[snippetObjects.length];
snippets.put(ent.getKey(), snippetStrings);
Expand All @@ -836,6 +853,7 @@ public Map<String, String[]> highlightFields(
* @param docIdsIn containing the document IDs to highlight.
* @param maxPassagesIn The maximum number of top-N ranked passages per-field used to form the
* highlighted snippets.
* @param matchedFieldsIn fields whose matched are combined to highlight, per-field
* @return Map keyed on field name, containing the array of formatted snippets corresponding to
* the documents in <code>docIdsIn</code>. If no highlights were found for a document, the
* first {@code maxPassages} from the field will be returned.
Expand All @@ -844,7 +862,12 @@ public Map<String, String[]> highlightFields(
* IndexOptions#DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS}
*/
protected Map<String, Object[]> highlightFieldsAsObjects(
String[] fieldsIn, Query query, int[] docIdsIn, int[] maxPassagesIn) throws IOException {
String[] fieldsIn,
Query query,
int[] docIdsIn,
int[] maxPassagesIn,
List<Set<String>> matchedFieldsIn)
throws IOException {
if (fieldsIn.length < 1) {
throw new IllegalArgumentException("fieldsIn must not be empty");
}
Expand Down Expand Up @@ -877,7 +900,12 @@ protected Map<String, Object[]> highlightFieldsAsObjects(
int numPostings = 0;
for (int f = 0; f < fields.length; f++) {
FieldHighlighter fieldHighlighter =
getFieldHighlighter(fields[f], query, queryTerms, maxPassages[f]);
getFieldHighlighter(
fields[f],
query,
queryTerms,
maxPassages[f],
matchedFieldsIn == null ? null : matchedFieldsIn.get(f));
fieldHighlighters[f] = fieldHighlighter;

switch (fieldHighlighter.getOffsetSource()) {
Expand Down Expand Up @@ -1059,17 +1087,30 @@ public Object highlightWithoutSearcher(String field, Query query, String content
}
Objects.requireNonNull(content, "content is required");
Set<Term> queryTerms = extractTerms(query);
return getFieldHighlighter(field, query, queryTerms, maxPassages)
return getFieldHighlighter(field, query, queryTerms, maxPassages, null)
.highlightFieldForDoc(null, -1, content);
}

protected FieldHighlighter getFieldHighlighter(
String field, Query query, Set<Term> allTerms, int maxPassages) {
UHComponents components = getHighlightComponents(field, query, allTerms);
OffsetSource offsetSource = getOptimizedOffsetSource(components);
String field, Query query, Set<Term> allTerms, int maxPassages, Set<String> matchedFields) {

FieldOffsetStrategy fieldOffsetStrategy;
if (matchedFields == null) {
UHComponents components = getHighlightComponents(field, query, allTerms);
OffsetSource offsetSource = getOptimizedOffsetSource(components);
fieldOffsetStrategy = getOffsetStrategy(offsetSource, components);
} else {
List<FieldOffsetStrategy> fieldsOffsetStrategies = new ArrayList<>(matchedFields.size());
for (String matchedField : matchedFields) {
UHComponents components = getHighlightComponents(matchedField, query, allTerms);
OffsetSource offsetSource = getOptimizedOffsetSource(components);
fieldsOffsetStrategies.add(getOffsetStrategy(offsetSource, components));
}
fieldOffsetStrategy = new MultiFieldsOffsetStrategy(fieldsOffsetStrategies);
}
return newFieldHighlighter(
field,
getOffsetStrategy(offsetSource, components),
fieldOffsetStrategy,
new SplittingBreakIterator(getBreakIterator(field), UnifiedHighlighter.MULTIVAL_SEP_CHAR),
getScorer(field),
maxPassages,
Expand Down
Loading

0 comments on commit 147b57e

Please sign in to comment.