Skip to content

Commit

Permalink
UnifiedHighlighter highlight on multiple fields
Browse files Browse the repository at this point in the history
Add ability to UnifiedHighlighter to combine matches from multiple fields
to highlight a single field.

FastVectorHighlighter for a long time has an option to highlight a single field
based on matches from several fields. But UnifiedHighlighter was missing this option.
This adds this ability.
  • Loading branch information
mayya-sharipova committed Apr 8, 2024
1 parent 75e1ebc commit 30a63b0
Show file tree
Hide file tree
Showing 3 changed files with 225 additions and 3 deletions.
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package org.apache.lucene.search.uhighlight;

import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import org.apache.lucene.index.LeafReader;

/**
* FieldOffsetStrategy that combines offsets from multiple fields. Used to highlight a single field
* based on matches from multiple fields.
*
* @lucene.internal
*/
public class MultiFieldsOffsetStrategy extends FieldOffsetStrategy {
private final List<FieldOffsetStrategy> fieldsOffsetStrategies;

public MultiFieldsOffsetStrategy(List<FieldOffsetStrategy> fieldsOffsetStrategies) {
super(null);
this.fieldsOffsetStrategies = fieldsOffsetStrategies;
}

@Override
public String getField() {
throw new IllegalStateException("MultiFieldsOffsetStrategy does not have a single field.");
}

@Override
public UnifiedHighlighter.OffsetSource getOffsetSource() {
// TODO: what should be returned here as offset source?
return fieldsOffsetStrategies.getFirst().getOffsetSource();
}

@Override
public OffsetsEnum getOffsetsEnum(LeafReader reader, int docId, String content)
throws IOException {
List<OffsetsEnum> fieldsOffsetsEnums = new ArrayList<>(fieldsOffsetStrategies.size());
for (FieldOffsetStrategy fieldOffsetStrategy : fieldsOffsetStrategies) {
OffsetsEnum offsetsEnum = fieldOffsetStrategy.getOffsetsEnum(reader, docId, content);
if (offsetsEnum != OffsetsEnum.EMPTY) {
fieldsOffsetsEnums.add(offsetsEnum);
}
}
return new OffsetsEnum.MultiOffsetsEnum(fieldsOffsetsEnums);
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@
import java.util.Set;
import java.util.SortedSet;
import java.util.TreeSet;
import java.util.function.Function;
import java.util.function.Predicate;
import java.util.function.Supplier;
import org.apache.lucene.analysis.Analyzer;
Expand Down Expand Up @@ -122,6 +123,8 @@ public class UnifiedHighlighter {

private Predicate<String> fieldMatcher;

private final Function<String, Set<String>> matchedFieldsFunc;

private Set<HighlightFlag> flags;

// e.g. wildcards
Expand Down Expand Up @@ -162,6 +165,7 @@ public UnifiedHighlighter(IndexSearcher indexSearcher, Analyzer indexAnalyzer) {
Objects.requireNonNull(
indexAnalyzer,
"indexAnalyzer is required" + " (even if in some circumstances it isn't used)");
this.matchedFieldsFunc = null;
}

@Deprecated
Expand Down Expand Up @@ -256,6 +260,8 @@ public static class Builder {

private final Analyzer indexAnalyzer;
private Predicate<String> fieldMatcher;

private Function<String, Set<String>> matchedFieldsFunc;
private Set<HighlightFlag> flags;
private boolean handleMultiTermQuery = DEFAULT_ENABLE_MULTI_TERM_QUERY;
private boolean highlightPhrasesStrictly = DEFAULT_ENABLE_HIGHLIGHT_PHRASES_STRICTLY;
Expand Down Expand Up @@ -360,6 +366,22 @@ public Builder withFieldMatcher(Predicate<String> value) {
return this;
}

/**
* Set up a function that given a field retuns a set of fields whose matches are combined to
* highlight the given field. This is useful when you want to highlight a field based on matches
* from several fields.
*
* <p>Note: All matched fields must share the same source as the field being highlighted,
* otherwise their offsets will not correspond to the highlighted field.
*
* <p>Note: Only the field being highlighted must provide an original source value (e.g. through
* stored field), other matched fields don't need it.
*/
public Builder withMatchedFieldsFunc(Function<String, Set<String>> matchedFieldsFunc) {
this.matchedFieldsFunc = matchedFieldsFunc;
return this;
}

public Builder withScorer(PassageScorer value) {
this.scorer = value;
return this;
Expand Down Expand Up @@ -436,6 +458,7 @@ public UnifiedHighlighter(Builder builder) {
this.maxLength = builder.maxLength;
this.breakIterator = builder.breakIterator;
this.fieldMatcher = builder.fieldMatcher;
this.matchedFieldsFunc = builder.matchedFieldsFunc;
this.scorer = builder.scorer;
this.formatter = builder.formatter;
this.maxNoHighlightPassages = builder.maxNoHighlightPassages;
Expand Down Expand Up @@ -543,6 +566,10 @@ protected Predicate<String> getFieldMatcher(String field) {
}
}

protected Set<String> getMatchedFields(String field) {
return matchedFieldsFunc == null ? null : matchedFieldsFunc.apply(field);
}

/** Returns the {@link HighlightFlag}s applicable for the current UH instance. */
protected Set<HighlightFlag> getFlags(String field) {
// If a builder is used for initializing a UH object, then flags will never be null.
Expand Down Expand Up @@ -1065,11 +1092,24 @@ public Object highlightWithoutSearcher(String field, Query query, String content

protected FieldHighlighter getFieldHighlighter(
String field, Query query, Set<Term> allTerms, int maxPassages) {
UHComponents components = getHighlightComponents(field, query, allTerms);
OffsetSource offsetSource = getOptimizedOffsetSource(components);
Set<String> matchedFields = getMatchedFields(field);
FieldOffsetStrategy fieldOffsetStrategy;
if (matchedFields == null || matchedFields.isEmpty()) {
UHComponents components = getHighlightComponents(field, query, allTerms);
OffsetSource offsetSource = getOptimizedOffsetSource(components);
fieldOffsetStrategy = getOffsetStrategy(offsetSource, components);
} else {
List<FieldOffsetStrategy> fieldsOffsetStrategies = new ArrayList<>(matchedFields.size());
for (String matchedField : matchedFields) {
UHComponents components = getHighlightComponents(matchedField, query, allTerms);
OffsetSource offsetSource = getOptimizedOffsetSource(components);
fieldsOffsetStrategies.add(getOffsetStrategy(offsetSource, components));
}
fieldOffsetStrategy = new MultiFieldsOffsetStrategy(fieldsOffsetStrategies);
}
return newFieldHighlighter(
field,
getOffsetStrategy(offsetSource, components),
fieldOffsetStrategy,
new SplittingBreakIterator(getBreakIterator(field), UnifiedHighlighter.MULTIVAL_SEP_CHAR),
getScorer(field),
maxPassages,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -30,13 +30,20 @@
import java.util.Map;
import java.util.Objects;
import java.util.Set;
import java.util.TreeMap;
import java.util.function.Function;
import java.util.function.Predicate;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.DelegatingAnalyzerWrapper;
import org.apache.lucene.analysis.core.WhitespaceAnalyzer;
import org.apache.lucene.analysis.en.EnglishAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.FieldType;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexOptions;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.BooleanClause;
import org.apache.lucene.search.BooleanQuery;
Expand All @@ -54,8 +61,13 @@
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.search.Weight;
import org.apache.lucene.search.uhighlight.UnifiedHighlighter.HighlightFlag;
import org.apache.lucene.store.Directory;
import org.apache.lucene.tests.analysis.MockAnalyzer;
import org.apache.lucene.tests.index.RandomIndexWriter;
import org.apache.lucene.util.QueryBuilder;
import org.apache.lucene.util.automaton.Automata;
import org.apache.lucene.util.automaton.CharacterRunAutomaton;
import org.apache.lucene.util.automaton.RegExp;

public class TestUnifiedHighlighter extends UnifiedHighlighterTestBase {
@ParametersFactory
Expand Down Expand Up @@ -1337,6 +1349,114 @@ public void testFieldMatcherMultiTermQuery() throws Exception {
ir.close();
}

public void testMatchedFields() throws IOException {
final Map<String, Analyzer> fieldAnalyzers = new TreeMap<>();
fieldAnalyzers.put("field", new WhitespaceAnalyzer());
fieldAnalyzers.put("field_english", new EnglishAnalyzer()); // English stemming and stopwords
fieldAnalyzers.put( // Each letter is a token
"field_characters",
new MockAnalyzer(random(), new CharacterRunAutomaton(new RegExp(".").toAutomaton()), true));
fieldAnalyzers.put( // Every three letters is a token
"field_tripples",
new MockAnalyzer(
random(), new CharacterRunAutomaton(new RegExp("...").toAutomaton()), true));
Analyzer analyzer =
new DelegatingAnalyzerWrapper(Analyzer.PER_FIELD_REUSE_STRATEGY) {
@Override
public Analyzer getWrappedAnalyzer(String fieldName) {
return fieldAnalyzers.get(fieldName);
}
};
FieldType fieldTypeMatched = new FieldType(fieldType);
fieldTypeMatched.setStored(false); // matched fields don't need to be stored
fieldTypeMatched.freeze();

try (Directory dir = newDirectory()) {
try (IndexWriter writer = new IndexWriter(dir, newIndexWriterConfig(analyzer))) {
Document doc = new Document();
doc.add(new Field("field", "dance with star", fieldType));
doc.add(new Field("field_english", "dance with star", fieldTypeMatched));
doc.add(new Field("field_characters", "dance with star", fieldTypeMatched));
doc.add(new Field("field_tripples", "dance with star", fieldTypeMatched));
writer.addDocument(doc);
}

try (IndexReader reader = DirectoryReader.open(dir)) {
IndexSearcher searcher = newSearcher(reader);
// field is highlighted based on the matches from the "field_english"
matchedFieldsTestCase(
analyzer,
searcher,
Set.of("field", "field_english"),
"dancing with the stars",
"<b>dance with star</b>",
"<b>dance</b> with <b>star</b>");

// field is highlighted based on the matches from the "field_characters"
matchedFieldsTestCase(
analyzer,
searcher,
Set.of("field", "field_characters"),
"danc",
"<b>danc</b>e with star",
"<b>d</b><b>a</b><b>n</b><b>c</b>e with star");

// field is highlighted based on the matches from the "field_tripples"
matchedFieldsTestCase(
analyzer,
searcher,
Set.of("field", "field_tripples"),
"danc",
"<b>dan</b>ce with star",
"<b>dan</b>ce with star");

// field is highlighted based on the matches from the "field_characters" and
// "field_tripples"
matchedFieldsTestCase(
analyzer,
searcher,
Set.of("field", "field_tripples", "field_characters"),
"danc",
"<b>danc</b>e with star",
"<b>da</b><b>n</b><b>c</b>e with star");
}
}
}

private static void matchedFieldsTestCase(
Analyzer analyzer,
IndexSearcher searcher,
Set<String> matchedFields,
String queryText,
String expectedSnippetWithWeightMatches,
String expectedSnippetWithoutWeightMatches)
throws IOException {
QueryBuilder queryBuilder = new QueryBuilder(analyzer);
BooleanQuery.Builder boolQueryBuilder = new BooleanQuery.Builder();
for (String matchedField : matchedFields) {
Query fieldPhraseQuery = queryBuilder.createPhraseQuery(matchedField, queryText, 2);
boolQueryBuilder.add(fieldPhraseQuery, BooleanClause.Occur.SHOULD);
}
Query query = boolQueryBuilder.build();
TopDocs topDocs = searcher.search(query, 10);
assertEquals(1, topDocs.totalHits.value);

Function<String, Set<String>> matchedFieldsFunc =
fieldName -> fieldName.equals("field") ? matchedFields : Collections.emptySet();
UnifiedHighlighter.Builder uhBuilder =
new UnifiedHighlighter.Builder(searcher, analyzer).withMatchedFieldsFunc(matchedFieldsFunc);
UnifiedHighlighter highlighter =
randomUnifiedHighlighter(
uhBuilder, EnumSet.of(HighlightFlag.PHRASES), random().nextBoolean());
String[] snippets = highlighter.highlight("field", query, topDocs, 10);
String expectedSnippet =
highlighter.getFlags("field").contains(HighlightFlag.WEIGHT_MATCHES)
? expectedSnippetWithWeightMatches
: expectedSnippetWithoutWeightMatches;
assertEquals(1, snippets.length);
assertEquals(expectedSnippet, snippets[0]);
}

public void testMatchesSlopBug() throws IOException {
IndexReader ir = indexSomeFields();
IndexSearcher searcher = newSearcher(ir);
Expand Down

0 comments on commit 30a63b0

Please sign in to comment.