Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

UnifiedHighlighter highlight on multiple fields #13268

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions lucene/CHANGES.txt
Original file line number Diff line number Diff line change
Expand Up @@ -240,6 +240,10 @@ New Features
* GITHUB#13197: Expand support for new scalar bit levels for HNSW vectors. This includes 4-bit vectors and an option
to compress them to gain a 50% reduction in memory usage. (Ben Trent)

* GITHUB#13268: Add ability for UnifiedHighlighter to highlight a field based on combined matches from multiple fields.
(Mayya Sharipova, Jim Ferenczi)


Improvements
---------------------

Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package org.apache.lucene.search.uhighlight;

import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import org.apache.lucene.index.LeafReader;

/**
* FieldOffsetStrategy that combines offsets from multiple fields. Used to highlight a single field
* based on matches from multiple fields.
*
* @lucene.internal
*/
public class MultiFieldsOffsetStrategy extends FieldOffsetStrategy {
private final List<FieldOffsetStrategy> fieldsOffsetStrategies;

public MultiFieldsOffsetStrategy(List<FieldOffsetStrategy> fieldsOffsetStrategies) {
super(null);
this.fieldsOffsetStrategies = fieldsOffsetStrategies;
}

@Override
public String getField() {
throw new IllegalStateException("MultiFieldsOffsetStrategy does not have a single field.");
}

@Override
public UnifiedHighlighter.OffsetSource getOffsetSource() {
// TODO: what should be returned here as offset source?
return fieldsOffsetStrategies.getFirst().getOffsetSource();
}

@Override
public OffsetsEnum getOffsetsEnum(LeafReader reader, int docId, String content)
throws IOException {
List<OffsetsEnum> fieldsOffsetsEnums = new ArrayList<>(fieldsOffsetStrategies.size());
for (FieldOffsetStrategy fieldOffsetStrategy : fieldsOffsetStrategies) {
OffsetsEnum offsetsEnum = fieldOffsetStrategy.getOffsetsEnum(reader, docId, content);
if (offsetsEnum != OffsetsEnum.EMPTY) {
fieldsOffsetsEnums.add(offsetsEnum);
}
}
return new OffsetsEnum.MultiOffsetsEnum(fieldsOffsetsEnums);
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@
import java.util.Set;
import java.util.SortedSet;
import java.util.TreeSet;
import java.util.function.Function;
import java.util.function.Predicate;
import java.util.function.Supplier;
import org.apache.lucene.analysis.Analyzer;
Expand Down Expand Up @@ -122,6 +123,8 @@ public class UnifiedHighlighter {

private Predicate<String> fieldMatcher;

private final Function<String, Set<String>> maskedFieldsFunc;

private Set<HighlightFlag> flags;

// e.g. wildcards
Expand Down Expand Up @@ -162,6 +165,7 @@ public UnifiedHighlighter(IndexSearcher indexSearcher, Analyzer indexAnalyzer) {
Objects.requireNonNull(
indexAnalyzer,
"indexAnalyzer is required" + " (even if in some circumstances it isn't used)");
this.maskedFieldsFunc = null;
}

@Deprecated
Expand Down Expand Up @@ -256,6 +260,8 @@ public static class Builder {

private final Analyzer indexAnalyzer;
private Predicate<String> fieldMatcher;

private Function<String, Set<String>> maskedFieldsFunc;
private Set<HighlightFlag> flags;
private boolean handleMultiTermQuery = DEFAULT_ENABLE_MULTI_TERM_QUERY;
private boolean highlightPhrasesStrictly = DEFAULT_ENABLE_HIGHLIGHT_PHRASES_STRICTLY;
Expand Down Expand Up @@ -360,6 +366,22 @@ public Builder withFieldMatcher(Predicate<String> value) {
return this;
}

/**
* Set up a function that given a field retuns a set of masked fields whose matches are combined
* to highlight the given field. Masked fields should not include the original field. This is
* useful when you want to highlight a field based on matches from several fields.
*
* <p>Note: All masked fields must share the same source as the field being highlighted,
* otherwise their offsets will not correspond to the highlighted field.
*
* <p>Note: Only the field being highlighted must provide an original source value (e.g. through
* stored field), other masked fields don't need it.
*/
public Builder withMaskedFieldsFunc(Function<String, Set<String>> maskedFieldsFunc) {
this.maskedFieldsFunc = maskedFieldsFunc;
return this;
}

public Builder withScorer(PassageScorer value) {
this.scorer = value;
return this;
Expand Down Expand Up @@ -436,6 +458,7 @@ public UnifiedHighlighter(Builder builder) {
this.maxLength = builder.maxLength;
this.breakIterator = builder.breakIterator;
this.fieldMatcher = builder.fieldMatcher;
this.maskedFieldsFunc = builder.maskedFieldsFunc;
this.scorer = builder.scorer;
this.formatter = builder.formatter;
this.maxNoHighlightPassages = builder.maxNoHighlightPassages;
Expand Down Expand Up @@ -543,6 +566,10 @@ protected Predicate<String> getFieldMatcher(String field) {
}
}

protected Set<String> getMaskedFields(String field) {
return maskedFieldsFunc == null ? null : maskedFieldsFunc.apply(field);
}

/** Returns the {@link HighlightFlag}s applicable for the current UH instance. */
protected Set<HighlightFlag> getFlags(String field) {
// If a builder is used for initializing a UH object, then flags will never be null.
Expand Down Expand Up @@ -1065,11 +1092,29 @@ public Object highlightWithoutSearcher(String field, Query query, String content

protected FieldHighlighter getFieldHighlighter(
String field, Query query, Set<Term> allTerms, int maxPassages) {
UHComponents components = getHighlightComponents(field, query, allTerms);
OffsetSource offsetSource = getOptimizedOffsetSource(components);
Set<String> maskedFields = getMaskedFields(field);
FieldOffsetStrategy fieldOffsetStrategy;
if (maskedFields == null || maskedFields.isEmpty()) {
UHComponents components = getHighlightComponents(field, query, allTerms);
OffsetSource offsetSource = getOptimizedOffsetSource(components);
fieldOffsetStrategy = getOffsetStrategy(offsetSource, components);
} else {
List<FieldOffsetStrategy> fieldsOffsetStrategies = new ArrayList<>(maskedFields.size() + 1);
for (String maskedField : maskedFields) {
UHComponents components = getHighlightComponents(maskedField, query, allTerms);
OffsetSource offsetSource = getOptimizedOffsetSource(components);
fieldsOffsetStrategies.add(getOffsetStrategy(offsetSource, components));
}
// adding original field as well
UHComponents components = getHighlightComponents(field, query, allTerms);
OffsetSource offsetSource = getOptimizedOffsetSource(components);
fieldsOffsetStrategies.add(getOffsetStrategy(offsetSource, components));

fieldOffsetStrategy = new MultiFieldsOffsetStrategy(fieldsOffsetStrategies);
}
return newFieldHighlighter(
field,
getOffsetStrategy(offsetSource, components),
fieldOffsetStrategy,
new SplittingBreakIterator(getBreakIterator(field), UnifiedHighlighter.MULTIVAL_SEP_CHAR),
getScorer(field),
maxPassages,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -30,13 +30,20 @@
import java.util.Map;
import java.util.Objects;
import java.util.Set;
import java.util.TreeMap;
import java.util.function.Function;
import java.util.function.Predicate;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.DelegatingAnalyzerWrapper;
import org.apache.lucene.analysis.core.WhitespaceAnalyzer;
import org.apache.lucene.analysis.en.EnglishAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.FieldType;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexOptions;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.BooleanClause;
import org.apache.lucene.search.BooleanQuery;
Expand All @@ -54,8 +61,13 @@
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.search.Weight;
import org.apache.lucene.search.uhighlight.UnifiedHighlighter.HighlightFlag;
import org.apache.lucene.store.Directory;
import org.apache.lucene.tests.analysis.MockAnalyzer;
import org.apache.lucene.tests.index.RandomIndexWriter;
import org.apache.lucene.util.QueryBuilder;
import org.apache.lucene.util.automaton.Automata;
import org.apache.lucene.util.automaton.CharacterRunAutomaton;
import org.apache.lucene.util.automaton.RegExp;

public class TestUnifiedHighlighter extends UnifiedHighlighterTestBase {
@ParametersFactory
Expand Down Expand Up @@ -1337,6 +1349,121 @@ public void testFieldMatcherMultiTermQuery() throws Exception {
ir.close();
}

public void testMaskedFields() throws IOException {
final Map<String, Analyzer> fieldAnalyzers = new TreeMap<>();
fieldAnalyzers.put("field", new WhitespaceAnalyzer());
fieldAnalyzers.put("field_english", new EnglishAnalyzer()); // English stemming and stopwords
fieldAnalyzers.put( // Each letter is a token
"field_characters",
new MockAnalyzer(random(), new CharacterRunAutomaton(new RegExp(".").toAutomaton()), true));
fieldAnalyzers.put( // Every three letters is a token
"field_tripples",
new MockAnalyzer(
random(), new CharacterRunAutomaton(new RegExp("...").toAutomaton()), true));
Analyzer analyzer =
new DelegatingAnalyzerWrapper(Analyzer.PER_FIELD_REUSE_STRATEGY) {
@Override
public Analyzer getWrappedAnalyzer(String fieldName) {
return fieldAnalyzers.get(fieldName);
}
};
FieldType fieldTypeMatched = new FieldType(fieldType);
fieldTypeMatched.setStored(false); // matched fields don't need to be stored
fieldTypeMatched.freeze();

try (Directory dir = newDirectory()) {
try (IndexWriter writer = new IndexWriter(dir, newIndexWriterConfig(analyzer))) {
Document doc = new Document();
doc.add(new Field("field", "dance with star", fieldType));
doc.add(new Field("field_english", "dance with star", fieldTypeMatched));
doc.add(new Field("field_characters", "dance with star", fieldTypeMatched));
doc.add(new Field("field_tripples", "dance with star", fieldTypeMatched));
writer.addDocument(doc);
}

try (IndexReader reader = DirectoryReader.open(dir)) {
IndexSearcher searcher = newSearcher(reader);
// field is highlighted based on the matches from the "field_english"
maskedFieldsTestCase(
analyzer,
searcher,
"field",
Set.of("field_english"),
"dancing with the stars",
"<b>dance with star</b>",
"<b>dance</b> with <b>star</b>");

// field is highlighted based on the matches from the "field_characters"
maskedFieldsTestCase(
analyzer,
searcher,
"field",
Set.of("field_characters"),
"danc",
"<b>danc</b>e with star",
"<b>d</b><b>a</b><b>n</b><b>c</b>e with star");

// field is highlighted based on the matches from the "field_tripples"
maskedFieldsTestCase(
analyzer,
searcher,
"field",
Set.of("field_tripples"),
"danc",
"<b>dan</b>ce with star",
"<b>dan</b>ce with star");

// field is highlighted based on the matches from the "field_characters" and
// "field_tripples"
maskedFieldsTestCase(
analyzer,
searcher,
"field",
Set.of("field_tripples", "field_characters"),
"danc",
"<b>danc</b>e with star",
"<b>da</b><b>n</b><b>c</b>e with star");
}
}
}

private static void maskedFieldsTestCase(
Analyzer analyzer,
IndexSearcher searcher,
String field,
Set<String> maskedFields,
String queryText,
String expectedSnippetWithWeightMatches,
String expectedSnippetWithoutWeightMatches)
throws IOException {
QueryBuilder queryBuilder = new QueryBuilder(analyzer);
BooleanQuery.Builder boolQueryBuilder = new BooleanQuery.Builder();
Query fieldPhraseQuery = queryBuilder.createPhraseQuery(field, queryText, 2);
boolQueryBuilder.add(fieldPhraseQuery, BooleanClause.Occur.SHOULD);
for (String maskedField : maskedFields) {
fieldPhraseQuery = queryBuilder.createPhraseQuery(maskedField, queryText, 2);
boolQueryBuilder.add(fieldPhraseQuery, BooleanClause.Occur.SHOULD);
}
Query query = boolQueryBuilder.build();
TopDocs topDocs = searcher.search(query, 10);
assertEquals(1, topDocs.totalHits.value);

Function<String, Set<String>> maskedFieldsFunc =
fieldName -> fieldName.equals(field) ? maskedFields : Collections.emptySet();
UnifiedHighlighter.Builder uhBuilder =
new UnifiedHighlighter.Builder(searcher, analyzer).withMaskedFieldsFunc(maskedFieldsFunc);
UnifiedHighlighter highlighter =
randomUnifiedHighlighter(
uhBuilder, EnumSet.of(HighlightFlag.PHRASES), random().nextBoolean());
String[] snippets = highlighter.highlight(field, query, topDocs, 10);
String expectedSnippet =
highlighter.getFlags(field).contains(HighlightFlag.WEIGHT_MATCHES)
? expectedSnippetWithWeightMatches
: expectedSnippetWithoutWeightMatches;
assertEquals(1, snippets.length);
assertEquals(expectedSnippet, snippets[0]);
}

public void testMatchesSlopBug() throws IOException {
IndexReader ir = indexSomeFields();
IndexSearcher searcher = newSearcher(ir);
Expand Down