Skip to content

Commit

Permalink
Unified Highlighter to support matched_fields
Browse files Browse the repository at this point in the history
Add support to the Uunified highlighter to combine matches on multiple fields
to highlight a single field: "matched_fields".

Based on Lucene PR: apache/lucene#13268

Lucene PR is based on the concept of masked fields where masked fields
are different from the original highlighted field. This PR in
Elasticsearch uses the already existing highlighter parameter
"matched_fields"

Closes elastic#5172
  • Loading branch information
mayya-sharipova committed Apr 18, 2024
1 parent daa63d2 commit 7c79a3e
Show file tree
Hide file tree
Showing 3 changed files with 142 additions and 27 deletions.
Original file line number Diff line number Diff line change
@@ -0,0 +1,82 @@
setup:
- requires:
cluster_features: ["gte_v8.13.0"]
reason: matched_fields for unified highlighter were added in 8.14.0
- do:
indices.create:
index: index1
body:
settings:
analysis:
filter:
my_edge_ngram:
type: edge_ngram
min_gram: 2
max_gram: 20
analyzer:
my_analyzer:
tokenizer: whitespace
filter: [ my_edge_ngram ]
mappings:
properties:
title:
type: text
fields:
english:
type: text
analyzer: english
ngram:
type: text
analyzer: my_analyzer
body :
type: text


- do:
bulk:
refresh: true
index: index1
body:
- '{"index": {"_id": 1 }}'
- '{"title": "dancing with the stars", "body": "Dancing with the Stars is a popular TV show"}'
- '{"index": {"_id": 2 }}'
- '{"title": "dance with star", "body": "Dancing with the Stars is a popular TV show"}'

---
"Highlight based on single masked field":
- do:
search:
index: index1
body:
query:
query_string:
query: "\"dancing with the stars\""
fields: ["title^5", "title.english"]
phrase_slop: 2
highlight:
fields:
title:
matched_fields: ["title.english"]

- length: {hits.hits: 2}
- match: {hits.hits.0.highlight.title.0: "<em>dancing with the stars</em>"}
- match: {hits.hits.1.highlight.title.0: "<em>dance with star</em>"}

---
"Highlight based on multiple masked fields":
- do:
search:
index: index1
body:
query:
query_string:
query: "dan with star"
fields: ["title^5", "title.ngram", "title.english"]
highlight:
fields:
title:
matched_fields: ["title.ngram", "title.english"]

- length: {hits.hits: 2}
- match: {hits.hits.0.highlight.title.0: "<em>dance</em> <em>with</em> <em>star</em>" }
- match: {hits.hits.1.highlight.title.0: "<em>dancing</em> <em>with</em> the <em>stars</em>"}
Original file line number Diff line number Diff line change
Expand Up @@ -1037,14 +1037,19 @@ public void testFVHManyMatches() throws Exception {
}

public void testMatchedFieldsFvhRequireFieldMatch() throws Exception {
checkMatchedFieldsCase(true);
checkMatchedFieldsCase(true, "fvh");
}

public void testMatchedFieldsFvhNoRequireFieldMatch() throws Exception {
checkMatchedFieldsCase(false);
checkMatchedFieldsCase(false, "fvh");
}

private void checkMatchedFieldsCase(boolean requireFieldMatch) throws Exception {
public void testMatchedFieldsUnified() throws Exception {
// unified highlighter doesn't support require_field_match when matched fields are used
checkMatchedFieldsCase(randomBoolean(), "unified");
}

private void checkMatchedFieldsCase(boolean requireFieldMatch, String type) throws Exception {
Settings.Builder settings = Settings.builder();
settings.put(indexSettings());
settings.put("index.analysis.analyzer.mock_english.tokenizer", "standard");
Expand Down Expand Up @@ -1104,7 +1109,7 @@ private void checkMatchedFieldsCase(boolean requireFieldMatch) throws Exception
Field fooField = new Field("foo").numOfFragments(1)
.order("score")
.fragmentSize(25)
.highlighterType("fvh")
.highlighterType(type)
.requireFieldMatch(requireFieldMatch);
SearchRequestBuilder req = prepareSearch("test").highlighter(new HighlightBuilder().field(fooField));

Expand All @@ -1125,7 +1130,7 @@ private void checkMatchedFieldsCase(boolean requireFieldMatch) throws Exception
fooField = new Field("foo").numOfFragments(1)
.order("score")
.fragmentSize(25)
.highlighterType("fvh")
.highlighterType(type)
.requireFieldMatch(requireFieldMatch);
fooField.matchedFields("foo", "foo.plain");
req = prepareSearch("test").highlighter(new HighlightBuilder().field(fooField));
Expand All @@ -1144,20 +1149,22 @@ private void checkMatchedFieldsCase(boolean requireFieldMatch) throws Exception
fooField = new Field("foo").numOfFragments(1)
.order("score")
.fragmentSize(25)
.highlighterType("fvh")
.highlighterType(type)
.requireFieldMatch(requireFieldMatch);
fooField.matchedFields("foo.plain");
req = prepareSearch("test").highlighter(new HighlightBuilder().field(fooField));
// unified highlighter always keeps the original field in the list of matched fields
String expectedHighlight0 = type.equals("unified") ? "<em>running</em> with <em>scissors</em>" : "<em>running</em> with scissors";
assertResponse(
req.setQuery(queryStringQuery("foo.plain:running scissors").field("foo")),
response -> assertHighlight(response, 0, "foo", 0, equalTo("<em>running</em> with scissors"))
response -> assertHighlight(response, 0, "foo", 0, equalTo(expectedHighlight0))
);

// Now make sure boosted fields don't blow up when matched fields is both the subfield and stored field.
fooField = new Field("foo").numOfFragments(1)
.order("score")
.fragmentSize(25)
.highlighterType("fvh")
.highlighterType(type)
.requireFieldMatch(requireFieldMatch);
fooField.matchedFields("foo", "foo.plain");
req = prepareSearch("test").highlighter(new HighlightBuilder().field(fooField));
Expand All @@ -1184,40 +1191,43 @@ private void checkMatchedFieldsCase(boolean requireFieldMatch) throws Exception
response -> assertHighlight(response, 0, "foo", 0, equalTo("<em>running</em> with <em>scissors</em>"))
);

// Unified and FVH highlighters break text into fragments differently
String expectedHighlight1 = type.equals("unified") ? "junk junk junk <em>cats</em> junk" : "junk junk <em>cats</em> junk junk";

// But we use the best found score when sorting fragments
assertResponse(
req.setQuery(queryStringQuery("cats foo.plain:cats^5").field("foo")),
response -> assertHighlight(response, 0, "foo", 0, equalTo("junk junk <em>cats</em> junk junk"))
response -> assertHighlight(response, 0, "foo", 0, equalTo(expectedHighlight1))
);

// which can also be written by searching on the subfield
assertResponse(
req.setQuery(queryStringQuery("cats").field("foo").field("foo.plain", 5)),
response -> assertHighlight(response, 0, "foo", 0, equalTo("junk junk <em>cats</em> junk junk"))
response -> assertHighlight(response, 0, "foo", 0, equalTo(expectedHighlight1))
);

// Speaking of two fields, you can have two fields, only one of which has matchedFields enabled
QueryBuilder twoFieldsQuery = queryStringQuery("cats").field("foo").field("foo.plain", 5).field("bar").field("bar.plain", 5);
Field barField = new Field("bar").numOfFragments(1)
.order("score")
.fragmentSize(25)
.highlighterType("fvh")
.highlighterType(type)
.requireFieldMatch(requireFieldMatch);
assertResponse(req.setQuery(twoFieldsQuery).highlighter(new HighlightBuilder().field(fooField).field(barField)), response -> {
assertHighlight(response, 0, "foo", 0, equalTo("junk junk <em>cats</em> junk junk"));
assertHighlight(response, 0, "foo", 0, equalTo(expectedHighlight1));
assertHighlight(response, 0, "bar", 0, equalTo("<em>cat</em> <em>cat</em> junk junk junk junk"));
});
// And you can enable matchedField highlighting on both
barField.matchedFields("bar", "bar.plain");
assertResponse(req.setQuery(twoFieldsQuery).highlighter(new HighlightBuilder().field(fooField).field(barField)), response -> {
assertHighlight(response, 0, "foo", 0, equalTo("junk junk <em>cats</em> junk junk"));
assertHighlight(response, 0, "bar", 0, equalTo("junk junk <em>cats</em> junk junk"));
assertHighlight(response, 0, "foo", 0, equalTo(expectedHighlight1));
assertHighlight(response, 0, "bar", 0, equalTo(expectedHighlight1));
});

// Setting a matchedField that isn't searched/doesn't exist is simply ignored.
barField.matchedFields("bar", "candy");
assertResponse(req.setQuery(twoFieldsQuery).highlighter(new HighlightBuilder().field(fooField).field(barField)), response -> {
assertHighlight(response, 0, "foo", 0, equalTo("junk junk <em>cats</em> junk junk"));
assertHighlight(response, 0, "foo", 0, equalTo(expectedHighlight1));
assertHighlight(response, 0, "bar", 0, equalTo("<em>cat</em> <em>cat</em> junk junk junk junk"));
});

Expand All @@ -1233,12 +1243,15 @@ private void checkMatchedFieldsCase(boolean requireFieldMatch) throws Exception
);

// If the stored field is found but the matched field isn't then you don't get a result either.
fooField.matchedFields("bar.plain");
assertResponse(
req.setQuery(queryStringQuery("running scissors").field("foo").field("foo.plain").field("bar").field("bar.plain"))
.highlighter(new HighlightBuilder().field(fooField).field(barField)),
response -> assertThat(response.getHits().getAt(0).getHighlightFields(), not(hasKey("foo")))
);
// only applicable to fvh highlighter, as unified highlighter always keeps the original field in the list of matched fields
if (type.equals("fvh")) {
fooField.matchedFields("bar.plain");
assertResponse(
req.setQuery(queryStringQuery("running scissors").field("foo").field("foo.plain").field("bar").field("bar.plain"))
.highlighter(new HighlightBuilder().field(fooField).field(barField)),
response -> assertThat(response.getHits().getAt(0).getHighlightFields(), not(hasKey("foo")))
);
}

// But if you add the stored field to the list of matched fields then you'll get a result again
fooField.matchedFields("foo", "bar.plain");
Expand All @@ -1261,11 +1274,22 @@ private void checkMatchedFieldsCase(boolean requireFieldMatch) throws Exception
}
);

assertFailures(
req.setQuery(queryStringQuery("result").field("foo").field("foo.plain").field("bar").field("bar.plain")),
RestStatus.INTERNAL_SERVER_ERROR,
containsString("IndexOutOfBoundsException")
);
if (type.equals("unified")) {
assertResponse(
req.setQuery(queryStringQuery("result").field("foo").field("foo.plain").field("bar").field("bar.plain"))
.highlighter(new HighlightBuilder().field(fooField).field(barField)),
response -> {
assertHighlight(response, 0, "bar", 0, equalTo("<em>result</em>"));
}
);
} else {
assertFailures(
req.setQuery(queryStringQuery("result").field("foo").field("foo.plain").field("bar").field("bar.plain"))
.highlighter(new HighlightBuilder().field(fooField).field(barField)),
RestStatus.INTERNAL_SERVER_ERROR,
containsString("IndexOutOfBoundsException")
);
}
}

public void testFastVectorHighlighterManyDocs() throws Exception {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -36,10 +36,12 @@
import java.io.IOException;
import java.text.BreakIterator;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;
import java.util.List;
import java.util.Locale;
import java.util.Map;
import java.util.Set;
import java.util.function.Predicate;

import static org.elasticsearch.lucene.search.uhighlight.CustomUnifiedHighlighter.MULTIVAL_SEP_CHAR;
Expand Down Expand Up @@ -142,8 +144,15 @@ CustomUnifiedHighlighter buildHighlighter(FieldHighlightContext fieldContext) {
}
Builder builder = UnifiedHighlighter.builder(searcher, analyzer);
builder.withBreakIterator(() -> breakIterator);
builder.withFieldMatcher(fieldMatcher(fieldContext));
builder.withFormatter(passageFormatter);

Set<String> matchedFields = fieldContext.field.fieldOptions().matchedFields();
// Masked fields require that the default field matcher is used
if (matchedFields != null && matchedFields.isEmpty() == false) {
builder.withMaskedFieldsFunc((fieldName) -> fieldName.equals(fieldContext.fieldName) ? matchedFields : Collections.emptySet());
} else {
builder.withFieldMatcher(fieldMatcher(fieldContext));
}
return new CustomUnifiedHighlighter(
builder,
offsetSource,
Expand Down

0 comments on commit 7c79a3e

Please sign in to comment.