From 2337eb05a03ea6943e59079ff7c95313a5522239 Mon Sep 17 00:00:00 2001 From: Mayya Sharipova Date: Thu, 9 May 2024 10:35:29 -0400 Subject: [PATCH] Unified Highlighter to support matched_fields (#107640) Add support to the Unified highlighter to combine matches on multiple fields to highlight a single field: "matched_fields". Based on Lucene PR: https://github.com/apache/lucene/pull/13268 Lucene PR is based on the concept of masked fields where masked fields are different from the original highlighted field. This PR in Elasticsearch uses the already existing highlighter parameter "matched_fields". --- docs/changelog/107640.yaml | 6 + .../search-your-data/highlighting.asciidoc | 150 ++---- .../highlighting-multi-fields-widget.asciidoc | 40 ++ .../highlighting-multi-fields.asciidoc | 465 ++++++++++++++++++ .../rest/yaml/CcsCommonYamlTestSuiteIT.java | 1 + .../60_unified_matched_fields.yml | 108 ++++ .../highlight/HighlighterSearchIT.java | 76 ++- .../org/elasticsearch/rest/RestFeatures.java | 8 + .../highlight/DefaultHighlighter.java | 18 +- 9 files changed, 723 insertions(+), 149 deletions(-) create mode 100644 docs/changelog/107640.yaml create mode 100644 docs/reference/tab-widgets/highlighting-multi-fields-widget.asciidoc create mode 100644 docs/reference/tab-widgets/highlighting-multi-fields.asciidoc create mode 100644 rest-api-spec/src/yamlRestTest/resources/rest-api-spec/test/search.highlight/60_unified_matched_fields.yml diff --git a/docs/changelog/107640.yaml b/docs/changelog/107640.yaml new file mode 100644 index 0000000000000..9871943481f20 --- /dev/null +++ b/docs/changelog/107640.yaml @@ -0,0 +1,6 @@ +pr: 107640 +summary: "Unified Highlighter to support matched_fields " +area: Highlighting +type: enhancement +issues: + - 5172 diff --git a/docs/reference/search/search-your-data/highlighting.asciidoc b/docs/reference/search/search-your-data/highlighting.asciidoc index 55e737eb00197..7ee13d971b035 100644 --- a/docs/reference/search/search-your-data/highlighting.asciidoc +++ b/docs/reference/search/search-your-data/highlighting.asciidoc @@ -46,8 +46,9 @@ for each field. The `unified` highlighter uses the Lucene Unified Highlighter. This highlighter breaks the text into sentences and uses the BM25 algorithm to score individual sentences as if they were documents in the corpus. It also supports -accurate phrase and multi-term (fuzzy, prefix, regex) highlighting. This is the -default highlighter. +accurate phrase and multi-term (fuzzy, prefix, regex) highlighting. The `unified` +highlighter can combine matches from multiple fields into one result (see +`matched_fields`). This is the default highlighter. [discrete] [[plain-highlighter]] @@ -199,10 +200,27 @@ include the search query as part of the `highlight_query`. matched_fields:: Combine matches on multiple fields to highlight a single field. This is most intuitive for multifields that analyze the same string in different -ways. All `matched_fields` must have `term_vector` set to -`with_positions_offsets`, but only the field to which -the matches are combined is loaded so only that field benefits from having -`store` set to `yes`. Only valid for the `fvh` highlighter. +ways. Valid for the `unified` and fvh` highlighters, but the behavior of this +option is different for each highlighter. + +For the `unified` highlighter: + +- `matched_fields` array should **not** contain the original field that you want to highlight. The +original field will be automatically added to the `matched_fields`, and there is no +way to exclude its matches when highlighting. +- `matched_fields` and the original field can be indexed with different strategies (with or +without `offsets`, with or without `term_vectors`). +- only the original field to which the matches are combined is loaded so only that field +benefits from having `store` set to `yes` + +For the `fvh` highlighter: + +- `matched_fields` array may or may not contain the original field +depending on your needs. If you want to include the original field's matches in +highlighting, add it to the `matched_fields` array. +- all `matched_fields` must have `term_vector` set to `with_positions_offsets` +- only the original field to which the matches are combined is loaded so only that field +benefits from having `store` set to `yes`. no_match_size:: The amount of text you want to return from the beginning of the field if there are no matching fragments to highlight. Defaults @@ -498,100 +516,14 @@ GET /_search [discrete] === Combine matches on multiple fields -WARNING: This is only supported by the `fvh` highlighter +WARNING: Supported by the `unified` and `fvh` highlighters. -The Fast Vector Highlighter can combine matches on multiple fields to +The Unified and Fast Vector Highlighter can combine matches on multiple fields to highlight a single field. This is most intuitive for multifields that -analyze the same string in different ways. All `matched_fields` must have -`term_vector` set to `with_positions_offsets` but only the field to which -the matches are combined is loaded so only that field would benefit from having -`store` set to `yes`. - -In the following examples, `comment` is analyzed by the `english` -analyzer and `comment.plain` is analyzed by the `standard` analyzer. - -[source,console] --------------------------------------------------- -GET /_search -{ - "query": { - "query_string": { - "query": "comment.plain:running scissors", - "fields": [ "comment" ] - } - }, - "highlight": { - "order": "score", - "fields": { - "comment": { - "matched_fields": [ "comment", "comment.plain" ], - "type": "fvh" - } - } - } -} --------------------------------------------------- -// TEST[setup:my_index] - -The above matches both "run with scissors" and "running with scissors" -and would highlight "running" and "scissors" but not "run". If both -phrases appear in a large document then "running with scissors" is -sorted above "run with scissors" in the fragments list because there -are more matches in that fragment. - -[source,console] --------------------------------------------------- -GET /_search -{ - "query": { - "query_string": { - "query": "running scissors", - "fields": ["comment", "comment.plain^10"] - } - }, - "highlight": { - "order": "score", - "fields": { - "comment": { - "matched_fields": ["comment", "comment.plain"], - "type" : "fvh" - } - } - } -} --------------------------------------------------- -// TEST[setup:my_index] +analyze the same string in different ways. -The above highlights "run" as well as "running" and "scissors" but -still sorts "running with scissors" above "run with scissors" because -the plain match ("running") is boosted. +include::{es-ref-dir}/tab-widgets/highlighting-multi-fields-widget.asciidoc[] -[source,console] --------------------------------------------------- -GET /_search -{ - "query": { - "query_string": { - "query": "running scissors", - "fields": [ "comment", "comment.plain^10" ] - } - }, - "highlight": { - "order": "score", - "fields": { - "comment": { - "matched_fields": [ "comment.plain" ], - "type": "fvh" - } - } - } -} --------------------------------------------------- -// TEST[setup:my_index] - -The above query wouldn't highlight "run" or "scissor" but shows that -it is just fine not to list the field to which the matches are combined -(`comment`) in the matched fields. [NOTE] Technically it is also fine to add fields to `matched_fields` that @@ -599,32 +531,6 @@ don't share the same underlying string as the field to which the matches are combined. The results might not make much sense and if one of the matches is off the end of the text then the whole query will fail. -[NOTE] -=================================================================== -There is a small amount of overhead involved with setting -`matched_fields` to a non-empty array so always prefer -[source,js] --------------------------------------------------- - "highlight": { - "fields": { - "comment": {} - } - } --------------------------------------------------- -// NOTCONSOLE -to -[source,js] --------------------------------------------------- - "highlight": { - "fields": { - "comment": { - "matched_fields": ["comment"], - "type" : "fvh" - } - } - } --------------------------------------------------- -// NOTCONSOLE =================================================================== diff --git a/docs/reference/tab-widgets/highlighting-multi-fields-widget.asciidoc b/docs/reference/tab-widgets/highlighting-multi-fields-widget.asciidoc new file mode 100644 index 0000000000000..e307ea002f542 --- /dev/null +++ b/docs/reference/tab-widgets/highlighting-multi-fields-widget.asciidoc @@ -0,0 +1,40 @@ +++++ +
+
+ + +
+
+++++ + +include::highlighting-multi-fields.asciidoc[tag=unified] + +++++ +
+ +
+++++ diff --git a/docs/reference/tab-widgets/highlighting-multi-fields.asciidoc b/docs/reference/tab-widgets/highlighting-multi-fields.asciidoc new file mode 100644 index 0000000000000..5af85f33c99fa --- /dev/null +++ b/docs/reference/tab-widgets/highlighting-multi-fields.asciidoc @@ -0,0 +1,465 @@ +// tag::unified[] + +In the following examples, `comment` is analyzed by the `standard` +analyzer and `comment.english` is analyzed by the `english` analyzer. + +[source,console] +-------------------------------------------------- +PUT index1 +{ + "mappings": { + "properties": { + "comment": { + "type": "text", + "analyzer": "standard", + "fields": { + "english": { + "type": "text", + "analyzer": "english" + } + } + } + } + } +} +-------------------------------------------------- + + +[source,console] +-------------------------------------------------- +PUT index1/_bulk?refresh=true +{"index": {"_id": "doc1" }} +{"comment": "run with scissors"} +{ "index" : {"_id": "doc2"} } +{"comment": "running with scissors"} + +-------------------------------------------------- +// TEST[continued] + + +[source,console] +-------------------------------------------------- +GET index1/_search +{ + "query": { + "query_string": { + "query": "running with scissors", + "fields": ["comment", "comment.english"] + } + }, + "highlight": { + "order": "score", + "fields": { + "comment": {} + } + } +} +-------------------------------------------------- +// TEST[continued] + +The above request matches both "run with scissors" and "running with scissors" +and would highlight "running" and "scissors" but not "run". If both +phrases appear in a large document then "running with scissors" is +sorted above "run with scissors" in the fragments list because there +are more matches in that fragment. + +[source,console-result] +---- +{ + ... + "hits" : { + "total" : { + "value" : 2, + "relation" : "eq" + }, + "max_score": 1.0577903, + "hits" : [ + { + "_index" : "index1", + "_id" : "doc2", + "_score" : 1.0577903, + "_source" : { + "comment" : "running with scissors" + }, + "highlight" : { + "comment" : [ + "running with scissors" + ] + } + }, + { + "_index" : "index1", + "_id" : "doc1", + "_score" : 0.36464313, + "_source" : { + "comment" : "run with scissors" + }, + "highlight" : { + "comment" : [ + "run with scissors" + ] + } + } + ] + } +} +---- +// TESTRESPONSE[s/\.\.\./"took" : $body.took,"timed_out" : $body.timed_out,"_shards" : $body._shards,/] + +The below request highlights "run" as well as "running" and "scissors", +because the `matched_fields` parameter instructs that for highlighting +we need to combine matches from the `comment.english` field with +the matches from the original `comment` field. + +[source,console] +-------------------------------------------------- +GET index1/_search +{ + "query": { + "query_string": { + "query": "running with scissors", + "fields": ["comment", "comment.english"] + } + }, + "highlight": { + "order": "score", + "fields": { + "comment": { + "matched_fields": ["comment.english"] + } + } + } +} +-------------------------------------------------- +// TEST[continued] + +[source,console-result] +---- +{ + ... + "hits" : { + "total" : { + "value" : 2, + "relation" : "eq" + }, + "max_score": 1.0577903, + "hits" : [ + { + "_index" : "index1", + "_id" : "doc2", + "_score" : 1.0577903, + "_source" : { + "comment" : "running with scissors" + }, + "highlight" : { + "comment" : [ + "running with scissors" + ] + } + }, + { + "_index" : "index1", + "_id" : "doc1", + "_score" : 0.36464313, + "_source" : { + "comment" : "run with scissors" + }, + "highlight" : { + "comment" : [ + "run with scissors" + ] + } + } + ] + } +} +---- +// TESTRESPONSE[s/\.\.\./"took" : $body.took,"timed_out" : $body.timed_out,"_shards" : $body._shards,/] + +// end::unified[] + + + + + +// tag::fvh[] + +In the following examples, `comment` is analyzed by the `standard` +analyzer and `comment.english` is analyzed by the `english` analyzer. + +[source,console] +-------------------------------------------------- +PUT index2 +{ + "mappings": { + "properties": { + "comment": { + "type": "text", + "analyzer": "standard", + "term_vector": "with_positions_offsets", + "fields": { + "english": { + "type": "text", + "analyzer": "english", + "term_vector": "with_positions_offsets" + } + } + } + } + } +} +-------------------------------------------------- + + +[source,console] +-------------------------------------------------- +PUT index2/_bulk?refresh=true +{"index": {"_id": "doc1" }} +{"comment": "run with scissors"} +{ "index" : {"_id": "doc2"} } +{"comment": "running with scissors"} + +-------------------------------------------------- +// TEST[continued] + + +[source,console] +-------------------------------------------------- +GET index2/_search +{ + "query": { + "query_string": { + "query": "running with scissors", + "fields": ["comment", "comment.english"] + } + }, + "highlight": { + "order": "score", + "fields": { + "comment": { + "type" : "fvh" + } + } + } +} +-------------------------------------------------- +// TEST[continued] + +The above request matches both "run with scissors" and "running with scissors" +and would highlight "running" and "scissors" but not "run". If both +phrases appear in a large document then "running with scissors" is +sorted above "run with scissors" in the fragments list because there +are more matches in that fragment. + +[source,console-result] +---- +{ + ... + "hits" : { + "total" : { + "value" : 2, + "relation" : "eq" + }, + "max_score": 1.0577903, + "hits" : [ + { + "_index" : "index2", + "_id" : "doc2", + "_score" : 1.0577903, + "_source" : { + "comment" : "running with scissors" + }, + "highlight" : { + "comment" : [ + "running with scissors" + ] + } + }, + { + "_index" : "index2", + "_id" : "doc1", + "_score" : 0.36464313, + "_source" : { + "comment" : "run with scissors" + }, + "highlight" : { + "comment" : [ + "run with scissors" + ] + } + } + ] + } +} +---- +// TESTRESPONSE[s/\.\.\./"took" : $body.took,"timed_out" : $body.timed_out,"_shards" : $body._shards,/] + +The below request highlights "run" as well as "running" and "scissors", +because the `matched_fields` parameter instructs that for highlighting +we need to combine matches from the `comment` and `comment.english` fields. + +[source,console] +-------------------------------------------------- +GET index2/_search +{ + "query": { + "query_string": { + "query": "running with scissors", + "fields": ["comment", "comment.english"] + } + }, + "highlight": { + "order": "score", + "fields": { + "comment": { + "type" : "fvh", + "matched_fields": ["comment", "comment.english"] + } + } + } +} +-------------------------------------------------- +// TEST[continued] + +[source,console-result] +---- +{ + ... + "hits" : { + "total" : { + "value" : 2, + "relation" : "eq" + }, + "max_score": 1.0577903, + "hits" : [ + { + "_index" : "index2", + "_id" : "doc2", + "_score" : 1.0577903, + "_source" : { + "comment" : "running with scissors" + }, + "highlight" : { + "comment" : [ + "running with scissors" + ] + } + }, + { + "_index" : "index2", + "_id" : "doc1", + "_score" : 0.36464313, + "_source" : { + "comment" : "run with scissors" + }, + "highlight" : { + "comment" : [ + "run with scissors" + ] + } + } + ] + } +} +---- +// TESTRESPONSE[s/\.\.\./"took" : $body.took,"timed_out" : $body.timed_out,"_shards" : $body._shards,/] + +The below request wouldn't highlight "run" or "scissor" but shows that +it is just fine not to list the field to which the matches are combined +(`comment.english`) in the matched fields. + +[source,console] +-------------------------------------------------- +GET index2/_search +{ + "query": { + "query_string": { + "query": "running with scissors", + "fields": ["comment", "comment.english"] + } + }, + "highlight": { + "order": "score", + "fields": { + "comment.english": { + "type" : "fvh", + "matched_fields": ["comment"] + } + } + } +} +-------------------------------------------------- +// TEST[continued] + + +[source,console-result] +---- +{ + ... + "hits" : { + "total" : { + "value" : 2, + "relation" : "eq" + }, + "max_score": 1.0577903, + "hits" : [ + { + "_index" : "index2", + "_id" : "doc2", + "_score" : 1.0577903, + "_source" : { + "comment" : "running with scissors" + }, + "highlight" : { + "comment.english" : [ + "running with scissors" + ] + } + }, + { + "_index" : "index2", + "_id" : "doc1", + "_score" : 0.36464313, + "_source" : { + "comment" : "run with scissors" + }, + "highlight" : { + "comment.english" : [ + "run with scissors" + ] + } + } + ] + } +} +---- +// TESTRESPONSE[s/\.\.\./"took" : $body.took,"timed_out" : $body.timed_out,"_shards" : $body._shards,/] + +[NOTE] +=================================================================== +There is a small amount of overhead involved with setting +`matched_fields` to a non-empty array so always prefer +[source,js] +-------------------------------------------------- + "highlight": { + "fields": { + "comment": {} + } + } +-------------------------------------------------- +// NOTCONSOLE +to +[source,js] +-------------------------------------------------- + "highlight": { + "fields": { + "comment": { + "matched_fields": ["comment"], + "type" : "fvh" + } + } + } +-------------------------------------------------- +// NOTCONSOLE + +// end::fvh[] diff --git a/qa/ccs-common-rest/src/yamlRestTest/java/org/elasticsearch/test/rest/yaml/CcsCommonYamlTestSuiteIT.java b/qa/ccs-common-rest/src/yamlRestTest/java/org/elasticsearch/test/rest/yaml/CcsCommonYamlTestSuiteIT.java index a8cff14ff6220..49db5e3a1cd99 100644 --- a/qa/ccs-common-rest/src/yamlRestTest/java/org/elasticsearch/test/rest/yaml/CcsCommonYamlTestSuiteIT.java +++ b/qa/ccs-common-rest/src/yamlRestTest/java/org/elasticsearch/test/rest/yaml/CcsCommonYamlTestSuiteIT.java @@ -79,6 +79,7 @@ public class CcsCommonYamlTestSuiteIT extends ESClientYamlSuiteTestCase { private static LocalClusterConfigProvider commonClusterConfig = cluster -> cluster.module("x-pack-async-search") .module("aggregations") + .module("analysis-common") .module("mapper-extras") .module("vector-tile") .module("x-pack-analytics") diff --git a/rest-api-spec/src/yamlRestTest/resources/rest-api-spec/test/search.highlight/60_unified_matched_fields.yml b/rest-api-spec/src/yamlRestTest/resources/rest-api-spec/test/search.highlight/60_unified_matched_fields.yml new file mode 100644 index 0000000000000..a0abff2d6726f --- /dev/null +++ b/rest-api-spec/src/yamlRestTest/resources/rest-api-spec/test/search.highlight/60_unified_matched_fields.yml @@ -0,0 +1,108 @@ +setup: + - requires: + cluster_features: 'unified_highlighter_matched_fields' + reason: 'test requires unified highlighter to support matched_fields' + + - do: + indices.create: + index: index1 + body: + settings: + index: + number_of_shards: 1 + number_of_replicas: 0 + analysis: + filter: + my_edge_ngram: + type: edge_ngram + min_gram: 2 + max_gram: 20 + analyzer: + my_analyzer: + tokenizer: whitespace + filter: [ my_edge_ngram ] + mappings: + properties: + title: + type: text + fields: + english: + type: text + analyzer: english + ngram: + type: text + analyzer: my_analyzer + body : + type: text + + + - do: + bulk: + refresh: true + index: index1 + body: + - '{"index": {"_id": 1 }}' + - '{"title": "dancing with the stars", "body": "Dancing with the Stars is a popular TV show"}' + - '{"index": {"_id": 2 }}' + - '{"title": "dance with star", "body": "Dancing with the Stars is a popular TV show"}' + +--- +"Highlight based on single masked field": + - do: + search: + index: index1 + body: + query: + query_string: + query: "\"dancing with the stars\"" + fields: ["title^5", "title.english"] + phrase_slop: 2 + highlight: + fields: + title: + matched_fields: ["title.english"] + + - length: {hits.hits: 2} + - match: {hits.hits.0.highlight.title.0: "dancing with the stars"} + - match: {hits.hits.1.highlight.title.0: "dance with star"} + +--- +"Highlight based on multiple masked fields": + - do: + search: + index: index1 + body: + query: + query_string: + query: "dan with star" + fields: ["title^5", "title.ngram", "title.english"] + highlight: + fields: + title: + matched_fields: ["title.ngram", "title.english"] + + - length: {hits.hits: 2} + - match: {hits.hits.0.highlight.title.0: "dance with star" } + - match: {hits.hits.1.highlight.title.0: "dancing with the stars"} + + +--- +"Highlight using matched_fields is not allowed when require_field_match is set to false": + - do: + catch: bad_request + search: + index: index1 + body: + query: + query_string: + query: "dan with star" + fields: ["title^5", "title.ngram", "title.english"] + highlight: + require_field_match: false + fields: + title: + matched_fields: ["title.ngram", "title.english"] + + - match: { status: 400 } + - match: { error.root_cause.0.type: "illegal_argument_exception" } + - match: { error.root_cause.0.reason: "Matched fields are not supported when [require_field_match] is set to [false]" } diff --git a/server/src/internalClusterTest/java/org/elasticsearch/search/fetch/subphase/highlight/HighlighterSearchIT.java b/server/src/internalClusterTest/java/org/elasticsearch/search/fetch/subphase/highlight/HighlighterSearchIT.java index ab72dbd4db707..0a6fceea9a3f1 100644 --- a/server/src/internalClusterTest/java/org/elasticsearch/search/fetch/subphase/highlight/HighlighterSearchIT.java +++ b/server/src/internalClusterTest/java/org/elasticsearch/search/fetch/subphase/highlight/HighlighterSearchIT.java @@ -1037,14 +1037,19 @@ public void testFVHManyMatches() throws Exception { } public void testMatchedFieldsFvhRequireFieldMatch() throws Exception { - checkMatchedFieldsCase(true); + checkMatchedFieldsCase(true, "fvh"); } public void testMatchedFieldsFvhNoRequireFieldMatch() throws Exception { - checkMatchedFieldsCase(false); + checkMatchedFieldsCase(false, "fvh"); } - private void checkMatchedFieldsCase(boolean requireFieldMatch) throws Exception { + public void testMatchedFieldsUnified() throws Exception { + // unified highlighter requires that "require_field_match" is true when matched fields are used + checkMatchedFieldsCase(true, "unified"); + } + + private void checkMatchedFieldsCase(boolean requireFieldMatch, String type) throws Exception { Settings.Builder settings = Settings.builder(); settings.put(indexSettings()); settings.put("index.analysis.analyzer.mock_english.tokenizer", "standard"); @@ -1104,7 +1109,7 @@ private void checkMatchedFieldsCase(boolean requireFieldMatch) throws Exception Field fooField = new Field("foo").numOfFragments(1) .order("score") .fragmentSize(25) - .highlighterType("fvh") + .highlighterType(type) .requireFieldMatch(requireFieldMatch); SearchRequestBuilder req = prepareSearch("test").highlighter(new HighlightBuilder().field(fooField)); @@ -1125,7 +1130,7 @@ private void checkMatchedFieldsCase(boolean requireFieldMatch) throws Exception fooField = new Field("foo").numOfFragments(1) .order("score") .fragmentSize(25) - .highlighterType("fvh") + .highlighterType(type) .requireFieldMatch(requireFieldMatch); fooField.matchedFields("foo", "foo.plain"); req = prepareSearch("test").highlighter(new HighlightBuilder().field(fooField)); @@ -1144,20 +1149,22 @@ private void checkMatchedFieldsCase(boolean requireFieldMatch) throws Exception fooField = new Field("foo").numOfFragments(1) .order("score") .fragmentSize(25) - .highlighterType("fvh") + .highlighterType(type) .requireFieldMatch(requireFieldMatch); fooField.matchedFields("foo.plain"); req = prepareSearch("test").highlighter(new HighlightBuilder().field(fooField)); + // unified highlighter always keeps the original field in the list of matched fields + String expectedHighlight0 = type.equals("unified") ? "running with scissors" : "running with scissors"; assertResponse( req.setQuery(queryStringQuery("foo.plain:running scissors").field("foo")), - response -> assertHighlight(response, 0, "foo", 0, equalTo("running with scissors")) + response -> assertHighlight(response, 0, "foo", 0, equalTo(expectedHighlight0)) ); // Now make sure boosted fields don't blow up when matched fields is both the subfield and stored field. fooField = new Field("foo").numOfFragments(1) .order("score") .fragmentSize(25) - .highlighterType("fvh") + .highlighterType(type) .requireFieldMatch(requireFieldMatch); fooField.matchedFields("foo", "foo.plain"); req = prepareSearch("test").highlighter(new HighlightBuilder().field(fooField)); @@ -1184,16 +1191,19 @@ private void checkMatchedFieldsCase(boolean requireFieldMatch) throws Exception response -> assertHighlight(response, 0, "foo", 0, equalTo("running with scissors")) ); + // Unified and FVH highlighters break text into fragments differently + String expectedHighlight1 = type.equals("unified") ? "junk junk junk cats junk" : "junk junk cats junk junk"; + // But we use the best found score when sorting fragments assertResponse( req.setQuery(queryStringQuery("cats foo.plain:cats^5").field("foo")), - response -> assertHighlight(response, 0, "foo", 0, equalTo("junk junk cats junk junk")) + response -> assertHighlight(response, 0, "foo", 0, equalTo(expectedHighlight1)) ); // which can also be written by searching on the subfield assertResponse( req.setQuery(queryStringQuery("cats").field("foo").field("foo.plain", 5)), - response -> assertHighlight(response, 0, "foo", 0, equalTo("junk junk cats junk junk")) + response -> assertHighlight(response, 0, "foo", 0, equalTo(expectedHighlight1)) ); // Speaking of two fields, you can have two fields, only one of which has matchedFields enabled @@ -1201,23 +1211,23 @@ private void checkMatchedFieldsCase(boolean requireFieldMatch) throws Exception Field barField = new Field("bar").numOfFragments(1) .order("score") .fragmentSize(25) - .highlighterType("fvh") + .highlighterType(type) .requireFieldMatch(requireFieldMatch); assertResponse(req.setQuery(twoFieldsQuery).highlighter(new HighlightBuilder().field(fooField).field(barField)), response -> { - assertHighlight(response, 0, "foo", 0, equalTo("junk junk cats junk junk")); + assertHighlight(response, 0, "foo", 0, equalTo(expectedHighlight1)); assertHighlight(response, 0, "bar", 0, equalTo("cat cat junk junk junk junk")); }); // And you can enable matchedField highlighting on both barField.matchedFields("bar", "bar.plain"); assertResponse(req.setQuery(twoFieldsQuery).highlighter(new HighlightBuilder().field(fooField).field(barField)), response -> { - assertHighlight(response, 0, "foo", 0, equalTo("junk junk cats junk junk")); - assertHighlight(response, 0, "bar", 0, equalTo("junk junk cats junk junk")); + assertHighlight(response, 0, "foo", 0, equalTo(expectedHighlight1)); + assertHighlight(response, 0, "bar", 0, equalTo(expectedHighlight1)); }); // Setting a matchedField that isn't searched/doesn't exist is simply ignored. barField.matchedFields("bar", "candy"); assertResponse(req.setQuery(twoFieldsQuery).highlighter(new HighlightBuilder().field(fooField).field(barField)), response -> { - assertHighlight(response, 0, "foo", 0, equalTo("junk junk cats junk junk")); + assertHighlight(response, 0, "foo", 0, equalTo(expectedHighlight1)); assertHighlight(response, 0, "bar", 0, equalTo("cat cat junk junk junk junk")); }); @@ -1233,12 +1243,15 @@ private void checkMatchedFieldsCase(boolean requireFieldMatch) throws Exception ); // If the stored field is found but the matched field isn't then you don't get a result either. - fooField.matchedFields("bar.plain"); - assertResponse( - req.setQuery(queryStringQuery("running scissors").field("foo").field("foo.plain").field("bar").field("bar.plain")) - .highlighter(new HighlightBuilder().field(fooField).field(barField)), - response -> assertThat(response.getHits().getAt(0).getHighlightFields(), not(hasKey("foo"))) - ); + // only applicable to fvh highlighter, as unified highlighter always keeps the original field in the list of matched fields + if (type.equals("fvh")) { + fooField.matchedFields("bar.plain"); + assertResponse( + req.setQuery(queryStringQuery("running scissors").field("foo").field("foo.plain").field("bar").field("bar.plain")) + .highlighter(new HighlightBuilder().field(fooField).field(barField)), + response -> assertThat(response.getHits().getAt(0).getHighlightFields(), not(hasKey("foo"))) + ); + } // But if you add the stored field to the list of matched fields then you'll get a result again fooField.matchedFields("foo", "bar.plain"); @@ -1261,11 +1274,22 @@ private void checkMatchedFieldsCase(boolean requireFieldMatch) throws Exception } ); - assertFailures( - req.setQuery(queryStringQuery("result").field("foo").field("foo.plain").field("bar").field("bar.plain")), - RestStatus.INTERNAL_SERVER_ERROR, - containsString("IndexOutOfBoundsException") - ); + if (type.equals("unified")) { + assertResponse( + req.setQuery(queryStringQuery("result").field("foo").field("foo.plain").field("bar").field("bar.plain")) + .highlighter(new HighlightBuilder().field(fooField).field(barField)), + response -> { + assertHighlight(response, 0, "bar", 0, equalTo("result")); + } + ); + } else { + assertFailures( + req.setQuery(queryStringQuery("result").field("foo").field("foo.plain").field("bar").field("bar.plain")) + .highlighter(new HighlightBuilder().field(fooField).field(barField)), + RestStatus.INTERNAL_SERVER_ERROR, + containsString("IndexOutOfBoundsException") + ); + } } public void testFastVectorHighlighterManyDocs() throws Exception { diff --git a/server/src/main/java/org/elasticsearch/rest/RestFeatures.java b/server/src/main/java/org/elasticsearch/rest/RestFeatures.java index 73b788d63b2ab..93cbd6376cbde 100644 --- a/server/src/main/java/org/elasticsearch/rest/RestFeatures.java +++ b/server/src/main/java/org/elasticsearch/rest/RestFeatures.java @@ -14,8 +14,16 @@ import org.elasticsearch.rest.action.admin.cluster.RestClusterGetSettingsAction; import java.util.Map; +import java.util.Set; + +import static org.elasticsearch.search.fetch.subphase.highlight.DefaultHighlighter.UNIFIED_HIGHLIGHTER_MATCHED_FIELDS; public class RestFeatures implements FeatureSpecification { + @Override + public Set getFeatures() { + return Set.of(UNIFIED_HIGHLIGHTER_MATCHED_FIELDS); + } + @Override public Map getHistoricalFeatures() { return Map.of(RestClusterGetSettingsAction.SUPPORTS_GET_SETTINGS_ACTION, Version.V_8_3_0); diff --git a/server/src/main/java/org/elasticsearch/search/fetch/subphase/highlight/DefaultHighlighter.java b/server/src/main/java/org/elasticsearch/search/fetch/subphase/highlight/DefaultHighlighter.java index da1be48e6b2c0..8f9bca2bbea93 100644 --- a/server/src/main/java/org/elasticsearch/search/fetch/subphase/highlight/DefaultHighlighter.java +++ b/server/src/main/java/org/elasticsearch/search/fetch/subphase/highlight/DefaultHighlighter.java @@ -21,6 +21,7 @@ import org.elasticsearch.common.Strings; import org.elasticsearch.common.lucene.Lucene; import org.elasticsearch.common.text.Text; +import org.elasticsearch.features.NodeFeature; import org.elasticsearch.index.IndexSettings; import org.elasticsearch.index.mapper.IdFieldMapper; import org.elasticsearch.index.mapper.MappedFieldType; @@ -36,15 +37,20 @@ import java.io.IOException; import java.text.BreakIterator; import java.util.ArrayList; +import java.util.Collections; import java.util.HashMap; import java.util.List; import java.util.Locale; import java.util.Map; +import java.util.Set; import java.util.function.Predicate; import static org.elasticsearch.lucene.search.uhighlight.CustomUnifiedHighlighter.MULTIVAL_SEP_CHAR; public class DefaultHighlighter implements Highlighter { + + public static final NodeFeature UNIFIED_HIGHLIGHTER_MATCHED_FIELDS = new NodeFeature("unified_highlighter_matched_fields"); + @Override public boolean canHighlight(MappedFieldType fieldType) { return true; @@ -142,8 +148,18 @@ CustomUnifiedHighlighter buildHighlighter(FieldHighlightContext fieldContext) { } Builder builder = UnifiedHighlighter.builder(searcher, analyzer); builder.withBreakIterator(() -> breakIterator); - builder.withFieldMatcher(fieldMatcher(fieldContext)); builder.withFormatter(passageFormatter); + + Set matchedFields = fieldContext.field.fieldOptions().matchedFields(); + if (matchedFields != null && matchedFields.isEmpty() == false) { + // Masked fields require that the default field matcher is used + if (fieldContext.field.fieldOptions().requireFieldMatch() == false) { + throw new IllegalArgumentException("Matched fields are not supported when [require_field_match] is set to [false]"); + } + builder.withMaskedFieldsFunc((fieldName) -> fieldName.equals(fieldContext.fieldName) ? matchedFields : Collections.emptySet()); + } else { + builder.withFieldMatcher(fieldMatcher(fieldContext)); + } return new CustomUnifiedHighlighter( builder, offsetSource,