Unified Highlighter to support matched_fields

Add support to the Uunified highlighter to combine matches on multiple fields to highlight a single field: "matched_fields". Based on Lucene PR: apache/lucene#13268 Lucene PR is based on the concept of masked fields where masked fields are different from the original highlighted field. This PR in Elasticsearch uses the already existing highlighter parameter "matched_fields" Closes elastic#5172
mayya-sharipova · Apr 18, 2024 · 7c79a3e · 7c79a3e
1 parent daa63d2
commit 7c79a3e
Show file tree

Hide file tree

Showing 3 changed files with 142 additions and 27 deletions.
diff --git a/.../yamlRestTest/resources/rest-api-spec/test/search.highlight/60_unified_matched_fields.yml b/.../yamlRestTest/resources/rest-api-spec/test/search.highlight/60_unified_matched_fields.yml
@@ -0,0 +1,82 @@
+setup:
+  - requires:
+      cluster_features: ["gte_v8.13.0"]
+      reason: matched_fields for unified highlighter were added in 8.14.0
+  - do:
+      indices.create:
+          index: index1
+          body:
+            settings:
+              analysis:
+                filter:
+                  my_edge_ngram:
+                    type: edge_ngram
+                    min_gram: 2
+                    max_gram: 20
+                analyzer:
+                  my_analyzer:
+                    tokenizer: whitespace
+                    filter: [ my_edge_ngram ]
+            mappings:
+              properties:
+                title:
+                  type: text
+                  fields:
+                    english:
+                      type: text
+                      analyzer: english
+                    ngram:
+                      type: text
+                      analyzer: my_analyzer
+                body :
+                  type: text
+
+
+  - do:
+      bulk:
+        refresh: true
+        index: index1
+        body:
+          - '{"index": {"_id": 1 }}'
+          - '{"title": "dancing with the stars", "body": "Dancing with the Stars is a popular TV show"}'
+          - '{"index": {"_id": 2 }}'
+          - '{"title": "dance with star", "body": "Dancing with the Stars is a popular TV show"}'
+
+---
+"Highlight based on single masked field":
+  - do:
+      search:
+        index: index1
+        body:
+          query:
+            query_string:
+              query: "\"dancing with the stars\""
+              fields: ["title^5", "title.english"]
+              phrase_slop: 2
+          highlight:
+            fields:
+              title:
+                matched_fields: ["title.english"]
+
+  - length: {hits.hits: 2}
+  - match: {hits.hits.0.highlight.title.0: "<em>dancing with the stars</em>"}
+  - match: {hits.hits.1.highlight.title.0: "<em>dance with star</em>"}
+
+---
+"Highlight based on multiple masked fields":
+  - do:
+      search:
+        index: index1
+        body:
+          query:
+            query_string:
+              query: "dan with star"
+              fields: ["title^5", "title.ngram", "title.english"]
+          highlight:
+            fields:
+              title:
+                matched_fields: ["title.ngram", "title.english"]
+
+  - length: {hits.hits: 2}
+  - match: {hits.hits.0.highlight.title.0: "<em>dance</em> <em>with</em> <em>star</em>" }
+  - match: {hits.hits.1.highlight.title.0: "<em>dancing</em> <em>with</em> the <em>stars</em>"}
diff --git a/...usterTest/java/org/elasticsearch/search/fetch/subphase/highlight/HighlighterSearchIT.java b/...usterTest/java/org/elasticsearch/search/fetch/subphase/highlight/HighlighterSearchIT.java
@@ -1037,14 +1037,19 @@ public void testFVHManyMatches() throws Exception {
     }
 
     public void testMatchedFieldsFvhRequireFieldMatch() throws Exception {
-        checkMatchedFieldsCase(true);
+        checkMatchedFieldsCase(true, "fvh");
     }
 
     public void testMatchedFieldsFvhNoRequireFieldMatch() throws Exception {
-        checkMatchedFieldsCase(false);
+        checkMatchedFieldsCase(false, "fvh");
     }
 
-    private void checkMatchedFieldsCase(boolean requireFieldMatch) throws Exception {
+    public void testMatchedFieldsUnified() throws Exception {
+        // unified highlighter doesn't support require_field_match when matched fields are used
+        checkMatchedFieldsCase(randomBoolean(), "unified");
+    }
+
+    private void checkMatchedFieldsCase(boolean requireFieldMatch, String type) throws Exception {
         Settings.Builder settings = Settings.builder();
         settings.put(indexSettings());
         settings.put("index.analysis.analyzer.mock_english.tokenizer", "standard");
@@ -1104,7 +1109,7 @@ private void checkMatchedFieldsCase(boolean requireFieldMatch) throws Exception
         Field fooField = new Field("foo").numOfFragments(1)
             .order("score")
             .fragmentSize(25)
-            .highlighterType("fvh")
+            .highlighterType(type)
             .requireFieldMatch(requireFieldMatch);
         SearchRequestBuilder req = prepareSearch("test").highlighter(new HighlightBuilder().field(fooField));
 
@@ -1125,7 +1130,7 @@ private void checkMatchedFieldsCase(boolean requireFieldMatch) throws Exception
         fooField = new Field("foo").numOfFragments(1)
             .order("score")
             .fragmentSize(25)
-            .highlighterType("fvh")
+            .highlighterType(type)
             .requireFieldMatch(requireFieldMatch);
         fooField.matchedFields("foo", "foo.plain");
         req = prepareSearch("test").highlighter(new HighlightBuilder().field(fooField));
@@ -1144,20 +1149,22 @@ private void checkMatchedFieldsCase(boolean requireFieldMatch) throws Exception
         fooField = new Field("foo").numOfFragments(1)
             .order("score")
             .fragmentSize(25)
-            .highlighterType("fvh")
+            .highlighterType(type)
             .requireFieldMatch(requireFieldMatch);
         fooField.matchedFields("foo.plain");
         req = prepareSearch("test").highlighter(new HighlightBuilder().field(fooField));
+        // unified highlighter always keeps the original field in the list of matched fields
+        String expectedHighlight0 = type.equals("unified") ? "<em>running</em> with <em>scissors</em>" : "<em>running</em> with scissors";
         assertResponse(
             req.setQuery(queryStringQuery("foo.plain:running scissors").field("foo")),
-            response -> assertHighlight(response, 0, "foo", 0, equalTo("<em>running</em> with scissors"))
+            response -> assertHighlight(response, 0, "foo", 0, equalTo(expectedHighlight0))
         );
 
         // Now make sure boosted fields don't blow up when matched fields is both the subfield and stored field.
         fooField = new Field("foo").numOfFragments(1)
             .order("score")
             .fragmentSize(25)
-            .highlighterType("fvh")
+            .highlighterType(type)
             .requireFieldMatch(requireFieldMatch);
         fooField.matchedFields("foo", "foo.plain");
         req = prepareSearch("test").highlighter(new HighlightBuilder().field(fooField));
@@ -1184,40 +1191,43 @@ private void checkMatchedFieldsCase(boolean requireFieldMatch) throws Exception
             response -> assertHighlight(response, 0, "foo", 0, equalTo("<em>running</em> with <em>scissors</em>"))
         );
 
+        // Unified and FVH highlighters break text into fragments differently
+        String expectedHighlight1 = type.equals("unified") ? "junk junk junk <em>cats</em> junk" : "junk junk <em>cats</em> junk junk";
+
         // But we use the best found score when sorting fragments
         assertResponse(
             req.setQuery(queryStringQuery("cats foo.plain:cats^5").field("foo")),
-            response -> assertHighlight(response, 0, "foo", 0, equalTo("junk junk <em>cats</em> junk junk"))
+            response -> assertHighlight(response, 0, "foo", 0, equalTo(expectedHighlight1))
         );
 
         // which can also be written by searching on the subfield
         assertResponse(
             req.setQuery(queryStringQuery("cats").field("foo").field("foo.plain", 5)),
-            response -> assertHighlight(response, 0, "foo", 0, equalTo("junk junk <em>cats</em> junk junk"))
+            response -> assertHighlight(response, 0, "foo", 0, equalTo(expectedHighlight1))
         );
 
         // Speaking of two fields, you can have two fields, only one of which has matchedFields enabled
         QueryBuilder twoFieldsQuery = queryStringQuery("cats").field("foo").field("foo.plain", 5).field("bar").field("bar.plain", 5);
         Field barField = new Field("bar").numOfFragments(1)
             .order("score")
             .fragmentSize(25)
-            .highlighterType("fvh")
+            .highlighterType(type)
             .requireFieldMatch(requireFieldMatch);
         assertResponse(req.setQuery(twoFieldsQuery).highlighter(new HighlightBuilder().field(fooField).field(barField)), response -> {
-            assertHighlight(response, 0, "foo", 0, equalTo("junk junk <em>cats</em> junk junk"));
+            assertHighlight(response, 0, "foo", 0, equalTo(expectedHighlight1));
             assertHighlight(response, 0, "bar", 0, equalTo("<em>cat</em> <em>cat</em> junk junk junk junk"));
         });
         // And you can enable matchedField highlighting on both
         barField.matchedFields("bar", "bar.plain");
         assertResponse(req.setQuery(twoFieldsQuery).highlighter(new HighlightBuilder().field(fooField).field(barField)), response -> {
-            assertHighlight(response, 0, "foo", 0, equalTo("junk junk <em>cats</em> junk junk"));
-            assertHighlight(response, 0, "bar", 0, equalTo("junk junk <em>cats</em> junk junk"));
+            assertHighlight(response, 0, "foo", 0, equalTo(expectedHighlight1));
+            assertHighlight(response, 0, "bar", 0, equalTo(expectedHighlight1));
         });
 
         // Setting a matchedField that isn't searched/doesn't exist is simply ignored.
         barField.matchedFields("bar", "candy");
         assertResponse(req.setQuery(twoFieldsQuery).highlighter(new HighlightBuilder().field(fooField).field(barField)), response -> {
-            assertHighlight(response, 0, "foo", 0, equalTo("junk junk <em>cats</em> junk junk"));
+            assertHighlight(response, 0, "foo", 0, equalTo(expectedHighlight1));
             assertHighlight(response, 0, "bar", 0, equalTo("<em>cat</em> <em>cat</em> junk junk junk junk"));
         });
 
@@ -1233,12 +1243,15 @@ private void checkMatchedFieldsCase(boolean requireFieldMatch) throws Exception
         );
 
         // If the stored field is found but the matched field isn't then you don't get a result either.
-        fooField.matchedFields("bar.plain");
-        assertResponse(
-            req.setQuery(queryStringQuery("running scissors").field("foo").field("foo.plain").field("bar").field("bar.plain"))
-                .highlighter(new HighlightBuilder().field(fooField).field(barField)),
-            response -> assertThat(response.getHits().getAt(0).getHighlightFields(), not(hasKey("foo")))
-        );
+        // only applicable to fvh highlighter, as unified highlighter always keeps the original field in the list of matched fields
+        if (type.equals("fvh")) {
+            fooField.matchedFields("bar.plain");
+            assertResponse(
+                req.setQuery(queryStringQuery("running scissors").field("foo").field("foo.plain").field("bar").field("bar.plain"))
+                    .highlighter(new HighlightBuilder().field(fooField).field(barField)),
+                response -> assertThat(response.getHits().getAt(0).getHighlightFields(), not(hasKey("foo")))
+            );
+        }
 
         // But if you add the stored field to the list of matched fields then you'll get a result again
         fooField.matchedFields("foo", "bar.plain");
@@ -1261,11 +1274,22 @@ private void checkMatchedFieldsCase(boolean requireFieldMatch) throws Exception
             }
         );
 
-        assertFailures(
-            req.setQuery(queryStringQuery("result").field("foo").field("foo.plain").field("bar").field("bar.plain")),
-            RestStatus.INTERNAL_SERVER_ERROR,
-            containsString("IndexOutOfBoundsException")
-        );
+        if (type.equals("unified")) {
+            assertResponse(
+                req.setQuery(queryStringQuery("result").field("foo").field("foo.plain").field("bar").field("bar.plain"))
+                    .highlighter(new HighlightBuilder().field(fooField).field(barField)),
+                response -> {
+                    assertHighlight(response, 0, "bar", 0, equalTo("<em>result</em>"));
+                }
+            );
+        } else {
+            assertFailures(
+                req.setQuery(queryStringQuery("result").field("foo").field("foo.plain").field("bar").field("bar.plain"))
+                    .highlighter(new HighlightBuilder().field(fooField).field(barField)),
+                RestStatus.INTERNAL_SERVER_ERROR,
+                containsString("IndexOutOfBoundsException")
+            );
+        }
     }
 
     public void testFastVectorHighlighterManyDocs() throws Exception {

diff --git a/...r/src/main/java/org/elasticsearch/search/fetch/subphase/highlight/DefaultHighlighter.java b/...r/src/main/java/org/elasticsearch/search/fetch/subphase/highlight/DefaultHighlighter.java
@@ -36,10 +36,12 @@
 import java.io.IOException;
 import java.text.BreakIterator;
 import java.util.ArrayList;
+import java.util.Collections;
 import java.util.HashMap;
 import java.util.List;
 import java.util.Locale;
 import java.util.Map;
+import java.util.Set;
 import java.util.function.Predicate;
 
 import static org.elasticsearch.lucene.search.uhighlight.CustomUnifiedHighlighter.MULTIVAL_SEP_CHAR;
@@ -142,8 +144,15 @@ CustomUnifiedHighlighter buildHighlighter(FieldHighlightContext fieldContext) {
         }
         Builder builder = UnifiedHighlighter.builder(searcher, analyzer);
         builder.withBreakIterator(() -> breakIterator);
-        builder.withFieldMatcher(fieldMatcher(fieldContext));
         builder.withFormatter(passageFormatter);
+
+        Set<String> matchedFields = fieldContext.field.fieldOptions().matchedFields();
+        // Masked fields require that the default field matcher is used
+        if (matchedFields != null && matchedFields.isEmpty() == false) {
+            builder.withMaskedFieldsFunc((fieldName) -> fieldName.equals(fieldContext.fieldName) ? matchedFields : Collections.emptySet());
+        } else {
+            builder.withFieldMatcher(fieldMatcher(fieldContext));
+        }
         return new CustomUnifiedHighlighter(
             builder,
             offsetSource,