Skip to content

Commit 7bf0584

Browse files
authored
Merge pull request #291 from uktrade/feature/search-custom-analyzer
Improved search results.
2 parents c754b51 + f02f4d9 commit 7bf0584

File tree

6 files changed

+38
-40
lines changed

6 files changed

+38
-40
lines changed

datahub/search/apps.py

+1
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@ class SearchConfig(AppConfig):
1313
def ready(self):
1414
"""Configures Elasticsearch default connection."""
1515
elasticsearch.configure_connection()
16+
elasticsearch.configure_index(index=settings.ES_INDEX)
1617

1718
# Makes sure mappings exist in Elasticsearch.
1819
# Those calls are idempotent

datahub/search/elasticsearch.py

+17-4
Original file line numberDiff line numberDiff line change
@@ -1,13 +1,18 @@
11
from collections import defaultdict
2-
from urllib.parse import ParseResult, urlparse # noqa: F401
32

43
import dateutil.parser
54
from django.conf import settings
65
from elasticsearch.helpers import bulk as es_bulk
7-
from elasticsearch_dsl import Search
6+
from elasticsearch_dsl import analysis, Index, Search
87
from elasticsearch_dsl.connections import connections
98
from elasticsearch_dsl.query import Match, MatchPhrase, Q
109

10+
lowercase_keyword_analyzer = analysis.CustomAnalyzer(
11+
'lowercase_keyword_analyzer',
12+
tokenizer='keyword',
13+
filter=['lowercase']
14+
)
15+
1116

1217
def configure_connection():
1318
"""Configure Elasticsearch default connection."""
@@ -18,12 +23,20 @@ def configure_connection():
1823
)
1924

2025

26+
def configure_index(index_name, settings=None):
27+
"""Configures Elasticsearch index."""
28+
index = Index(index_name)
29+
index.analyzer(lowercase_keyword_analyzer)
30+
if settings:
31+
index.settings(**settings)
32+
index.create()
33+
34+
2135
def get_search_term_query(term):
2236
"""Returns search term query."""
2337
return Q('bool', should=[
24-
MatchPhrase(name={'query': term, 'slop': 200}),
38+
MatchPhrase(name_keyword={'query': term, 'boost': 2}),
2539
Match(name={'query': term}),
26-
MatchPhrase(_all={'query': term}),
2740
Match(_all={'query': term}),
2841
])
2942

datahub/search/models.py

+8-5
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
from django.conf import settings
2-
from elasticsearch_dsl import Boolean, Date, DocType, Double, Integer, Nested, String
2+
from elasticsearch_dsl import (Boolean, Date, DocType, Double,
3+
Integer, Nested, String)
34

45

56
def _id_name_dict(obj):
@@ -113,7 +114,8 @@ class Company(DocType, MapDBModelToDict):
113114
headquarter_type = Nested(properties={'id': String(index='not_analyzed'), 'name': String()})
114115
id = String(index='not_analyzed')
115116
modified_on = Date()
116-
name = String()
117+
name = String(copy_to='name_keyword')
118+
name_keyword = String(analyzer='lowercase_keyword_analyzer')
117119
one_list_account_owner = Nested(properties={'id': String(index='not_analyzed'),
118120
'first_name': String(copy_to='one_list_account_owner.name'),
119121
'last_name': String(copy_to='one_list_account_owner.name'),
@@ -185,7 +187,8 @@ class Contact(DocType, MapDBModelToDict):
185187
created_on = Date()
186188
modified_on = Date()
187189
id = String(index='not_analyzed')
188-
name = String()
190+
name = String(copy_to='name_keyword')
191+
name_keyword = String(analyzer='lowercase_keyword_analyzer')
189192
title = Nested(properties={'id': String(index='not_analyzed'), 'name': String(copy_to='name')})
190193
first_name = String(copy_to='name')
191194
last_name = String(copy_to='name')
@@ -307,8 +310,8 @@ class InvestmentProject(DocType, MapDBModelToDict):
307310
'id': String(index='not_analyzed'),
308311
'name': String()
309312
}) # InvestmentType
310-
name = String()
311-
description = String()
313+
name = String(copy_to='name_keyword')
314+
name_keyword = String(analyzer='lowercase_keyword_analyzer')
312315
r_and_d_budget = Boolean()
313316
non_fdi_r_and_d_budget = Boolean()
314317
new_tech_to_uk = Boolean()

datahub/search/test/conftest.py

+5-5
Original file line numberDiff line numberDiff line change
@@ -4,15 +4,14 @@
44
from django.core import management
55
from django.db.models.signals import post_save
66
from elasticsearch.helpers.test import get_test_client
7-
from elasticsearch_dsl import Index
87
from pytest import fixture
98

109
from datahub.company.models import Company, Contact
1110
from datahub.company.test.factories import CompanyFactory, ContactFactory
1211
from datahub.core import constants
1312
from datahub.investment.models import InvestmentProject
1413
from datahub.investment.test.factories import InvestmentProjectFactory
15-
from datahub.search import models
14+
from datahub.search import models, elasticsearch
1615
from datahub.search.management.commands import sync_es
1716

1817

@@ -74,9 +73,10 @@ def create_test_index(client, index):
7473
if client.indices.exists(index=index):
7574
client.indices.delete(index)
7675

77-
index = Index(index)
78-
index.create()
79-
76+
elasticsearch.configure_index(index, {
77+
'number_of_shards': 1,
78+
'number_of_replicas': 0,
79+
})
8080

8181
@fixture
8282
def post_save_handlers():

datahub/search/test/test_elasticsearch.py

+6-24
Original file line numberDiff line numberDiff line change
@@ -13,9 +13,9 @@ def test_get_search_term_query():
1313
'should': [
1414
{
1515
'match_phrase': {
16-
'name': {
16+
'name_keyword': {
1717
'query': 'hello',
18-
'slop': 200
18+
'boost': 2
1919
}
2020
}
2121
}, {
@@ -24,12 +24,6 @@ def test_get_search_term_query():
2424
'query': 'hello',
2525
}
2626
}
27-
}, {
28-
'match_phrase': {
29-
'_all': {
30-
'query': 'hello',
31-
}
32-
}
3327
}, {
3428
'match': {
3529
'_all': {
@@ -52,9 +46,9 @@ def test_get_basic_search_query():
5246
'should': [
5347
{
5448
'match_phrase': {
55-
'name': {
49+
'name_keyword': {
5650
'query': 'test',
57-
'slop': 200
51+
'boost': 2
5852
}
5953
}
6054
}, {
@@ -63,12 +57,6 @@ def test_get_basic_search_query():
6357
'query': 'test',
6458
}
6559
}
66-
}, {
67-
'match_phrase': {
68-
'_all': {
69-
'query': 'test',
70-
}
71-
}
7260
}, {
7361
'match': {
7462
'_all': {
@@ -137,9 +125,9 @@ def test_search_by_entity_query():
137125
'should': [
138126
{
139127
'match_phrase': {
140-
'name': {
128+
'name_keyword': {
141129
'query': 'test',
142-
'slop': 200
130+
'boost': 2
143131
}
144132
}
145133
}, {
@@ -148,12 +136,6 @@ def test_search_by_entity_query():
148136
'query': 'test',
149137
}
150138
}
151-
}, {
152-
'match_phrase': {
153-
'_all': {
154-
'query': 'test',
155-
}
156-
}
157139
}, {
158140
'match': {
159141
'_all': {

datahub/search/test/test_views.py

+1-2
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
import datetime
2-
from unittest import mock, skip
2+
from unittest import mock
33

44
import pytest
55
from elasticsearch_dsl.connections import connections
@@ -217,7 +217,6 @@ def test_search_investment_project_no_filters(self):
217217

218218
assert response.status_code == status.HTTP_400_BAD_REQUEST
219219

220-
@skip('This test fails randomly. Skip until fixed.')
221220
@mock.patch('datahub.core.utils.executor.submit', synchronous_executor_submit)
222221
@mock.patch('django.db.transaction.on_commit', synchronous_transaction_on_commit)
223222
def test_search_results_quality(self):

0 commit comments

Comments
 (0)