-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathtest_exact_matches.py
94 lines (77 loc) · 2.77 KB
/
test_exact_matches.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
import re
import pytest
from home.forms.search import SearchForm
from home.service.search import SearchService
WORD_TOKEN = re.compile(r"[^_\-\s]+")
OVERLAP_THRESHOLD = 0.75
@pytest.mark.slow
@pytest.mark.datahub
@pytest.mark.parametrize(
"query,expected_urn",
[
(
"dummy_pnc_ids_to_send",
"urn:li:dataset:(urn:li:dataPlatform:dbt,cadet.awsdatacatalog.bold_rr_pnc_ids.dummy_pnc_ids_to_send,PROD)", # noqa E501
),
(
"Accommodation on the first night following release",
"urn:li:chart:(justice-data,accommodation-on-release)",
),
(
"ns_postcode_lookup_latest_2011census",
"urn:li:dataset:(urn:li:dataPlatform:dbt,cadet.awsdatacatalog.common_lookup.ns_postcode_lookup_latest_2011census,PROD)", # noqa E501
),
],
)
def test_exact_title_match(query, expected_urn):
"""
Test that tables can be retrieved by searching for their exact name
"""
form = SearchForm({"query": query})
assert form.is_valid()
service = SearchService(form=form, page="1")
results = service.results
assert results.total_results >= 1
assert results.page_results[0].urn == expected_urn
@pytest.mark.slow
@pytest.mark.datahub
@pytest.mark.parametrize(
"query",
(
("prison_population_history.chunk_assignment",),
("Accommodation on the first night following release",),
("vcms_activations",),
("ns_postcode_lookup_latest_2011census",),
),
)
def test_no_duplicates(query):
"""
Test that there are no entries with similar names in the first page
"""
form = SearchForm({"query": query})
assert form.is_valid()
service = SearchService(form=form, page="1")
results = service.results
titles = [result.fully_qualified_name for result in results.page_results]
assert_no_fuzzy_match(titles)
def assert_no_fuzzy_match(titles):
"""
Check for similar looking titles by tokenising and comparing the number of tokens
common to both titles to the number of tokens that are unique to one or the other
"""
for i, title1 in enumerate(titles, 1):
for j, title2 in enumerate(titles, 1):
if i == j:
continue
assert (
title1 != title2
), f'"{title1}" @ position {i} duplicates {title2} @ position {j}"'
tokens1 = set(WORD_TOKEN.findall(title1))
if not tokens1:
continue
tokens2 = set(WORD_TOKEN.findall(title2))
intersection = tokens1.intersection(tokens2)
union = tokens1.union(tokens2)
assert (
len(intersection) / len(union) <= OVERLAP_THRESHOLD
), f'"{title1}" @ position {i} is similar to {title2} @ position {j}"'