Skip to content

Commit 3b7234b

Browse files
LavMattmitchdawson1982
authored andcommitted
Fmd 366 add dataset lineage link (#416)
* add upstream and downstream lineage to getDatasetDetails graphql query * refactor parse_relations() helper to handle more relations * add upstream and downstream lineage to RelationshipType enum * update parse_relations() input args * update parse_relations() input args in search * add has_lineage and lineage_url to dataset details context * add lineage link to details_table template * remove redundant block in query for data product relationships * return entity name for lineage * have only 1 RelationshipType for lineage * simplfy `parse_relations()` helper function * update DatasetDetails to use single lineage type * align url to rest of table * update tests * add default value for url * design suggestions for lineage label, from Alex and Jess * spell it right * suggestions from Mat
1 parent dc12292 commit 3b7234b

File tree

12 files changed

+123
-49
lines changed

12 files changed

+123
-49
lines changed

home/service/details.py

+21
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,8 @@
1+
import os
12
from data_platform_catalogue.entities import RelationshipType
23
from data_platform_catalogue.search_types import MultiSelectFilter, ResultType
34
from django.core.exceptions import ObjectDoesNotExist
5+
from urllib.parse import urlsplit
46

57
from .base import GenericService
68

@@ -86,13 +88,32 @@ def __init__(self, urn: str):
8688
self.context = self._get_context()
8789

8890
def _get_context(self):
91+
split_datahub_url = urlsplit(
92+
os.getenv("CATALOGUE_URL", "https://test-catalogue.gov.uk")
93+
)
94+
8995
return {
9096
"table": self.table_metadata,
9197
"parent_entity": self.parent_entity,
9298
"dataset_parent_type": self.dataset_parent_type,
9399
"h1_value": "Details",
100+
"has_lineage": self.has_lineage(),
101+
"lineage_url": f"{split_datahub_url.scheme}://{split_datahub_url.netloc}/dataset/{self.table_metadata.urn}/Lineage?is_lineage_mode=true&",
94102
}
95103

104+
def has_lineage(self) -> bool:
105+
"""
106+
Inspects the relationships property of the Table model to establish if a
107+
Dataset has any lineage recorded in datahub.
108+
"""
109+
has_lineage = (
110+
len(
111+
self.table_metadata.relationships.get(RelationshipType.DATA_LINEAGE, [])
112+
)
113+
> 0
114+
)
115+
return has_lineage
116+
96117

97118
class ChartDetailsService(GenericService):
98119
def __init__(self, urn: str):

lib/datahub-client/data_platform_catalogue/client/datahub_client.py

+11-9
Original file line numberDiff line numberDiff line change
@@ -232,21 +232,23 @@ def get_table_details(self, urn) -> Table:
232232
created, modified = parse_created_and_modified(properties)
233233
name, display_name, qualified_name = parse_names(response, properties)
234234

235-
# A dataset can't have multiple parents, but if we did
236-
# start to use in that we'd need to change this
237-
if response["container_relations"]["total"] > 0:
238-
relations = parse_relations(
239-
RelationshipType.PARENT, response["container_relations"]
240-
)
241-
else:
242-
relations = {}
235+
lineage_relations = parse_relations(
236+
RelationshipType.DATA_LINEAGE,
237+
[
238+
response.get("downstream_lineage_relations", {}),
239+
response.get("upstream_lineage_relations", {}),
240+
],
241+
)
242+
parent_relations = parse_relations(
243+
RelationshipType.PARENT, [response["parent_container_relations"]]
244+
)
243245
return Table(
244246
urn=urn,
245247
display_name=display_name,
246248
name=name,
247249
fully_qualified_name=qualified_name,
248250
description=properties.get("description", ""),
249-
relationships=relations,
251+
relationships={**lineage_relations, **parent_relations},
250252
domain=domain,
251253
governance=Governance(
252254
data_owner=owner,

lib/datahub-client/data_platform_catalogue/client/graphql/getDatasetDetails.graphql

+41-1
Original file line numberDiff line numberDiff line change
@@ -14,10 +14,50 @@ query getDatasetDetails($urn: String!) {
1414
}
1515
}
1616
}
17+
downstream_lineage_relations: lineage (
18+
input: {direction: DOWNSTREAM
19+
start:0,
20+
count:10}
21+
) {
22+
total
23+
relationships{
24+
type
25+
entity{
26+
urn
27+
... on Dataset {
28+
name
29+
properties{
30+
name
31+
}
32+
}
33+
type
34+
}
35+
}
36+
}
37+
upstream_lineage_relations: lineage (
38+
input: {direction: UPSTREAM
39+
start:0,
40+
count:10}
41+
) {
42+
total
43+
relationships{
44+
type
45+
entity{
46+
urn
47+
... on Dataset {
48+
name
49+
properties{
50+
name
51+
}
52+
}
53+
type
54+
}
55+
}
56+
}
1757
subTypes {
1858
typeNames
1959
}
20-
container_relations: relationships(
60+
parent_container_relations: relationships(
2161
input: { types: ["IsPartOf"], direction: OUTGOING, count: 10 }
2262
) {
2363
total

lib/datahub-client/data_platform_catalogue/client/graphql_helpers.py

+15-10
Original file line numberDiff line numberDiff line change
@@ -256,21 +256,26 @@ def parse_columns(entity: dict[str, Any]) -> list[Column]:
256256

257257

258258
def parse_relations(
259-
relationship_type: RelationshipType, relations_dict: dict
259+
relationship_type: RelationshipType, relations_list: list[dict]
260260
) -> dict[RelationshipType, list[EntityRef]]:
261261
"""
262262
parse the relationships results returned from a graphql querys
263263
"""
264-
# we may want to do soemthing with total realtion if we are returning child
264+
265+
# we may want to do something with total relations if we are returning child
265266
# relations and need to paginate through relations - 10 relations returned as is
266-
# total_relations = relations_dict.get("total", 0)
267-
parent_entities = relations_dict.get("relationships", [])
268-
related_entities = [
269-
EntityRef(
270-
urn=i["entity"]["urn"], display_name=i["entity"]["properties"]["name"]
271-
)
272-
for i in parent_entities
273-
]
267+
# There may be more than 10 lineage entities but since we currently only care
268+
# if lineage exists for a dataset we don't need to capture everything
269+
related_entities = []
270+
for j in relations_list:
271+
for i in j["relationships"]:
272+
urn = i.get("entity").get("urn")
273+
display_name = (
274+
i.get("entity").get("properties").get("name")
275+
if i.get("entity", {}).get("properties") is not None
276+
else i.get("entity").get("name")
277+
)
278+
related_entities.append(EntityRef(urn=urn, display_name=display_name))
274279

275280
relations_return = {relationship_type: related_entities}
276281
return relations_return

lib/datahub-client/data_platform_catalogue/client/search.py

-5
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,6 @@
1212
parse_names,
1313
parse_owner,
1414
parse_properties,
15-
parse_relations,
1615
parse_tags,
1716
)
1817
from data_platform_catalogue.entities import RelationshipType
@@ -259,16 +258,12 @@ def _parse_result(
259258
last_modified = parse_last_modified(entity)
260259
name, display_name, qualified_name = parse_names(entity, properties)
261260

262-
relations = parse_relations(
263-
RelationshipType.PARENT, entity.get("relationships", {})
264-
)
265261
domain = parse_domain(entity)
266262

267263
metadata = {
268264
"owner": owner.display_name,
269265
"owner_email": owner.email,
270266
"total_parents": entity.get("relationships", {}).get("total", 0),
271-
"parents": relations[RelationshipType.PARENT],
272267
"domain_name": domain.display_name,
273268
"domain_id": domain.urn,
274269
"entity_types": self._parse_types_and_sub_types(entity, "Dataset"),

lib/datahub-client/data_platform_catalogue/entities.py

+1
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010
class RelationshipType(Enum):
1111
PARENT = "PARENT"
1212
PLATFORM = "PLATFORM"
13+
DATA_LINEAGE = "DATA_LINEAGE"
1314

1415

1516
class EntityRef(BaseModel):

lib/datahub-client/tests/client/datahub/test_datahub_client.py

+17-7
Original file line numberDiff line numberDiff line change
@@ -243,17 +243,22 @@ def test_get_dataset(
243243
urn = "abc"
244244
datahub_response = {
245245
"dataset": {
246+
"type": "DATASET",
246247
"platform": {"name": "datahub"},
247248
"ownership": None,
248249
"subTypes": None,
249-
"container_relations": {
250+
"downstream_lineage_relations": {"total": 0, "relationships": []},
251+
"upstream_lineage_relations": {"total": 0, "relationships": []},
252+
"parent_container_relations": {
250253
"total": 1,
251254
"relationships": [
252255
{
256+
"type": "IsPartOf",
257+
"direction": "OUTGOING",
253258
"entity": {
254259
"urn": "urn:li:container:database",
255260
"properties": {"name": "database"},
256-
}
261+
},
257262
}
258263
],
259264
},
@@ -325,7 +330,8 @@ def test_get_dataset(
325330
relationships={
326331
RelationshipType.PARENT: [
327332
EntityRef(urn="urn:li:container:database", display_name="database")
328-
]
333+
],
334+
RelationshipType.DATA_LINEAGE: [],
329335
},
330336
domain=DomainRef(display_name="", urn=""),
331337
governance=Governance(
@@ -369,9 +375,10 @@ def test_get_dataset_minimal_properties(
369375
"platform": {"name": "datahub"},
370376
"name": "notinproperties",
371377
"properties": {},
372-
"container_relations": {
373-
"total": 0,
374-
},
378+
"downstream_lineage_relations": {"total": 0, "relationships": []},
379+
"upstream_lineage_relations": {"total": 0, "relationships": []},
380+
"parent_container_relations": {"total": 0, "relationships": []},
381+
"data_product_relations": {"total": 0, "relationships": []},
375382
"schemaMetadata": {"fields": []},
376383
}
377384
}
@@ -389,7 +396,10 @@ def test_get_dataset_minimal_properties(
389396
name="notinproperties",
390397
fully_qualified_name="notinproperties",
391398
description="",
392-
relationships={},
399+
relationships={
400+
RelationshipType.PARENT: [],
401+
RelationshipType.DATA_LINEAGE: [],
402+
},
393403
domain=DomainRef(display_name="", urn=""),
394404
governance=Governance(
395405
data_owner=OwnerRef(display_name="", email="", urn=""),

lib/datahub-client/tests/client/datahub/test_graphql_helpers.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -191,7 +191,7 @@ def test_parse_relations():
191191
],
192192
}
193193
}
194-
result = parse_relations(RelationshipType.PARENT, relations["relationships"])
194+
result = parse_relations(RelationshipType.PARENT, [relations["relationships"]])
195195
assert result == {
196196
RelationshipType.PARENT: [
197197
EntityRef(urn="urn:li:dataProduct:test", display_name="test")
@@ -201,7 +201,7 @@ def test_parse_relations():
201201

202202
def test_parse_relations_blank():
203203
relations = {"relationships": {"total": 0, "relationships": []}}
204-
result = parse_relations(RelationshipType.PARENT, relations["relationships"])
204+
result = parse_relations(RelationshipType.PARENT, [relations["relationships"]])
205205
assert result == {RelationshipType.PARENT: []}
206206

207207

lib/datahub-client/tests/client/datahub/test_search.py

-9
Original file line numberDiff line numberDiff line change
@@ -123,7 +123,6 @@ def test_one_search_result(mock_graph, searcher):
123123
"owner": "",
124124
"owner_email": "",
125125
"total_parents": 0,
126-
"parents": [],
127126
"domain_name": "HMPPS",
128127
"domain_id": "urn:li:domain:3dc18e48-c062-4407-84a9-73e23f768023",
129128
"entity_types": {
@@ -210,7 +209,6 @@ def test_dataset_result(mock_graph, searcher):
210209
"owner": "",
211210
"owner_email": "",
212211
"total_parents": 0,
213-
"parents": [],
214212
"domain_name": "HMPPS",
215213
"domain_id": "urn:li:domain:3dc18e48-c062-4407-84a9-73e23f768023",
216214
"entity_types": {
@@ -302,7 +300,6 @@ def test_full_page(mock_graph, searcher):
302300
"owner": "",
303301
"owner_email": "",
304302
"total_parents": 0,
305-
"parents": [],
306303
"domain_name": "",
307304
"domain_id": "",
308305
"entity_types": {
@@ -334,7 +331,6 @@ def test_full_page(mock_graph, searcher):
334331
"owner": "",
335332
"owner_email": "",
336333
"total_parents": 0,
337-
"parents": [],
338334
"domain_name": "",
339335
"domain_id": "",
340336
"entity_types": {
@@ -364,7 +360,6 @@ def test_full_page(mock_graph, searcher):
364360
"owner": "",
365361
"owner_email": "",
366362
"total_parents": 0,
367-
"parents": [],
368363
"domain_name": "",
369364
"domain_id": "",
370365
"entity_types": {
@@ -444,7 +439,6 @@ def test_query_match(mock_graph, searcher):
444439
"owner": "",
445440
"owner_email": "",
446441
"total_parents": 0,
447-
"parents": [],
448442
"domain_name": "",
449443
"domain_id": "",
450444
"entity_types": {
@@ -520,7 +514,6 @@ def test_result_with_owner(mock_graph, searcher):
520514
"owner": "Shannon Lovett",
521515
"owner_email": "shannon@longtail.com",
522516
"total_parents": 0,
523-
"parents": [],
524517
"domain_name": "",
525518
"domain_id": "",
526519
"entity_types": {
@@ -891,7 +884,6 @@ def test_search_for_charts(mock_graph, searcher):
891884
"owner": "",
892885
"owner_email": "",
893886
"total_parents": 0,
894-
"parents": [],
895887
"domain_name": "",
896888
"domain_id": "",
897889
"entity_types": {
@@ -1091,7 +1083,6 @@ def test_list_database_tables(mock_graph, searcher):
10911083
"owner": "",
10921084
"owner_email": "",
10931085
"total_parents": 0,
1094-
"parents": [],
10951086
"domain_name": "",
10961087
"domain_id": "",
10971088
"entity_types": {

templates/details_table.html

+11-3
Original file line numberDiff line numberDiff line change
@@ -78,9 +78,17 @@ <h3 class="govuk-heading-s govuk-!-margin-top-3">
7878
<h2 class="govuk-heading-m">Table schema</h2>
7979
<p class="govuk-body">The schema for this table is not available.</p>
8080
{% endif %}
81-
82-
83-
</div>
81+
{% if has_lineage %}
82+
<h2 class="govuk-heading-m">Lineage</h2>
83+
<div class="govuk-body-m" >
84+
If you are interested to find out what data were used to create this table or if this table is used to create any further tables, you can see that information via the lineage.
85+
</div class="govuk-body-m">
86+
<div class="govuk-body">
87+
<a href="{{lineage_url}}" class="govuk-link">
88+
View lineage in DataHub
89+
</a>
90+
</div>
91+
{% endif %}
8492
</div>
8593

8694
{% endblock content %}

tests/benchmark/test_exact_matches.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -15,8 +15,8 @@
1515
"query,expected_urn",
1616
[
1717
(
18-
"prison_population_history.chunk_assignment",
19-
"urn:li:dataset:(urn:li:dataPlatform:dbt,cadet.awsdatacatalog.prison_population_history.chunk_assignment,PROD)",
18+
"bold_common_platform_linked_tables.all_offence",
19+
"urn:li:dataset:(urn:li:dataPlatform:dbt,cadet.awsdatacatalog.bold_common_platform_linked_tables.all_offence,PROD)",
2020
),
2121
(
2222
"Accommodation on the first night following release",

tests/conftest.py

+2-1
Original file line numberDiff line numberDiff line change
@@ -98,7 +98,8 @@ def generate_table_metadata(
9898
name=name or fake.unique.name(),
9999
fully_qualified_name="Foo.Dataset",
100100
description=description or fake.paragraph(),
101-
relationships=relations or {RelationshipType.PARENT: []},
101+
relationships=relations
102+
or {RelationshipType.PARENT: [], RelationshipType.DATA_LINEAGE: []},
102103
domain=DomainRef(display_name="LAA", urn="LAA"),
103104
governance=Governance(
104105
data_owner=OwnerRef(

0 commit comments

Comments
 (0)