From 228ae2b593dd33d46bcf4dbe50955a01f0f21d2d Mon Sep 17 00:00:00 2001 From: Mingfei Shao Date: Mon, 30 Jan 2023 16:54:14 -0600 Subject: [PATCH 01/17] wip: vlmd aggmds support --- .gitignore | 1 + src/mds/agg_mds/adapters.py | 4 ++++ src/mds/agg_mds/commons.py | 2 ++ .../agg_mds/datastore/elasticsearch_dao.py | 24 +++++++++++++------ src/mds/config.py | 3 +++ src/mds/populate.py | 8 ++++++- tests/test_agg_mds_commons.py | 17 +++++++++++-- tests/test_agg_mds_elasticsearch_dao.py | 18 ++++++++++---- tests/test_populate.py | 2 +- 9 files changed, 64 insertions(+), 15 deletions(-) diff --git a/.gitignore b/.gitignore index 2645ee69..e587c450 100644 --- a/.gitignore +++ b/.gitignore @@ -109,3 +109,4 @@ postgres-data/ # VSCode .vscode/ +.dccache diff --git a/src/mds/agg_mds/adapters.py b/src/mds/agg_mds/adapters.py index 8a1c18ae..351f9de0 100644 --- a/src/mds/agg_mds/adapters.py +++ b/src/mds/agg_mds/adapters.py @@ -1001,6 +1001,7 @@ def normalizeToGen3MDSFields(self, data, **kwargs) -> Dict[str, Any]: mappings = kwargs.get("mappings", None) config = kwargs.get("config", {}) study_field = config.get("study_field", "gen3_discovery") + data_dict_field = config.get("data_dict_field", None) keepOriginalFields = kwargs.get("keepOriginalFields", True) globalFieldFilters = kwargs.get("globalFieldFilters", []) schema = kwargs.get("schema", {}) @@ -1021,6 +1022,9 @@ def normalizeToGen3MDSFields(self, data, **kwargs) -> Dict[str, Any]: "_guid_type": "discovery_metadata", "gen3_discovery": item, } + # for VLMD, bring it into AggMDS records + if data_dict_field is not None and data_dict_field in record: + results[guid][data_dict_field] = record[data_dict_field] perItemValues = kwargs.get("perItemValues", None) if perItemValues is not None: diff --git a/src/mds/agg_mds/commons.py b/src/mds/agg_mds/commons.py index 69677a1e..6dddbb00 100644 --- a/src/mds/agg_mds/commons.py +++ b/src/mds/agg_mds/commons.py @@ -197,6 +197,7 @@ class MDSInstance: ] = None study_data_field: str = "gen3_discovery" guid_type: str = "discovery_metadata" + data_dict_field: Optional[str] = None select_field: Optional[Dict[str, str]] = None def __post_init__(self): @@ -219,6 +220,7 @@ class AdapterMDSInstance: field_mappings: Optional[Dict[str, Any]] = None per_item_values: Optional[Dict[str, Any]] = None study_data_field: str = "gen3_discovery" + data_dict_field: Optional[str] = None keep_original_fields: bool = True global_field_filters: List[str] = field(default_factory=list) commons_name: Optional[str] = None diff --git a/src/mds/agg_mds/datastore/elasticsearch_dao.py b/src/mds/agg_mds/datastore/elasticsearch_dao.py index cc2a8319..7d145810 100644 --- a/src/mds/agg_mds/datastore/elasticsearch_dao.py +++ b/src/mds/agg_mds/datastore/elasticsearch_dao.py @@ -2,7 +2,12 @@ from typing import Any, List, Dict, Union, Optional, Tuple from math import ceil from mds import logger -from mds.config import AGG_MDS_NAMESPACE, ES_RETRY_LIMIT, ES_RETRY_INTERVAL +from mds.config import ( + AGG_MDS_NAMESPACE, + ES_RETRY_LIMIT, + ES_RETRY_INTERVAL, + AGG_MDS_DEFAULT_STUDY_DATA_FIELD, +) AGG_MDS_INDEX = f"{AGG_MDS_NAMESPACE}-commons-index" AGG_MDS_TYPE = "commons" @@ -189,7 +194,7 @@ async def update_metadata( guid_arr: List[str], tags: Dict[str, List[str]], info: Dict[str, str], - study_data_field: str, + data_dict_field: str = None, use_temp_index: bool = False, ): index_to_update = AGG_MDS_INFO_INDEX_TEMP if use_temp_index else AGG_MDS_INFO_INDEX @@ -201,10 +206,15 @@ async def update_metadata( ) index_to_update = AGG_MDS_INDEX_TEMP if use_temp_index else AGG_MDS_INDEX - for doc in data: - key = list(doc.keys())[0] + for d in data: + key = list(d.keys())[0] # Flatten out this structure - doc = doc[key][study_data_field] + doc = { + AGG_MDS_DEFAULT_STUDY_DATA_FIELD: d[key][AGG_MDS_DEFAULT_STUDY_DATA_FIELD] + } + if data_dict_field in d[key]: + doc[data_dict_field] = d[key][data_dict_field] + print(doc) try: elastic_search_client.index( @@ -295,11 +305,11 @@ async def get_all_metadata(limit, offset, counts: Optional[str] = None, flatten= counts: converts the count of the entry[count] if it is a dict or array returns: - flattend == true + flattened == true results : MDS results as a dict paging info - flattend == false + flattened == false results : { commonsA: metadata commonsB: metadata diff --git a/src/mds/config.py b/src/mds/config.py index 83eebdbe..367de8aa 100644 --- a/src/mds/config.py +++ b/src/mds/config.py @@ -19,6 +19,9 @@ def __init__(self, value): URL_PREFIX = config("URL_PREFIX", default="/" if DEBUG else "/mds") USE_AGG_MDS = config("USE_AGG_MDS", cast=bool, default=False) AGG_MDS_NAMESPACE = config("AGG_MDS_NAMESPACE", default="default_namespace") +AGG_MDS_DEFAULT_STUDY_DATA_FIELD = config( + "AGG_MDS_DEFAULT_STUDY_DATA_FIELD", cast=str, default="gen3_discovery" +) ES_ENDPOINT = config("GEN3_ES_ENDPOINT", default="http://localhost:9200") # Database diff --git a/src/mds/populate.py b/src/mds/populate.py index ea8a367c..dc4fb8b1 100644 --- a/src/mds/populate.py +++ b/src/mds/populate.py @@ -44,6 +44,12 @@ async def populate_metadata(name: str, common, results, use_temp_index=False): entry = next(iter(x.values())) def normalize(entry: dict) -> Any: + # normalize study level metadata field names + if common.study_data_field != config.AGG_MDS_DEFAULT_STUDY_DATA_FIELD: + entry[config.AGG_MDS_DEFAULT_STUDY_DATA_FIELD] = entry.pop( + common.study_data_field + ) + if ( not hasattr(common, "columns_to_fields") or common.columns_to_fields is None @@ -91,7 +97,7 @@ def normalize(entry: dict) -> Any: info = {"commons_url": common.commons_url} await datastore.update_metadata( - name, mds_arr, keys, tags, info, common.study_data_field, use_temp_index + name, mds_arr, keys, tags, info, common.data_dict_field, use_temp_index ) diff --git a/tests/test_agg_mds_commons.py b/tests/test_agg_mds_commons.py index 807d2d7d..6aceb7fc 100644 --- a/tests/test_agg_mds_commons.py +++ b/tests/test_agg_mds_commons.py @@ -227,7 +227,6 @@ def test_parse_config(): } } }, - "gen3_commons": { "my_gen3_commons": { "mds_url": "http://mds", @@ -247,6 +246,13 @@ def test_parse_config(): "mds_url": "http://non-gen3", "commons_url": "non-gen3", "adapter": "icpsr" + }, + "another_gen3_commons": { + "mds_url": "http://another-gen3", + "commons_url": "another-gen3", + "adapter": "gen3", + "study_data_field" : "my_metadata", + "data_dict_field" : "my_data_dict" } } } @@ -295,7 +301,14 @@ def test_parse_config(): "http://non-gen3", "non-gen3", "icpsr", - ) + ), + "another_gen3_commons": AdapterMDSInstance( + "http://another-gen3", + "another-gen3", + "gen3", + study_data_field="my_metadata", + data_dict_field="my_data_dict", + ), }, ) diff --git a/tests/test_agg_mds_elasticsearch_dao.py b/tests/test_agg_mds_elasticsearch_dao.py index e5637f21..e26638bb 100644 --- a/tests/test_agg_mds_elasticsearch_dao.py +++ b/tests/test_agg_mds_elasticsearch_dao.py @@ -221,7 +221,6 @@ async def test_update_metadata(): [], {}, {}, - "gen3_discovery", ) mock_index.assert_has_calls( [ @@ -232,7 +231,13 @@ async def test_update_metadata(): index=AGG_MDS_INFO_INDEX, ), call( - body={"some_field": "some_value", "__manifest": {}, "sites": ""}, + body={ + "gen3_discovery": { + "some_field": "some_value", + "__manifest": {}, + "sites": "", + } + }, doc_type="commons", id="my_id", index=AGG_MDS_INDEX, @@ -264,7 +269,6 @@ async def test_update_metadata_to_temp_index(): [], {}, {}, - "gen3_discovery", use_temp_index=True, ) mock_index.assert_has_calls( @@ -276,7 +280,13 @@ async def test_update_metadata_to_temp_index(): index=AGG_MDS_INFO_INDEX_TEMP, ), call( - body={"some_field": "some_value", "__manifest": {}, "sites": ""}, + body={ + "gen3_discovery": { + "some_field": "some_value", + "__manifest": {}, + "sites": "", + } + }, doc_type="commons", id="my_id", index=AGG_MDS_INDEX_TEMP, diff --git a/tests/test_populate.py b/tests/test_populate.py index cf666570..7c50b0ea 100644 --- a/tests/test_populate.py +++ b/tests/test_populate.py @@ -81,7 +81,7 @@ async def test_populate_metadata(): ["id1"], {"my_category": ["my_name"]}, {"commons_url": "http://commons"}, - "gen3_discovery", + None, False, ) From 61d141b31449a096d051ad82f5e85c2f1f92f8a4 Mon Sep 17 00:00:00 2001 From: Mingfei Shao Date: Mon, 30 Jan 2023 21:05:53 -0600 Subject: [PATCH 02/17] wip: add config field --- .../agg_mds/datastore/elasticsearch_dao.py | 9 +++--- src/mds/config.py | 3 ++ src/mds/populate.py | 28 +++++++++++-------- tests/test_populate.py | 1 - 4 files changed, 25 insertions(+), 16 deletions(-) diff --git a/src/mds/agg_mds/datastore/elasticsearch_dao.py b/src/mds/agg_mds/datastore/elasticsearch_dao.py index 7d145810..d912665a 100644 --- a/src/mds/agg_mds/datastore/elasticsearch_dao.py +++ b/src/mds/agg_mds/datastore/elasticsearch_dao.py @@ -7,6 +7,7 @@ ES_RETRY_LIMIT, ES_RETRY_INTERVAL, AGG_MDS_DEFAULT_STUDY_DATA_FIELD, + AGG_MDS_DEFAULT_DATA_DICT_FIELD, ) AGG_MDS_INDEX = f"{AGG_MDS_NAMESPACE}-commons-index" @@ -194,7 +195,6 @@ async def update_metadata( guid_arr: List[str], tags: Dict[str, List[str]], info: Dict[str, str], - data_dict_field: str = None, use_temp_index: bool = False, ): index_to_update = AGG_MDS_INFO_INDEX_TEMP if use_temp_index else AGG_MDS_INFO_INDEX @@ -212,9 +212,10 @@ async def update_metadata( doc = { AGG_MDS_DEFAULT_STUDY_DATA_FIELD: d[key][AGG_MDS_DEFAULT_STUDY_DATA_FIELD] } - if data_dict_field in d[key]: - doc[data_dict_field] = d[key][data_dict_field] - print(doc) + if AGG_MDS_DEFAULT_DATA_DICT_FIELD in d[key]: + doc[AGG_MDS_DEFAULT_DATA_DICT_FIELD] = d[key][ + AGG_MDS_DEFAULT_DATA_DICT_FIELD + ] try: elastic_search_client.index( diff --git a/src/mds/config.py b/src/mds/config.py index 367de8aa..977a034e 100644 --- a/src/mds/config.py +++ b/src/mds/config.py @@ -22,6 +22,9 @@ def __init__(self, value): AGG_MDS_DEFAULT_STUDY_DATA_FIELD = config( "AGG_MDS_DEFAULT_STUDY_DATA_FIELD", cast=str, default="gen3_discovery" ) +AGG_MDS_DEFAULT_DATA_DICT_FIELD = config( + "AGG_MDS_DEFAULT_DATA_DICT_FIELD", cast=str, default="data_dictionaries" +) ES_ENDPOINT = config("GEN3_ES_ENDPOINT", default="http://localhost:9200") # Database diff --git a/src/mds/populate.py b/src/mds/populate.py index dc4fb8b1..f2d279b7 100644 --- a/src/mds/populate.py +++ b/src/mds/populate.py @@ -49,6 +49,14 @@ def normalize(entry: dict) -> Any: entry[config.AGG_MDS_DEFAULT_STUDY_DATA_FIELD] = entry.pop( common.study_data_field ) + # normalize variable level metadata field names, if available + if ( + common.data_dict_field is not None + and common.data_dict_field != config.AGG_MDS_DEFAULT_DATA_DICT_FIELD + ): + entry[config.AGG_MDS_DEFAULT_DATA_DICT_FIELD] = entry.pop( + common.data_dict_field + ) if ( not hasattr(common, "columns_to_fields") @@ -60,13 +68,13 @@ def normalize(entry: dict) -> Any: if field == column: continue if isinstance(field, ColumnsToFields): - entry[common.study_data_field][column] = field.get_value( - entry[common.study_data_field] - ) + entry[config.AGG_MDS_DEFAULT_STUDY_DATA_FIELD][ + column + ] = field.get_value(entry[config.AGG_MDS_DEFAULT_STUDY_DATA_FIELD]) else: - if field in entry[common.study_data_field]: - entry[common.study_data_field][column] = entry[ - common.study_data_field + if field in entry[config.AGG_MDS_DEFAULT_STUDY_DATA_FIELD]: + entry[config.AGG_MDS_DEFAULT_STUDY_DATA_FIELD][column] = entry[ + config.AGG_MDS_DEFAULT_STUDY_DATA_FIELD ][field] return entry @@ -74,14 +82,14 @@ def normalize(entry: dict) -> Any: # add the common field, selecting the name or an override (i.e. commons_name) and url to the entry - entry[common.study_data_field]["commons_name"] = ( + entry[config.AGG_MDS_DEFAULT_STUDY_DATA_FIELD]["commons_name"] = ( common.commons_name if hasattr(common, "commons_name") and common.commons_name is not None else name ) # add to tags - for t in entry[common.study_data_field].get("tags") or {}: + for t in entry[config.AGG_MDS_DEFAULT_STUDY_DATA_FIELD].get("tags") or {}: if "category" not in t: continue if t["category"] not in tags: @@ -96,9 +104,7 @@ def normalize(entry: dict) -> Any: keys = list(results.keys()) info = {"commons_url": common.commons_url} - await datastore.update_metadata( - name, mds_arr, keys, tags, info, common.data_dict_field, use_temp_index - ) + await datastore.update_metadata(name, mds_arr, keys, tags, info, use_temp_index) async def populate_info(commons_config: Commons, use_temp_index=False) -> None: diff --git a/tests/test_populate.py b/tests/test_populate.py index 7c50b0ea..11e8c3f9 100644 --- a/tests/test_populate.py +++ b/tests/test_populate.py @@ -81,7 +81,6 @@ async def test_populate_metadata(): ["id1"], {"my_category": ["my_name"]}, {"commons_url": "http://commons"}, - None, False, ) From dd7490d1c9fbca47e398213db8e4564ef9d6d096 Mon Sep 17 00:00:00 2001 From: Mingfei Shao Date: Mon, 30 Jan 2023 22:44:39 -0600 Subject: [PATCH 03/17] wip: query --- .../agg_mds/datastore/elasticsearch_dao.py | 21 +-- tests/test_agg_mds_query.py | 134 +++++++++--------- 2 files changed, 82 insertions(+), 73 deletions(-) diff --git a/src/mds/agg_mds/datastore/elasticsearch_dao.py b/src/mds/agg_mds/datastore/elasticsearch_dao.py index d912665a..c55c69ff 100644 --- a/src/mds/agg_mds/datastore/elasticsearch_dao.py +++ b/src/mds/agg_mds/datastore/elasticsearch_dao.py @@ -293,9 +293,12 @@ def process_record(record: dict, counts: Optional[List[str]]) -> Tuple[str, dict """ _id = record["_id"] normalized = record["_source"] - for c in counts: - if c in normalized: - normalized[c] = count(normalized[c]) + if AGG_MDS_DEFAULT_STUDY_DATA_FIELD in normalized: + for c in counts: + if c in normalized[AGG_MDS_DEFAULT_STUDY_DATA_FIELD]: + normalized[AGG_MDS_DEFAULT_STUDY_DATA_FIELD][c] = count( + normalized[AGG_MDS_DEFAULT_STUDY_DATA_FIELD][c] + ) return _id, normalized @@ -356,7 +359,7 @@ async def get_all_metadata(limit, offset, counts: Optional[str] = None, flatten= flat = [] for record in res["hits"]["hits"]: rid, normalized = process_record(record, toReduce) - flat.append({rid: {"gen3_discovery": normalized}}) + flat.append({rid: normalized}) return { "results": flat, "pagination": { @@ -378,12 +381,12 @@ async def get_all_metadata(limit, offset, counts: Optional[str] = None, flatten= } for record in res["hits"]["hits"]: rid, normalized = process_record(record, toReduce) - commons_name = normalized["commons_name"] + commons_name = normalized[AGG_MDS_DEFAULT_STUDY_DATA_FIELD][ + "commons_name" + ] if commons_name not in byCommons["results"]: byCommons["results"][commons_name] = [] - byCommons["results"][commons_name].append( - {rid: {"gen3_discovery": normalized}} - ) + byCommons["results"][commons_name].append({rid: normalized}) return byCommons except Exception as error: @@ -455,7 +458,7 @@ async def get_commons_attribute(name): } }, ) - return data["hits"]["hits"][0]["_source"] + return data["hits"]["hits"][0][AGG_MDS_DEFAULT_STUDY_DATA_FIELD]["_source"] except Exception as error: logger.error(error) return None diff --git a/tests/test_agg_mds_query.py b/tests/test_agg_mds_query.py index 1e34d75d..240cf173 100644 --- a/tests/test_agg_mds_query.py +++ b/tests/test_agg_mds_query.py @@ -107,28 +107,30 @@ async def test_aggregate_metadata_paged_flat(client): "_id": "815616c0-dfsdfjjj", "_score": 1.0, "_source": { - "link": "", - "tags": [ - {"name": "restricted", "category": "Access"}, - {"name": "genomic", "category": "category"}, - ], - "commons": "LI", - "_unique_id": "815616c0-c4a4-4883-9107-a05694499a36", - "dataset_code": "LI", - "brief_summary": "Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua.", - "dataset_title": "Lorem ipsum dolor sit amet", - "samples_count": "", - "subjects_count": "", - "data_files_count": 11062, - "_subjects_count": "", - "study_description": "Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ultricies tristique nulla aliquet enim tortor at auctor.", - "short_name": "Lorem ipsum dolor sit amet", - "full_name": "Lorem ipsum dolor sit amet, consectetur adipiscing elit", - "commons_name": "Lorem ipsum", - "__manifest": [ - {"filename": "foo2.txt"}, - {"filename": "foo3.txt"}, - ], + "gen3_discovery": { + "link": "", + "tags": [ + {"name": "restricted", "category": "Access"}, + {"name": "genomic", "category": "category"}, + ], + "commons": "LI", + "_unique_id": "815616c0-c4a4-4883-9107-a05694499a36", + "dataset_code": "LI", + "brief_summary": "Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua.", + "dataset_title": "Lorem ipsum dolor sit amet", + "samples_count": "", + "subjects_count": "", + "data_files_count": 11062, + "_subjects_count": "", + "study_description": "Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ultricies tristique nulla aliquet enim tortor at auctor.", + "short_name": "Lorem ipsum dolor sit amet", + "full_name": "Lorem ipsum dolor sit amet, consectetur adipiscing elit", + "commons_name": "Lorem ipsum", + "__manifest": [ + {"filename": "foo2.txt"}, + {"filename": "foo3.txt"}, + ], + }, }, } ], @@ -194,28 +196,30 @@ async def test_aggregate_metadata_counts(client): "_id": "815616c0-dfsdfjjj", "_score": 1.0, "_source": { - "link": "", - "tags": [ - {"name": "restricted", "category": "Access"}, - {"name": "genomic", "category": "category"}, - ], - "commons": "LI", - "_unique_id": "815616c0-c4a4-4883-9107-a05694499a36", - "dataset_code": "LI", - "brief_summary": "Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua.", - "dataset_title": "Lorem ipsum dolor sit amet", - "samples_count": "", - "subjects_count": "", - "data_files_count": 11062, - "_subjects_count": "", - "study_description": "Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ultricies tristique nulla aliquet enim tortor at auctor.", - "short_name": "Lorem ipsum dolor sit amet", - "full_name": "Lorem ipsum dolor sit amet, consectetur adipiscing elit", - "commons_name": "Lorem ipsum", - "__manifest": [ - {"filename": "foo2.txt"}, - {"filename": "foo3.txt"}, - ], + "gen3_discovery": { + "link": "", + "tags": [ + {"name": "restricted", "category": "Access"}, + {"name": "genomic", "category": "category"}, + ], + "commons": "LI", + "_unique_id": "815616c0-c4a4-4883-9107-a05694499a36", + "dataset_code": "LI", + "brief_summary": "Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua.", + "dataset_title": "Lorem ipsum dolor sit amet", + "samples_count": "", + "subjects_count": "", + "data_files_count": 11062, + "_subjects_count": "", + "study_description": "Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ultricies tristique nulla aliquet enim tortor at auctor.", + "short_name": "Lorem ipsum dolor sit amet", + "full_name": "Lorem ipsum dolor sit amet, consectetur adipiscing elit", + "commons_name": "Lorem ipsum", + "__manifest": [ + {"filename": "foo2.txt"}, + {"filename": "foo3.txt"}, + ], + }, }, } ], @@ -261,7 +265,7 @@ async def test_aggregate_metadata_counts(client): assert resp.json() == results # test multiple counts field - mock_data["hits"]["hits"][0]["_source"]["__manifest"] = [ + mock_data["hits"]["hits"][0]["_source"]["gen3_discovery"]["__manifest"] = [ {"filename": "foo2.txt"}, {"filename": "foo3.txt"}, ] @@ -291,25 +295,27 @@ async def test_aggregate_metadata_counts_null(client): "_id": "815616c0-dfsdfjjj", "_score": 1.0, "_source": { - "link": "", - "tags": [ - {"name": "restricted", "category": "Access"}, - {"name": "genomic", "category": "category"}, - ], - "commons": "LI", - "_unique_id": "815616c0-c4a4-4883-9107-a05694499a36", - "dataset_code": "LI", - "brief_summary": "Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua.", - "dataset_title": "Lorem ipsum dolor sit amet", - "samples_count": "", - "subjects_count": "", - "data_files_count": 11062, - "_subjects_count": "", - "study_description": "Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ultricies tristique nulla aliquet enim tortor at auctor.", - "short_name": "Lorem ipsum dolor sit amet", - "full_name": "Lorem ipsum dolor sit amet, consectetur adipiscing elit", - "commons_name": "Lorem ipsum", - "__manifest": None, + "gen3_discovery": { + "link": "", + "tags": [ + {"name": "restricted", "category": "Access"}, + {"name": "genomic", "category": "category"}, + ], + "commons": "LI", + "_unique_id": "815616c0-c4a4-4883-9107-a05694499a36", + "dataset_code": "LI", + "brief_summary": "Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua.", + "dataset_title": "Lorem ipsum dolor sit amet", + "samples_count": "", + "subjects_count": "", + "data_files_count": 11062, + "_subjects_count": "", + "study_description": "Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ultricies tristique nulla aliquet enim tortor at auctor.", + "short_name": "Lorem ipsum dolor sit amet", + "full_name": "Lorem ipsum dolor sit amet, consectetur adipiscing elit", + "commons_name": "Lorem ipsum", + "__manifest": None, + }, }, } ], From 546757f061cda63776304f2c00f3917c4e0c7397 Mon Sep 17 00:00:00 2001 From: Mingfei Shao Date: Mon, 30 Jan 2023 22:52:07 -0600 Subject: [PATCH 04/17] wip: fix test --- tests/test_agg_mds_elasticsearch_dao.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/test_agg_mds_elasticsearch_dao.py b/tests/test_agg_mds_elasticsearch_dao.py index e26638bb..4ad90a0a 100644 --- a/tests/test_agg_mds_elasticsearch_dao.py +++ b/tests/test_agg_mds_elasticsearch_dao.py @@ -410,11 +410,11 @@ def test_count_value_none(): def test_process_records(): _id = "123" - _source = {"count": [1, 2, 3, 4], "name": "my_name"} + _source = {"gen3_discovery": {"count": [1, 2, 3, 4], "name": "my_name"}} record = {"_id": _id, "_source": _source} rid, normalized = process_record(record, ["count"]) assert rid == _id - assert normalized == {"count": 4, "name": "my_name"} + assert normalized == {"gen3_discovery": {"count": 4, "name": "my_name"}} # test if passed dict field is not array rid, normalized = process_record(record, ["name"]) From 3592b832459d976e2efcc57060b33bfb5e102af5 Mon Sep 17 00:00:00 2001 From: Mingfei Shao Date: Tue, 31 Jan 2023 14:26:59 -0600 Subject: [PATCH 05/17] wip: es update --- .../agg_mds/datastore/elasticsearch_dao.py | 22 +++++++++++++++---- src/mds/populate.py | 9 ++++++-- 2 files changed, 25 insertions(+), 6 deletions(-) diff --git a/src/mds/agg_mds/datastore/elasticsearch_dao.py b/src/mds/agg_mds/datastore/elasticsearch_dao.py index c55c69ff..5266ef45 100644 --- a/src/mds/agg_mds/datastore/elasticsearch_dao.py +++ b/src/mds/agg_mds/datastore/elasticsearch_dao.py @@ -260,7 +260,13 @@ async def get_commons(): index=AGG_MDS_INDEX, body={ "size": 0, - "aggs": {"commons_names": {"terms": {"field": "commons_name.keyword"}}}, + "aggs": { + "commons_names": { + "terms": { + "field": f"{AGG_MDS_DEFAULT_STUDY_DATA_FIELD}.commons_name.keyword" + } + } + }, }, ) return { @@ -398,7 +404,13 @@ async def get_all_named_commons_metadata(name): try: res = elastic_search_client.search( index=AGG_MDS_INDEX, - body={"query": {"match": {"commons_name.keyword": name}}}, + body={ + "query": { + "match": { + f"{AGG_MDS_DEFAULT_STUDY_DATA_FIELD}.commons_name.keyword": name + } + } + }, ) return [x["_source"] for x in res["hits"]["hits"]] except Exception as error: @@ -458,7 +470,7 @@ async def get_commons_attribute(name): } }, ) - return data["hits"]["hits"][0][AGG_MDS_DEFAULT_STUDY_DATA_FIELD]["_source"] + return data["hits"]["hits"][0]["_source"] except Exception as error: logger.error(error) return None @@ -473,7 +485,9 @@ async def get_aggregations(name): "query": { "constant_score": { "filter": { - "match": {"commons_name": name}, + "match": { + f"{AGG_MDS_DEFAULT_STUDY_DATA_FIELD}.commons_name": name + }, } } }, diff --git a/src/mds/populate.py b/src/mds/populate.py index f2d279b7..269f58c0 100644 --- a/src/mds/populate.py +++ b/src/mds/populate.py @@ -178,8 +178,13 @@ async def main(commons_config: Commons) -> None: "mappings": { "commons": { "properties": { - k: v.to_schema(True) - for k, v in commons_config.configuration.schema.items() + config.AGG_MDS_DEFAULT_STUDY_DATA_FIELD: { + "type": "nested", + "properties": { + k: v.to_schema(True) + for k, v in commons_config.configuration.schema.items() + }, + } } } } From 1ef12b24a79f9e939a524b79cd7e720304d85b21 Mon Sep 17 00:00:00 2001 From: Mingfei Shao Date: Tue, 31 Jan 2023 15:42:18 -0600 Subject: [PATCH 06/17] wip: es update --- src/mds/agg_mds/datastore/elasticsearch_dao.py | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) diff --git a/src/mds/agg_mds/datastore/elasticsearch_dao.py b/src/mds/agg_mds/datastore/elasticsearch_dao.py index 5266ef45..a5f1e869 100644 --- a/src/mds/agg_mds/datastore/elasticsearch_dao.py +++ b/src/mds/agg_mds/datastore/elasticsearch_dao.py @@ -406,8 +406,13 @@ async def get_all_named_commons_metadata(name): index=AGG_MDS_INDEX, body={ "query": { - "match": { - f"{AGG_MDS_DEFAULT_STUDY_DATA_FIELD}.commons_name.keyword": name + "nested": { + "path": AGG_MDS_DEFAULT_STUDY_DATA_FIELD, + "query": { + "match": { + f"{AGG_MDS_DEFAULT_STUDY_DATA_FIELD}.commons_name.keyword": "HEAL" + } + }, } } }, @@ -426,14 +431,16 @@ async def metadata_tags(): "size": 0, "aggs": { "tags": { - "nested": {"path": "tags"}, + "nested": {"path": f"{AGG_MDS_DEFAULT_STUDY_DATA_FIELD}.tags"}, "aggs": { "categories": { - "terms": {"field": "tags.category.keyword"}, + "terms": { + "field": f"{AGG_MDS_DEFAULT_STUDY_DATA_FIELD}.tags.category.keyword" + }, "aggs": { "name": { "terms": { - "field": "tags.name.keyword", + "field": f"{AGG_MDS_DEFAULT_STUDY_DATA_FIELD}.tags.name.keyword" } } }, From 9eafb0c45f6e7a95f33add9c28d596548e233961 Mon Sep 17 00:00:00 2001 From: Mingfei Shao Date: Tue, 31 Jan 2023 16:06:28 -0600 Subject: [PATCH 07/17] wip: es update --- src/mds/agg_mds/datastore/elasticsearch_dao.py | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/src/mds/agg_mds/datastore/elasticsearch_dao.py b/src/mds/agg_mds/datastore/elasticsearch_dao.py index a5f1e869..f72055ce 100644 --- a/src/mds/agg_mds/datastore/elasticsearch_dao.py +++ b/src/mds/agg_mds/datastore/elasticsearch_dao.py @@ -261,10 +261,15 @@ async def get_commons(): body={ "size": 0, "aggs": { - "commons_names": { - "terms": { - "field": f"{AGG_MDS_DEFAULT_STUDY_DATA_FIELD}.commons_name.keyword" - } + AGG_MDS_DEFAULT_STUDY_DATA_FIELD: { + "nested": {"path": AGG_MDS_DEFAULT_STUDY_DATA_FIELD}, + "aggs": { + "commons_names": { + "terms": { + "field": f"{AGG_MDS_DEFAULT_STUDY_DATA_FIELD}.commons_name.keyword" + } + } + }, } }, }, From d0c333450e1fac1bc3e3468c99d5689fe81e28ce Mon Sep 17 00:00:00 2001 From: Mingfei Shao Date: Tue, 31 Jan 2023 16:07:36 -0600 Subject: [PATCH 08/17] wip: es update --- src/mds/agg_mds/datastore/elasticsearch_dao.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/mds/agg_mds/datastore/elasticsearch_dao.py b/src/mds/agg_mds/datastore/elasticsearch_dao.py index f72055ce..6b9fb3b7 100644 --- a/src/mds/agg_mds/datastore/elasticsearch_dao.py +++ b/src/mds/agg_mds/datastore/elasticsearch_dao.py @@ -276,7 +276,10 @@ async def get_commons(): ) return { "commons": [ - x["key"] for x in res["aggregations"]["commons_names"]["buckets"] + x["key"] + for x in res["aggregations"][AGG_MDS_DEFAULT_STUDY_DATA_FIELD][ + "commons_names" + ]["buckets"] ] } except Exception as error: From 626c967b00d518179c25220bf9c6734a5d1120de Mon Sep 17 00:00:00 2001 From: Mingfei Shao Date: Tue, 31 Jan 2023 16:35:32 -0600 Subject: [PATCH 09/17] wip: fix test --- tests/test_agg_mds_elasticsearch_dao.py | 61 ++++++++++++++++++++----- 1 file changed, 49 insertions(+), 12 deletions(-) diff --git a/tests/test_agg_mds_elasticsearch_dao.py b/tests/test_agg_mds_elasticsearch_dao.py index 4ad90a0a..99f4823c 100644 --- a/tests/test_agg_mds_elasticsearch_dao.py +++ b/tests/test_agg_mds_elasticsearch_dao.py @@ -12,6 +12,7 @@ AGG_MDS_INFO_INDEX_TEMP, AGG_MDS_CONFIG_INDEX_TEMP, AGG_MDS_INFO_TYPE, + AGG_MDS_DEFAULT_STUDY_DATA_FIELD, count, process_record, ) @@ -210,7 +211,7 @@ async def test_update_metadata(): [ { "my_id": { - "gen3_discovery": { + AGG_MDS_DEFAULT_STUDY_DATA_FIELD: { "some_field": "some_value", "__manifest": {}, "sites": "", @@ -232,7 +233,7 @@ async def test_update_metadata(): ), call( body={ - "gen3_discovery": { + AGG_MDS_DEFAULT_STUDY_DATA_FIELD: { "some_field": "some_value", "__manifest": {}, "sites": "", @@ -258,7 +259,7 @@ async def test_update_metadata_to_temp_index(): [ { "my_id": { - "gen3_discovery": { + AGG_MDS_DEFAULT_STUDY_DATA_FIELD: { "some_field": "some_value", "__manifest": {}, "sites": "", @@ -281,7 +282,7 @@ async def test_update_metadata_to_temp_index(): ), call( body={ - "gen3_discovery": { + AGG_MDS_DEFAULT_STUDY_DATA_FIELD: { "some_field": "some_value", "__manifest": {}, "sites": "", @@ -377,7 +378,18 @@ async def test_get_commons(): index=AGG_MDS_INDEX, body={ "size": 0, - "aggs": {"commons_names": {"terms": {"field": "commons_name.keyword"}}}, + "aggs": { + AGG_MDS_DEFAULT_STUDY_DATA_FIELD: { + "nested": {"path": AGG_MDS_DEFAULT_STUDY_DATA_FIELD}, + "aggs": { + "commons_names": { + "terms": { + "field": f"{AGG_MDS_DEFAULT_STUDY_DATA_FIELD}.commons_name.keyword" + } + } + }, + } + }, }, ) @@ -410,11 +422,15 @@ def test_count_value_none(): def test_process_records(): _id = "123" - _source = {"gen3_discovery": {"count": [1, 2, 3, 4], "name": "my_name"}} + _source = { + AGG_MDS_DEFAULT_STUDY_DATA_FIELD: {"count": [1, 2, 3, 4], "name": "my_name"} + } record = {"_id": _id, "_source": _source} rid, normalized = process_record(record, ["count"]) assert rid == _id - assert normalized == {"gen3_discovery": {"count": 4, "name": "my_name"}} + assert normalized == { + AGG_MDS_DEFAULT_STUDY_DATA_FIELD: {"count": 4, "name": "my_name"} + } # test if passed dict field is not array rid, normalized = process_record(record, ["name"]) @@ -453,7 +469,18 @@ async def test_get_all_named_commons_metadata(): await elasticsearch_dao.get_all_named_commons_metadata("my-commons") mock_client.search.assert_called_with( index=AGG_MDS_INDEX, - body={"query": {"match": {"commons_name.keyword": "my-commons"}}}, + body={ + "query": { + "nested": { + "path": AGG_MDS_DEFAULT_STUDY_DATA_FIELD, + "query": { + "match": { + f"{AGG_MDS_DEFAULT_STUDY_DATA_FIELD}.commons_name.keyword": "HEAL" + } + }, + } + } + }, ) with patch( @@ -477,12 +504,18 @@ async def test_metadata_tags(): "size": 0, "aggs": { "tags": { - "nested": {"path": "tags"}, + "nested": {"path": f"{AGG_MDS_DEFAULT_STUDY_DATA_FIELD}.tags"}, "aggs": { "categories": { - "terms": {"field": "tags.category.keyword"}, + "terms": { + "field": f"{AGG_MDS_DEFAULT_STUDY_DATA_FIELD}.tags.category.keyword" + }, "aggs": { - "name": {"terms": {"field": "tags.name.keyword"}} + "name": { + "terms": { + "field": f"{AGG_MDS_DEFAULT_STUDY_DATA_FIELD}.tags.name.keyword" + } + } }, } }, @@ -528,7 +561,11 @@ async def test_get_aggregations(): "size": 0, "query": { "constant_score": { - "filter": {"match": {"commons_name": "my-commons"}} + "filter": { + "match": { + f"{AGG_MDS_DEFAULT_STUDY_DATA_FIELD}.commons_name": "my-commons" + } + } } }, "aggs": {"_subjects_count": {"sum": {"field": "_subjects_count"}}}, From c7f1f3e1f40287f5d49b458ce03f98e5b895f824 Mon Sep 17 00:00:00 2001 From: Mingfei Shao Date: Tue, 31 Jan 2023 16:47:10 -0600 Subject: [PATCH 10/17] wip: fix test --- tests/test_populate.py | 2 +- tests/test_query.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/test_populate.py b/tests/test_populate.py index 11e8c3f9..b50af1eb 100644 --- a/tests/test_populate.py +++ b/tests/test_populate.py @@ -275,7 +275,7 @@ async def test_populate_config(): ) config = parse_config_from_file(Path(fp.name)) await populate_config(config) - mock_datastore.update_config_info.called_with(["_subjects_count"]) + await mock_datastore.update_config_info.called_with(["_subjects_count"]) @pytest.mark.asyncio diff --git a/tests/test_query.py b/tests/test_query.py index 20ce9d16..8269a84d 100644 --- a/tests/test_query.py +++ b/tests/test_query.py @@ -171,7 +171,7 @@ def test_query_filter_all_values(client): assert list(sorted(client.get("/metadata?a.b=*").json())) == ["tq_5"] # query all records with a == "*" - assert list(sorted(client.get("/metadata?a=\*").json())) == ["tq_4"] + assert list(sorted(client.get("/metadata?a=*").json())) == ["tq_4"] finally: for i in range(1, 8): client.delete(f"/metadata/tq_{i}") From 10cc1d1abd3aad003cf2bceb6dc3e532303646f0 Mon Sep 17 00:00:00 2001 From: Mingfei Shao Date: Tue, 31 Jan 2023 16:50:34 -0600 Subject: [PATCH 11/17] wip: fix test --- tests/test_query.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_query.py b/tests/test_query.py index 8269a84d..20ce9d16 100644 --- a/tests/test_query.py +++ b/tests/test_query.py @@ -171,7 +171,7 @@ def test_query_filter_all_values(client): assert list(sorted(client.get("/metadata?a.b=*").json())) == ["tq_5"] # query all records with a == "*" - assert list(sorted(client.get("/metadata?a=*").json())) == ["tq_4"] + assert list(sorted(client.get("/metadata?a=\*").json())) == ["tq_4"] finally: for i in range(1, 8): client.delete(f"/metadata/tq_{i}") From db7be3a40c1fe2bb0f92ee40e8058f1b11e4d512 Mon Sep 17 00:00:00 2001 From: Mingfei Shao Date: Wed, 1 Feb 2023 11:20:01 -0600 Subject: [PATCH 12/17] update version --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 428d0c5d..7862cead 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "mds" -version = "2.0.1" +version = "3.0.0" description = "Metadata Service" authors = ["CTDS UChicago "] license = "Apache-2.0" From 4a651c79e6ee2a081c06bdd838bc72d7de8f4c19 Mon Sep 17 00:00:00 2001 From: mfshao Date: Wed, 1 Feb 2023 17:20:55 +0000 Subject: [PATCH 13/17] Apply automatic documentation changes --- docs/openapi.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/openapi.yaml b/docs/openapi.yaml index e27f9a19..e6f32a34 100644 --- a/docs/openapi.yaml +++ b/docs/openapi.yaml @@ -92,7 +92,7 @@ components: type: http info: title: Framework Services Object Management Service - version: 2.0.1 + version: 3.0.0 openapi: 3.0.2 paths: /_status: From d851a5f0bbe7da3f35bf6adabb2367279d576a62 Mon Sep 17 00:00:00 2001 From: Mingfei Shao Date: Wed, 1 Feb 2023 14:48:25 -0600 Subject: [PATCH 14/17] update --- src/mds/agg_mds/query.py | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/src/mds/agg_mds/query.py b/src/mds/agg_mds/query.py index 365acc6c..ccf805b6 100644 --- a/src/mds/agg_mds/query.py +++ b/src/mds/agg_mds/query.py @@ -29,12 +29,7 @@ async def get_commons_info(what: str): Example: - { - schema: { - ... - ... - } - } + {"__manifest":{"type":"array","properties":{"file_name":{"type":"string","description":""},"file_size":{"type":"integer","description":""}},"description":"","default":[]},"commons_url":{"type":"string","description":""}} """ res = await datastore.get_commons_attribute(what) From bc32f94fc7032081d0fc606e9e54e76cee95279c Mon Sep 17 00:00:00 2001 From: Mingfei Shao Date: Wed, 1 Feb 2023 14:56:33 -0600 Subject: [PATCH 15/17] update doc --- src/mds/agg_mds/query.py | 54 +++++++++++++++++++++++++++++++++++++--- 1 file changed, 51 insertions(+), 3 deletions(-) diff --git a/src/mds/agg_mds/query.py b/src/mds/agg_mds/query.py index ccf805b6..4bac360b 100644 --- a/src/mds/agg_mds/query.py +++ b/src/mds/agg_mds/query.py @@ -29,7 +29,30 @@ async def get_commons_info(what: str): Example: - {"__manifest":{"type":"array","properties":{"file_name":{"type":"string","description":""},"file_size":{"type":"integer","description":""}},"description":"","default":[]},"commons_url":{"type":"string","description":""}} + { + "__manifest":{ + "type":"array", + "properties":{ + "file_name":{ + "type":"string", + "description":"" + }, + "file_size":{ + "type":"integer", + "description":"" + } + }, + "description":"", + "default":[ + + ] + }, + "commons_url":{ + "type":"string", + "description":"" + }, + ... + } """ res = await datastore.get_commons_attribute(what) @@ -131,7 +154,26 @@ async def get_aggregate_metadata_for_commons( Example: - [ { id2: { name: "bear" } } , { id3: { name: "cat" } }] + [ + { + "gen3_discovery": { + "name": "bear", + "type": "study", + ... + }, + "data_dictionaries": { + ... + } + }, + { + "gen3_discovery": { + "name": "cat", + "type": "study", + ... + } + }, + ... + ] """ res = await datastore.get_all_named_commons_metadata(name) @@ -206,7 +248,13 @@ async def get_aggregate_metadata_guid(guid: str): Example: - { id2: { name: "bear" } } + { + "gen3_discovery": { + "name": "cat", + "type": "study", + ... + } + } """ res = await datastore.get_by_guid(guid) if res: From 316e9db05a8c18e7d8b567a961adf63e1a25cc6d Mon Sep 17 00:00:00 2001 From: mfshao Date: Wed, 1 Feb 2023 20:57:22 +0000 Subject: [PATCH 16/17] Apply automatic documentation changes --- docs/openapi.yaml | 24 ++++++++++++++++++------ 1 file changed, 18 insertions(+), 6 deletions(-) diff --git a/docs/openapi.yaml b/docs/openapi.yaml index e6f32a34..81054513 100644 --- a/docs/openapi.yaml +++ b/docs/openapi.yaml @@ -126,8 +126,14 @@ paths: get: description: "Returns status and configuration information about aggregate metadata\ \ service.\n\nReturn configuration information. Currently supports only 1\ - \ information type:\n**schema**\n\nExample:\n\n {\n schema: {\n \ - \ ...\n ...\n }\n }" + \ information type:\n**schema**\n\nExample:\n\n{\n \"__manifest\":{\n \ + \ \"type\":\"array\",\n \"properties\":{\n \"file_name\"\ + :{\n \"type\":\"string\",\n \"description\"\ + :\"\"\n },\n \"file_size\":{\n \"type\"\ + :\"integer\",\n \"description\":\"\"\n }\n \ + \ },\n \"description\":\"\",\n \"default\":[\n\n ]\n\ + \ },\n \"commons_url\":{\n \"type\":\"string\",\n \"description\"\ + :\"\"\n },\n ...\n}" operationId: get_commons_info_aggregate_info__what__get parameters: - in: path @@ -238,8 +244,9 @@ paths: - Aggregate /aggregate/metadata/guid/{guid}: get: - description: "Returns a metadata record by GUID\n\nExample:\n\n { id2: {\ - \ name: \"bear\" } }" + description: "Returns a metadata record by GUID\n\nExample:\n\n {\n \ + \ \"gen3_discovery\": {\n \"name\": \"cat\",\n \"\ + type\": \"study\",\n ...\n }\n }" operationId: get_aggregate_metadata_guid_aggregate_metadata_guid__guid__get parameters: - in: path @@ -267,8 +274,13 @@ paths: get: description: "et all metadata records from a commons by name\n\nReturns an array\ \ containing all the metadata entries for a single commons.\nThere are no\ - \ limit/offset parameters.\n\nExample:\n\n [ { id2: { name: \"bear\" }\ - \ } , { id3: { name: \"cat\" } }]" + \ limit/offset parameters.\n\nExample:\n\n [\n {\n \"\ + gen3_discovery\": {\n \"name\": \"bear\",\n \ + \ \"type\": \"study\",\n ...\n },\n \"\ + data_dictionaries\": {\n ...\n }\n },\n \ + \ {\n \"gen3_discovery\": {\n \"name\": \"\ + cat\",\n \"type\": \"study\",\n ...\n \ + \ }\n },\n ...\n ]" operationId: get_aggregate_metadata_for_commons_aggregate_metadata__name__get parameters: - in: path From 90e605a025a1bb88f6c1be818b7d6a39042b4bfc Mon Sep 17 00:00:00 2001 From: Mingfei Shao Date: Wed, 1 Feb 2023 21:34:03 -0600 Subject: [PATCH 17/17] dummy