From c71fa350b1ddfad0e159a33543d96c0169a1320e Mon Sep 17 00:00:00 2001 From: Adrien Perrin Date: Fri, 24 Jan 2025 08:33:07 +0000 Subject: [PATCH] update cmems date filter regex --- geospaas_harvesting/providers/cmems.py | 22 +++++++++++++++++----- tests/providers/test_cmems.py | 19 ++++++++++++------- 2 files changed, 29 insertions(+), 12 deletions(-) diff --git a/geospaas_harvesting/providers/cmems.py b/geospaas_harvesting/providers/cmems.py index 3047ab9..3ba3116 100644 --- a/geospaas_harvesting/providers/cmems.py +++ b/geospaas_harvesting/providers/cmems.py @@ -1,5 +1,6 @@ """Code for searching CMEMS data (https://marine.copernicus.eu/)""" import calendar +import logging import re import tempfile from datetime import datetime @@ -121,8 +122,9 @@ def make_filter(self): months_regex.append(f"{month:02d}({days_regex})") years_regex.append(f"({year}({'|'.join(months_regex)}))") + full_regex = '|'.join(years_regex) - return f".*_({'|'.join(years_regex)})_.*" + return f"^(.*_({full_regex})_.*)|({full_regex}.*)$" @staticmethod def _find_dict_in_list(dicts_list, key, value): @@ -202,6 +204,8 @@ def get_normalized_attributes(self, dataset_info, **kwargs): class CMEMSMetadataNormalizer(): """Normalizer for CMEMS datasets""" + logger = logging.getLogger(__name__ + '.CMEMSMetadataNormalizer') + def __init__(self, product_info): self._product_info = product_info @@ -307,7 +311,7 @@ def get_time_coverage(self, entry_id): ), # generic 1 day coverage ( - re.compile(rf'(^|[-_.:]){providers_utils.YEARMONTHDAY_REGEX}([-_.:T]|$)'), + re.compile(rf'(^|[-_.:]){providers_utils.YEARMONTHDAY_REGEX}(\d{{6}})?([-_.:T]|$)'), lambda time: (time, time + relativedelta(days=1)) ), # generic 1 month coverage @@ -373,13 +377,21 @@ def get_dataset_parameters(self, dataset_info): variables = [] variable_dict = None for variable in dataset_info.metadata['variables']: + if variable['standard_name']: + search_name = variable['standard_name'] + elif variable['short_name']: + search_name = variable['short_name'] + else: + self.logger.error('No available name for the following variable, skipping: %s', + variable) + continue + try: - variable_dict = providers_utils.get_cf_or_wkv_standard_name( - variable['standard_name']) + variable_dict = providers_utils.get_cf_or_wkv_standard_name(search_name) except IndexError: try: variable_dict = pythesint.vocabularies['cf_standard_name'].fuzzy_search( - variable['standard_name'])[0] + search_name)[0] except IndexError: continue if variable_dict not in variables: diff --git a/tests/providers/test_cmems.py b/tests/providers/test_cmems.py index 190b86b..18ae678 100644 --- a/tests/providers/test_cmems.py +++ b/tests/providers/test_cmems.py @@ -58,38 +58,43 @@ def test_make_filter(self): """Test making a regular expression matching a time range """ mock_crawler = mock.Mock() + regex_template = "^(.*_({regex})_.*)|({regex}.*)$" mock_crawler.time_range = (datetime(2024, 9, 1), datetime(2024, 9, 2)) self.assertEqual( CMEMSCrawler.make_filter(mock_crawler), - '.*_((2024(09(01|02))))_.*') + regex_template.format(regex='(2024(09(01|02)))')) mock_crawler.time_range = (datetime(2024, 9, 1), datetime(2024, 10, 15)) self.assertEqual( CMEMSCrawler.make_filter(mock_crawler), - '.*_((2024(09(01|02|03|04|05|06|07|08|09|10|11|12|13|14|15|16|17|18|19|20|21|22|23|24' - '|25|26|27|28|29|30)|10(01|02|03|04|05|06|07|08|09|10|11|12|13|14|15))))_.*') + regex_template.format(regex=( + '(2024(09(01|02|03|04|05|06|07|08|09|10|11|12|13|14|15|16|17|18|19|20|21|22|23' + '|24|25|26|27|28|29|30)|10(01|02|03|04|05|06|07|08|09|10|11|12|13|14|15)))' + ))) mock_crawler.time_range = (datetime(2024, 11, 1), datetime(2025, 1, 1)) self.assertEqual( CMEMSCrawler.make_filter(mock_crawler), - '.*_((202412[0-3][0-9])|(2024(11(01|02|03|04|05|06|07|08|09|10|11|12|13|14|15|16|17|' - '18|19|20|21|22|23|24|25|26|27|28|29|30)))|(2025(01(01))))_.*') + regex_template.format(regex=( + '(202412[0-3][0-9])|(2024(11(01|02|03|04|05|06|07|08|09|10|11|12|13|14|15|' + '16|17|18|19|20|21|22|23|24|25|26|27|28|29|30)))|(2025(01(01)))'))) mock_crawler.time_range = (datetime(2023, 12, 30), datetime(2024, 1, 2)) self.assertEqual( CMEMSCrawler.make_filter(mock_crawler), - '.*_((2023(12(30|31)))|(2024(01(01|02))))_.*') + regex_template.format(regex=('(2023(12(30|31)))|(2024(01(01|02)))'))) mock_crawler.time_range = (datetime(2023, 12, 30), datetime(2025, 1, 2)) self.assertEqual( CMEMSCrawler.make_filter(mock_crawler), - '.*_((2023(12(30|31)))|(2024[0-9]{4})|(2025(01(01|02))))_.*') + regex_template.format(regex=( + '(2023(12(30|31)))|(2024[0-9]{4})|(2025(01(01|02)))'))) mock_crawler.time_range = (None, None) self.assertIsNone(CMEMSCrawler.make_filter(mock_crawler))