From 9f0bb25999bab1886ea6546c998de91b663545d2 Mon Sep 17 00:00:00 2001 From: Adrien Perrin Date: Wed, 17 Apr 2024 13:29:51 +0200 Subject: [PATCH 1/5] add AVISO thredds provider --- geospaas_harvesting/config.py | 2 ++ geospaas_harvesting/providers/aviso.py | 23 +++++++++++++++++++++++ 2 files changed, 25 insertions(+) create mode 100644 geospaas_harvesting/providers/aviso.py diff --git a/geospaas_harvesting/config.py b/geospaas_harvesting/config.py index c66a1d1..aaa5e94 100644 --- a/geospaas_harvesting/config.py +++ b/geospaas_harvesting/config.py @@ -1,6 +1,7 @@ """Configuration management""" import logging +import geospaas_harvesting.providers.aviso as providers_aviso import geospaas_harvesting.providers.base as providers_base import geospaas_harvesting.providers.ceda as providers_ceda import geospaas_harvesting.providers.cmems as providers_cmems @@ -57,6 +58,7 @@ class ProvidersArgument(DictArgument): } """ provider_types = { + 'aviso': providers_aviso.AVISOProvider, 'ceda': providers_ceda.CEDAProvider, 'cmems_ftp': providers_cmems.CMEMSFTPProvider, 'copernicus_scihub': providers_copernicus_scihub.CopernicusScihubProvider, diff --git a/geospaas_harvesting/providers/aviso.py b/geospaas_harvesting/providers/aviso.py new file mode 100644 index 0000000..286dc23 --- /dev/null +++ b/geospaas_harvesting/providers/aviso.py @@ -0,0 +1,23 @@ +"""Code for searching MET NO data (https://thredds.met.no/thredds)""" +from .base import Provider, TimeFilterMixin +from ..arguments import StringArgument +from ..crawlers import ThreddsCrawler + + +class AVISOProvider(TimeFilterMixin, Provider): + """Provider for MET NO's Thredds""" + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + self.url = 'https://tds.aviso.altimetry.fr/thredds' + self.search_parameters_parser.add_arguments([ + StringArgument('directory', required=True), + StringArgument('include'), + ]) + + def _make_crawler(self, parameters): + return ThreddsCrawler( + '/'.join((self.url, parameters['directory'].lstrip('/'))), + time_range=(parameters['start_time'], parameters['end_time']), + include=parameters.get('include'), + max_threads=30 + ) From e54bac77e98fcdd4c6175642b20e9b49ffef80dd Mon Sep 17 00:00:00 2001 From: Adrien Perrin Date: Mon, 10 Jun 2024 15:58:24 +0200 Subject: [PATCH 2/5] use auth on all DirectoryCrawler HTTP requests --- geospaas_harvesting/crawlers.py | 24 ++++++++++++++---------- 1 file changed, 14 insertions(+), 10 deletions(-) diff --git a/geospaas_harvesting/crawlers.py b/geospaas_harvesting/crawlers.py index 3e79f9b..58248ad 100644 --- a/geospaas_harvesting/crawlers.py +++ b/geospaas_harvesting/crawlers.py @@ -76,10 +76,9 @@ def set_initial_state(self): """ raise NotImplementedError() - @classmethod - def _http_get(cls, url, request_parameters=None, max_tries=5, wait_time=5): + def _http_get(self, url, request_parameters=None, max_tries=5, wait_time=5): """Sends an HTTP GET request, retry in case of failure""" - cls.logger.debug("Getting page: '%s'", url) + self.logger.debug("Getting page: '%s'", url) last_error = None for try_index in range(max_tries): @@ -94,8 +93,8 @@ def _http_get(cls, url, request_parameters=None, max_tries=5, wait_time=5): raise else: last_error = error - cls.logger.warning('Error while sending request to %s, %d retries left', - url, max_tries - try_index - 1, exc_info=True) + self.logger.warning('Error while sending request to %s, %d retries left', + url, max_tries - try_index - 1, exc_info=True) time.sleep(wait_time) wait_time *= 2 raise RuntimeError(f"Max retries reached trying to get {url}") from last_error @@ -336,6 +335,14 @@ def __eq__(self, other): self.username == other.username and self.password == other.password) + def _http_get(self, url, request_parameters=None, max_tries=5, wait_time=5): + if self.username is not None and self.password is not None: + if request_parameters is None: + request_parameters = {} + request_parameters['auth'] = (self.username, self.password) + return super()._http_get(url, request_parameters=request_parameters, + max_tries=max_tries, wait_time=wait_time) + @property def base_url(self): """Get the root URL without the path""" @@ -557,8 +564,6 @@ def _prepend_parent_path(parent_path, paths): def _list_folder_contents(self, folder_path): request_parameters = {} - if self.username is not None and self.password is not None: - request_parameters['auth'] = (self.username, self.password) html = self._http_get(f"{self.base_url}{folder_path}", request_parameters).text stripped_folder_path = self._strip_folder_page(folder_path) return self._prepend_parent_path(stripped_folder_path, self._get_links(html)) @@ -647,10 +652,9 @@ def get_normalized_attributes(self, dataset_info, **kwargs): """ ddx_url = self.get_ddx_url(dataset_info.url) # Get the metadata from the dataset as an XML tree - stream = io.BytesIO(utils.http_request('GET', ddx_url, stream=True).content) + stream = io.BytesIO(self._http_get(ddx_url, request_parameters={'stream': True}).content) # Get all the global attributes of the Dataset into a dictionary - extracted_attributes = self._extract_attributes( - ET.parse(stream).getroot()) + extracted_attributes = self._extract_attributes(ET.parse(stream).getroot()) # add the URL to the attributes passed to metanorm self.add_url(dataset_info.url, extracted_attributes) # Get the parameters needed to create a geospaas catalog dataset from the global attributes From 04d87069dcd8821303caeb2c8289ca2574e9b008 Mon Sep 17 00:00:00 2001 From: Adrien Perrin Date: Mon, 10 Jun 2024 15:59:08 +0200 Subject: [PATCH 3/5] add auth to crawler for AVISO provider --- geospaas_harvesting/providers/aviso.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/geospaas_harvesting/providers/aviso.py b/geospaas_harvesting/providers/aviso.py index 286dc23..c2a6f3f 100644 --- a/geospaas_harvesting/providers/aviso.py +++ b/geospaas_harvesting/providers/aviso.py @@ -1,11 +1,11 @@ -"""Code for searching MET NO data (https://thredds.met.no/thredds)""" +"""Code for searching AVISO data (https://tds.aviso.altimetry.fr/thredds)""" from .base import Provider, TimeFilterMixin from ..arguments import StringArgument from ..crawlers import ThreddsCrawler class AVISOProvider(TimeFilterMixin, Provider): - """Provider for MET NO's Thredds""" + """Provider for AVISO's Thredds""" def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) self.url = 'https://tds.aviso.altimetry.fr/thredds' @@ -19,5 +19,7 @@ def _make_crawler(self, parameters): '/'.join((self.url, parameters['directory'].lstrip('/'))), time_range=(parameters['start_time'], parameters['end_time']), include=parameters.get('include'), - max_threads=30 + max_threads=30, + username=self.username, + password=self.password, ) From e374db7be73dcf5d9c3cabeb27de9bd6e32b3cc2 Mon Sep 17 00:00:00 2001 From: Adrien Perrin Date: Tue, 11 Jun 2024 08:44:32 +0000 Subject: [PATCH 4/5] add tests for DirectoryCrawler._http_get --- tests/test_generic_crawlers.py | 40 +++++++++++++++++++++++++++++----- 1 file changed, 34 insertions(+), 6 deletions(-) diff --git a/tests/test_generic_crawlers.py b/tests/test_generic_crawlers.py index d47fbb5..1340ca7 100644 --- a/tests/test_generic_crawlers.py +++ b/tests/test_generic_crawlers.py @@ -80,7 +80,7 @@ def test_http_get_retry(self): http_500_error, mock.Mock()) with self.assertLogs(crawlers.Crawler.logger, level=logging.WARNING): - crawlers.Crawler._http_get('url', max_tries=5, wait_time=30) + crawlers.Crawler()._http_get('url', max_tries=5, wait_time=30) self.assertEqual(len(mock_request.mock_calls), 5) self.assertListEqual(mock_sleep.mock_calls, [mock.call(30 * (2**i)) for i in range(4)]) @@ -96,7 +96,7 @@ def test_http_get_fails_eventually(self): with self.assertLogs(crawlers.Crawler.logger, level=logging.WARNING), \ self.assertRaises(RuntimeError): - crawlers.Crawler._http_get('url') + crawlers.Crawler()._http_get('url') self.assertEqual(len(mock_request.mock_calls), 5) self.assertEqual(len(mock_sleep.mock_calls), 5) @@ -108,7 +108,7 @@ def test_http_get_no_retry_error(self): with mock.patch('geospaas_harvesting.utils.http_request') as mock_request: mock_request.side_effect = requests.TooManyRedirects with self.assertRaises(requests.RequestException): - self.assertIsNone(crawlers.Crawler._http_get('url')) + self.assertIsNone(crawlers.Crawler()._http_get('url')) def test_http_get_error_on_404_status(self): """Test that an exception is raised in case of HTTP error code""" @@ -117,7 +117,7 @@ def test_http_get_error_on_404_status(self): with mock.patch('geospaas_harvesting.utils.http_request') as mock_request: mock_request.side_effect = requests.HTTPError(response=response) with self.assertRaises(requests.HTTPError): - crawlers.Crawler._http_get('http://foo') + crawlers.Crawler()._http_get('http://foo') def test_abstract_get_normalized_attributes(self): """get_normalized_attributes() should raise a NotImplementedError""" @@ -329,6 +329,31 @@ def test_equality(self): 'http://foo', (datetime(2024, 1, 2), datetime(2024, 1, 3)), r'.*\.nc', 'user', 'password')) + def test_http_get_with_auth(self): + """If no username and password are provided, HTTP requests + should not have an 'auth' parameter + """ + crawler = crawlers.DirectoryCrawler('', username='user', password='pass') + with mock.patch('geospaas_harvesting.crawlers.Crawler._http_get') as mock_get: + crawler._http_get('http://foo/bar') + crawler._http_get('http://foo/bar', request_parameters={'quz': 'qux'}) + mock_get.assert_has_calls(( + mock.call('http://foo/bar', request_parameters={'auth': ('user', 'pass')}, + max_tries=5, wait_time=5), + mock.call('http://foo/bar', request_parameters={'quz': 'qux', 'auth': ('user', 'pass')}, + max_tries=5, wait_time=5), + )) + + def test_http_get_no_auth(self): + """If no username and password are provided, HTTP requests + should not have an 'auth' parameter + """ + crawler = crawlers.DirectoryCrawler('') + with mock.patch('geospaas_harvesting.crawlers.Crawler._http_get') as mock_get: + crawler._http_get('http://foo/bar') + mock_get.assert_called_with('http://foo/bar', request_parameters=None, + max_tries=5, wait_time=5) + def test_abstract_list_folder_contents(self): """ A NotImplementedError should be raised if the _list_folder_contents() method @@ -709,7 +734,8 @@ def test_list_folder_contents_no_auth(self): mock_http_get.return_value.text = '' crawler = crawlers.HTMLDirectoryCrawler('http://foo') crawler._list_folder_contents('/bar') - mock_http_get.assert_called_once_with('http://foo/bar', {}) + mock_http_get.assert_called_once_with('http://foo/bar', request_parameters={}, + max_tries=5, wait_time=5) def test_list_folder_contents_with_auth(self): """If a username and password are provided, HTTP requests @@ -719,7 +745,9 @@ def test_list_folder_contents_with_auth(self): mock_http_get.return_value.text = '' crawler = crawlers.HTMLDirectoryCrawler('http://foo', username='user', password='pass') crawler._list_folder_contents('/bar') - mock_http_get.assert_called_once_with('http://foo/bar', {'auth': ('user', 'pass')}) + mock_http_get.assert_called_once_with('http://foo/bar', + request_parameters={'auth': ('user', 'pass')}, + max_tries=5, wait_time=5) def test_get_normalized_attributes(self): """Test that the attributes are gotten using metanorm, and the From 4316ac80f3e705a41ed465c48f3de55480357f9e Mon Sep 17 00:00:00 2001 From: Adrien Perrin Date: Tue, 11 Jun 2024 08:52:27 +0000 Subject: [PATCH 5/5] add tests for AVISO provider --- tests/providers/test_aviso.py | 30 ++++++++++++++++++++++++++++++ 1 file changed, 30 insertions(+) create mode 100644 tests/providers/test_aviso.py diff --git a/tests/providers/test_aviso.py b/tests/providers/test_aviso.py new file mode 100644 index 0000000..a788aea --- /dev/null +++ b/tests/providers/test_aviso.py @@ -0,0 +1,30 @@ +# pylint: disable=protected-access +"""Tests for the AVISO provider""" +import unittest +import unittest.mock as mock +from datetime import datetime, timezone + +import geospaas_harvesting.crawlers as crawlers +from geospaas_harvesting.providers.aviso import AVISOProvider + + +class AVISOProviderTestCase(unittest.TestCase): + """Tests for AVISOProvider""" + + def test_make_crawler(self): + """Test creating a crawler from parameters""" + provider = AVISOProvider(name='test', username='user', password='pass') + parameters = { + 'start_time': datetime(2023, 1, 1, tzinfo=timezone.utc), + 'end_time': datetime(2023, 1, 2, tzinfo=timezone.utc), + 'directory': 'foo', + 'include': '.*' + } + self.assertEqual( + provider._make_crawler(parameters), + crawlers.ThreddsCrawler( + 'https://tds.aviso.altimetry.fr/thredds/foo', + include='.*', + time_range=(datetime(2023, 1, 1, tzinfo=timezone.utc), + datetime(2023, 1, 2, tzinfo=timezone.utc)), + username='user', password='pass'))