diff --git a/geospaas_harvesting/crawlers.py b/geospaas_harvesting/crawlers.py index 262fb062..e0cad613 100644 --- a/geospaas_harvesting/crawlers.py +++ b/geospaas_harvesting/crawlers.py @@ -99,17 +99,17 @@ class WebDirectoryCrawler(Crawler): f'^.*/{YEAR_PATTERN}/{MONTH_PATTERN}/{DAY_OF_MONTH_PATTERN}/.*$') DAY_OF_YEAR_MATCHER = re.compile(f'^.*/{YEAR_PATTERN}/{DAY_OF_YEAR_PATTERN}(/.*)?$') - def __init__(self, root_url, time_range=(None, None), excludes=None): + def __init__(self, root_url, time_range=(None, None), include=None): """ `root_url` is the URL of the data repository to explore. `time_range` is a 2-tuple of datetime.datetime objects defining the time range of the datasets returned by the crawler. - `excludes` is the list of string that are the associated url is ignored during - the harvesting process if these strings are found in the crawled url. + `include` is a regular expression string used to filter the crawler's output. + Only URLs matching it are returned. """ self.root_url = urlparse(root_url) self.time_range = time_range - self.excludes = (self.EXCLUDE or []) + (excludes or []) + self.include = re.compile(include) if include else None self.set_initial_state() @property @@ -209,10 +209,6 @@ def _is_folder(self, path): """Returns True if path points to a folder""" raise NotImplementedError("_is_folder is abstract in WebDirectoryCrawler") - def _is_file(self, path): - """Returns True if path points to a file""" - raise NotImplementedError("_is_file is abstract in WebDirectoryCrawler") - def _add_url_to_return(self, path): """ Add a URL to the list of URLs returned by the crawler after @@ -233,15 +229,20 @@ def _add_folder_to_process(self, path): self._to_process.append(path) def _process_folder(self, folder_path): - """Get the contents of a folder and feed the _urls and _to_process attributes""" + """ + Get the contents of a folder and feed the _urls (based on includes) and _to_process + attributes + """ self.LOGGER.info("Looking for resources in '%s'...", folder_path) for path in self._list_folder_contents(folder_path): - # Select paths which do not contain any of the self.excludes strings - if all(excluded_string not in path for excluded_string in self.excludes): - if self._is_folder(path): - self._add_folder_to_process(path) - elif self._is_file(path): - self._add_url_to_return(path) + # deselect paths which contains any of the excludes strings + if self.EXCLUDE and self.EXCLUDE.search(path): + continue + if self._is_folder(path): + self._add_folder_to_process(path) + # select paths which are matched based on input config file + if self.include and self.include.search(path): + self._add_url_to_return(path) def get_download_url(self, resource_url): """ @@ -264,9 +265,6 @@ def _list_folder_contents(self, folder_path): def _is_folder(self, path): return os.path.isdir(path) - def _is_file(self, path): - return os.path.isfile(path) - class HTMLDirectoryCrawler(WebDirectoryCrawler): """Implementation of WebDirectoryCrawler for repositories exposed as HTML pages.""" @@ -285,9 +283,6 @@ def _strip_folder_page(folder_path): def _is_folder(self, path): return path.endswith(self.FOLDERS_SUFFIXES) - def _is_file(self, path): - return path.endswith(self.FILES_SUFFIXES) - @classmethod def _get_links(cls, html): """Returns the list of links contained in an HTML page, passed as a string""" @@ -325,7 +320,7 @@ class OpenDAPCrawler(HTMLDirectoryCrawler): LOGGER = logging.getLogger(__name__ + '.OpenDAPCrawler') FOLDERS_SUFFIXES = ('/contents.html',) FILES_SUFFIXES = ('.nc', '.nc.gz') - EXCLUDE = ['?'] + EXCLUDE = re.compile(r'\?') class ThreddsCrawler(HTMLDirectoryCrawler): @@ -335,7 +330,7 @@ class ThreddsCrawler(HTMLDirectoryCrawler): LOGGER = logging.getLogger(__name__ + '.ThreddsCrawler') FOLDERS_SUFFIXES = ('/catalog.html',) FILES_SUFFIXES = ('.nc',) - EXCLUDE = ['/thredds/catalog.html'] + EXCLUDE = re.compile(r'/thredds/catalog.html$') def get_download_url(self, resource_url): result = None @@ -354,18 +349,17 @@ class FTPCrawler(WebDirectoryCrawler): """ LOGGER = logging.getLogger(__name__ + '.FTPCrawler') - def __init__(self, root_url, time_range=(None, None), excludes=None, - username='anonymous', password='anonymous', files_suffixes=''): + def __init__(self, root_url, time_range=(None, None), include=None, + username='anonymous', password='anonymous'): if not root_url.startswith('ftp://'): raise ValueError("The root url must start with 'ftp://'") self.username = username self.password = password - self.files_suffixes = files_suffixes self.ftp = None - super().__init__(root_url, time_range, excludes) + super().__init__(root_url, time_range, include) def set_initial_state(self): """ @@ -433,9 +427,6 @@ def _is_folder(self, path): else: return True - def _is_file(self, path): - return path.endswith(self.files_suffixes) - class HTTPPaginatedAPICrawler(Crawler): """Base class for crawlers used on repositories exposing a paginated API over HTTP""" diff --git a/geospaas_harvesting/harvest.yml b/geospaas_harvesting/harvest.yml index 56fa0f97..3d76127c 100644 --- a/geospaas_harvesting/harvest.yml +++ b/geospaas_harvesting/harvest.yml @@ -3,43 +3,61 @@ # dump_on_interruption: False # poll_interval: 600 # harvesters: -# OSISAF: -# class: 'OSISAFHarvester' -# max_fetcher_threads: 30 -# # We exclude "EASE-Grid map projections" and "southern hemispheres" from harvesting process -# excludes: ['ease', '_sh_polstere',] -# max_db_threads: 1 -# urls: -# - 'https://thredds.met.no/thredds/catalog/osisaf/met.no/ice/amsr2_conc/catalog.html' -# - 'https://thredds.met.no/thredds/catalog/osisaf/met.no/ice/conc/catalog.html' -# podaac: -# class: 'PODAACHarvester' -# max_fetcher_threads: 30 -# max_db_threads: 1 -# urls: -# - 'https://opendap.jpl.nasa.gov/opendap/allData/ghrsst/data/GDS2/L2P/VIIRS_NPP/NAVO/v1/2014/005/contents.html' -# - 'https://opendap.jpl.nasa.gov/opendap/allData/ghrsst/data/GDS2/L2P/VIIRS_N20/' -# - 'https://opendap.jpl.nasa.gov/opendap/allData/ghrsst/data/GDS2/L2P/VIIRS_NPP/' -# - 'https://opendap.jpl.nasa.gov/opendap/allData/ghrsst/data/GDS2/L2P/MODIS_A/' -# copernicus_sentinel: -# class: 'CopernicusSentinelHarvester' -# max_fetcher_threads: 30 -# max_db_threads: 1 -# url: 'https://scihub.copernicus.eu/apihub/search' -# search_terms: -# - 'platformname:Sentinel-1 AND NOT L0' -# - 'platformname:Sentinel-2 AND NOT L0' -# - 'platformname:Sentinel-3 AND NOT L0' -# username: 'username' -# # Environment variable name -# password: !ENV 'COPERNICUS_OPEN_HUB_PASSWORD' -# FTP_jaxa: -# class: 'FTPHarvester' -# max_fetcher_threads: 30 -# max_db_threads: 1 -# username: username -# password: !ENV 'JAXA_PASSWORD' -# fileformat: '.h5' -# urls: -# - 'ftp://ftp.gportal.jaxa.jp/standard/GCOM-W/GCOM-W.AMSR2/L3.SST_25/3/2012/07/' +# radarsat_local: +# class: 'LOCALHarvester' +# include: 'RS2_\w+(?!.)' +# max_fetcher_threads: 1 +# max_db_threads: 1 +# paths: +# - "/src/sample/test_multi_nansat" +# FTP_jaxa: +# class: 'FTPHarvester' +# max_fetcher_threads: 1 +# max_db_threads: 1 +# username: username +# password: !ENV 'JAXA_PASSWORD' +# include: '\.h5$' +# urls: +# - 'ftp://ftp.gportal.jaxa.jp/standard/GCOM-W/GCOM-W.AMSR2/L3.SST_25/3/2012/07/' +# OSISAF: +# class: 'OSISAFHarvester' +# max_fetcher_threads: 1 +# # We include "_nh_polstere" in order to only harvest the northen-hemisphere data +# include: '_nh_polstere' +# max_db_threads: 1 +# #time_range: +# # - !ENV HARVESTING_START_TIME +# # - !ENV HARVESTING_END_TIME +# urls: +# - 'https://thredds.met.no/thredds/catalog/osisaf/met.no/ice/amsr2_conc/catalog.html' +# - 'https://thredds.met.no/thredds/catalog/osisaf/met.no/ice/conc/catalog.html' +# - 'https://thredds.met.no/thredds/catalog/osisaf/met.no/ice/type/catalog.html' +# - 'https://thredds.met.no/thredds/catalog/osisaf/met.no/ice/drift_mr/catalog.html' +# - 'https://thredds.met.no/thredds/catalog/osisaf/met.no/ice/drift_lr/merged/catalog.html' +# +# podaac: +# class: 'PODAACHarvester' +# max_fetcher_threads: 1 +# max_db_threads: 1 +# include: '\.nc$|\.h5$' +# urls: +# - 'https://opendap.jpl.nasa.gov/opendap/allData/ghrsst/data/GDS2/L2P/VIIRS_NPP/NAVO/v1/2014/005/contents.html' +# - 'https://opendap.jpl.nasa.gov/opendap/allData/ghrsst/data/GDS2/L2P/VIIRS_N20/' +# - 'https://opendap.jpl.nasa.gov/opendap/allData/ghrsst/data/GDS2/L2P/VIIRS_NPP/' +# - 'https://opendap.jpl.nasa.gov/opendap/allData/ghrsst/data/GDS2/L2P/MODIS_A/' +# copernicus_sentinel: +# class: 'CopernicusSentinelHarvester' +# max_fetcher_threads: 30 +# max_db_threads: 1 +# include: '.*' +# #time_range: +# # - !ENV HARVESTING_START_TIME +# # - !ENV HARVESTING_END_TIME +# url: 'https://scihub.copernicus.eu/apihub/search' +# search_terms: +# - 'platformname:Sentinel-1 AND NOT L0' +# - 'platformname:Sentinel-2 AND NOT L0' +# - 'platformname:Sentinel-3 AND NOT L0' +# username: "username" +# password: !ENV COPERNICUS_OPEN_HUB_PASSWORD ... diff --git a/geospaas_harvesting/harvesters.py b/geospaas_harvesting/harvesters.py index 0296be7f..74817107 100644 --- a/geospaas_harvesting/harvesters.py +++ b/geospaas_harvesting/harvesters.py @@ -95,10 +95,10 @@ class for harvesting online data sources that rely on webpages (and most of the def __init__(self, **config): super().__init__(**config) - if 'excludes' in config: - if not isinstance(config['excludes'], list): + if 'include' in config: + if not isinstance(config['include'], str): raise HarvesterConfigurationError( - "'excludes' field must be fed with a python list of excluded names ") + "The 'include' field must be fed with a regex matching URLs to include") def _create_crawlers(self): if self.crawler is None: @@ -107,7 +107,7 @@ def _create_crawlers(self): try: return [ self.crawler(url, time_range=(self.get_time_range()), - excludes=self.config.get('excludes', None)) + include=self.config.get('include', None)) for url in self.config['urls'] ] except TypeError as error: @@ -150,9 +150,8 @@ def _create_crawlers(self): root_url=url, username=self.config.get('username', None), password=self.config.get('password'), - files_suffixes=self.config.get('fileformat', None), time_range=(self.get_time_range()), - excludes=self.config.get('excludes', None) + include=self.config.get('include', None) ) for url in self.config['urls'] ] @@ -200,3 +199,17 @@ def _create_ingester(self): if parameter_name in self.config: parameters[parameter_name] = self.config[parameter_name] return ingesters.CreodiasEOFinderIngester(**parameters) + + +class LOCALHarvester(WebDirectoryHarvester): + """ Harvester class for some specific local files """ + def _create_crawlers(self): + return [ + crawlers.LocalDirectoryCrawler( + url, + include = self.config.get('include', None), + time_range = self.get_time_range() + ) + for url in self.config['paths'] + ] + ingester = ingesters.NansatIngester diff --git a/geospaas_harvesting/ingesters.py b/geospaas_harvesting/ingesters.py index fdcd74be..ea58f796 100644 --- a/geospaas_harvesting/ingesters.py +++ b/geospaas_harvesting/ingesters.py @@ -11,6 +11,7 @@ import uuid import xml.etree.ElementTree as ET from urllib.parse import urlparse +from dateutil.tz import tzutc import dateutil.parser import django.db import django.db.utils @@ -28,7 +29,7 @@ ISOTopicCategory, Location, Parameter, Platform) from nansat import Nansat from metanorm.handlers import GeospatialMetadataHandler - +from metanorm.utils import get_cf_or_wkv_standard_name logging.getLogger(__name__).addHandler(logging.NullHandler()) @@ -520,6 +521,16 @@ def _get_normalized_attributes(self, dataset_info, *args, **kwargs): normalized_attributes = {} n_points = int(kwargs.get('n_points', 10)) nansat_options = kwargs.get('nansat_options', {}) + url_scheme = urlparse(dataset_info).scheme + if not 'http' in url_scheme and not 'ftp' in url_scheme: + normalized_attributes['geospaas_service_name'] = FILE_SERVICE_NAME + normalized_attributes['geospaas_service'] = LOCAL_FILE_SERVICE + elif 'http' in url_scheme and not 'ftp' in url_scheme: + normalized_attributes['geospaas_service_name'] = DAP_SERVICE_NAME + normalized_attributes['geospaas_service'] = OPENDAP_SERVICE + elif 'ftp' in url_scheme: + raise ValueError("LOCALHarvester (which uses NansatIngester) is only for local file" + " addresses or http addresses, not for ftp protocol") # Open file with Nansat nansat_object = Nansat(nansat_filename(dataset_info), **nansat_options) @@ -527,22 +538,13 @@ def _get_normalized_attributes(self, dataset_info, *args, **kwargs): # get metadata from Nansat and get objects from vocabularies n_metadata = nansat_object.get_metadata() - # set service info attributes - url_scheme = urlparse(dataset_info).scheme - if 'http' in url_scheme: - normalized_attributes['geospaas_service_name'] = DAP_SERVICE_NAME - normalized_attributes['geospaas_service'] = OPENDAP_SERVICE - else: - normalized_attributes['geospaas_service_name'] = FILE_SERVICE_NAME - normalized_attributes['geospaas_service'] = LOCAL_FILE_SERVICE - # set compulsory metadata (source) normalized_attributes['entry_title'] = n_metadata.get('entry_title', 'NONE') normalized_attributes['summary'] = n_metadata.get('summary', 'NONE') normalized_attributes['time_coverage_start'] = dateutil.parser.parse( - n_metadata['time_coverage_start']) + n_metadata['time_coverage_start']).replace(tzinfo=tzutc()) normalized_attributes['time_coverage_end'] = dateutil.parser.parse( - n_metadata['time_coverage_end']) + n_metadata['time_coverage_end']).replace(tzinfo=tzutc()) normalized_attributes['platform'] = json.loads(n_metadata['platform']) normalized_attributes['instrument'] = json.loads(n_metadata['instrument']) normalized_attributes['specs'] = n_metadata.get('specs', '') @@ -551,15 +553,30 @@ def _get_normalized_attributes(self, dataset_info, *args, **kwargs): # set optional ForeignKey metadata from Nansat or from defaults normalized_attributes['gcmd_location'] = n_metadata.get( 'gcmd_location', pti.get_gcmd_location('SEA SURFACE')) - normalized_attributes['provider'] = n_metadata.get( - 'data_center', pti.get_gcmd_provider('NERSC')) + normalized_attributes['provider'] = pti.get_gcmd_provider( + n_metadata.get('provider', 'NERSC')) normalized_attributes['iso_topic_category'] = n_metadata.get( 'ISO_topic_category', pti.get_iso19115_topic_category('Oceans')) # Find coverage to set number of points in the geolocation - if len(nansat_object.vrt.dataset.GetGCPs()) > 0: + if nansat_object.vrt.dataset.GetGCPs(): nansat_object.reproject_gcps() normalized_attributes['location_geometry'] = GEOSGeometry( - nansat_object.get_border_wkt(nPoints=n_points), srid=4326) + nansat_object.get_border_wkt(n_points=n_points), srid=4326) + + json_dumped_dataset_parameters = n_metadata.get('dataset_parameters', None) + if json_dumped_dataset_parameters: + json_loads_result = json.loads(json_dumped_dataset_parameters) + if isinstance(json_loads_result, list): + normalized_attributes['dataset_parameters'] = [ + get_cf_or_wkv_standard_name(dataset_param) + for dataset_param in json_loads_result + ] + else: + self.LOGGER.error( + "'dataset_parameters' section of metadata is not a json-dumped python list", + exc_info=True) + raise TypeError( + "'dataset_parameters' section of metadata is not a json-dumped python list") return normalized_attributes diff --git a/runtests.py b/runtests.py index 3538cf64..f936fa5c 100644 --- a/runtests.py +++ b/runtests.py @@ -18,6 +18,6 @@ test_module = f".{sys.argv[1]}" if len(sys.argv) >= 2 else '' TestRunner = get_runner(settings) - test_runner = TestRunner() + test_runner = TestRunner(interactive=False) failures = test_runner.run_tests(["tests" + test_module]) sys.exit(bool(failures)) diff --git a/tests/data/nansat/arc_metno_dataset.nc b/tests/data/nansat/arc_metno_dataset.nc deleted file mode 100644 index 406ba2c0..00000000 Binary files a/tests/data/nansat/arc_metno_dataset.nc and /dev/null differ diff --git a/tests/test_crawlers.py b/tests/test_crawlers.py index 148410f9..9d65dceb 100644 --- a/tests/test_crawlers.py +++ b/tests/test_crawlers.py @@ -5,9 +5,10 @@ import json import logging import os -import os.path +import re import unittest import unittest.mock as mock +from unittest.mock import call from datetime import datetime, timezone from urllib.parse import urlparse @@ -56,14 +57,6 @@ def test_is_folder(self): with self.assertRaises(NotImplementedError): crawler._is_folder('') - def test_is_file(self): - """ - A NotImplementedError should be raised if the _is_file() method - is accessed directly on the WebDirectoryCrawler class - """ - crawler = crawlers.WebDirectoryCrawler('') - with self.assertRaises(NotImplementedError): - crawler._is_file('') def test_get_download_url(self): """ @@ -134,33 +127,31 @@ def test_add_folder_to_process(self): def test_process_folder_with_file(self): """_process_folder() should feed the _urls stack - with file paths which are not excluded + with only file paths which are included """ - crawler = crawlers.WebDirectoryCrawler('http://foo/bar') - crawler.excludes = ['.gz'] + crawler = crawlers.WebDirectoryCrawler('http://foo/bar', include='\.nc$') + crawler.EXCLUDE = re.compile(r'\.h5$') crawler.LOGGER = mock.Mock() with mock.patch.object(crawler, '_list_folder_contents') as mock_folder_contents, \ - mock.patch.object(crawler, '_is_file', return_value=True), \ mock.patch.object(crawler, '_is_folder', return_value=False), \ mock.patch.object(crawler, '_add_url_to_return') as mock_add_url: mock_folder_contents.return_value = ['/bar/baz.nc', '/bar/qux.gz'] crawler._process_folder('') - mock_add_url.assert_called_with('/bar/baz.nc') + mock_add_url.assert_called_once_with('/bar/baz.nc') def test_process_folder_with_folder(self): """_process_folder() should feed the _to_process stack with folder paths which are not excluded """ - crawler = crawlers.WebDirectoryCrawler('http://foo/bar') - crawler.excludes = ['qux'] + crawler = crawlers.WebDirectoryCrawler('http://foo/bar', include='baz') + crawler.EXCLUDE = re.compile(r'qux') crawler.LOGGER = mock.Mock() with mock.patch.object(crawler, '_list_folder_contents') as mock_folder_contents, \ - mock.patch.object(crawler, '_is_file', return_value=False), \ mock.patch.object(crawler, '_is_folder', return_value=True), \ mock.patch.object(crawler, '_add_folder_to_process') as mock_add_folder: mock_folder_contents.return_value = ['/bar/baz', '/bar/qux'] crawler._process_folder('') - mock_add_folder.assert_called_with('/bar/baz') + mock_add_folder.assert_called_once_with('/bar/baz') def test_get_year_folder_coverage(self): """Get the correct time range from a year folder""" @@ -341,14 +332,6 @@ def test_is_folder(self): with mock.patch('os.path.isdir', return_value=False): self.assertFalse(self.crawler._is_folder(''), "_is_folder() should return False") - def test_is_file(self): - """_is_file() should return True if the path points - to a regular file, False otherwise""" - with mock.patch('os.path.isfile', return_value=True): - self.assertTrue(self.crawler._is_file(''), "_is_file() should return True") - with mock.patch('os.path.isfile', return_value=False): - self.assertFalse(self.crawler._is_file(''), "_is_file() should return False") - class HTMLDirectoryCrawlerTestCase(unittest.TestCase): """Tests for the HTMLDirectoryCrawler crawler""" @@ -498,7 +481,7 @@ def test_process_folder(self): Explore root page and make sure the _url and _to_process attributes of the crawler have the right values """ - crawler = crawlers.OpenDAPCrawler(self.TEST_DATA['root']['urls'][0]) + crawler = crawlers.OpenDAPCrawler(self.TEST_DATA['root']['urls'][0],include='\.nc$') with self.assertLogs(crawler.LOGGER): crawler._process_folder(crawler._to_process.pop()) self.assertListEqual(crawler._urls, [self.TEST_DATA['dataset']['urls'][0]]) @@ -506,7 +489,8 @@ def test_process_folder(self): def test_process_folder_with_duplicates(self): """If the same URL is present twice in the page, it should only be processed once""" - crawler = crawlers.OpenDAPCrawler(self.TEST_DATA['root_duplicates']['urls'][0]) + crawler = crawlers.OpenDAPCrawler(self.TEST_DATA['root_duplicates']['urls'][0], + include='\.nc$') with self.assertLogs(crawler.LOGGER): crawler._process_folder(crawler._to_process.pop()) self.assertListEqual(crawler._urls, [self.TEST_DATA['dataset']['urls'][1]]) @@ -524,7 +508,7 @@ def test_process_folder_with_time_restriction(self): the crawler's time range. """ crawler = crawlers.OpenDAPCrawler( - self.TEST_DATA['folder_day_of_year']['urls'][0], + self.TEST_DATA['folder_day_of_year']['urls'][0], include='\.nc$', time_range=(datetime(2019, 2, 15, 11, 0, 0), datetime(2019, 2, 15, 13, 0, 0))) with self.assertLogs(crawler.LOGGER): crawler._process_folder(crawler._to_process.pop()) @@ -540,7 +524,7 @@ def test_process_folder_with_time_restriction(self): def test_iterating(self): """Test the call to the __iter__ method""" crawler = crawlers.OpenDAPCrawler( - self.TEST_DATA['root']['urls'][0], + self.TEST_DATA['root']['urls'][0], include='\.nc$', time_range=(datetime(2019, 2, 14, 0, 0, 0), datetime(2019, 2, 14, 9, 0, 0))) crawler_iterator = iter(crawler) @@ -863,7 +847,7 @@ def emulate_cwd_of_ftp(self, name): def test_ftp_correct_navigation(self, mock_ftp): """check that file URLs and folders paths are added to the right stacks""" - test_crawler = crawlers.FTPCrawler('ftp://foo', files_suffixes='.gz') + test_crawler = crawlers.FTPCrawler('ftp://foo', include='\.gz$') test_crawler.ftp.nlst.return_value = ['file1.gz', 'folder_name', 'file3.bb', 'file2.gz', ] test_crawler.ftp.cwd = self.emulate_cwd_of_ftp test_crawler.ftp.host = '' @@ -883,7 +867,7 @@ def test_ftp_correct_exception(self, mock_ftp): """ test_crawler = crawlers.FTPCrawler( - 'ftp://', username="d", password="d", files_suffixes='.gz') + 'ftp://', username="d", password="d", include='\.gz$') mock_ftp.side_effect = ftplib.error_perm("503") test_crawler.set_initial_state() diff --git a/tests/test_harvesters.py b/tests/test_harvesters.py index 3d5e6088..09f89331 100644 --- a/tests/test_harvesters.py +++ b/tests/test_harvesters.py @@ -1,10 +1,10 @@ """Tests for the harvesters""" #pylint: disable=protected-access +import re import unittest import unittest.mock as mock from datetime import datetime - from geospaas.vocabularies.models import Parameter import geospaas_harvesting.crawlers as crawlers @@ -191,33 +191,17 @@ def test_creodias_harvester(self): self.assertIsInstance(harvester._current_crawler, crawlers.CreodiasEOFinderCrawler) self.assertIsInstance(harvester._ingester, ingesters.CreodiasEOFinderIngester) - def test_osisaf_harvester_extra_excludes(self): - """ extra excludes should have passed by the excludes as a list in configuration file. + def test_osisaf_harvester_include(self): + """ include criteria should have passed by the "includes" as a regex in configuration file. Otherwise, accossiated error must be raised """ harvester = harvesters.OSISAFHarvester(urls=[''], max_fetcher_threads=1, max_db_threads=1, - excludes=['ease', '_sh_polstere', ]) - self.assertListEqual(harvester._current_crawler.excludes, - crawlers.ThreddsCrawler.EXCLUDE + ['ease', '_sh_polstere']) + include='ease|_sh_polstere') + self.assertEqual(harvester._current_crawler.include, re.compile('ease|_sh_polstere')) harvester = harvesters.OSISAFHarvester(urls=[''], max_fetcher_threads=1, max_db_threads=1) - self.assertListEqual(harvester._current_crawler.excludes, crawlers.ThreddsCrawler.EXCLUDE) - - with self.assertRaises(HarvesterConfigurationError): + self.assertIsNone(harvester._current_crawler.include) + with self.assertRaises(HarvesterConfigurationError):#incorrectly passsed as a list harvester = harvesters.OSISAFHarvester( - urls=[''], max_fetcher_threads=1, max_db_threads=1, excludes='ease') - - def test_extra_excludes_with_no_CLASS_EXCLUDE(self): - """ shall return the excludes from the config file """ - class TestCrawler(crawlers.WebDirectoryCrawler): - EXCLUDE = None - - class TestHarvester(harvesters.WebDirectoryHarvester): - ingester = ingesters.DDXIngester - crawler = TestCrawler - harvester = TestHarvester(urls=[''], max_fetcher_threads=1, max_db_threads=1, - excludes=['ease', '_sh_polstere', ]) - self.assertListEqual(harvester._current_crawler.excludes, ['ease', '_sh_polstere']) - harvester = TestHarvester(urls=[''], max_fetcher_threads=1, max_db_threads=1,) - self.assertEqual(harvester._current_crawler.excludes, []) + urls=[''], max_fetcher_threads=1, max_db_threads=1, include=['ease']) class HarvesterExceptTestCase(unittest.TestCase): diff --git a/tests/test_ingesters.py b/tests/test_ingesters.py index cba32ed3..f07b07c0 100644 --- a/tests/test_ingesters.py +++ b/tests/test_ingesters.py @@ -18,7 +18,8 @@ from geospaas.catalog.models import Dataset, DatasetURI from geospaas.vocabularies.models import DataCenter, ISOTopicCategory, Parameter import geospaas_harvesting.ingesters as ingesters - +from geospaas.catalog.managers import (DAP_SERVICE_NAME, FILE_SERVICE_NAME, + LOCAL_FILE_SERVICE, OPENDAP_SERVICE) class IngesterTestCase(django.test.TransactionTestCase): """Test the base ingester class""" @@ -739,8 +740,8 @@ def test_get_normalized_attributes(self): """ dataset_info = {'services': {'download': {'url': 'http://something'}}} with mock.patch.object( - self.ingester._metadata_handler, 'get_parameters', return_value={'foo': 'bar'}), \ - mock.patch.object(self.ingester, 'add_url') as mock_add_url: + self.ingester._metadata_handler, 'get_parameters', return_value={'foo': 'bar'}), \ + mock.patch.object(self.ingester, 'add_url') as mock_add_url: self.assertDictEqual( self.ingester._get_normalized_attributes(dataset_info), { @@ -821,21 +822,46 @@ def setUp(self): self.mock_param_count = self.patcher_param_count.start() self.mock_param_count.return_value = 2 + self.patcher_get_metadata = mock.patch('geospaas_harvesting.ingesters.Nansat') + self.mock_get_metadata = self.patcher_get_metadata.start() + + self.mock_get_metadata.return_value.get_border_wkt.return_value = ( + 'POLYGON((24.88 68.08,22.46 68.71,19.96 69.31,17.39 69.87,24.88 68.08))') + def tearDown(self): self.patcher_param_count.stop() + self.patcher_get_metadata.stop() def test_normalize_netcdf_attributes_with_nansat(self): """Test the ingestion of a netcdf file using nansat""" + self.mock_get_metadata.return_value.get_metadata.side_effect = [ + {'bulletin_type': 'Forecast', 'Conventions': 'CF-1.4', 'field_date': '2017-05-29', + 'field_type': 'Files based on file type nersc_daily', + 'filename': '/vsimem/343PBWM116.vrt', 'Forecast_range': '10 days', + 'history': '20170521:Created by program hyc2proj, version V0.3', + 'institution': 'MET Norway, Henrik Mohns plass 1, N-0313 Oslo, Norway', + 'instrument': + '{"Category": "In Situ/Laboratory Instruments", "Class": "Data Analysis", ' + '"Type": "Environmental Modeling", "Subtype": "", "Short_Name": "Computer", ' + '"Long_Name": "Computer"}', + 'platform': + '{"Category": "Models/Analyses", "Series_Entity": "", "Short_Name": "MODELS", ' + '"Long_Name": ""}', + 'references': 'http://marine.copernicus.eu', 'source': 'NERSC-HYCOM model fields', + 'time_coverage_end': '2017-05-27T00:00:00', 'time_coverage_start': + '2017-05-18T00:00:00', + 'title': + 'Arctic Ocean Physics Analysis and Forecast, 12.5km daily mean ' + '(dataset-topaz4-arc-myoceanv2-be)', + 'dataset_parameters': '["surface_backwards_scattering_coefficient_of_radar_wave"]'}] ingester = ingesters.NansatIngester() - normalized_attributes = ingester._get_normalized_attributes( - os.path.join(os.path.dirname(__file__), 'data/nansat/arc_metno_dataset.nc')) - + normalized_attributes = ingester._get_normalized_attributes('') self.assertEqual(normalized_attributes['entry_title'], 'NONE') self.assertEqual(normalized_attributes['summary'], 'NONE') self.assertEqual(normalized_attributes['time_coverage_start'], datetime( - year=2017, month=5, day=18, hour=0, minute=0, second=0)) + year=2017, month=5, day=18, hour=0, minute=0, second=0, tzinfo=tzutc())) self.assertEqual(normalized_attributes['time_coverage_end'], datetime( - year=2017, month=5, day=27, hour=0, minute=0, second=0)) + year=2017, month=5, day=27, hour=0, minute=0, second=0, tzinfo=tzutc())) self.assertEqual(normalized_attributes['instrument']['Short_Name'], 'Computer') self.assertEqual(normalized_attributes['instrument']['Long_Name'], 'Computer') @@ -850,16 +876,7 @@ def test_normalize_netcdf_attributes_with_nansat(self): self.assertEqual(normalized_attributes['platform']['Series_Entity'], '') expected_geometry = GEOSGeometry(( - 'POLYGON((' - '20.7042 89.9999,24.9957 89.9999,28.0373 89.9998,30.2939 89.9998,32.0298 89.9998,' - '33.4042 89.9998,34.5181 89.9998,35.4387 89.9997,36.2117 89.9997,36.8699 89.9997,' - '37.6088 89.9997,37.6088 89.9997,33.5816 89.9997,29.6653 89.9996,25.8904 89.9996,' - '22.28 89.9996,18.8504 89.9996,15.611 89.9996,12.5653 89.9996,9.7123 89.9996,' - '7.0469 89.9996,4.1286 89.9995,4.1286 89.9995,1.3791 89.9996,-0.8831 89.9996,' - '-3.3327 89.9996,-5.984 89.9996,-8.85 89.9996,-11.9418 89.9996,-15.2668 89.9997,' - '-18.8277 89.9997,-22.6199 89.9997,-26.6303 89.9997,-26.6303 89.9997,-24.7751 89.9997,' - '-22.9012 89.9997,-20.6677 89.9998,-17.9691 89.9998,-14.6602 89.9998,-10.5392 89.9998,' - '-5.3282 89.9998,1.34 89.9999,9.8983 89.9999,20.7042 89.9999))'), srid=4326) + 'POLYGON((24.88 68.08,22.46 68.71,19.96 69.31,17.39 69.87,24.88 68.08))'), srid=4326) # This fails, which is why string representations are compared. Any explanation is welcome. # self.assertTrue(normalized_attributes['location_geometry'].equals(expected_geometry)) @@ -877,6 +894,24 @@ def test_normalize_netcdf_attributes_with_nansat(self): self.assertEqual( normalized_attributes['gcmd_location']['Location_Category'], 'VERTICAL LOCATION') self.assertEqual(normalized_attributes['gcmd_location']['Location_Type'], 'SEA SURFACE') + self.assertEqual( + normalized_attributes['dataset_parameters'], + [ + OrderedDict( + [('standard_name', 'surface_backwards_scattering_coefficient_of_radar_wave'), + ('canonical_units', '1'), + ('grib', ''), + ('amip', ''), + ('description', + 'The scattering/absorption/attenuation coefficient is assumed to be an ' + 'integral over all wavelengths, unless a coordinate of ' + 'radiation_wavelength is included to specify the wavelength. Scattering of' + ' radiation is its deflection from its incident path without loss of ' + 'energy. Backwards scattering refers to the sum of scattering into all ' + 'backward angles i.e. scattering_angle exceeding pi/2 radians. A ' + 'scattering_angle should not be specified with this quantity.') + ]) + ]) # TODO: make this work # def test_ingest_dataset_twice_different_urls(self): @@ -893,3 +928,91 @@ def test_normalize_netcdf_attributes_with_nansat(self): # self.assertTrue(logger_cm.records[0].msg.endswith('already exists in the database.')) # self.assertEqual(Dataset.objects.count(), initial_datasets_count + 1) + + def test_exception_handling_of_bad_development_of_mappers(self): + """Test the exception handling of bad development of 'dataset_parameters' of metadata. + ANY mapper should return a python list as 'dataset_parameters' of metadata.""" + self.mock_get_metadata.return_value.get_metadata.side_effect = [ + { + 'time_coverage_end': '2017-05-27T00:00:00', 'time_coverage_start': + '2017-05-18T00:00:00', + 'platform': + '{"Category": "Models/Analyses", "Series_Entity": "", "Short_Name": "MODELS", ' + '"Long_Name": ""}', + 'instrument': + '{"Category": "In Situ/Laboratory Instruments", "Class": "Data Analysis", ' + '"Type": "Environmental Modeling", "Subtype": "", "Short_Name": "Computer", ' + '"Long_Name": "Computer"}', + 'dataset_parameters': "{}"}] + ingester = ingesters.NansatIngester() + with self.assertRaises(TypeError) as err: + normalized_attributes = ingester._get_normalized_attributes('') + self.assertEqual( + err.exception.args[0], + "'dataset_parameters' section of metadata is not a json-dumped python list") + + def test_usage_of_nansat_ingester_with_http_protocol_in_the_OPENDAP_cases(self): + """LOCALHarvester(which uses NansatIngester) can be used for `OPENDAP provided` files """ + ingester = ingesters.NansatIngester() + self.mock_get_metadata.return_value.get_metadata.side_effect = [{ + 'time_coverage_end': '2017-05-27T00:00:00', 'time_coverage_start': + '2017-05-18T00:00:00', + 'platform': + '{"Category": "Models/Analyses", "Series_Entity": "", "Short_Name": "MODELS", ' + '"Long_Name": ""}', + 'instrument': + '{"Category": "In Situ/Laboratory Instruments", "Class": "Data Analysis", ' + '"Type": "Environmental Modeling", "Subtype": "", "Short_Name": "Computer", ' + '"Long_Name": "Computer"}', + }] + normalized_attributes = ingester._get_normalized_attributes('http://') + self.assertEqual(normalized_attributes['geospaas_service_name'], DAP_SERVICE_NAME) + self.assertEqual(normalized_attributes['geospaas_service'], OPENDAP_SERVICE) + + def test_usage_of_nansat_ingester_with_local_file(self): + """LOCALHarvester(which uses NansatIngester) can be used for local files """ + ingester = ingesters.NansatIngester() + self.mock_get_metadata.return_value.get_metadata.side_effect = [{ + 'time_coverage_end': '2017-05-27T00:00:00', 'time_coverage_start': + '2017-05-18T00:00:00', + 'platform': + '{"Category": "Models/Analyses", "Series_Entity": "", "Short_Name": "MODELS", ' + '"Long_Name": ""}', + 'instrument': + '{"Category": "In Situ/Laboratory Instruments", "Class": "Data Analysis", ' + '"Type": "Environmental Modeling", "Subtype": "", "Short_Name": "Computer", ' + '"Long_Name": "Computer"}', + }] + normalized_attributes = ingester._get_normalized_attributes('/src/blabla') + self.assertEqual(normalized_attributes['geospaas_service_name'], FILE_SERVICE_NAME) + self.assertEqual(normalized_attributes['geospaas_service'], LOCAL_FILE_SERVICE) + + + def test_exception_handling_of_bad_inputting_of_nansat_ingester_with_ftp_protocol(self): + """LOCALHarvester(which uses NansatIngester) is only for local file addresses""" + ingester = ingesters.NansatIngester() + self.mock_get_metadata.return_value.get_metadata.side_effect = [''] + with self.assertRaises(ValueError) as err: + normalized_attributes = ingester._get_normalized_attributes('ftp://') + self.assertEqual( + err.exception.args[0], + "LOCALHarvester (which uses NansatIngester) is only for local file addresses or http " + "addresses, not for ftp protocol") + + def test_reprojection_based_on_gcps(self): + """Nansat ingester should reproject if there is any GC point in the metadata""" + self.mock_get_metadata.return_value.vrt.dataset.GetGCPs.return_value = True + self.mock_get_metadata.return_value.get_metadata.side_effect = [{ + 'time_coverage_end': '2017-05-27T00:00:00', 'time_coverage_start': + '2017-05-18T00:00:00', + 'platform': + '{"Category": "Models/Analyses", "Series_Entity": "", "Short_Name": "MODELS", ' + '"Long_Name": ""}', + 'instrument': + '{"Category": "In Situ/Laboratory Instruments", "Class": "Data Analysis", ' + '"Type": "Environmental Modeling", "Subtype": "", "Short_Name": "Computer", ' + '"Long_Name": "Computer"}', + }] + ingester = ingesters.NansatIngester() + normalized_attributes = ingester._get_normalized_attributes('') + self.mock_get_metadata.return_value.reproject_gcps.assert_called_once()