From 4be4f045bc4e86b1a3630f824f770626d0a7ae87 Mon Sep 17 00:00:00 2001 From: Miles Alan Date: Thu, 10 Oct 2024 14:55:21 -0400 Subject: [PATCH 1/2] Schema: Add extracted_{filesize,calendar_start,calendar_end} New properties for schedule schema: extracted_filesize: The filesize in bytes of GTFS archive extracted extracted_calendar_start: Earliest date referenced in calendar/calendar_dates extracted_calendar_end: Latest date referenced in calendar/calendar_dates Also adds related helper functions: extract_gtfs_calendar_range: Extract calendar range from a GTFS archive get_filesize: Gets the filesize in bytes given a filepath is_gtfs_yyyymmdd_format: Determines if date is in GTFS YYYYMMDD format --- README.md | 7 +- schemas/gtfs_schedule_source_schema.json | 87 ++++++++++-------------- scripts/export_to_csv.py | 3 + tools/constants.py | 5 ++ tools/helpers.py | 66 ++++++++++++++++++ tools/representations.py | 27 +++++++- 6 files changed, 142 insertions(+), 53 deletions(-) diff --git a/README.md b/README.md index 664be3e4..d0770f87 100644 --- a/README.md +++ b/README.md @@ -59,12 +59,15 @@ Contains the JSON schemas used to validate the feeds in the integration tests. | - country_code | Text |Required | ISO 3166-1 alpha-2 code designating the country where the feed service is located. For a list of valid codes [see here](https://en.wikipedia.org/wiki/List_of_ISO_3166_country_codes). | | - subdivision_name | Text |Optional | ISO 3166-2 subdivision name designating the subdivision (e.g province, state, region) where the feed service is located. For a list of valid names [see here](https://unece.org/trade/uncefact/unlocode-country-subdivisions-iso-3166-2).| | - municipality | Text |Optional | Primary municipality in which the feed service is located.| -| - bounding_box | Object|System generated | Bounding box of the feed when it was first added to the catalog. Contains `minimum_latitude`, `maximum_latitude`, `minimum_longitude`, `maximum_longitude` and `extracted_on` fields. If the bounding box information displays as "null", you can check any potential feed errors with [the GTFS validator](https://github.com/MobilityData/gtfs-validator). | +| - bounding_box | Object|System generated | The bounding box of the feed and metadata about the GTFS archive when it was first added or updated in the catalog. Contains `minimum_latitude`, `maximum_latitude`, `minimum_longitude`, `maximum_longitude`, `extracted_on`, `extracted_filesize`, `extracted_date_start`, and `extracted_date_end` fields. If the bounding box information displays as "null", you can check any potential feed errors with [the GTFS validator](https://github.com/MobilityData/gtfs-validator). | --minimum_latitude | Latitude | System generated | The minimum latitude for the feed's bounding box. | --maximum_latitude | Latitude | System generated | The maximum latitude for the feed's bounding box. | --minimum_longitude | Longitude | System generated | The minimum longitude for the feed's bounding box. | --maximum_longitude | Longitude | System generated | The maximum longitude for the feed's bounding box. -| --extracted_on | Date and Time | System generated | The date and timestamp the bounding box was extracted on in UTC. +| --extracted_on | Date and Time | System generated | The date and timestamp when the bounding box was extracted on in UTC. +| --extracted_filesize | Integer | System generated | The filesize in bytes of the GTFS archive when the bounding box was extracted. +| --extracted_calendar_start | Date | System generated | The date earliest covered by calendar/calendar_dates when the bounding box was extracted. +| --extracted_calendar_end | Date | System generated | The date latest covered by calendar/calendar_dates when the bounding box was extracted. | provider | Text | Required | A commonly used name for the transit provider included in the feed. | | feed_contact_email | Text | Optional | The contact information for the data producer of the feed, discovered via feed_info.feed_contact_email in the feed, the provider's website, or the Mobility Database contributor form. | | name | Text |Optional | An optional description of the feed, e.g to specify if the feed is an aggregate of multiple providers, or which network is represented by the feed. | diff --git a/schemas/gtfs_schedule_source_schema.json b/schemas/gtfs_schedule_source_schema.json index fb6395d6..4167251c 100644 --- a/schemas/gtfs_schedule_source_schema.json +++ b/schemas/gtfs_schedule_source_schema.json @@ -42,58 +42,45 @@ }, "bounding_box": { "type": "object", - "description": "This is the bounding box of the data source when it was first added to the catalog. It includes the date and timestamp the bounding box was extracted in UTC.", - "oneOf": [ - { - "properties": { - "minimum_latitude": { - "type": "number", - "minimum": -90, - "maximum": 90 - }, - "maximum_latitude": { - "type": "number", - "minimum": -90, - "maximum": 90 - }, - "minimum_longitude": { - "type": "number", - "minimum": -180, - "maximum": 180 - }, - "maximum_longitude": { - "type": "number", - "minimum": -180, - "maximum": 180 - }, - "extracted_on": { - "type": "string", - "format": "date-time" - } - } + "description": "This is the bounding box of the data source and metadata when it was first added to the catalog. It includes the date and timestamp the bounding box was extracted in UTC, filesize in bytes, and calendar daterange.", + "properties": { + "minimum_latitude": { + "type": ["number", "null"], + "minimum": -90, + "maximum": 90 }, - { - "properties": { - "minimum_latitude": { - "type": "null" - }, - "maximum_latitude": { - "type": "null" - }, - "minimum_longitude": { - "type": "null" - }, - "maximum_longitude": { - "type": "null" - }, - "extracted_on": { - "type": "string", - "format": "date-time" - } - } + "maximum_latitude": { + "type": ["number", "null"], + "minimum": -90, + "maximum": 90 + }, + "minimum_longitude": { + "type": ["number", "null"], + "minimum": -180, + "maximum": 180 + }, + "maximum_longitude": { + "type": ["number", "null"], + "minimum": -180, + "maximum": 180 + }, + "extracted_on": { + "type": "string", + "format": "date-time" + }, + "extracted_filesize": { + "type": ["number", "null"] + }, + "extracted_calendar_start": { + "type": ["string", "null"], + "format": "date" + }, + "extracted_calendar_end": { + "type": ["string", "null"], + "format": "date" } - ], - "required": ["minimum_latitude", "maximum_latitude", "minimum_longitude", "maximum_longitude", "extracted_on"] + }, + "required": ["minimum_latitude", "maximum_latitude", "minimum_longitude", "maximum_longitude", "extracted_on", "extracted_filesize", "extracted_calendar_start", "extracted_calendar_end"] } }, "required": ["country_code", "bounding_box"] diff --git a/scripts/export_to_csv.py b/scripts/export_to_csv.py index 9d5b3b7c..26129f6d 100644 --- a/scripts/export_to_csv.py +++ b/scripts/export_to_csv.py @@ -28,6 +28,9 @@ 'location.bounding_box.minimum_longitude', 'location.bounding_box.maximum_longitude', 'location.bounding_box.extracted_on', + 'location.bounding_box.extracted_filesize', + 'location.bounding_box.extracted_calendar_start', + 'location.bounding_box.extracted_calendar_end', 'status', 'features', 'redirect.id', diff --git a/tools/constants.py b/tools/constants.py index d1b7ffab..e1565a33 100644 --- a/tools/constants.py +++ b/tools/constants.py @@ -7,7 +7,9 @@ STOP_LON = "stop_lon" START_SERVICE_AREA_ID = "start_service_area_id" START_SERVICE_AREA_RADIUS = "start_service_area_radius" +START_DATE = "start_date" END_DATE = "end_date" +DATE = "date" GTFS_DATE_FORMAT = "%Y%m%d" PATHWAYS_TXT = "pathways.txt" FARES_ATTRIBUTES_TXT = "fares_attributes.txt" @@ -75,6 +77,9 @@ MINIMUM_LONGITUDE = "minimum_longitude" MAXIMUM_LONGITUDE = "maximum_longitude" EXTRACTED_ON = "extracted_on" +EXTRACTED_FILESIZE= "extracted_filesize" +EXTRACTED_CALENDAR_START = "extracted_calendar_start" +EXTRACTED_CALENDAR_END = "extracted_calendar_end" URLS = "urls" DIRECT_DOWNLOAD = "direct_download" LICENSE = "license" diff --git a/tools/helpers.py b/tools/helpers.py index 1a079117..24028055 100644 --- a/tools/helpers.py +++ b/tools/helpers.py @@ -9,6 +9,10 @@ from unidecode import unidecode import uuid from tools.constants import ( + START_DATE, + END_DATE, + DATE, + GTFS_DATE_FORMAT, STOP_LAT, STOP_LON, MDB_ARCHIVES_LATEST_URL_TEMPLATE, @@ -292,6 +296,21 @@ def create_filename( extension=extension, ) +def is_gtfs_yyyymmdd_format(string): + """ + Determines if the given string is in standard GTFS YYYYMMDD date format. + + Args: + string (str): Date string to test against. + + Returns: + bool: True if can be parsed as a standard GTFS YYYYMMDD date string. + """ + try: + datetime.datetime.strptime(string, GTFS_DATE_FORMAT) + return True + except ValueError: + return False def normalize(string): """ @@ -335,6 +354,15 @@ def get_iso_time(): .isoformat() ) +def get_filesize(path): + """ + Gets the filesize of the given file path. + + Returns: + int: Filesize in bytes of the given file path. + """ + return os.stat(path).st_size + ######################### # GTFS SPECIFIC FUNCTIONS @@ -405,3 +433,41 @@ def extract_gtfs_bounding_box(file_path): maximum_longitude = stops[STOP_LON].dropna().max() if stops_are_present else None return minimum_latitude, maximum_latitude, minimum_longitude, maximum_longitude + +def extract_gtfs_calendar_range(file_path): + """ + Extracts the min and max dates of a GTFS source using the `calendar` & `calendar_dates` files from the GTFS dataset. + + This function loads a GTFS dataset and determines the earliest (min) and latest (max) date referenced + based on the calendar and calendar_dates in the dataset. + + Args: + file_path (str): The file path to the GTFS dataset. + + Returns: + tuple: A tuple with the minimum and maximum calendar dates formatted in standard YYYY-MM-DD format. + + Notes: + If both calendar and calendar_dates files are missing or columns are invalid, returned value will be a tuple with 2 None values. + """ + dataset = load_gtfs(file_path) + dates = [] + + if dataset.calendar is not None: + dates.append(dataset.calendar[START_DATE]) + dates.append(dataset.calendar[END_DATE]) + if dataset.calendar_dates is not None: + dates.append(dataset.calendar_dates[DATE]) + if len(dates) == 0: + return None, None + + all_dates = pd.concat(dates).dropna() + filtered_dates = all_dates[all_dates.apply(is_gtfs_yyyymmdd_format)] + if len(filtered_dates) == 0: + return None, None + + min_date_yyyymmdd = filtered_dates.min() + max_date_yyyymmdd = filtered_dates.max() + min_date = datetime.datetime.strptime(min_date_yyyymmdd, GTFS_DATE_FORMAT).strftime('%Y-%m-%d') + max_date = datetime.datetime.strptime(max_date_yyyymmdd, GTFS_DATE_FORMAT).strftime('%Y-%m-%d') + return min_date, max_date diff --git a/tools/representations.py b/tools/representations.py index c7101fe0..4ea06bab 100644 --- a/tools/representations.py +++ b/tools/representations.py @@ -6,7 +6,9 @@ is_readable, load_gtfs, extract_gtfs_bounding_box, + extract_gtfs_calendar_range, get_iso_time, + get_filesize, create_latest_url, to_json, create_filename, @@ -30,6 +32,9 @@ MINIMUM_LONGITUDE, MAXIMUM_LONGITUDE, EXTRACTED_ON, + EXTRACTED_FILESIZE, + EXTRACTED_CALENDAR_START, + EXTRACTED_CALENDAR_END, URLS, DIRECT_DOWNLOAD, LICENSE, @@ -492,7 +497,10 @@ class GtfsScheduleSource(Source): bbox_max_lat (float): Maximum latitude of the bounding box. bbox_min_lon (float): Minimum longitude of the bounding box. bbox_max_lon (float): Maximum longitude of the bounding box. - bbox_extracted_on (str): Date when the bounding box was extracted. + bbox_extracted_on (str): Date-time when the bounding box was extracted. + bbox_extracted_filesize (int): Filesize in bytes when the bounding box was extracted. + bbox_extracted_calendar_start (str): Date earliest covered by calendar/calendar_dates when the bounding box was extracted. + bbox_extracted_calendar_end (str): Date latest covered by calendar/calendar_dates when the bounding box was extracted. latest_url (str): URL for the latest version of the GTFS data. feed_contact_email (str, optional): Contact email for the GTFS feed. redirects (list): List of redirect URLs, if any. @@ -529,6 +537,9 @@ def __init__(self, **kwargs): self.bbox_min_lon = bounding_box.pop(MINIMUM_LONGITUDE) self.bbox_max_lon = bounding_box.pop(MAXIMUM_LONGITUDE) self.bbox_extracted_on = bounding_box.pop(EXTRACTED_ON) + self.bbox_extracted_filesize = bounding_box.pop(EXTRACTED_FILESIZE, None) + self.bbox_extracted_calendar_start = bounding_box.pop(EXTRACTED_CALENDAR_START, None) + self.bbox_extracted_calendar_end = bounding_box.pop(EXTRACTED_CALENDAR_END, None) urls = kwargs.pop(URLS, {}) self.latest_url = urls.pop(LATEST) self.feed_contact_email = kwargs.pop(FEED_CONTACT_EMAIL, None) @@ -548,6 +559,9 @@ def __str__(self): MINIMUM_LONGITUDE: self.bbox_min_lon, MAXIMUM_LONGITUDE: self.bbox_max_lon, EXTRACTED_ON: self.bbox_extracted_on, + EXTRACTED_FILESIZE: self.bbox_extracted_filesize, + EXTRACTED_CALENDAR_START: self.bbox_extracted_calendar_start, + EXTRACTED_CALENDAR_END: self.bbox_extracted_calendar_end, DIRECT_DOWNLOAD: self.direct_download_url, AUTHENTICATION_TYPE: self.authentication_type, AUTHENTICATION_INFO: self.authentication_info_url, @@ -624,6 +638,9 @@ def update(self, **kwargs): self.bbox_max_lon, ) = extract_gtfs_bounding_box(file_path=dataset_path) self.bbox_extracted_on = get_iso_time() + self.bbox_extracted_filesize = get_filesize(dataset_path) + self.bbox_extracted_calendar_start, self.bbox_extracted_calendar_end = extract_gtfs_calendar_range(dataset_path) + # Delete the downloaded dataset because we don't need it anymore os.remove(dataset_path) @@ -689,6 +706,8 @@ def build(cls, **kwargs): maximum_longitude, ) = extract_gtfs_bounding_box(file_path=dataset_path) extracted_on = get_iso_time() + extracted_filesize = get_filesize(dataset_path) + extracted_calendar_start, extracted_calendar_end = extract_gtfs_calendar_range(dataset_path) # Delete the downloaded dataset because we don't need it anymore os.remove(dataset_path) @@ -719,6 +738,9 @@ def build(cls, **kwargs): minimum_longitude=minimum_longitude, maximum_longitude=maximum_longitude, extracted_on=extracted_on, + extracted_calendar_start=extracted_calendar_start, + extracted_calendar_end=extracted_calendar_end, + extracted_filesize=extracted_filesize, latest=latest, **kwargs, ) @@ -745,6 +767,9 @@ def schematize(cls, **kwargs): MINIMUM_LONGITUDE: kwargs.pop(MINIMUM_LONGITUDE), MAXIMUM_LONGITUDE: kwargs.pop(MAXIMUM_LONGITUDE), EXTRACTED_ON: kwargs.pop(EXTRACTED_ON), + EXTRACTED_FILESIZE: kwargs.pop(EXTRACTED_FILESIZE), + EXTRACTED_CALENDAR_START: kwargs.pop(EXTRACTED_CALENDAR_START), + EXTRACTED_CALENDAR_END: kwargs.pop(EXTRACTED_CALENDAR_END), }, }, URLS: { From 502f88a0bc59eb0dd063b8ce53d297df03def377 Mon Sep 17 00:00:00 2001 From: Miles Alan Date: Thu, 10 Oct 2024 15:15:45 -0400 Subject: [PATCH 2/2] Tests: Update for extracted_{filesize,calendar_start,calendar_end} Also adds tests for new helper extract_gtfs_calendar_range function --- tools/tests/test_helpers.py | 81 +++++++++++++++++++++++++++++ tools/tests/test_representations.py | 42 +++++++++++++++ 2 files changed, 123 insertions(+) diff --git a/tools/tests/test_helpers.py b/tools/tests/test_helpers.py index 9d51010a..050b5a97 100644 --- a/tools/tests/test_helpers.py +++ b/tools/tests/test_helpers.py @@ -11,8 +11,12 @@ get_iso_time, load_gtfs, extract_gtfs_bounding_box, + extract_gtfs_calendar_range, STOP_LAT, STOP_LON, + START_DATE, + END_DATE, + DATE, to_json, from_json, normalize, @@ -362,6 +366,83 @@ def test_extract_gtfs_bounding_box_stops_present(self, mock_load_gtfs): under_test = extract_gtfs_bounding_box(file_path=self.test_path) self.assertEqual(under_test, test_bounding_box) + @patch("tools.helpers.load_gtfs") + def test_extract_gtfs_calendar_range_no_calendar_or_calendar_dates(self, mock_load_gtfs): + test_return_min_max = (None, None) + test_calendar = None + type(mock_load_gtfs.return_value).calendar = test_calendar + test_calendar_dates = None + type(mock_load_gtfs.return_value).calendar_dates = test_calendar_dates + under_test = extract_gtfs_calendar_range(file_path=self.test_path) + self.assertEqual(under_test, test_return_min_max) + + @patch("tools.helpers.load_gtfs") + def test_extract_gtfs_calendar_range_invalid_calendar(self, mock_load_gtfs): + test_return_min_max = (None, None) + test_calendar = pd.DataFrame( + { + # Note: only YYYYMMDD valid per GTFS spec; YYYY-MM-DD & nil values dropped + START_DATE: ["2024-02-30", pd.NA], + END_DATE: ["2034-02-01", pd.NA] + } + ) + type(mock_load_gtfs.return_value).calendar = test_calendar + test_calendar_dates = None + type(mock_load_gtfs.return_value).calendar_dates = test_calendar_dates + under_test = extract_gtfs_calendar_range(file_path=self.test_path) + self.assertEqual(under_test, test_return_min_max) + + @patch("tools.helpers.load_gtfs") + def test_extract_gtfs_calendar_range_only_calendar(self, mock_load_gtfs): + test_return_min_max = ('2010-01-02', '2032-04-09') + test_calendar = pd.DataFrame( + { + # Note: only YYYYMMDD valid per GTFS spec; YYYY-MM-DD & nil values dropped + START_DATE: ["20100102", "20230702", "20230402", "2024-02-30", pd.NA], + END_DATE: ["20140104", "20230709", "20320409", "2034-02-01", pd.NA] + } + ) + type(mock_load_gtfs.return_value).calendar = test_calendar + test_calendar_dates = None + type(mock_load_gtfs.return_value).calendar_dates = test_calendar_dates + under_test = extract_gtfs_calendar_range(file_path=self.test_path) + self.assertEqual(under_test, test_return_min_max) + + @patch("tools.helpers.load_gtfs") + def test_extract_gtfs_calendar_range_only_calendar_dates(self, mock_load_gtfs): + test_return_min_max = ('2021-07-02', '2029-04-02') + test_calendar = None + type(mock_load_gtfs.return_value).calendar = test_calendar + test_calendar_dates = pd.DataFrame( + { + # Note: only YYYYMMDD valid per GTFS spec; YYYY-MM-DD & nil values dropped + DATE: ["20240102", "20210702", "20290402", "2027-02-30", pd.NA], + } + ) + type(mock_load_gtfs.return_value).calendar_dates = test_calendar_dates + under_test = extract_gtfs_calendar_range(file_path=self.test_path) + self.assertEqual(under_test, test_return_min_max) + + @patch("tools.helpers.load_gtfs") + def test_extract_gtfs_calendar_range_both_calendar_and_calendar_dates(self, mock_load_gtfs): + test_return_min_max = ('1999-01-02', '2031-07-02') + test_calendar = pd.DataFrame( + { + # Note: only YYYYMMDD valid per GTFS spec; YYYY-MM-DD & nil values dropped + START_DATE: ["19990102", "20230702", "20230402", "2024-02-30", pd.NA], + END_DATE: ["20240104", "20230709", "20230409", "2034-02-01", pd.NA] + } + ) + type(mock_load_gtfs.return_value).calendar = test_calendar + test_calendar_dates = pd.DataFrame( + { + # Note: only YYYYMMDD valid per GTFS spec; YYYY-MM-DD & nil values dropped + DATE: ["20240102", "20310702", "20290402", "2027-02-30", pd.NA], + } + ) + type(mock_load_gtfs.return_value).calendar_dates = test_calendar_dates + under_test = extract_gtfs_calendar_range(file_path=self.test_path) + self.assertEqual(under_test, test_return_min_max) class TestInOutFunctions(TestCase): def setUp(self): diff --git a/tools/tests/test_representations.py b/tools/tests/test_representations.py index 2de38f95..f3c9c05c 100644 --- a/tools/tests/test_representations.py +++ b/tools/tests/test_representations.py @@ -24,6 +24,9 @@ MINIMUM_LONGITUDE, MAXIMUM_LONGITUDE, EXTRACTED_ON, + EXTRACTED_FILESIZE, + EXTRACTED_CALENDAR_START, + EXTRACTED_CALENDAR_END, DIRECT_DOWNLOAD, LATEST, LICENSE, @@ -303,6 +306,9 @@ def setUp(self): self.test_min_lon = "some_min_lon" self.test_max_lon = "some_max_lon" self.test_extracted_on = "some_extraction_time" + self.test_extracted_filesize = "some_extraction_filesize" + self.test_extracted_calendar_start = "some_extraction_calendar_start" + self.test_extracted_calendar_end = "some_extraction_calendar_end" self.test_direct_download_url = "some_direct_download_url" self.test_authentication_type = "some_authentication_type" self.test_authentication_info_url = "some_authentication_info_url" @@ -323,6 +329,9 @@ def setUp(self): MINIMUM_LONGITUDE: self.test_min_lon, MAXIMUM_LONGITUDE: self.test_max_lon, EXTRACTED_ON: self.test_extracted_on, + EXTRACTED_FILESIZE: self.test_extracted_filesize, + EXTRACTED_CALENDAR_START: self.test_extracted_calendar_start, + EXTRACTED_CALENDAR_END: self.test_extracted_calendar_end, DIRECT_DOWNLOAD: self.test_direct_download_url, AUTHENTICATION_TYPE: self.test_authentication_type, AUTHENTICATION_INFO: self.test_authentication_info_url, @@ -349,6 +358,9 @@ def setUp(self): MINIMUM_LONGITUDE: self.test_min_lon, MAXIMUM_LONGITUDE: self.test_max_lon, EXTRACTED_ON: self.test_extracted_on, + EXTRACTED_FILESIZE: self.test_extracted_filesize, + EXTRACTED_CALENDAR_START: self.test_extracted_calendar_start, + EXTRACTED_CALENDAR_END: self.test_extracted_calendar_end, }, }, URLS: { @@ -450,6 +462,8 @@ def test_has_status(self): @patch("tools.representations.os") @patch("tools.representations.get_iso_time") + @patch("tools.representations.get_filesize") + @patch("tools.representations.extract_gtfs_calendar_range") @patch("tools.representations.extract_gtfs_bounding_box") @patch("tools.representations.is_readable") @patch("tools.representations.download_dataset") @@ -458,6 +472,8 @@ def test_update( mock_download_dataset, mock_read_func, mock_bounding_box, + mock_calendar, + mock_filesize, mock_time, mock_os, ): @@ -476,6 +492,9 @@ def test_update( self.assertEqual(under_test.bbox_min_lon, self.test_min_lon) self.assertEqual(under_test.bbox_max_lon, self.test_max_lon) self.assertEqual(under_test.bbox_extracted_on, self.test_extracted_on) + self.assertEqual(under_test.bbox_extracted_filesize, self.test_extracted_filesize) + self.assertEqual(under_test.bbox_extracted_calendar_start, self.test_extracted_calendar_start) + self.assertEqual(under_test.bbox_extracted_calendar_end, self.test_extracted_calendar_end) self.assertEqual(under_test.provider, self.test_provider) self.assertEqual(under_test.name, self.test_name) self.assertEqual(under_test.country_code, self.test_country_code) @@ -491,6 +510,9 @@ def test_update( test_min_lon = "another_min_lon" test_max_lon = "another_max_lon" test_extracted_on = "another_extraction_time" + test_extracted_filesize = "another_extraction_filesize" + test_extracted_calendar_start = "another_extraction_calendar_start" + test_extracted_calendar_end = "another_extraction_calendar_end" test_provider = "another_provider" test_name = "another_name" test_country_code = "another_country_code" @@ -506,6 +528,11 @@ def test_update( test_max_lon, ) mock_time.return_value = test_extracted_on + mock_filesize.return_value = test_extracted_filesize + mock_calendar.return_value = ( + test_extracted_calendar_start, + test_extracted_calendar_end + ) under_test = instance.update( **{ PROVIDER: test_provider, @@ -531,6 +558,9 @@ def test_update( self.assertEqual(under_test.bbox_min_lon, test_min_lon) self.assertEqual(under_test.bbox_max_lon, test_max_lon) self.assertEqual(under_test.bbox_extracted_on, test_extracted_on) + self.assertEqual(under_test.bbox_extracted_filesize, test_extracted_filesize) + self.assertEqual(under_test.bbox_extracted_calendar_start, test_extracted_calendar_start) + self.assertEqual(under_test.bbox_extracted_calendar_end, test_extracted_calendar_end) self.assertEqual(under_test.provider, test_provider) self.assertEqual(under_test.name, test_name) self.assertEqual(under_test.country_code, test_country_code) @@ -544,6 +574,8 @@ def test_update( @patch("tools.representations.create_latest_url") @patch("tools.representations.create_filename") @patch("tools.representations.get_iso_time") + @patch("tools.representations.get_filesize") + @patch("tools.representations.extract_gtfs_calendar_range") @patch("tools.representations.extract_gtfs_bounding_box") @patch("tools.representations.is_readable") @patch("tools.representations.download_dataset") @@ -552,6 +584,8 @@ def test_build( mock_download_dataset, mock_read_func, mock_bounding_box, + mock_calendar, + mock_filesize, mock_time, mock_filename, mock_latest_url, @@ -571,6 +605,11 @@ def test_build( "some_max_lon", ) mock_time.return_value = "some_time" + mock_filesize.return_value = "some_filesize" + mock_calendar.return_value = ( + "some_calendar_start", + "some_calendar_end", + ) mock_filename.return_value = "some_filename" mock_latest_url.return_value = "some_latest_url" mock_schema.return_value = deepcopy(self.test_schema) @@ -580,6 +619,9 @@ def test_build( del self.test_kwargs[MINIMUM_LONGITUDE] del self.test_kwargs[MAXIMUM_LONGITUDE] del self.test_kwargs[EXTRACTED_ON] + del self.test_kwargs[EXTRACTED_FILESIZE] + del self.test_kwargs[EXTRACTED_CALENDAR_START] + del self.test_kwargs[EXTRACTED_CALENDAR_END] del self.test_kwargs[LATEST] under_test = GtfsScheduleSource.build(**self.test_kwargs) self.assertIsNotNone(under_test)