From 528374d45b692fa4f910e00367dfb7131871136c Mon Sep 17 00:00:00 2001 From: Soren Spicknall Date: Wed, 6 Sep 2023 13:56:18 -0500 Subject: [PATCH] GTFS Validator Upgrade Follow-Up (#2935) * Update accepted ranges for validator version test * Update values list in _mart_gtfs_quality YAML * Switch bucket ref for external table to hourly bucket * Add seed for validator v4.1.0 rule details --- .../gtfs_schedule_v2/validation_notices.yml | 2 +- ...chedule_validator_rule_details_unioned.sql | 1 + .../mart/gtfs_quality/_mart_gtfs_quality.yml | 6 +- warehouse/seeds/_seeds.yml | 24 +++- ...schedule_validator_rule_details_v4_1_0.csv | 114 ++++++++++++++++++ 5 files changed, 143 insertions(+), 4 deletions(-) create mode 100644 warehouse/seeds/gtfs_schedule_validator_rule_details_v4_1_0.csv diff --git a/airflow/dags/create_external_tables/gtfs_schedule_v2/validation_notices.yml b/airflow/dags/create_external_tables/gtfs_schedule_v2/validation_notices.yml index 8c58f973c2..4fde53196f 100644 --- a/airflow/dags/create_external_tables/gtfs_schedule_v2/validation_notices.yml +++ b/airflow/dags/create_external_tables/gtfs_schedule_v2/validation_notices.yml @@ -1,5 +1,5 @@ operator: operators.ExternalTable -bucket: "{{ env_var('CALITP_BUCKET__GTFS_SCHEDULE_VALIDATION') }}" +bucket: "{{ env_var('CALITP_BUCKET__GTFS_SCHEDULE_VALIDATION_HOURLY') }}" source_objects: - "validation_notices/*.jsonl.gz" destination_project_dataset_table: "external_gtfs_schedule.validation_notices" diff --git a/warehouse/models/intermediate/gtfs_quality/int_gtfs_quality__schedule_validator_rule_details_unioned.sql b/warehouse/models/intermediate/gtfs_quality/int_gtfs_quality__schedule_validator_rule_details_unioned.sql index 428193bac1..3ec6ba50e5 100644 --- a/warehouse/models/intermediate/gtfs_quality/int_gtfs_quality__schedule_validator_rule_details_unioned.sql +++ b/warehouse/models/intermediate/gtfs_quality/int_gtfs_quality__schedule_validator_rule_details_unioned.sql @@ -4,6 +4,7 @@ WITH unioned AS ( ref('gtfs_schedule_validator_rule_details_v2_0_0'), ref('gtfs_schedule_validator_rule_details_v3_1_1'), ref('gtfs_schedule_validator_rule_details_v4_0_0'), + ref('gtfs_schedule_validator_rule_details_v4_1_0'), ], ) }} ), diff --git a/warehouse/models/mart/gtfs_quality/_mart_gtfs_quality.yml b/warehouse/models/mart/gtfs_quality/_mart_gtfs_quality.yml index d352a1a25a..347627d9a8 100644 --- a/warehouse/models/mart/gtfs_quality/_mart_gtfs_quality.yml +++ b/warehouse/models/mart/gtfs_quality/_mart_gtfs_quality.yml @@ -82,7 +82,11 @@ models: where: validation_validator_version = 'v3.1.1' - dbt_utils.accepted_range: min_value: "DATE'2022-11-16'" + max_value: "DATE'2023-08-31'" where: validation_validator_version = 'v4.0.0' + - dbt_utils.accepted_range: + min_value: "DATE'2023-09-01'" + where: validation_validator_version = 'v4.1.0' - &schedule_feed_key name: feed_key tests: @@ -107,7 +111,7 @@ models: tests: - not_null - accepted_values: - values: ['v2.0.0', 'v3.1.1', 'v4.0.0'] + values: ['v2.0.0', 'v3.1.1', 'v4.0.0', 'v4.1.0'] - &schedule_validator_code name: code description: | diff --git a/warehouse/seeds/_seeds.yml b/warehouse/seeds/_seeds.yml index 5c4cbb4759..aedd1978d3 100644 --- a/warehouse/seeds/_seeds.yml +++ b/warehouse/seeds/_seeds.yml @@ -75,8 +75,28 @@ seeds: - name: gtfs_schedule_validator_rule_details_v4_0_0 description: | A list of validation codes output by the GTFS Schedule validator, and their severities and descriptions. - This data was manually parsed from the contents of the RULES.md file in the v3.1.1 release of the validator, - sourced from: https://github.com/MobilityData/gtfs-validator/archive/refs/tags/v3.1.1.zip + This data was manually parsed from the contents of the RULES.md file in the v4.0.0 release of the validator, + sourced from: https://github.com/MobilityData/gtfs-validator/archive/refs/tags/v4.0.0.zip + columns: + - name: code + tests: + - not_null + - unique + - name: human_readable_description + tests: + - not_null + - name: version + tests: + - not_null + - name: severity + tests: + - not_null + + - name: gtfs_schedule_validator_rule_details_v4_1_0 + description: | + A list of validation codes output by the GTFS Schedule validator, and their severities and descriptions. + This data was manually parsed from the contents of the RULES.md file in the v4.1.0 release of the validator, + sourced from: https://github.com/MobilityData/gtfs-validator/archive/refs/tags/v4.1.0.zip columns: - name: code tests: diff --git a/warehouse/seeds/gtfs_schedule_validator_rule_details_v4_1_0.csv b/warehouse/seeds/gtfs_schedule_validator_rule_details_v4_1_0.csv new file mode 100644 index 0000000000..9ad6297634 --- /dev/null +++ b/warehouse/seeds/gtfs_schedule_validator_rule_details_v4_1_0.csv @@ -0,0 +1,114 @@ +code,human_readable_description,version,severity +block_trips_with_overlapping_stop_times,Block trips with overlapping stop times.,v4.1.0,ERROR +csv_parsing_failed,Parsing of a CSV file failed.,v4.1.0,ERROR +decreasing_shape_distance,Decreasing shape_dist_traveled in shapes.txt.,v4.1.0,ERROR +decreasing_or_equal_stop_time_distance,Decreasing or equal shape_dist_traveled in stop_times.txt.,v4.1.0,ERROR +duplicated_column,Duplicated column in CSV.,v4.1.0,ERROR +duplicate_key,Duplicated entity.,v4.1.0,ERROR +empty_column_name,A column name is empty.,v4.1.0,ERROR +empty_file,A CSV file is empty.,v4.1.0,ERROR +equal_shape_distance_diff_coordinates,Two consecutive points have equal shape_dist_traveled and different lat/lon coordinates in shapes.txt.,v4.1.0,ERROR +fare_transfer_rule_duration_limit_type_without_duration_limit,A row from GTFS file fare_transfer_rules.txt has a defined duration_limit_type field but no duration_limit specified.,v4.1.0,ERROR +fare_transfer_rule_duration_limit_without_type,A row from GTFS file fare_transfer_rules.txt has a defined duration_limit field but no duration_limit_type specified.,v4.1.0,ERROR +fare_transfer_rule_invalid_transfer_count,A row from GTFS file fare_transfer_rules.txt has a defined transfer_count with an invalid value.,v4.1.0,ERROR +fare_transfer_rule_missing_transfer_count,A row from fare_transfer_rules.txt has from_leg_group_id equal to to_leg_group_id but has no transfer_count specified.,v4.1.0,ERROR +fare_transfer_rule_with_forbidden_transfer_count,A row from fare_transfer_rules.txt has from_leg_group_id not equal to to_leg_group_id but has transfer_count specified.,v4.1.0,ERROR +foreign_key_violation,Wrong foreign key.,v4.1.0,ERROR +inconsistent_agency_timezone,Inconsistent Timezone among agencies.,v4.1.0,ERROR +invalid_color,A field contains an invalid color value.,v4.1.0,ERROR +invalid_currency,A field contains a wrong currency code.,v4.1.0,ERROR +invalid_currency_amount,A currency amount field has a value that does not match the format of its corresponding currency code field.,v4.1.0,ERROR +invalid_date,A field cannot be parsed as date.,v4.1.0,ERROR +invalid_email,A field contains a malformed email address.,v4.1.0,ERROR +invalid_float,A field cannot be parsed as a floating point number.,v4.1.0,ERROR +invalid_integer,A field cannot be parsed as an integer.,v4.1.0,ERROR +invalid_language_code,A field contains a wrong language code.,v4.1.0,ERROR +invalid_phone_number,A field contains a malformed phone number.,v4.1.0,ERROR +invalid_row_length,Invalid csv row length.,v4.1.0,ERROR +invalid_time,A field cannot be parsed as time.,v4.1.0,ERROR +invalid_timezone,A field cannot be parsed as a timezone.,v4.1.0,ERROR +invalid_url,A field contains a malformed URL.,v4.1.0,ERROR +location_without_parent_station,A location that must have parent_station field does not have it.,v4.1.0,ERROR +location_with_unexpected_stop_time,A location in stops.txt that is not a stop is referenced by some stop_times.stop_id.,v4.1.0,ERROR +missing_calendar_and_calendar_date_files,Missing GTFS files calendar.txt and calendar_dates.txt.,v4.1.0,ERROR +missing_level_id,stops.level_id is conditionally required.,v4.1.0,ERROR +missing_required_column,A required column is missing in the input file.,v4.1.0,ERROR +missing_required_field,A required field is missing.,v4.1.0,ERROR +missing_required_file,A required file is missing.,v4.1.0,ERROR +missing_stop_name,stops.stop_name is required for location_type equal to 0 1 or 2.,v4.1.0,ERROR +missing_trip_edge,Missing trip edge arrival_time or departure_time.,v4.1.0,ERROR +new_line_in_value,New line or carriage return in a value in CSV file.,v4.1.0,ERROR +number_out_of_range,Out of range value.,v4.1.0,ERROR +overlapping_frequency,Trip frequencies overlap.,v4.1.0,ERROR +pathway_to_platform_with_boarding_areas,A pathway has an endpoint that is a platform which has boarding areas.,v4.1.0,ERROR +pathway_to_wrong_location_type,A pathway has an endpoint that is a station.,v4.1.0,ERROR +pathway_unreachable_location,A location is not reachable at least in one direction: from the entrances or to the exits.,v4.1.0,ERROR +point_near_origin,A point is too close to origin (0; 0).,v4.1.0,ERROR +point_near_pole,A point is too close to the North or South Pole.,v4.1.0,ERROR +route_both_short_and_long_name_missing,Missing route short name and long name.,v4.1.0,ERROR +start_and_end_range_equal,Two date or time fields are equal.,v4.1.0,ERROR +start_and_end_range_out_of_order,Two date or time fields are out of order.,v4.1.0,ERROR +station_with_parent_station,A station has parent_station field set.,v4.1.0,ERROR +stop_time_timepoint_without_times,arrival_time or departure_time not specified for timepoint.,v4.1.0,ERROR +stop_time_with_arrival_before_previous_departure_time,Backwards time travel between stops in stop_times.txt,v4.1.0,ERROR +stop_time_with_only_arrival_or_departure_time,Missing stop_times.arrival_time or stop_times.departure_time.,v4.1.0,ERROR +stop_without_location,stop_lat and/or stop_lon is missing for stop with location_type equal to 0 1 or 2,v4.1.0,ERROR +stop_without_zone_id,Stop without value for stops.zone_id.,v4.1.0,ERROR +too_many_rows,A CSV file has too many rows.,v4.1.0,ERROR +transfer_with_invalid_stop_location_type,A stop id field from GTFS file transfers.txt references a stop that has a location_type other than 0 or 1 (aka Stop/Platform or Station).,v4.1.0,ERROR +transfer_with_invalid_trip_and_route,A trip id field from GTFS file transfers.txt references a route that does not match its trips.txt route_id.,v4.1.0,ERROR +transfer_with_invalid_trip_and_stop,A trip id field from GTFS file transfers.txt references a stop that is not included in the referenced trip's stop-times.,v4.1.0,ERROR +translation_foreign_key_violation,An entity with the given record_id and record_sub_id cannot be found in the referenced table.,v4.1.0,ERROR +translation_unexpected_value,A field in a translations row has value but must be empty.,v4.1.0,ERROR +wrong_parent_location_type,Incorrect type of the parent location.,v4.1.0,ERROR +attribution_without_role,Attribution with no role.,v4.1.0,WARNING +duplicate_fare_media,Two distinct fare media have the same fare media name and type.,v4.1.0,WARNING +duplicate_route_name,Two distinct routes have either the same route_short_name; the same route_long_name; or the same combination of route_short_name and route_long_name.,v4.1.0,WARNING +empty_row,A row in the input file has only spaces.,v4.1.0,WARNING +equal_shape_distance_same_coordinates,Two consecutive points have equal shape_dist_traveled and the same lat/lon coordinates in shapes.txt.,v4.1.0,WARNING +expired_calendar,Dataset should not contain date ranges for services that have already expired.,v4.1.0,WARNING +fast_travel_between_consecutive_stops,A transit vehicle moves too fast between two consecutive stops.,v4.1.0,WARNING +fast_travel_between_far_stops,A transit vehicle moves too fast between two far stops.,v4.1.0,WARNING +feed_expiration_date7_days,Dataset should be valid for at least the next 7 days.,v4.1.0,WARNING +feed_expiration_date30_days,Dataset should cover at least the next 30 days of service.,v4.1.0,WARNING +feed_info_lang_and_agency_lang_mismatch,Mismatching feed and agency language fields.,v4.1.0,WARNING +inconsistent_agency_lang,Inconsistent language among agencies.,v4.1.0,WARNING +leading_or_trailing_whitespaces,The value in CSV file has leading or trailing whitespaces.,v4.1.0,WARNING +missing_feed_info_date,feed_end_date should be provided if feed_start_date is provided. feed_start_date should be provided if feed_end_date is provided.,v4.1.0,WARNING +missing_recommended_file,A recommended file is missing.,v4.1.0,WARNING +missing_recommended_field,A recommended field is missing.,v4.1.0,WARNING +missing_timepoint_column,timepoint column is missing for a dataset.,v4.1.0,WARNING +missing_timepoint_value,stop_times.timepoint value is missing for a record.,v4.1.0,WARNING +mixed_case_recommended_field,This field has customer-facing text and should use Mixed Case (should contain upper and lower case letters).,v4.1.0,WARNING +more_than_one_entity,More than one row in CSV.,v4.1.0,WARNING +non_ascii_or_non_printable_char,Non ascii or non printable char in id.,v4.1.0,WARNING +pathway_dangling_generic_node,A generic node has only one incident location in a pathway graph.,v4.1.0,WARNING +pathway_loop,A pathway starts and ends at the same location.,v4.1.0,WARNING +route_color_contrast,Insufficient route color contrast.,v4.1.0,WARNING +route_long_name_contains_short_name,Long name should not contain short name for a single route.,v4.1.0,WARNING +route_short_name_too_long,Short name of a route is too long (more than 12 characters).,v4.1.0,WARNING +same_name_and_description_for_route,Same name and description for route.,v4.1.0,WARNING +same_name_and_description_for_stop,Same name and description for stop.,v4.1.0,WARNING +same_route_and_agency_url,Same routes.route_url and agency.agency_url.,v4.1.0,WARNING +same_stop_and_agency_url,Same stops.stop_url and agency.agency_url.,v4.1.0,WARNING +same_stop_and_route_url,Same stops.stop_url and routes.route_url.,v4.1.0,WARNING +stop_has_too_many_matches_for_shape,Stop entry that has many potential matches to the trip's path of travel.,v4.1.0,WARNING +stops_match_shape_out_of_order,Two stop entries are different than their arrival-departure order defined by the shapes.txt,v4.1.0,WARNING +stop_too_far_from_shape,Stop too far from trip shape.,v4.1.0,WARNING +stop_too_far_from_shape_using_user_distance,Stop time too far from shape.,v4.1.0,WARNING +stop_without_stop_time,A stop in stops.txt is not referenced by any stop_times.stop_id.,v4.1.0,WARNING +transfer_with_suspicious_mid_trip_in_seat,A trip id field from GTFS file transfers.txt with an in-seat transfer type references a stop that is not in the expected position in the trip's stop-times.,v4.1.0,WARNING +translation_unknown_table_name,A translation references an unknown or missing GTFS table.,v4.1.0,WARNING +trip_coverage_not_active_for_next7_days,Trips data should be valid for at least the next seven days.,v4.1.0,WARNING +unexpected_enum_value,An enum has an unexpected value.,v4.1.0,WARNING +unusable_trip,Trips must have more than one stop to be usable.,v4.1.0,WARNING +unused_shape,Shape is not used in GTFS file trips.txt.,v4.1.0,WARNING +unused_trip,Trip is not be used in stop_times.txt,v4.1.0,WARNING +platform_without_parent_station,A platform has no parent_station field set.,v4.1.0,INFO +unknown_column,A column name is unknown.,v4.1.0,INFO +unknown_file,A file is unknown.,v4.1.0,INFO +i_o_error,Error in IO operation.,v4.1.0,SYSTEM_ERROR +runtime_exception_in_loader_error,RuntimeException while loading GTFS dataset in memory.,v4.1.0,SYSTEM_ERROR +runtime_exception_in_validator_error,RuntimeException while validating GTFS archive.,v4.1.0,SYSTEM_ERROR +thread_execution_error,ExecutionException during multithreaded validation,v4.1.0,SYSTEM_ERROR +u_r_i_syntax_error,A string could not be parsed as a URI reference.,v4.1.0,SYSTEM_ERROR