diff --git a/server/rapid/gen_latest_rapid_data.sh b/server/rapid/gen_latest_rapid_data.sh deleted file mode 100755 index a91e48147..000000000 --- a/server/rapid/gen_latest_rapid_data.sh +++ /dev/null @@ -1,13 +0,0 @@ -#!/bin/bash -x - -# step 1: download and inflate from the feed (see appropriate line in setup_rapid_input.sh) -# stop 2: run this script with the new files - -newfile=$1 - -poetry run python process_events.py $newfile data/output - -# step 3: upload to aws -# sample upload command below. note that an aws cp --recursive will be faster (if you only have one month). -# aws s3 sync --dryrun data/output/Events/ s3://tm-mbta-performance/Events/ -# aws s3 cp --recursive --dryrun data/output/Events/ s3://tm-mbta-performance/Events/ diff --git a/server/rapid/gen_rapid_events.sh b/server/rapid/gen_rapid_events.sh deleted file mode 100755 index 4503ef2f2..000000000 --- a/server/rapid/gen_rapid_events.sh +++ /dev/null @@ -1,8 +0,0 @@ -#!/bin/bash -x - -for y in `seq 2016 2024`; do - for f in $(find data/input/$y/ -name '*.csv'); do - echo "Generating stop data from $f" - poetry run python process_events.py $f data/output - done -done diff --git a/server/rapid/process_events.py b/server/rapid/process_events.py deleted file mode 100644 index 0370ab622..000000000 --- a/server/rapid/process_events.py +++ /dev/null @@ -1,76 +0,0 @@ -import argparse -import pandas as pd -import pathlib - - -def process_events(input_csv, outdir, nozip=False): - columns = [ - "service_date", - "route_id", - "trip_id", - "direction_id", - "stop_id", - "stop_sequence", - "vehicle_id", - "vehicle_label", - "event_type", - "event_time_sec", - ] - - df = pd.read_csv( - input_csv, - usecols=columns, - parse_dates=["service_date"], - dtype={ - "route_id": "str", - "trip_id": "str", - "stop_id": "str", - "vehicle_id": "str", - "vehicle_label": "str", - "event_time": "int", - }, - ) - - df["event_time"] = df["service_date"] + pd.to_timedelta(df["event_time_sec"], unit="s") - df.drop(columns=["event_time_sec"], inplace=True) - - service_date_month = pd.Grouper(key="service_date", freq="1M") - grouped = df.groupby([service_date_month, "stop_id"]) - - for name, events in grouped: - service_date, stop_id = name - - fname = pathlib.Path( - outdir, - "Events", - "monthly-data", - str(stop_id), - f"Year={service_date.year}", - f"Month={service_date.month}", - "events.csv.gz", - ) - fname.parent.mkdir(parents=True, exist_ok=True) - # set mtime to 0 in gzip header for determinism (so we can re-gen old routes, and rsync to s3 will ignore) - events.to_csv(fname, index=False, compression={"method": "gzip", "mtime": 0} if not nozip else None) - - -def main(): - parser = argparse.ArgumentParser() - - parser.add_argument("input", metavar="INPUT_CSV") - parser.add_argument("output", metavar="OUTPUT_DIR") - - parser.add_argument("--nozip", "-nz", action="store_true", help="debug feature to skip gzipping") - - args = parser.parse_args() - input_csv = args.input - output_dir = args.output - no_zip = args.nozip - - pathlib.Path(output_dir).mkdir(exist_ok=True) - - process_events(input_csv, output_dir, no_zip) - - -if __name__ == "__main__": - main() diff --git a/server/rapid/setup_rapid_input.sh b/server/rapid/setup_rapid_input.sh deleted file mode 100755 index 22b5ce152..000000000 --- a/server/rapid/setup_rapid_input.sh +++ /dev/null @@ -1,36 +0,0 @@ -#!/bin/sh -x - -mkdir -p data/input - -# 2016 is a weird case- seems to be tts, headways, etc. not ARR DEP events. -wget -N -O data/input/2016.zip https://www.arcgis.com/sharing/rest/content/items/3e892be850fe4cc4a15d6450de4bd318/data -wget -N -O data/input/2017.zip https://www.arcgis.com/sharing/rest/content/items/cde60045db904ad299922f4f8759dcad/data -wget -N -O data/input/2018.zip https://www.arcgis.com/sharing/rest/content/items/25c3086e9826407e9f59dd9844f6c975/data -wget -N -O data/input/2019.zip https://www.arcgis.com/sharing/rest/content/items/11bbb87f8fb245c2b87ed3c8a099b95f/data -wget -N -O data/input/2020.zip https://www.arcgis.com/sharing/rest/content/items/cb4cf52bafb1402b9b978a424ed4dd78/data -wget -N -O data/input/2021.zip https://www.arcgis.com/sharing/rest/content/items/611b8c77f30245a0af0c62e2859e8b49/data -wget -N -O data/input/2022.zip https://www.arcgis.com/sharing/rest/content/items/99094a0c59e443cdbdaefa071c6df609/data -wget -N -O data/input/2023.zip https://www.arcgis.com/sharing/rest/content/items/9a7f5634db72459ab731b6a9b274a1d4/data -wget -N -O data/input/2024.zip https://www.arcgis.com/sharing/rest/content/items/4adbec39db40498a8530496d8c63a924/data - -cd data/input -for i in `seq 2017 2024`; do - unzip -o -d $i $i.zip -done - -# The following years only have single csv files -# These are too large to process at once, so we use this sed script -# to split it into monthly files. -for y in 2016 2017 2018; do - awk -v year=$y -v outdir="$y/" -F "-" ' - NR==1 {header=$0}; - NF>1 && NR>1 { - if(! files[$2]) { - print header >> (outdir year "_" $2 ".csv"); - files[$2] = 1; - }; - print $0 >> (outdir year "_" $2 ".csv"); - }' $y/Events$y.csv; - - rm $y/Events$y.csv; -done