forked from DataKind-DC/capital-nature-ingest
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathget_events.py
120 lines (99 loc) · 3.79 KB
/
get_events.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
from concurrent.futures import ThreadPoolExecutor
from datetime import datetime
from inspect import getmodule
from io import StringIO
from math import ceil
import os
try:
NPS_KEY = os.environ['NPS_KEY']
except KeyError:
NPS_KEY = input("Enter your NPS API key:")
os.environ["NPS_KEY"] = NPS_KEY
try:
EVENTBRITE_TOKEN = os.environ['EVENTBRITE_TOKEN']
except KeyError:
EVENTBRITE_TOKEN = input("Enter your Eventbrite API key:")
os.environ["EVENTBRITE_TOKEN"] = EVENTBRITE_TOKEN
from events import ans, arlington, aws, casey_trees, city_blossoms, \
dc_audubon, eleventh_street, fairfax, fona, \
friends_of_kenilworth_gardens, loudoun_wildlife_conservancy, lfwa, \
montgomery, nova_parks, nps, potomac_conservancy, rcc, riverkeeper, \
sierra_club_md, sierra_club, tnc, us_botanic_garden, vnps, \
nva_audubon_society, mdflora
from events.utils.log import get_logger
from tests.utils import schema_test
from events.utils import formatters, reports, aws_utils
BUCKET = os.getenv('BUCKET_NAME')
logger = get_logger(os.path.basename(__file__))
def get_source_events(event_source_main):
f = getmodule(event_source_main).__name__.split('.')[-1]
try:
events = event_source_main()
if not BUCKET:
n = len(events)
print(f"Scraped {n} event(s) for {f}")
except Exception as e:
msg = f'Exception getting events in {f}: {e}'
logger.critical(msg, exc_info=True)
return []
events = [
{k: formatters.unicoder(v) for k, v in i.items()}
for i in events
]
for i, event in enumerate(events):
try:
schema_test([event])
except Exception as e:
msg = f'Exception getting events in {f}: {e}'
logger.error(msg, exc_info=True)
events.pop(i)
events = formatters.tag_events_with_state(events)
events = formatters.date_filter(events)
return events
def get_events():
'''
Combines the events output of all the event scrapers.
Returns:
events (list): a list of dicts, w/ each dict being a single event.
'''
event_sources = [
ans, arlington, aws, casey_trees, city_blossoms, dc_audubon,
eleventh_street, fairfax, fona, friends_of_kenilworth_gardens,
loudoun_wildlife_conservancy, lfwa, mdflora, montgomery, nova_parks,
nps, potomac_conservancy, rcc, riverkeeper, sierra_club_md,
sierra_club, tnc, us_botanic_garden, vnps, nva_audubon_society
]
event_source_mains = [e.main for e in event_sources]
n_workers = ceil(len(event_sources) / 2)
with ThreadPoolExecutor(max_workers=n_workers) as executor:
events = executor.map(get_source_events, event_source_mains)
events = [item for sublist in events for item in sublist]
return events
def main(event={}, context={}):
try:
events = get_events()
if not BUCKET:
return events
except Exception as e:
events = []
logger.critical(f"Critical error: {e}", exc_info=True)
finally:
log_df = reports.make_reports(events)
if BUCKET:
log_data = StringIO()
log_df.to_csv(log_data, index=False)
now = datetime.now().strftime("%m-%d-%Y")
aws_utils.put_object(log_data.getvalue(), f'logs/log-{now}.csv')
if __name__ == '__main__':
events = []
data_path = os.path.join(os.getcwd(), 'data')
reports_path = os.path.join(os.getcwd(), 'reports')
try:
events.extend(main())
except Exception as e:
logger.critical(f"Critical error: {e}", exc_info=True)
finally:
print(f"Done scraping {len(events)} events!")
print(f"You can find the logs in ./logs")
print(f"You can find the data in {data_path}")
print(f"You can find the reports in {reports_path}")