-
Notifications
You must be signed in to change notification settings - Fork 0
/
flumart.py
198 lines (156 loc) · 9.78 KB
/
flumart.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
"""
© 2022. Triad National Security, LLC. All rights reserved.
This program was produced under U.S. Government contract 89233218CNA000001 for Los Alamos
National Laboratory (LANL), which is operated by Triad National Security, LLC for the U.S.
Department of Energy/National Nuclear Security Administration. All rights in the program are
reserved by Triad National Security, LLC, and the U.S. Department of Energy/National Nuclear
Security Administration. The Government is granted for itself and others acting on its behalf a
nonexclusive, paid-up, irrevocable worldwide license in this material to reproduce, prepare
derivative works, distribute copies to the public, perform publicly and display publicly, and to permit
others to do so.
"""
"""
This code pulls data from the WHO's influenza surveillance database:
https://apps.who.int/flumart/Default?ReportNo=12
This website is pretty tricky to parse; you must pass realistic headers to the POST requests, and you must also
issue 3 total requests: 1) a GET request, 2) a POST request, and 3) another POST request. All 3 of these requests,
in order, are required to actually collect the underlying data that's displayed in the table. See `get_table_data`
for more documentation on this process.
Kudos to @Ajax1234 on StackOverflow, who helped solve my initial problems here:
https://stackoverflow.com/a/70013344/1269634
A bit more sleuthing was required to ultimately completely automate this, but his answer was tremendously
valuable!
"""
import urllib.parse
import requests
from bs4 import BeautifulSoup
#####
# We define 2 header blocks that will be used for the 2 POST requests in `get_table_data`. These headers come from a
# fresh access of the website using Firefox 95's developer tools.
#####
post_headers_display_report = """Host: apps.who.int
User-Agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:95.0) Gecko/20100101 Firefox/95.0
Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8
Accept-Language: en-US,en;q=0.5
Accept-Encoding: gzip, deflate, br
Content-Type: application/x-www-form-urlencoded
Origin: https://apps.who.int
DNT: 1
Connection: keep-alive
Referer: https://apps.who.int/flumart/Default?ReportNo=12
Upgrade-Insecure-Requests: 1
Sec-Fetch-Dest: document
Sec-Fetch-Mode: navigate
Sec-Fetch-Site: same-origin
Sec-Fetch-User: ?1"""
post_headers_table_data = """Host: apps.who.int
User-Agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:95.0) Gecko/20100101 Firefox/95.0
Accept: */*
Accept-Language: en-US,en;q=0.5
Accept-Encoding: gzip, deflate, br
X-Requested-With: XMLHttpRequest
X-MicrosoftAjax: Delta=true
Cache-Control: no-cache
Content-Type: application/x-www-form-urlencoded; charset=utf-8
Origin: https://apps.who.int
DNT: 1
Connection: keep-alive
Referer: https://apps.who.int/flumart/Default?ReportNo=12
Sec-Fetch-Dest: empty
Sec-Fetch-Mode: cors
Sec-Fetch-Site: same-origin
TE: trailers"""
#####
# End of our header blocks.
#####
def parse_headers(headers):
"""
Turn the single multi-line string of headers into a dict that requests can use.
"""
return dict(line.split(': ') for line in filter(None, headers.split('\n')))
def get_important_hidden_input_values(html):
"""
Grab and return the 3 important input values from the HTML response:
* __VIEWSTATE
* __VIEWSTATEGENERATOR
* __EVENTVALIDATION
"""
soup = BeautifulSoup(html, 'lxml')
viewstate = soup.find_all('input', {'id': '__VIEWSTATE'})
assert len(viewstate) == 1
viewstate = viewstate[0]['value']
viewstategenerator = soup.find_all('input', {'id': '__VIEWSTATEGENERATOR'})
assert len(viewstategenerator) == 1
viewstategenerator = viewstategenerator[0]['value']
eventvalidation = soup.find_all('input', {'id': '__EVENTVALIDATION'})
assert len(eventvalidation) == 1
eventvalidation = eventvalidation[0]['value']
return (viewstate, viewstategenerator, eventvalidation)
def get_table_data(country, from_year, from_week, to_year, to_week):
"""
Issue 3 HTTP requests to get the tabular data we want:
1. First, issue a GET request to the root page. This will 1) set the cookies and 2) allow us to grab the
3 important input values (see `get_important_hidden_input_values`) so that we can issue the next POST
request.
2. Second, issue a POST request that will return a new table skeleton. This POST request will yield 3
*new* important input values that must be used for the next and final POST request.
3. Finally, issue a POST request that will grab the actual data to populate the skeleton table.
This chaining of requests is important. Without the first request, we won't have the cookies and 3 important
input values to issue the second request. Without the second request, we won't have the 3 *new* important
input values to issue the third request. VERY TRICKY!
"""
with requests.Session() as s:
# Issue the first request (GET) to set the Session's cookies and grab the first batch of 3 important input
# values.
response = s.get('https://apps.who.int/flumart/Default?ReportNo=12')
viewstate, viewstategenerator, eventvalidation = get_important_hidden_input_values(response.text)
# Construct the POST payload needed for the second request.
data = data_format_display_report(viewstate,
viewstategenerator,
eventvalidation,
country,
from_year,
from_week,
to_year,
to_week)
# Issue the second request (POST) to grab the table skeleton and 3 *new* important input values.
response = s.post('https://apps.who.int/flumart/Default?ReportNo=12',
data=data,
headers=parse_headers(post_headers_display_report))
viewstate, viewstategenerator, eventvalidation = get_important_hidden_input_values(response.text)
# Construct the POST payload needed for the third request.
data = data_format_table_data(viewstate,
viewstategenerator,
eventvalidation,
country,
from_year,
from_week,
to_year,
to_week)
# Finally, issue the last request (POST) to grab the contents for the table skeleton.
response = s.post('https://apps.who.int/flumart/Default?ReportNo=12',
data=data,
headers=parse_headers(post_headers_table_data))
# Return the HTML content meant to go inside the table skeleton.
return response.text
def parse_table(html):
"""
Parse the table contents into a more useful data structure.
TODO: Create a Pandas DataFrame from the contents.
"""
soup = BeautifulSoup(html, 'lxml')
_, _, h, *body = [list(filter(None, [i.get_text(strip=True) for i in b.select('td')]))
for b in soup.select('table table table table tr:nth-of-type(5) table tr')]
return [dict(zip([*filter(None, h)], i)) for i in body]
def data_format_display_report(viewstate, viewstategenerator, eventvalidation, country, from_year, from_week, to_year, to_week):
"""
Construct the POST payload for the second request in `get_table_data` that gets the table skeleton.
"""
return f'__EVENTTARGET=&__EVENTARGUMENT=&__LASTFOCUS=&__VIEWSTATE={urllib.parse.quote(viewstate)}&__VIEWSTATEGENERATOR={urllib.parse.quote(viewstategenerator)}&__EVENTVALIDATION={urllib.parse.quote(eventvalidation)}&ddlFilterBy=1&lstSearchBy={country}&ctl_list_YearFrom={from_year}&ctl_list_WeekFrom={from_week}&ctl_list_YearTo={to_year}&ctl_list_WeekTo={to_week}&ctl_ViewReport=Display+report'
def data_format_table_data(viewstate, viewstategenerator, eventvalidation, country, from_year, from_week, to_year, to_week):
"""
Construct the POST payload for the third request in `get_table_data` that gets the actual table contents.
"""
return f'ScriptManager1=ScriptManager1%7Cctl_ReportViewer%24ctl09%24Reserved_AsyncLoadTarget&__EVENTTARGET=ctl_ReportViewer%24ctl09%24Reserved_AsyncLoadTarget&__EVENTARGUMENT=&__LASTFOCUS=&__VIEWSTATE={urllib.parse.quote(viewstate)}&__VIEWSTATEGENERATOR={urllib.parse.quote(viewstategenerator)}&__EVENTVALIDATION={urllib.parse.quote(eventvalidation)}&ddlFilterBy=1&lstSearchBy={country}&ctl_list_YearFrom={from_year}&ctl_list_WeekFrom={from_week}&ctl_list_YearTo={to_year}&ctl_list_WeekTo={to_week}&ctl_ReportViewer%24ctl03%24ctl00=&ctl_ReportViewer%24ctl03%24ctl01=&ctl_ReportViewer%24ctl10=ltr&ctl_ReportViewer%24ctl11=standards&ctl_ReportViewer%24AsyncWait%24HiddenCancelField=False&ctl_ReportViewer%24ctl04%24ctl03%24ddValue=1&ctl_ReportViewer%24ctl04%24ctl05%24ddValue=1&ctl_ReportViewer%24ToggleParam%24store=&ctl_ReportViewer%24ToggleParam%24collapse=false&ctl_ReportViewer%24ctl05%24ctl00%24CurrentPage=&ctl_ReportViewer%24ctl05%24ctl03%24ctl00=&ctl_ReportViewer%24ctl08%24ClientClickedId=&ctl_ReportViewer%24ctl07%24store=&ctl_ReportViewer%24ctl07%24collapse=false&ctl_ReportViewer%24ctl09%24VisibilityState%24ctl00=None&ctl_ReportViewer%24ctl09%24ScrollPosition=&ctl_ReportViewer%24ctl09%24ReportControl%24ctl02=&ctl_ReportViewer%24ctl09%24ReportControl%24ctl03=&ctl_ReportViewer%24ctl09%24ReportControl%24ctl04=100&__ASYNCPOST=true&'
html = get_table_data('Brazil', '2020', '1', '2021', '53')
print(parse_table(html))