forked from PermanentOrg/sftp-qa
-
Notifications
You must be signed in to change notification settings - Fork 0
/
apod-downloader.py
executable file
·213 lines (168 loc) · 6.16 KB
/
apod-downloader.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
#!/usr/bin/env python3
import argparse
from bs4 import BeautifulSoup as BS
from dateutil import parser as date_parser
import os
import requests
import sys
from typing import List
CACHE_DIR = "test-tree/apod"
ARCHIVE_ROOT = "https://apod.nasa.gov/apod/"
LOCAL_ARCHIVE = "test-tree/APOD"
START_DATE = "2019-05-25" # kinda of arbitrary startpoint
END_DATE = "2020-03-01" # matches end date of the corpus in google drive
def slurp(fname):
with open(fname, "rb") as fh:
return str(fh.read())
class Link:
def __init__(
self,
timestamp: str,
html_url: str,
name: str,
image_url: str = "",
explanation: str = "",
):
self.name = name
self.timestamp = date_parser.parse(timestamp)
self.image_url = image_url
self.explanation = explanation
## These pertain to the webpage specific to this image, not the index as a whole
self.html_url = html_url
self.html = ""
def __str__(self) -> str:
return f'{self.datestr()}: <a href="{self.html_url}>{self.name}</a><img src="{self.image_url}">'
def datestr(self) -> str:
"Return timestamp as a %Y-%m-%d string"
return self.timestamp.strftime("%Y-%m-%d")
def fetch_image(self):
"""Ensure image is in cache
Some html pages have a video instead, and we skip those"""
if self.image_url == "":
self.parse_html()
if self.image_url == "VIDEO":
return
ext = os.path.splitext(self.image_url)[1]
fname = os.path.join(CACHE_DIR, f"{self.datestr()}{ext}")
if os.path.exists(fname):
return
response = requests.get(self.image_url)
if response.status_code == 200:
with open(fname, "wb") as fh:
fh.write(response.content)
else:
sys.stderr.write(
f"Couldn't fetch {self.image_url}. Response code: {response.status_code}\n"
)
sys.exit()
def fetch_html(self) -> str:
"""Grab self.html_url from web or cache, return as string
Side effect: sets self.html to the same html string we return"""
fname = os.path.join(CACHE_DIR, f"{self.datestr()}.html")
if os.path.exists(fname):
self.html = slurp(fname).replace(r"\'", "'")
return slurp(fname).replace(r"\'", "'")
# Not in cache, so fetch and store it
response = requests.get(self.html_url)
if response.status_code == 200:
self.html = str(response.content)
with open(fname, "wb") as fh:
fh.write(response.content)
else:
sys.stderr.write(
f"Couldn't fetch {self.html_url}. Response code: {response.status_code}\n"
)
sys.exit()
def fetch(self):
self.fetch_html()
self.parse_html()
self.fetch_image()
def parse_html(self):
if self.html == "":
sys.stderr.write(
"Cannot fetch image without first fetching html. Fetching html now.\n"
)
self.fetch_html()
soup = BS(self.html, features="lxml")
# It's a video, not an image
if "Video Credit" in self.html:
self.image_url = "VIDEO"
return
iframes = soup.find_all("iframe")
for iframe in iframes:
if "youtube.com" in iframe.get("src"):
self.image_url = "VIDEO"
return
if "ustream.tv" in iframe.get("src"):
self.image_url = "VIDEO"
return
self.image_url = ARCHIVE_ROOT + [
s
for s in soup.find_all("a")
if s.get("href") and s.get("href").startswith("image")
][0].get("href")
# TODO: parse out explanation
def get_archive_html() -> str:
"""Fetch html index of photos from web or disk if available
During testing, I did `wget 'https://apod.nasa.gov/apod/archivepix.html'` to avoid
constantly re-downloading that page. This func lets my
script DTRT regardless of whether somebody has done
that.
The index is the only thing that changes. Everything else is
write-once, so we can just cache that forever if we want.
"""
fname = os.path.join(CACHE_DIR, "archivepix.html")
if os.path.exists(fname):
return slurp(fname).replace(r"\'", "'")
response = requests.get("https://apod.nasa.gov/apod/archivepix.html")
return str(response.content).replace(r"\'", "'")
def parse_link(line: str) -> Link:
"""Parses a line from the archive and returns a Link object"""
timestamp, rest = line.split(":", 1)
anchor = BS(rest, features="lxml").a
name = anchor.text
return Link(timestamp, ARCHIVE_ROOT + anchor.get("href"), anchor.text)
def get_links() -> List[Link]:
"""Grab the archive from web or disk, parse it for photo links"""
lines = get_archive_html().split(r"\n")
lines = [l for l in lines if '<a href="ap' in l]
links = []
for line in lines:
links.append(parse_link(line))
return links
def parse_cli():
global START_DATE
global END_DATE
cli_parser = argparse.ArgumentParser(
prog="apod-downloader", description="Download test images from NASA", epilog=""
)
cli_parser.add_argument(
"--start",
help=f"start date in %Y-%m-%d format (default: {START_DATE}",
default=START_DATE,
)
cli_parser.add_argument(
"--end",
help=f"start date in %Y-%m-%d format (default: {END_DATE}",
default=END_DATE,
)
args = cli_parser.parse_args()
START_DATE = date_parser.parse(args.start)
END_DATE = date_parser.parse(args.end)
if END_DATE < START_DATE:
END_DATE = START_DATE
sys.stderr.write(
"End date must be after start date, setting end date equal to start date\n"
)
return args
def main():
os.system(f"mkdir -p {CACHE_DIR}")
cli = parse_cli()
links = get_links()
links = [l for l in links if l.timestamp <= END_DATE]
links = [l for l in links if l.timestamp >= START_DATE]
for link in links:
link.fetch()
print(link)
if __name__ == "__main__":
main()