-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathSpyder.py
75 lines (71 loc) · 2.37 KB
/
Spyder.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
import sys
import urllib2
import urlparse
import htmllib, formatter
from cStringIO import StringIO
def log_stdout(msg):
"""Print msg to the screen."""
print (msg)
def get_page(url, log):
"""Retrieve URL and return contents, log errors."""
try:
page = urllib2.urlopen(url)
except urllib2.URLError:
log("Error retrieving: " + url)
return ''
body = page.read()
page.close()
return body
def find_links(html):
"""Return a list of links in html."""
# We're using the parser just to
writer = formatter.DumbWriter(StringIO())
f = formatter.AbstractFormatter(writer)
parser = htmllib.HTMLParser(f)
parser.feed(html)
parser.close()
return parser.anchorlist
class Spider:
"""
The heart of this program, finds all links within a web site.
run() contains the main loop.
process_page() retrieves each page and finds the links.
"""
def __init__(self, startURL, log=None):
# This method sets initial values
self.URLs = set()
self.URLs.add(startURL)
self.include = startURL
self._links_to_process = [startURL]
if log is None:
# Use log_stdout function if no log provided
self.log = log_stdout
else:
self.log = log
def run(self):
# Processes list of URLs one at a time
while self._links_to_process:
url = self._links_to_process.pop()
self.log("Retrieving: " + url)
self.process_page(url)
def url_in_site(self, link):
# Checks whether the link starts with the base URL
return link.startswith(self.include)
def process_page(self, url):
# Retrieves page and finds links in it
html = get_page(url, self.log)
for link in find_links(html):
# Handle relative links
link = urlparse.urljoin(url, link)
self.log("Checking: " + link)
# Make sure this is a new URL within current site
if link not in self.URLs and self.url_in_site(link):
self.URLs.add(link)
self._links_to_process.append(link)
if __name__ == '__main__':
# This code runs when script is started from command line
startURL = sys.argv[1]
spider = Spider(startURL)
spider.run()
for URL in sorted(spider.URLs):
print (URL)