-
Notifications
You must be signed in to change notification settings - Fork 2
/
thesandbornmaps.cudl.colorado.edu.py
executable file
·104 lines (97 loc) · 4.35 KB
/
thesandbornmaps.cudl.colorado.edu.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
#!/usr/bin/env python
import urllib2, sys, re, getopt
from bs4 import BeautifulSoup
from functions import getSoup, fileDl, ensureDir, roundUpTo, roundDownTo
baseUrl = "http://cudl.colorado.edu/luna/servlet/view/all?sort=city%2Cdate%2Csheet&os="
baseDir = "/root/maps/"
def usage():
print("University of Colorado Sanborn Maps Scraper. Requested by /u/WhiskeyQuebec. Made by /u/nicba1010.")
print("Specify directory or it'll all go to your root folder!!!")
print("Options available: ")
print("\t-s, --simple\tConstructs the simple/flat directory structure")
print("\t-h, --help\tShows this text")
print("\t--from=\t\tStart at the given document number")
print("\t--to=\t\tEnd with the given document number")
print("\t--save-dir=\tStore at this location")
startDoc = 0
endDoc = -1
simple = False
try:
opts, args = getopt.getopt(sys.argv[1:], "hs", ["help", "simple", "from=", "to=", "save-dir="])
except getopt.GetoptError as err:
print str(err)
usage()
sys.exit()
for o, a in opts:
if o in ("-s", "--simple"):
simple = True
elif o in ("-h", "--help"):
usage()
sys.exit()
elif o == "--from":
startDoc = int(a)
elif o == "--to":
endDoc = int(a)
elif o == "--save-dir":
baseDir = a
else:
usage()
assert False, "unhandled option"
sys.exit()
ensureDir(baseDir)
documentSoup = getSoup(baseUrl + "0")
documentTotal = int(documentSoup.find('div', { "id" : "PageRange" }).text.split('of')[1].strip().replace(',',''))
print str(documentTotal) + " documents to download. Let's get started!"
if endDoc == -1:
endDoc = documentTotal
documentNum = 1 + roundDownTo(startDoc, 50)
print("Scanning range from " + str(roundDownTo(startDoc, 50)) + " to " + str(documentTotal if roundUpTo(endDoc, 50) > documentTotal else roundUpTo(endDoc, 50)))
for i in range(roundDownTo(startDoc, 50), documentTotal if roundUpTo(endDoc, 50) > documentTotal else roundUpTo(endDoc, 50), 50):
print("Documents " + str(i) + " to " + str(i+50))
groupSoup = getSoup(baseUrl + str(i))
for mediaContainer in groupSoup.findAll('div', { "class" : "mediaContainer" }):
if documentNum < startDoc:
documentNum += 1
print("Skiping Document " + str(documentNum))
continue
elif documentNum > endDoc:
print("My job here is done!")
sys.exit(420)
print("\tDocument " + str(documentNum))
documentNum += 1
blockQuotes = mediaContainer.findAll('blockquote')
try:
print("\t\tCity: \t\t" + blockQuotes[0].text.strip())
print("\t\tDate: \t\t" + blockQuotes[1].text.strip())
print("\t\tSheet: \t\t" + blockQuotes[2].text.strip())
except Exception:
pass
singleDocumentUrl = mediaContainer.find('a')['href']
print("\t\tDoc Url: \t" + singleDocumentUrl)
singleDocumentSoup = getSoup(singleDocumentUrl)
theJavaScript = singleDocumentSoup.find('div', { "class" : "controlStrip" }).nextSibling.nextSibling
theJP2Url = str(theJavaScript).split("openPdfInWindow")[1].splitlines()[5].strip()[11:-38]
print("\t\tJP2 Url: \t" + theJP2Url)
theXMLUrl = ""
theXMLId = theJP2Url.split('/')[-1][:-4]
if theXMLId == "bou00003":
theXMLUrl = "http://ucblibraries.colorado.edu/systems/digitalinitiatives/xml/bou00.xml"
else:
try:
theXMLUrl = singleDocumentSoup.find('td', text=re.compile(r'METS XML View')).parent.nextSibling.nextSibling.find('a')['href']
except TypeError:
theXMLUrl = raw_input("\t\tThe XML is not valid, check the page manually and try to find the real XML file, input the url here: ")
print("\t\tXML Url: \t" + str(theXMLUrl))
if not simple:
ensureDir(baseDir + blockQuotes[0].text.strip() + "/" + blockQuotes[1].text.strip() + "/")
fileDl(theXMLUrl, baseDir if simple else (baseDir + blockQuotes[0].text.strip() + "/" + blockQuotes[1].text.strip() + "/"), "\t\t\t")
print("\t\tXML ID: \t" + theXMLId)
xmlSoup = getSoup(theXMLUrl)
admId = xmlSoup.find('filegrp').find('file', { "id" : theXMLId })['admid']
imageWidth = int(xmlSoup.find('techmd', { "id" : admId }).find('imagewidth').text.strip())
imageHeight = int(xmlSoup.find('techmd', { "id" : admId }).find('imagelength').text.strip())
print("\t\tWidth: \t\t" + str(imageWidth))
print("\t\tHeight: \t" + str(imageHeight))
finalUrl = theJP2Url + "&x=0&y=0&width=" + str(imageWidth) + "&height=" + str(imageHeight)
print("\t\tFinal Url: \t" + finalUrl)
fileDl(finalUrl, baseDir if simple else (baseDir + blockQuotes[0].text.strip() + "/" + blockQuotes[1].text.strip() + "/"), "\t\t\t", theXMLId + ".jp2")