-
Notifications
You must be signed in to change notification settings - Fork 0
/
scraperArizona.py
49 lines (43 loc) · 1.7 KB
/
scraperArizona.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
################################################################################
# Takes Arizona pdf and extracts the
# text out of the pdf documents.
#
################################################################################
#import urllib2
from urllib2 import Request, urlopen
from StringIO import StringIO
#import cookielib
#import os
#from bs4 import BeautifulSoup
#import pyPDF2
from pyPdf import PdfFileWriter, PdfFileReader #to import, go to pyPdf and run the program, then go back up a level
# Website to scrape
file_url = 'http://www.wapa.gov/dsw/scibowl/SampleQ/Samples2003.pdf'
f=open('round.txt','w') #Where all the text of every file goes
print(file_url) #Full URL
############################################################
# At this point, I have file_url which leads to a pdf I want
writer = PdfFileWriter()
remoteFile = urlopen(Request(file_url)).read()
memoryFile = StringIO(remoteFile)
pdfFile = PdfFileReader(memoryFile)
############################################################
#Use this to save scraped file to output.pdf
for pageNum in xrange(pdfFile.getNumPages()):
currentPage = pdfFile.getPage(pageNum)
#currentPage.mergePage(watermark.getPage(0))
writer.addPage(currentPage)
outputStream = open("output.pdf","wb")
writer.write(outputStream)
outputStream.close()
############################################################
# Read output.pdf and extract text from each page, spit into
# round.txt
pdf = PdfFileReader(open("output.pdf", "rb"))
for page in pdf.pages:
pgtxt = page.extractText().encode("ascii", "ignore")
#remove new lines
pgtxt = pgtxt.replace('\n', ' ').replace('\r', '')
print pgtxt
f.write(pgtxt)
f.close()