-
Notifications
You must be signed in to change notification settings - Fork 0
/
crawler.py
402 lines (345 loc) · 14.1 KB
/
crawler.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
import math
import webdev
import os
import json
import matmult
osParentDirectory = "crawling"
osTFDirectory = "tf"
osTFIDFDirectory = "tfidf"
def crawl(seed):
#first, make a list of all the links we've gone to
lstPagesVisited=[]
#make a queue for what pages we should visit
lstQueue = []
#dicOutgoingLinks will record what pages we've been to
dicOutgoingLinks = {}
#dicIncomingLinks will record incoming links of the pages
dicIncomingLinks = {}
#dicAllWords will basically record all the words we've been to
dicAllWords={}
# reset information
resetInformation()
#every time we come across a page, we add it to pages.txt
osPagesFile = open("pages.txt", "a")
lstQueue.append(seed)
#while there's still something in the queue
while(len(lstQueue)>0):
#each time we go through one link
strSubPage = lstQueue[0]
#If the links found in page aren't in the queue or in the pages visited, add it to the list
outGoingLinks = getOutgoingLinks(strSubPage)
dicOutgoingLinks[strSubPage] = outGoingLinks
# add the links to the dictionary of incoming list
for url in outGoingLinks:
dicValueAppendElement(dicIncomingLinks, url, strSubPage)
#since we've gone on that page, let's add it to the pages we've visited
lstPagesVisited.append(strSubPage)
#We then add the page we've just visited to the text file
osPagesFile.write(strSubPage+"\n")
#this function records all the info we need in searchdata.py
recordInformation(strSubPage, dicAllWords)
#we then get rid of the website we've just visitied which is at lstQueue[0]
lstQueue.pop(0)
#for every link inside the page we're on,
for strLink in dicOutgoingLinks[strSubPage]:
#if it's not in the queue already and not a key in dicOutgoingLinks visited, then
if((strLink not in dicOutgoingLinks) and (strLink not in lstQueue)):
#add that page to the queue
lstQueue.append(strLink)
#after we're done going through all the pages, close pages.txt
osPagesFile.close()
#record outgoing links and incoming links
recordOutgoingLinksFile(dicOutgoingLinks)
recordIncomingLinksFile(dicIncomingLinks)
#Now that we have the pages we've been to, we can do the easy bit of recording IDF values
recordIDF(dicAllWords, dicOutgoingLinks)
# record tf-idf of every page
osDirectory = osParentDirectory
lstPages = os.listdir(osDirectory)
for page in lstPages:
osDirectory = os.path.join(osDirectory, page)
recordTFIDF(osDirectory)
osDirectory = osParentDirectory
#create page rank file
createPageRankFile()
#the number of pages we visited will be the number of pages there are since we visited all of them
return len(lstPagesVisited)
#O(n^m) due to finite number of processes
def resetInformation():
# reset directory "crawling"
if os.path.isdir(osParentDirectory):
recursiveDeleteDirectory(osParentDirectory)
#then create a new directory
createNewDirectory(osParentDirectory)
#reset directory "IDF Values"
if os.path.isdir("IDF Values"):
recursiveDeleteDirectory("IDF Values")
#then create a new directory
createNewDirectory("IDF Values")
#we need to reset what's inside the pages.txt file
if os.path.isfile("pages.txt"):
os.remove("pages.txt")
#this function will write down the info into a file
#O(n) time due to functions called that are O(n)
def recordInformation(strSubPage, dicAllWords):
#break down the lines
lstLines=webdev.read_url(strSubPage).strip().split("\n")
#dicWords will store the number of times a word appears
#apple : 3
#peach : 5
dicWords={}
#we'll read the whole thing
countWord(lstLines, dicWords)
# create new directory
osDirectory = strSubPage[strSubPage.rfind("/")+1:len(strSubPage)-5]
# joining parent directory
osDirectory = os.path.join(osParentDirectory, osDirectory)
# recursiveDeleteDirectory(osDirectory)
createNewDirectory(osDirectory)
#create a file and then write into it
recordWordCount(osDirectory,dicWords,dicAllWords)
#create a file that keeps track of how many words there are
recordTotalWordCount(osDirectory,dicWords)
#store the term frequency
recordTF(osDirectory,dicWords)
#This generally updates a dictionary with the number of words inside
#O(n^2) time
def countWord(lstLines, dicWords):
#for every line in the html page
for strLine in lstLines:
#If it starts with "<p>", then we know it has paragraphs
if(strLine.strip().endswith("<p>")):
blnParse = True
if(strLine.strip()==("</p>")):
blnParse = False
#if it's time to parse the paragraph, then...
if(blnParse==True and not strLine.strip().endswith("<p>")):
#if it's not in the dictionary, make it 1
if(strLine not in dicWords):
dicWords[strLine]=1
#otherwise, add 1
else:
dicWords[strLine]+=1
#This function checks if a directory exists and deletes it
#O(n^n) time
def recursiveDeleteDirectory(osDirectory):
for entry in os.listdir(osDirectory):
if os.path.isdir(os.path.join(osDirectory,entry)):
recursiveDeleteDirectory(os.path.join(osDirectory,entry))
elif os.path.isfile(os.path.join(osDirectory,entry)):
os.remove(os.path.join(osDirectory,entry))
os.rmdir(osDirectory)
#O(1)
def createNewDirectory(osDirectory):
#If the directory doesn't exist
if not os.path.exists(osDirectory):
#make a directory
os.makedirs(osDirectory)
#This function creates files for every word and prints the number of times that word appears
#O(n) time
def recordWordCount(osDirectory, dicWords, dicAllWords):
#for every key value(word) inside the dicWords dictionary,
for strWord in dicWords:
#create the file name
strFileName = strWord+".txt"
#make the path
osPath = os.path.join(osDirectory,(strFileName))
#open the file
osFile = open(osPath, "w")
#write the amount of times that word pops up into the file
osFile.write(str(dicWords[strWord]))
#then close the file
osFile.close()
#if the word isn't a part of dictionary of all words, then
if(strWord not in dicAllWords):
#make the word a key and make the value(number of times it comes up) 1
dicAllWords[strWord]=1
#if it's already a key but it pops up anyway, then we know the word occurs again
else:
#so we add 1
dicAllWords[strWord]+=1
#This function creates a file for every word term frequency
#O(n) time
def recordTF(osDirectory,dicWords):
#first, we read the number of total numbre of word
osPath = os.path.join(osDirectory,("total.txt"))
osFile = open(osPath, "r")
fltTotal = float(osFile.readline())
osFile.close()
# create tf
osDirectory = os.path.join(osDirectory,osTFDirectory)
createNewDirectory(osDirectory)
#for every word in dicWords
for strWord in dicWords:
#now, we record the term frequencies
osPath = os.path.join(osDirectory,(strWord+"_tf.txt"))
#open the file
osPath = open(osPath, "w")
#write into it
osPath.write(str(float(dicWords[strWord])/fltTotal))
#then close the file
osPath.close()
#This function creates a file that prints out the total words in a page
#O(n) time
def recordTotalWordCount(osDirectory, dicWords):
#initialize the total
intTotal=0
#find the path
strPath = os.path.join(osDirectory,("total.txt"))
osFile = open(strPath, "w")
#for every word in dicWords
for strWord in dicWords:
#add the number of times a word apears into inttotal
intTotal+=dicWords[strWord]
#then print out
osFile.write(str(intTotal))
osFile.close()
#O(n) time
def recordIDF(dicAllWords,dicOutgoingLinks):
intTotalDocuments = len(dicOutgoingLinks)
#for every word in the dictionary of all words
for strWord in dicAllWords:
#enter the idf directory
osPath = os.path.join("IDF Values", strWord+"_idf.txt")
#create the IDF value for the word
osPath = open(osPath,"w")
osPath.write(str(math.log2(intTotalDocuments/(1+dicAllWords[strWord]))))
osPath.close()
def recordTFIDF(osDirectory):
# read tf
osInDirectory = os.path.join(osDirectory, osTFDirectory)
lstTF = os.listdir(osInDirectory)
dicTFIDF = {}
for tf in lstTF:
osPath = os.path.join(osInDirectory,tf)
filein = open(osPath, "r")
dicTFIDF[tf[0: tf.find("_")] + "_tfidf.txt"] = [float(filein.readline().strip()),0]
filein.close()
# read idf
osInDirectory = "IDF Values"
lstIDF = os.listdir(osInDirectory)
for idf in lstIDF:
if idf[0: idf.find("_")] + "_tfidf.txt" in dicTFIDF:
osPath = os.path.join(osInDirectory,idf)
filein = open(osPath, "r")
dicTFIDF[idf[0: idf.find("_")] + "_tfidf.txt"][1] = float(filein.readline().strip())
filein.close()
# write word_tfidf.txt
# 1. create tf-idf directory
osOutDirectory = os.path.join(osDirectory, osTFIDFDirectory)
createNewDirectory(osOutDirectory)
# 2. write tfidf txts from dicTFIDF
for tfidf in dicTFIDF:
osPath = os.path.join(osOutDirectory,tfidf)
fileout = open(osPath, "w")
fileout.write(str(math.log2(1+dicTFIDF[tfidf][0])*dicTFIDF[tfidf][1]))
fileout.close()
def getOutgoingLinks(url):
lstLines = webdev.read_url(url).strip().split("\n")
# 2 variables to keep track of the beginning and end of a link
intStart = 0
intEnd = 0
strBeginning = url[0:url.rfind("/")+1]
strLink = ""
lstLinks=[]
#we'll read the whole thing
for strLine in lstLines:
#If it starts with "<a href", then we know it has a link to another pages
if(strLine.startswith("<a href")):
#find the index of the first "
intStart = strLine.find('"')
#find the index of the last "
intEnd = strLine.rfind('"')
#everything in between the " " will be the link
strLink = strLine[intStart+1:intEnd]
#If it's a relative link(it starts with "./", we'll add the absolute start onto the name of it
#Instead of "./N-0.html
if(strLink.startswith("./")):
#It will be https://scs.carleton.N-0.html now I think
strLink = strBeginning+strLink[2:len(strLink)]
#now we add the link to our list of links
lstLinks.append(strLink)
return lstLinks
#O(1)
def recordOutgoingLinksFile(dict):
#make the file in the general directory
file = open( "outgoinglinks.json", "w")
json.dump(dict,file)
file.close()
def recordIncomingLinksFile(dict):
#make the file in the general directory
file = open( "incominglinks.json", "w")
json.dump(dict,file)
file.close()
# create a dictionary with all the urls and their page rank and store it in a json file
def createPageRankFile():
# page rank for all the URLs
alpha = 0.1
euclideanDistThreshold = 0.0001
row = [] # row of the adjacency matrix
adjacencyMatrix = [] # N * N matrix
dict = {} # id -> link, e.g., 1:'http://.../N-3.html'
reversedDict = {} # link -> id, e.g., 'http://.../N-3.html':1
# create dict and reverse dict
count = 0
filein = open("pages.txt", "r")
link = filein.readline().strip()
while link != "":
dict[count] = link
reversedDict[link] = count
count += 1
link = filein.readline().strip()
filein.close()
# load outgoing links
file = open("outgoinglinks.json", "r")
outgoinglinks = json.load(file)
file.close()
# create adjacency matrix
adjacencyMatrix = [-1] * len(dict)
row = [-1] * len(dict)
for id in dict:
for link in reversedDict:
if link in outgoinglinks[dict[id]]:
row[reversedDict[link]] = 1
else:
row[reversedDict[link]] = 0
adjacencyMatrix[id] = row
row = [-1] * len(dict)
# scaled adjacency matrix after adding alpha/N to each entry
count = 0
for i in range(len(adjacencyMatrix)):
for j in range(len(adjacencyMatrix[0])):
if adjacencyMatrix[i][j] == 1:
count += 1
if count == 0:
for j in range(len(adjacencyMatrix[0])):
adjacencyMatrix[i][j] = (1 / len(adjacencyMatrix)) * (1-alpha) + (alpha / len(adjacencyMatrix))
else:
for j in range(len(adjacencyMatrix[0])):
adjacencyMatrix[i][j] = (adjacencyMatrix[i][j] / count) * (1-alpha) + (alpha / len(adjacencyMatrix))
count = 0
# power iteration with adjacency matrix
# initialize iterating vector
iteratingVector = [[1 / len(adjacencyMatrix)] * len(adjacencyMatrix)]
# use modules of matmult in tutorial4 to calculate
finalPageRankVector = matmult.mult_matrix(iteratingVector, adjacencyMatrix)
dist = matmult.euclidean_dist(finalPageRankVector, iteratingVector)
# iterate until dist < euclideanDistThreshold when we find the stable final page rank vector
while dist >= euclideanDistThreshold:
iteratingVector = finalPageRankVector
finalPageRankVector = matmult.mult_matrix(iteratingVector, adjacencyMatrix)
dist = matmult.euclidean_dist(finalPageRankVector, iteratingVector)
# write a json file contains all the page urls and their page rank.
# key = url, value = page rank
dicPageRank = {}
for url in reversedDict:
# reversedDict: key=url, value=id. So finalPageRankVector[0][id] is the page rank result for the url with this id
dicPageRank[url] = finalPageRankVector[0][reversedDict[url]]
fileout = open("pagerank.json", "w")
json.dump(dicPageRank,fileout)
fileout.close()
def dicValueAppendElement(dict, key, value):
if not(key in dict):
dict[key] = [value]
else:
dict[key].append(value)