-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathcrawl.py
185 lines (161 loc) · 4.17 KB
/
crawl.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
#def record_user_click(index,keyword,url):
# urls=lookup(index,keyword)
# if urls:
# for entry in urls:
# if entry[0]==url:
# entry[1]=entry[1]+1
#--------------------------------------
#def add_to_index1(index,keyword,url):###
# for item in index:
# if keyword==item[0]:
# if url not in item[1]:
# item[1].append([url,0])
# return
# index.append([keyword,[[url,0] ]])
def add_to_index(index,keyword,url):
if keyword in index:
if url not in index[keyword]:##eliminate dups
index[keyword].append(url)
else:
index[keyword]=[url]
#def lookup1(index,keyword):###
# for item in index:
# if item[0]==keyword: return item[1]
# return []
def lookup(index,keyword):
if keyword in index:
return index[keyword]
return None
def add_page_to_index(index,url,content):
#words=content.split()
words=split_string(content,' !,<>?."\"\'')
for word in words:
add_to_index(index,word,url)
def split_string(source,splitlist):
output=[]
atsplit=True
for char in source:
if char in splitlist:
atsplit=True
else:
if atsplit:
output.append(char)
atsplit=False
else:
output[-1]=output[-1]+char
return output
#------------------------------------
def get_page(url):
try:
import urllib
usock=urllib.urlopen(url)
data= str(usock.read())
usock.close()
return data
except:
return ""
def get_next_target(s):
start_link=s.find('<a href=')
if start_link==-1: return None,0
start_quote=s.find('"',start_link)
end_quote=s.find('"',start_quote+1)
url=s[start_quote+1:end_quote]
return url,end_quote
def get_all_links(page):
links=[]
while True:
url, endpos=get_next_target(page)
if url:
links.append(url)
page=page[endpos:]
else: break
return links
def crawl_web1(seed,max_depth): ###
tocrawl=[[seed,0]]
crawled=[]
index=[] # building the search index
while tocrawl:
page,depth=tocrawl.pop()
if page not in crawled and depth<=max_depth:
content=get_page(page)
add_page_to_index(index,page,content)
for link in get_all_links(content):
tocrawl.append([link,depth+1])
crawled.append(page)
return index
def crawl_web(seed,max_depth): ###
tocrawl=[[seed,0]] #<url,depth>
crawled=[]
index={} # building the search index
graph={}
while tocrawl:
page,depth=tocrawl.pop()
if page not in crawled and depth<=max_depth:
content=get_page(page)
add_page_to_index(index,page,content)
outlinks=get_all_links(content)
graph[page]=outlinks
for link in outlinks:
tocrawl.append([link,depth+1])
crawled.append(page)
return index,graph
def compute_ranks(graph):
d=0.8 #dumping factor
numloops=10
ranks={}
npages=len(graph)
for page in graph:
ranks[page]=1.0/npages
for i in range(0,numloops):
newranks={}
for page in graph:
newrank=(1-d)/npages
for node in graph:
if page in graph[node]:
newrank=newrank+d*(ranks[node]/len(graph[node]))
newranks[page]=newrank
ranks=newranks
return ranks
def search(index,ranks,keyword):
if keyword not in index:
return None
URLs=index[keyword]
bestUrl=URLs[0]
bestRank=ranks[bestUrl]
for url in URLs:
if ranks[url]>bestRank:
bestUrl=url
bestRank=ranks[url]
return bestUrl,bestRank
#-------------------------------------
def getText(url):
page=get_page(url)
start_title=page.find('<title>')
if start_title==-1:
return "(Empty)"
end_title=page.find('</title>')
title=page[start_title+7:end_title]
return title
def storeToDB(index, ranks):#store data to DB
import MySQLdb
db=MySQLdb.connect(user='root',db='mini', passwd='cxl', host='localhost')
cursor=db.cursor()
for keyword in index:
for url in index[keyword]:
try:
cursor.execute('insert into mini_index values("'+keyword+'", "'+url+'")' )
except:
continue
for url in ranks:
try:
title=getText(url)
cursor.execute('insert into mini_ranks (url,rank,title) values("'+url+'", '+str(ranks[url])+',"'+str(title)+'")' )
#cursor.execute('insert into mini_ranks (url,rank) values("'+url+'", '+str(ranks[url])+')' )
except:
continue
db.close()
index,graph= crawl_web('http://sparkloftmedia.com/',2)
ranks= compute_ranks(graph)
##print ranks
##print search(index,ranks,'Portland')
storeToDB(index,ranks)