-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathpagerank.py
128 lines (93 loc) · 3.21 KB
/
pagerank.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
import numpy as np
import sys
arguments = sys.argv
dataset = open(arguments[1])
dataset = dataset.readlines()
# handle the dataset
data = dataset[4:len(dataset)]
for i in range(len(data)): # 去除每一行最后的回车
#i = i[:len(i)-1]
data[i] = data[i][:len(data[i])-1]
data[i] = data[i].split("\t")
connect_sheet = {} # 记录每个点去往哪些点,即从key 到内容
#for i in range(875713):
for i in data:
connect_sheet[i[0]] = []
if connect_sheet.get(i[1]) == None:
connect_sheet[i[1]] = []
#print("data[0] 的第一项: ", type(connect_sheet.get(data[0][0])))
for i in data:
connect_sheet.get(i[0]).append(i[1])
#print(len(connect_sheet))
#initial_pr = {}
PR = {}
N = len(connect_sheet)
for i in connect_sheet:
#initial_pr[i] = 1/N
PR[i] = 1/N # used to be 0
out_count = {}
for i in connect_sheet:
out_count[i] = len(connect_sheet.get(i))
#print(out_count['1982'])
def get_outlinks(connect,PR):
in_links = {}
for i in connect:
in_links[i] = []
for i in connect:
local_outs = connect.get(i)
for k in local_outs:
in_links[k].append(PR[i]/len(local_outs))
return in_links
def reducer(connect,PR,in_link,N): # connect used to be in the parameters
#new_inlinks = {}
#for i in connect:
# new_inlinks[i] = []
for i in in_link:
PR[i] = (1-0.8)/N
if len(in_link.get(i)) > 0:
local_in_link = in_link.get(i)
#for m in local_in_link:
PR[i] += 0.8*sum(local_in_link)
return PR
#print(pagerank_calculater(connect_sheet,PR,out_count,N))
def sort_by_value(dict): #function which sorts a dictionary by value
sorted_keys = sorted(dict, key=dict.get, reverse=True)
#sorted_dict = {}
#for w in sorted_keys:
# sorted_dict[w] = dict[w]
return sorted_keys
count = 0
#print(get_outlinks(connect_sheet, PR))
while True:
count += 1
key_list_by_pr = sort_by_value(PR)
top1_value = PR[key_list_by_pr[0]]
link_in = get_outlinks(connect_sheet, PR)
PR = reducer(connect_sheet,PR,link_in,N)
#print(PR)
updated_keylist = sort_by_value(PR)
updated_top1 = PR[updated_keylist[0]]
#print(updated_top1)
if abs(top1_value - updated_top1) < 1e-10:
#with open("top10_pg.txt", "w") as o:
# o.write("Top ten largest page rank: \n")
# o.write("Node id: page rank value: \n")
# for i in updated_keylist:
# temp = i + '\t' + str(PR[i]) + '\n'
# o.write(temp)
#o.write("end of top 10\n")
print("Top ten largest page rank: ")
print("Node id: ", '\t', "page rank value")
for i in updated_keylist[0:10]:
print(i, '\t', PR[i])
with open("pagerank_result.txt","w") as f:
f.write("Node id: page rank value: \n")
for i in PR:
temp = i + '\t' + str(PR[i]) + '\n'
f.write(temp)
#print(updated_keylist[0:10])
#print(count)
break
#print("Node id: ", '\t', "page rank value")
#for i in PR:
# print(i, '\t', PR[i])