-
Notifications
You must be signed in to change notification settings - Fork 0
/
searchdata.py
85 lines (74 loc) · 2.55 KB
/
searchdata.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
import os
import json
#this function searches outgoing links from the outgoinglinks.json file
def get_outgoing_links(URL):
osFile = open("outgoinglinks.json", "r")
dicOutgoingLinks = json.load(osFile)
osFile.close()
if URL in dicOutgoingLinks:
return dicOutgoingLinks[URL]
else:
return None
def get_incoming_links(URL):
osFile = open("incominglinks.json", "r")
dicIncomingLinks = json.load(osFile)
osFile.close()
if URL in dicIncomingLinks:
return dicIncomingLinks[URL]
else:
return None
def get_page_rank(URL):
osFile = open("pagerank.json", "r")
dicPageRank = json.load(osFile)
osFile.close()
if URL in dicPageRank:
return dicPageRank[URL]
else:
return -1
def get_idf(word):
fltIDF = 0
#so we're going to go into the IDF Value folder
strFile = word+"_idf.txt"
osPath = os.path.join("IDF Values", strFile)
if os.path.exists(osPath):
osFile = open(osPath,"r")
fltIDF = float(osFile.readline())
osFile.close()
#i forgot to use the log function on this.
return fltIDF
def get_tf(URL, word):
fltTF=0
#we go into the directory with the URL name
prtDirectory = "crawling"
tfDirectory = "tf"
osDirectory = os.path.join(prtDirectory,URL[URL.rfind("/")+1:len(URL)-5])
osDirectory = os.path.join(osDirectory, tfDirectory)
if os.path.isdir(osDirectory):
#if it's a directory, go ahead
osFile = (word+"_tf.txt")
osPath = os.path.join(osDirectory, osFile)
if os.path.isfile(osPath):
#we go into the word file
osFile = open(osPath, "r")
#we read that word file
fltTF = float(osFile.read())
osFile.close()
return fltTF
def get_tf_idf(URL, word):
fltTFIDF=0
#we go into the directory with the URL name
osParentDirectory = "crawling"
osTFIDFDirectory = "tfidf"
osDirectory = os.path.join(osParentDirectory,URL[URL.rfind("/")+1:len(URL)-5])
osDirectory = os.path.join(osDirectory, osTFIDFDirectory)
if os.path.isdir(osDirectory):
#if it's a directory, go ahead
osFile = (word+"_tfidf.txt")
osPath = os.path.join(osDirectory, osFile)
if os.path.isfile(osPath):
#we go into the word file
osFile = open(osPath, "r")
#we read that word file
fltTFIDF = float(osFile.readline().strip())
osFile.close()
return fltTFIDF