-
Notifications
You must be signed in to change notification settings - Fork 0
/
Core.py
executable file
·131 lines (118 loc) · 5.54 KB
/
Core.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
import thread
import Queue
import re
import urllib
import urlparse
import time
import sys
import random
processedURLsCounter = 0
errorCounter = 0
matchCounter = 0
dupcheck = set()
queue = None # Html Queue
textToLookFor = None
searchingLimit = None
matchTextRegex = None
matchesList = []
proxies = [ # Place your proxy list here with port, example: exampleproxy.com:8080
{'http': ''},
{'http': ''},
{'http': ''},
{'http': ''},
{'http': ''},
{'http': ''},
{'http': ''},
{'http': ''},
{'http': ''},
{'http': ''},
] # Either delete the content inside this list or fill it without no empty entry
currentProxy = None
def processInitialUserInputAndInitiateVariables():
global queue, textToLookFor, searchingLimit, matchTextRegex
if len(sys.argv) >= 4:
searchingLimit = sys.argv[3]
else:
searchingLimit = raw_input("Type in the maximum number of pages you want to search into:\n")
queue = Queue.Queue(int(searchingLimit)) # It's best to hard-code the queue's limit just in case
if len(sys.argv) < 2: # This was an experimental flag, which allows the user to run the python script with flags
queue.put('http://' + raw_input("Type in a link without http//: \n"))
else:
queue.put(sys.argv[1])
if len(sys.argv) >= 3:
textToLookFor = sys.argv[2]
else:
textToLookFor = raw_input("Type in a word that you want to look for: \n")
logFile = open("errorLog.txt", "w")
logFile.close()
matchTextRegex = [r"((<p([A-Za-z-0-9=\"'_ \\-]|\s){0,}?[>])){1}(.{0,}?",
r".{0,}?)(<[\/|\\]?([A-Za-z-0-9=\"'_ \-]|\s){0,}?p[>]){1}"]
def processAndTellResult():
print("Searching done, writing results to matches.txt ...")
resultFiles = open("matches.txt", "w")
resultFiles.close()
resultFiles = open("matches.txt", "a")
resultFiles.write("Word matched: " + textToLookFor + "\n\n")
for match in matchesList:
resultFiles.write(match)
resultFiles.close()
print("Done! Matches have been written into matches.txt \nURLs processed: " + str(processedURLsCounter)
+ "\nURLs with positive matches: " + str(matchCounter) + "\nErrors while processing URLs: "
+ str(errorCounter))
def queueURLs(html, origLink): # Processes HTML code looking for other URLs inside the domain
for url in re.findall(r'<a[^>]+href=["\'](.[^"\']+)["\']', html, re.I): # Searches the HTML for other URLs
link = url.split("#", 1)[0] if url.startswith("http") else '{uri.scheme}://{uri.netloc}'.format(uri=urlparse.urlparse(origLink)) + url.split("#", 1)[0]
if link in dupcheck: # Checks if link has already been processed
continue
dupcheck.add(link)
if len(dupcheck) > 99999: # It's better to avoid memory overflow problems
dupcheck.clear()
if processedURLsCounter >= int(searchingLimit):
with queue.mutex: # Clears queue in a thread safe way
queue.queue.clear()
else:
queue.put(link)
def getHTML(link):
try:
global processedURLsCounter, matchCounter, errorCounter, proxies, currentProxy
if((currentProxy == None or processedURLsCounter % 20 == 0 ) and len(proxies)>0 ):
selectedProxy = random.choice(proxies)
if(selectedProxy['http'] != ''):
currentProxy = selectedProxy
else:
currentProxy = None
processedURLsCounter += 1
if(currentProxy != None):
html = urllib.urlopen(link, proxies=currentProxy).read() # Query the url page html trough a proxy
else:
html = urllib.urlopen(link).read()
print("Processing link: " + link + "\n")
# Bellow it will scan the HTML code for texts inside P elements which contain the chosen word or phrase
for match in re.findall(matchTextRegex[0] + textToLookFor + matchTextRegex[1], html, re.I):
# If it gets inside the loop it has basically found a match
matchCounter += 1
print("Match found:\n")
result = str(match[3])
result = re.sub(r'<.{0,}?>', '.', result) # Replaces any tags left inside the text by a period
print(str(result) + "\n\n") # Warns the user that it has found a match
# Adds match to list so we can throw all that into a text file later, doing this while processing HTMLs
# will just slow the process
matchesList.append("Link:\n " + link + "\nMatched paragraph:\n " + result + "\n\n")
queueURLs(html, link) # Query queueURLs to look for other links in that page
except (KeyboardInterrupt, SystemExit): # User can interrupt the process anytime by pressing CTRL+C
dupcheck.add(link)
raise
except Exception, e:
errorCounter += 1
dupcheck.add(link) # If the link generates an exception we need to make sure it won't process it again
print("Error while processing: " + link + " check errorLog.txt for more details \n")
logFile = open("errorLog.txt", "a")
logFile.write("Error processing " + link + " : " + str(e) + "\n") # Writes error to log file
logFile.close()
processAndTellResult() # Make sure user won't lose the results collected
pass
processInitialUserInputAndInitiateVariables()
while queue.empty() is not True or len(dupcheck) < 1: # Core loop of this script, starts the chain effect
thread.start_new_thread(getHTML, (queue.get(),))
time.sleep(0.5) # Without this the while loop will cycle too fast and think that the queue is empty
processAndTellResult()