-
Notifications
You must be signed in to change notification settings - Fork 0
/
rag_crawler.py
123 lines (107 loc) · 4.92 KB
/
rag_crawler.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
from lxml import html
import requests
import time
from Mail_module import Mail_module
THREADHOLD = 70 # show item only price decrease THREADHOLD%
STOP = 15 # Lower bound to search for watchlist item
IGNORE_LIST = "rag_ignore_list.txt"
WATCH_LIST = "rag_watch_list.txt"
MAIL_SERVER = 'smtp.gmail.com:587'
REFRESH_TIME = 7200 # report list item refresh rate in second, 2H
CLEAN_COUNTER = 12
IGNORE_COUNTER = 200
def text(elt):
return elt.text_content().replace(u'\xa0', u' ')
class item:
def __init__(self, name, price, discount, time ):
self.name = name
self.price = price
self.discount = discount
self.time = time
self.counter = 0
class Rag_crawler:
def __init__(self, report, stop):
self.report = report
self.stop = stop
self.mm = None
self.report_list = {} # list of item object that already reported to recipient
with open(WATCH_LIST) as f:
self.watch_list = f.readlines()
self.watch_list = [x.strip() for x in self.watch_list]
f.close()
with open(IGNORE_LIST) as f:
self.ignore_list = f.readlines()
self.ignore_list = [x.strip() for x in self.ignore_list]
f.close()
def init_mail_module( self, username, pw, to_addr ):
self.mm = Mail_module( username, pw, MAIL_SERVER )
self.mm.set_from( "RAG Crawler Bot")
self.mm.to_addr = to_addr
def get_rag(self):
base_url = 'http://ragi.al/cheap/iRO-Renewal'
count = 0
page = requests.get(base_url+'/'+str(count))
tree = html.fromstring(page.content)
discount = self.stop + 1
msg = ""
report = False
while ( discount > self.stop ):
count = count + 1
page = requests.get(base_url+'/'+str(count))
tree = html.fromstring(page.content)
for table in tree.xpath('//div[@class="ilist"]/table'):
data = [[text(td) for td in tr.xpath('td')] for tr in table.xpath('//tr')]
for i in range (1, len(data)):
item_name = data[i][0].lstrip()
item_price = int (data[i][2].replace('z', '').replace(',',''))
discount = int(data[i][3].replace('-','').replace('%', ''))
if ( item_name in self.ignore_list ):
continue
if ( discount >= self.report or (item_name in self.watch_list)):
if ( item_name in self.report_list ):
# Possibly already report this item
if ( self.report_list[item_name].price > item_price ):
self.report_list[item_name].price = item_price
self.report_list[item_name].discount = discount
self.report_list[item_name].time = time.time()
self.report_list[item_name].counter = 0
#print("Found lower price for %s" %(item_name))
msg = msg + "%s, %d, %d\n" %(item_name, item_price, discount)
report = True
else:
self.report_list[item_name].counter += 1
continue
else: # not in list
#print("inserting %s into list" %(item_name))
self.report_list[item_name] = item(item_name, item_price, discount, time.time())
msg = msg + "%s, %s, %d\n" %(item_name, item_price, discount)
report = True
if ( report ):
self.mm.sendmail(self.mm.to_addr, "Rag Discount Item", msg)
print("mail content: \n%s" %(msg))
def clean_report_list( self ):
print("Running clean_report_list")
now = time.time()
key_list = list(self.report_list.keys())
for k in key_list:
if ( REFRESH_TIME < int(now - self.report_list[k].time) ):
if ( self.report_list[k].counter > IGNORE_COUNTER ):
# Add to ignore list and update struct
f = open(IGNORE_LIST, "a")
f.write( "%s\n" %(self.report_list[k].name) )
self.ignore_list.append( self.report_list[k].name )
del self.report_list[k]
if __name__ == "__main__":
clean_counter = CLEAN_COUNTER
username = input("Enter bot mail username: ")
pw = input("Enter pw: ")
to_addr = input("Enter recipient email addr: ")
rc = Rag_crawler( THREADHOLD, STOP )
rc.init_mail_module( username, pw, to_addr )
while True:
rc.get_rag()
clean_counter = clean_counter - 1
if ( clean_counter <= 0 ):
clean_counter = CLEAN_COUNTER
rc.clean_report_list()
time.sleep(10)