-
Notifications
You must be signed in to change notification settings - Fork 1
/
searchengines.py
executable file
·105 lines (86 loc) · 2.94 KB
/
searchengines.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
#!/usr/bin/env python
#-*- coding: UTF-8 -*-
__author__ = "helljump"
import logging
import time
import random
import settings
from grab import Grab, error
import captcha
log = logging.getLogger(__name__)
log.setLevel(logging.DEBUG)
#, threads %i', threading.active_count()
def get_antigate():
ag = captcha.BaseCaptcha()
params = settings.STORAGE.get('captcha', {})
for k, v in params.items():
if v:
if k == 'antigate' and params['antigate_key']:
ag = captcha.Antigate(params['antigate_key'])
elif k == 'captchabot' and params['captchabot_key']:
ag = captcha.CaptchaBot(params['captchabot_key'])
elif k == 'ripcaptcha' and params['ripcaptcha_key']:
ag = captcha.RipCaptcha(params['ripcaptcha_key'])
return ag
def googlecom(q, baseurl="http://www.google.com"):
g = Grab(headers=random.choice(settings.CONFIG['headers']))
#if __name__ == '__main__':
# g.setup(log_dir='dumps')
g.go(baseurl)
g.set_input('q', q)
g.submit()
while True:
links = g.doc.select('//h3[@class="r"]/a/@href')
if not links:
log.debug('no links on page')
return
for link in links:
yield link.text()
try:
nexturl = g.doc.select('//a[@id="pnnext"]/@href').text()
except error.DataNotFound:
log.debug('no next button')
return
g.go(nexturl)
time.sleep(1)
def googleru(q):
return googlecom(q, "http://www.google.ru")
def yandex(q):
ag = get_antigate()
g = Grab(headers=random.choice(settings.CONFIG['headers']))
g.go('http://ya.ru')
g.set_input('text', q)
#g.go('http://yandex.ru/yandsearch?text=free+proxy+list&lr=9')
g.submit()
while True:
try:
capurl = g.doc.select('//img[starts-with(@src,"http://yandex.ru/captchaimg")]/@src')
url = capurl.text()
g2 = g.clone()
data = g2.go(url)
rep = ag.get_text(data.body_as_bytes(), 'captcha.gif', is_russian=1)
g.set_input('rep', rep)
g.submit()
except error.DataNotFound:
pass # it's ok
links = g.doc.select('//a[contains(@class,"serp-item__title-link")]/@href')
if not links:
log.debug('no links on page')
return
for link in links:
yield link.text()
try:
nexturl = g.doc.select('//a[contains(@class,"pager__button_kind_next")]/@href').text()
except error.DataNotFound:
log.debug('no next button')
return
g.go(nexturl)
time.sleep(1)
ENGINES = (
['yandex', True, yandex],
['google.com', True, googlecom],
['google.ru', True, googleru]
)
if __name__ == "__main__":
for l in yandex("free proxy list"):
print l