forked from astroB0Y/experimentos
-
Notifications
You must be signed in to change notification settings - Fork 0
/
tools.py
115 lines (91 loc) · 3.33 KB
/
tools.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
import sys
from bs4 import BeautifulSoup
from torpy.http.requests import TorRequests
import requests
def scraper():
lista = ""
print('scraper : INFO : requesting elcano...', flush=True)
with open('caches/site.txt', 'r') as f:
line = f.read()
link = line.strip()
f.close()
try:
with TorRequests() as tor_requests:
with tor_requests.get_session() as sess:
grab = sess.get(link)
print(grab)
except:
# El error de la librería torpy no tiene importancia y no afecta a futuros runs
print("scraper : INFO : torpy linea 22 could not access elcano")
sys.exit(0)
soup = BeautifulSoup(grab.text, 'html.parser')
for enlace in soup.find_all('a'):
acelink = enlace.get('href')
canal = enlace.text
if not str(acelink).startswith("acestream://") or canal == "aquÃ":
pass
else:
link = str(acelink).replace("acestream://", "")
lista += str((canal + "\n" + link + "\n"))
contenido = ((lista.replace(u'\xa0', u' ')).strip())
if contenido != "":
print("scraper : OK : channels retrieved")
write_cache(contenido)
else:
print("scraper : INFO : could not access elcano")
def write_cache(contenido):
"""
contenido1 = contenido.replace('ç', 'ç', 1)
contenido2 = contenido1.replace('ñ', 'ñ')
contenido_lat = contenido2.replace('ó', 'ó')
"""
with open("caches/cachedList.txt", "wb") as cachedlist:
cachedlist.write(contenido.encode('latin1'))
cachedlist.close()
print("scraper : INFO : elcano cached")
"""
def read_cache():
with open('caches/cachedList.txt', 'r') as cachedlist:
contenido = cachedlist.read()
cachedlist.close()
print("scrap: INFO: returning elcano cached List")
#return (contenido)
"""
'''
def electroperra():
grab = requests.get('https://hackmd.io/@cicvNR77Qoem0wkzPOdC8w/B1CkAt6so')
soup = BeautifulSoup(grab.text, 'html.parser')
lista = ""
longLinealText = ""
contenido1 = ""
contenido2 = ""
contenido3 = ""
for webText in soup.find_all('div', attrs={'class': 'container-fluid markdown-body'}):
longLinealText = (str(webText).split('|'))
#print(longLinealText)
for contenido in longLinealText:
if contenido == "\x0a":
pass
else:
contenido3 = contenido2
contenido2 = contenido1
contenido1 = contenido.strip()
if "acestream" not in contenido2:
pass
else:
if contenido3 == "":
contenido3 = "Evento del momento"
if "acestream" in contenido2:
acelink = contenido2.removeprefix("[:link:](acestream://").removesuffix(")")
lista += (contenido3.strip() + "\n" + acelink + "\n")
if lista != "":
print("electroperra : OK : channels retrieved")
else:
print("electroperra : INFO : could not access electroperra")
with open("caches/ep.txt", "w") as f:
f.write(lista)
f.close()
print("electroperra : INFO : electroperra cached")
electroperra()
'''
scraper()