tools.py

import sys
from bs4 import BeautifulSoup
from torpy.http.requests import TorRequests
import requests


def scraper():
    lista = ""
    print('scraper : INFO : requesting elcano...', flush=True)

    with open('caches/site.txt', 'r') as f:
        line = f.read()
        link = line.strip()
        f.close()

    try:
        with TorRequests() as tor_requests:
            with tor_requests.get_session() as sess:
                grab = sess.get(link)
                print(grab)
    except:
        # El error de la librería torpy no tiene importancia y no afecta a futuros runs
        print("scraper : INFO : torpy linea 22 could not access elcano")
        sys.exit(0)

    soup = BeautifulSoup(grab.text, 'html.parser')
    for enlace in soup.find_all('a'):
        acelink = enlace.get('href')
        canal = enlace.text

        if not str(acelink).startswith("acestream://") or canal == "aquÃ­":
            pass
        else:
            link = str(acelink).replace("acestream://", "")
            lista += str((canal + "\n" + link + "\n"))
            contenido = ((lista.replace(u'\xa0', u' ')).strip())

    if contenido != "":
        print("scraper : OK : channels retrieved")
        write_cache(contenido)
    else:
        print("scraper : INFO : could not access elcano")
    

def write_cache(contenido):
    """
    contenido1 = contenido.replace('Ã§', 'ç', 1)
    contenido2 = contenido1.replace('Ã±', 'ñ')
    contenido_lat = contenido2.replace('Ã³', 'ó')
    """    
    with open("caches/cachedList.txt", "wb") as cachedlist:
        cachedlist.write(contenido.encode('latin1'))
        cachedlist.close()
        print("scraper : INFO : elcano cached")

"""
def read_cache():
    with open('caches/cachedList.txt', 'r') as cachedlist:
        contenido = cachedlist.read()
        cachedlist.close()
        print("scrap: INFO: returning elcano cached List")
        #return (contenido)
"""


'''
def electroperra():
    
    grab = requests.get('https://hackmd.io/@cicvNR77Qoem0wkzPOdC8w/B1CkAt6so')
    soup = BeautifulSoup(grab.text, 'html.parser')
    lista = ""
    longLinealText = ""
    contenido1 = ""
    contenido2 = ""
    contenido3 = ""

    for webText in soup.find_all('div', attrs={'class': 'container-fluid markdown-body'}):
        longLinealText = (str(webText).split('|'))
        #print(longLinealText)
        for contenido in longLinealText:
            if contenido == "\x0a":
                pass
            else:
                contenido3 = contenido2
                contenido2 = contenido1
                contenido1 = contenido.strip()
                
            if "acestream" not in contenido2:
                pass
            else:
                if contenido3 == "":
                    contenido3 = "Evento del momento"
                if "acestream" in contenido2:
                    acelink = contenido2.removeprefix("[:link:](acestream://").removesuffix(")")
                lista += (contenido3.strip() + "\n" + acelink + "\n")
                
        if lista != "":
            print("electroperra : OK : channels retrieved")
        else:
            print("electroperra : INFO : could not access electroperra")
    
    
        with open("caches/ep.txt", "w") as f:
            f.write(lista)
            f.close()
            print("electroperra : INFO : electroperra cached")


electroperra()
'''
scraper()