-
Notifications
You must be signed in to change notification settings - Fork 7
/
Copy pathscrapper.py
56 lines (49 loc) · 2.02 KB
/
scrapper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
import requests
from bs4 import BeautifulSoup
# for debug
# import traceback
def scrape():
url = "https://ktu.edu.in/eu/core/announcements.htm"
#Since Dumb KTU can go down any minute, it's best to use Try Except
try:
response = requests.get(url)
soup = BeautifulSoup(response.text, 'html5lib')
table = soup.find("table", {"class": "ktu-news"})
tr_list = table.findAll("tr")
data = []
for tr in tr_list:
links = []
content = tr.findAll("b")
# Temporary Fix is to skip the announcement which causes the issue
# At least the bot will still work until that type of announcements could be scrapped properly
# Issue caused by Announcement on Sept 11
if(len(content) == 0):
# print(tr)
continue
try:
links_all = tr.findAll("a")
for link in links_all:
text = link.find(text=True)
link = str(link.get('href'))
if link.startswith('/'):
link = "https://ktu.edu.in"+link
links.append(dict({'url': link, 'text': text}))
except:
links = []
date = content[0].text.split(':')[0][:-3]
title = content[1].text
texts = tr.find("li").findAll(text=True)
content = ''
for text in texts:
if len(text) > 25 and text != title:
""" 25 is an arbitrarily taken length, content is definitely more than 25 characters
and hyperlink text (eg, notification, timetable) is definitely less than 25 """
content += text.replace('\n','').replace('\r','')+'\n'
data.append(dict({'date': date, 'title': title, 'link': links, 'content': content.strip()}))
except Exception as e:
# for debug
# traceback.print_exc()
data = []
print(str(e))
# print(data[0])
return data