-
Notifications
You must be signed in to change notification settings - Fork 0
/
zipfy.py
50 lines (39 loc) · 1.5 KB
/
zipfy.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
import re
from requests import get
from BeautifulSoup import BeautifulSoup
from HTMLParser import HTMLParser
class Corpus:
def __init__(self, words):
self.words = words.lower()
self.word_list = self.words.split()
self.word_set = set(self.word_list)
self.freq_list = []
for word in self.word_set:
self.freq_list.append((word, self.word_list.count(word)))
self.freq_list = sorted(self.freq_list, key=lambda tup: tup[1])
self.freq_list.reverse()
class WebCorpus(Corpus):
def __init__(self, url):
self.url = url
try:
site = get(url)
except:
raise Exception, 'No such website'
self.is_html = False
self.is_plaintext = False
if 'text/html' in site.headers['content-type']:
self.is_html = True
h = HTMLParser()
page = BeautifulSoup(site.content, convertEntities=True)
body = page.body
text = ' '.join(body.findAll(text=True)).strip()
text = ' '.join(h.unescape(text).split())
self.words = re.sub('[^A-Za-z\'\s\-]+', '', text).__str__()
elif 'text/plain' in site.headers['content-type']:
self.is_plaintext = True
text = site.content.strip()
text = ' '.join(text.split()).lower()
self.words = re.sub('[^A-Za-z\'\s\-]+', '', text).__str__()
else:
raise Exception, 'Unsupported data type'
Corpus.__init__(self, self.words)