forked from amitKr85/project_student_allocation
-
Notifications
You must be signed in to change notification settings - Fork 0
/
dict1.py
129 lines (61 loc) · 2.09 KB
/
dict1.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
import urllib
import urllib.request as url
from urllib.parse import quote
from bs4 import BeautifulSoup as bs
from dict import v_set
import bleach
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from dict import *
import time
global stop_words
stop_words = set(stopwords.words('english'))
v2_set = set()
import requests
#url = raw_input("en.wikipedia.org/wiki")
for v in v_set:
original = v
removed = original.replace("-", " ")
time.sleep(.5)
print('------------------------------------------',v,'-----------------------')
article= removed
#print(article)
article = quote(article)
from urllib.request import urlopen
def find_bad_qn(a):
url ="http://en.wikipedia.org/wiki/"+article
try:
urlopen(url)
except:
pass
# print("Please Wait.. it will take some time")
#for i in range(298314,298346):
# find_bad_qn(i)
sauce=urllib.request.urlopen("http://en.wikipedia.org/wiki/"+article).read()
soup=bs(sauce,'lxml')
vo_set = set()
w1=soup.find(attrs={'class':'mw-parser-output'})
#bleach.clean(w1, tags=[], attributes={}, styles=[], strip=True)
w2=w1.text
w3=word_tokenize(w2)
#print(w1.text)
for word in w3:
if word not in stop_words and word.__len__()>3 and word.isalpha():
vo_set.add(word)
for word in vo_set:
print(word)
#if stop_words:
# print(word)
#for para in :
# print(para.string)
#data = r.text
#soup = BeautifulSoup(data,'html.parser')
#artist_name_list = soup.find(class_='BodyText')
#print(soup.find(class_='BodyText'))
#artist_name_list_items = artist_name_list.find_all('a')
# Use .contents to pull out the <a> tag’s children
#for artist_name in artist_name_list:
# names = artist_name.contents[0]
# print(names)
#print(soup.find('div',id="bodyContent").p)