forked from amitKr85/project_student_allocation
-
Notifications
You must be signed in to change notification settings - Fork 0
/
filterurldict.py
52 lines (46 loc) · 1.45 KB
/
filterurldict.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
from urllib.request import urlopen
import pickle
import urllib
import urllib.request as url
from urllib.parse import quote
from bs4 import BeautifulSoup as bs
from dict import v_set
import bleach
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from test1 import skilldict
#import urllib2
from dict import *
import time
#from synoskill import sksyno
corrskdic = set()
baseURL = "http://en.wikipedia.org/wiki/"
for v in skilldict:
if " skill" in v.lower() or " skill " in v.lower():
corrskdic.add(v)
else:
original = v
removed = original.replace("-", " ")
time.sleep(.5)
print('------------------------------------------', v, '-----------------------')
article = removed
# print(article)
article = quote(article)
fullURL = baseURL + article
# print fullURL
try:
# req = urllib.Request(fullURL)
resp = urlopen(fullURL)
if resp.getcode() == 404:
# Do whatever you want if 404 is found
print("404 Found!")
else:
# Do your normal stuff here if page is found.
print("URL: {0} Response: {1}".format(fullURL, resp.getcode()))
corrskdic.add(v)
# corrskdic.add(orignal)
except:
print("Could not connect to URL: {0} ".format(fullURL))
for i in corrskdic:
print(i)