-
Notifications
You must be signed in to change notification settings - Fork 0
/
scrape_bleacherreport_com.py
143 lines (119 loc) · 3.14 KB
/
scrape_bleacherreport_com.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
#Last tested 14-11-11
import re
from bs4 import BeautifulSoup
from urllib import request
NAME = 'bleacherreport.com'
URL = 'http://bleacherreport.com/nba/'
#HARDCODED URL
#URL = 'http://bleacherreport.com/articles/1992194-nba-power-rankings-lay-of-the-land-heading-down-the-stretch'
#Rankings URL buried, two urllib requests made
#main page rankings link usually outdated, hardcode new URL everytime
#14-11-11 Trying to scrape URL automatically
abr_list = ['ATL',
'BKN',
'BOS',
'CHA',
'CHI',
'CLE',
'DAL',
'DEN',
'DET',
'GSW',
'HOU',
'IND',
'LAC',
'LAL',
'MEM',
'MIA',
'MIL',
'MIN',
'NOH',
'NYK',
'OKC',
'ORL',
'PHI',
'PHX',
'POR',
'SAC',
'SAS',
'TOR',
'UTA',
'WAS']
class ScrapeBleacherReport:
def __init__(self,URL=URL):
nba_pre_pr = request.urlopen(URL).read()
self.soup = BeautifulSoup(nba_pre_pr)
pr_url = self.soup.find("a",text="NBA Power Rankings")
self.nba_pr = request.urlopen(pr_url['href']).read()
self.soup = BeautifulSoup(self.nba_pr)
print('----------BleacherReport.com Initialized-------------')
def getSite(self):
return(NAME)
def getUrl(self):
return(URL)
def getDatePublished(self):
date_raw = self.soup.find('time')
p = re.search('(Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)(?:\s)(\d+)(?:\,\s)(\d+)',date_raw.text)
date = p.group(3) + '-' + str(self.monthToNum(p.group(1))).zfill(2) + '-' + str(p.group(2)).zfill(2)
return(date)
def monthToNum(self,date):
return{
'Jan' : 1,
'Feb' : 2,
'Mar' : 3,
'Apr' : 4,
'May' : 5,
'Jun' : 6,
'Jul' : 7,
'Aug' : 8,
'Sep' : 9,
'Oct' : 10,
'Nov' : 11,
'Dec' : 12
}[date]
def nameToShortName(self,name):
return{
'Oklahoma City Thunder' : 'OKC',
'Indiana Pacers' : 'IND',
'Houston Rockets': 'HOU',
'Miami Heat' : 'MIA' ,
'Los Angeles Clippers' : 'LAC',
'San Antonio Spurs' : 'SAS',
'Dallas Mavericks' : 'DAL',
'Phoenix Suns' : 'PHX',
'Memphis Grizzlies' : 'MEM',
'Portland Trail Blazers' : 'POR',
'Toronto Raptors' : 'TOR',
'Golden State Warriors' : 'GSW',
'Chicago Bulls' : 'CHI',
'Washington Wizards' : 'WAS',
'Brooklyn Nets' : 'BKN',
'Atlanta Hawks' : 'ATL',
'Minnesota Timberwolves' : 'MIN',
'Charlotte Hornets' : 'CHA',
'New Orleans Pelicans' : 'NOH',
'Detroit Pistons' : 'DET',
'New York Knicks' : 'NYK',
'Denver Nuggets' : 'DEN',
'Utah Jazz' : 'UTA',
'Orlando Magic' : 'ORL',
'Cleveland Cavaliers' : 'CLE',
'Boston Celtics' : 'BOS',
'Sacramento Kings' : 'SAC',
'Los Angeles Lakers' : 'LAL',
'Milwaukee Bucks' : 'MIL',
'Philadelphia 76ers' : 'PHI'
}[name]
def getRanks(self):
teamList = []
ratings_raw = self.soup.find_all('h2','article_page-title')
#ratings_raw = self.soup.find_all('strong')
for rank in ratings_raw:
p = re.search('(\d+)(?:\.\s)(\w+\s\w+\s\w+|\w+\s\w+)',rank.text)
teamList.append((self.nameToShortName(p.group(2)),int(p.group(1))))
if p.group(1) == '1':
break
teamList.sort()
teamRanks = [ranks for names,ranks in teamList]
#teamNames = [names for names,ranks in teamList]
return(teamRanks)