-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathscraper.py
139 lines (103 loc) · 3.59 KB
/
scraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
import urllib.request
import re
import time
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.support import expected_conditions
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.common.exceptions import TimeoutException
def scrape_data():
url1="https://www.tapology.com/search/mma-event-figures/ppv-pay-per-view-buys-buyrate"
ufc_link="https://www.ufc.com/athletes/all"
row1=[]
source1=urllib.request.urlopen(url1)
soup1=BeautifulSoup(source1,"lxml")
table1=soup1.select('table')[0]
table1_row=table1.find_all('tr')
for tr in table1_row:
td=tr.find_all('td')
temp=[i.text for i in td]
row1.append(temp)
row1=row1[1:]
dic=dict()
for r in row1:
temp=''
#To remove commas from amount
for s in r[6]:
if s!=',':
temp=temp+s
if int(temp)>1000000:
temp1=r[2].split()
for word in temp1:
if word=='The' or re.match(r'[0-9vs]',word): #remove unnecessary numbers and words from list
continue
elif word.isspace()==False:
if word in dic:
dic[word]=dic[word]+1
else:
dic[word]=1
#Store name of fighter with highest key
key_name=''
value_num=0
for key in dic:
if dic[key]>value_num:
key_name=key
value_num=dic[key]
def find(driver):
element = driver.find_element_by_xpath("//*[@id='block-mainpagecontent']/div/div/div[1]/div/div/ul/li/a")
if element:
return element
else:
return False
#To click on load more button on the page
driver=webdriver.Chrome(executable_path="/home/akhil/crawler/chromedriver_linux64 (2)/chromedriver")
driver.get('https://www.ufc.com/athletes/all')
delay = 7 #seconds
while True:
try:
myElem = WebDriverWait(driver, delay).until(expected_conditions.presence_of_element_located((By.XPATH, '//*[@id="block-mainpagecontent"]/div/div/div[2]/div/div/ul/li/a')))
print("found the btn")
except TimeoutException:
print('-------------------------------------------')
break
driver.execute_script("arguments[0].click()", myElem)
print('click')
driver.implicitly_wait(delay)
time.sleep(delay)
print('sleep')
print("out")
source = driver.page_source #To store or save html page after clicking load more button
#name_tree=urllib.request.urlopen(ufc_link)
name_tree_request=BeautifulSoup(source,"lxml")
link=''
im=''
fighters_info={} #To store information of all fighters
for i in name_tree_request.find_all('div',class_='c-listing-athlete-flipcard'):
full_name=i.find('span',class_='c-listing-athlete__name').text
first=full_name.split()
print(first)
sec = first[1] if len(first)>1 else first[0]
im=i.find('span',class_='c-listing-athlete__name')
im=i.img['src']
link=i.find('div',class_='c-listing-athlete-flipcard__action').a['href']
print('-------',sec)
fighters_info[sec]={}
fighters_info[sec]['full_name']=full_name
fighters_info[sec]['img']=im
fighters_info[sec]['link']=link
link_to_fighters_page=fighters_info[key_name]['link']
fighters_image=fighters_info[key_name]['img']
record=[]
next_url="https://www.ufc.com"+link_to_fighters_page
#print(next_url)
source2=urllib.request.urlopen(next_url)
soup2=BeautifulSoup(source2,"lxml")
record_tree=soup2.find_all('div',class_='c-hero__headline-suffix')
record=list(record_tree[0].text.split())
fighters_name=fighters_info[key_name]['full_name']
fighters_name=' '.join(fighters_name.split())
fighters_record=' '.join(record)
return fighters_name,fighters_record,dic
if __name__=='__main__':
fighters_name,fighters_record,dic=scrape_data()