Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Google Competitions Scraper #39

Open
wants to merge 2 commits into
base: dev
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
38 changes: 28 additions & 10 deletions run.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
import datetime
from datetime import datetime, timedelta
import pickle
import os.path
from googleapiclient.discovery import build
Expand All @@ -9,7 +9,7 @@
sys.path.append( # Add absolute path of utils to sys.path
os.path.join( os.path.dirname( os.path.realpath( __file__ )),
'../student-ratings' ))
from scrapers import codechef, hackerearth
from scrapers import codechef, hackerearth, google
from database import db_tools as tools
from ratings import processor
from pathlib import Path
Expand All @@ -23,9 +23,10 @@

# Objects of this class are made for each contest; This allows the association of name, website, etc to that particular contest
class contest_details():
def __init__(self, url):
self.website = str(url[0].split('.')[1]) # Fetch the platform name
def __init__(self, url, event_name):
self.website = str(url[0].split('.')[0]) # Fetch the platform name
self.contest_code = str(url[-1])
self.event_name = self.make_event_name(event_name.split())
self.file_name = self.make_file_name()

def make_file_name(self):
Expand All @@ -43,7 +44,19 @@ def make_file_name(self):
return f'codechef-{month}-long-{self.contest_code[-2:]}.in'
elif self.website == 'hackerearth':
return f'hackerearth-{self.contest_code}.in'
elif self.website == 'g':
contest_round = self.event_name[1].lower()
year = self.event_name[-1][-2:]
return f'google-{self.contest_code}-{contest_round}-{year}'

def make_event_name(self, name_original):
if 'Qualification' in name_original:
return [self.contest_code, 'qualification', name_original[-1]]
elif 'Finals' in name_original:
return [self.contest_code, 'finals', name_original[-1]]
elif 'Round' in name_original:
return [self.contest_code, name_original[name_original.index('Round')+1], name_original[-1]]

def set_leaderboard(self, leaderboard):
self.leaderboard = leaderboard

Expand Down Expand Up @@ -78,7 +91,6 @@ def get_calendar_events(DAYS):
calendar_response = response.get('items', [])
return calendar_response


def get_all_contests(DAYS):
calendar_response = get_calendar_events(DAYS) # Gets all contest event in the last [DAYS] days
contests = defaultdict(list)
Expand All @@ -91,21 +103,22 @@ def get_all_contests(DAYS):
else:
for event in calendar_response:
try:
url = event['location'].replace('https://', '').split('/') # Remove the https and make the parts of the url a list
url = event['location'].replace('https://', '').replace('www.', '').split('/') # Remove the https and make the parts of the url a list

except:
log.error('The contest {} does not have an associated website and is hence ignored.'.format(event['summary']))
continue
try:
url.remove('') # To remove any unexpected blank items caused by a trailing slash
except:
pass

contest = contest_details(url) # Create a contest_details object for the contest
if contest.website not in ['codechef', 'hackerearth']: # Only codechef and hackerearth scrapers are compatible as of now
contest = contest_details(url, event['summary']) # Create a contest_details object for the contest
if contest.website not in ['codechef', 'hackerearth', 'g']: # Only google (future competitions), codechef, hackerearth scrapers are compatible as of now
continue
if contest.file_name not in existing_contests: # Checks whether the contest has already been scraped, if not writes it to scraped contests
contest_names_file.write(contest.file_name+'\n')
contests[contest.website].append(contest)
log.info(f'Writing {contest.file_name} to list of scraped contests.')
else:
log.warn(f'{contest.file_name} already exists, ignoring; To re-scrape, delete the file and remove this entry.')

Expand All @@ -128,6 +141,11 @@ def scrape(DAYS=30):
assert len(leaderboards) == len(contests['hackerearth']) # Make sure the number of leaderboards is the same as number of contests
for i in range(len(leaderboards)):
contests['hackerearth'][i].set_leaderboard(leaderboards[i])

leaderboards = google.scrape(list(contest.event_name for contest in contests['g']))
assert len(leaderboards) == len(contests['g']) # Make sure the number of leaderboards is the same as number of contests
for i in range(len(leaderboards)):
contests['g'][i].set_leaderboard(leaderboards[i])

else:
return
Expand Down Expand Up @@ -177,5 +195,5 @@ def execute(DAYS=30, map_USN=True, clean=False): #

""" Uncomment one of the two lines depending on requirement, or call your desired function yourself """

# execute(clean=True)
execute(DAYS=60, clean=True)
# make_scoreboard(map_USN=True, clean=True)
4 changes: 2 additions & 2 deletions scrapers/codechef.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ def get_rankings(site, contest_code):
log.info(f'Initialised website: {site}')
total_pages = int(load_all(r'jump', 'class')[-1].text)
scraped_scoreboard = []
for page in range(total_pages):
for page in range(1, total_pages):
names = load_all(name_class, 'class')
scores = load_all(r"//td[@class='num']//div[not(@class) and (@title='' or not(@title))]", 'xpath')
if contest_code[0:4] == "COOK":
Expand All @@ -49,7 +49,7 @@ def get_rankings(site, contest_code):
[float(y.text.split()[0]) for y in scores]))
if page == total_pages-1: # Reached Last Page
break
driver.get(site + f'&page={page+2}') # go to next page
driver.get(site + f'&page={page+1}') # go to next page
return scraped_scoreboard

def scrape(contest_codes):
Expand Down
176 changes: 87 additions & 89 deletions scrapers/google.py
Original file line number Diff line number Diff line change
@@ -1,21 +1,12 @@
from selenium import webdriver
from time import sleep
from csv import writer

IS_OUTPUT_CSV = False
scoreboard_url = "https://codingcompetitions.withgoogle.com/kickstart/round/0000000000051061"

chromeOptions = webdriver.ChromeOptions()
prefs = {'profile.managed_default_content_settings.images': 2, # does not load images on web page
'disk-cache-size': 1024} # use disk cache to reduce page load time

chromeOptions.add_experimental_option("prefs", prefs)
driver = webdriver.Chrome(options=chromeOptions)
driver.get(scoreboard_url)



'''
import sys
import os
from utils import selenium_utils, log
from selenium.common.exceptions import ElementClickInterceptedException
driver = selenium_utils.make_driver()
load_all = selenium_utils.load_all(driver)
load = selenium_utils.load(driver)

"""
<div class="ranking-table__row">
<div class="ranking-table__row-cell ranking-table__row-cell--left ranking-table__row-cell__rank">1</div>
<div class="ranking-table__row-cell ranking-table__row-cell--left ranking-table__row-cell__displayname">
Expand All @@ -34,74 +25,81 @@
Score is stored directly in the <span> with className="user-total-score"
UserName is stored in a <p> which is the child element of the <a> with className="ranking-table__row-cell__displayname"

'''


score_class = "user-total-score"
rank_class = "ranking-table__row-cell__rank"
name_class = "ranking-table__row-cell__displayname"
dropdown_class = "mdc-select__selected-text"
dropdown_css = "ul.mdc-list>li.mdc-list-item"
scraped_scoreboard = list()

# Wait for scoreboard to load
while not driver.find_elements_by_class_name(score_class):
sleep(1)


# Wait for scoreboard to reload after changing the number of rows to 50
number_of_rows = len(driver.find_elements_by_class_name(score_class))
total_pages = int(driver.find_element_by_class_name("ranking-table-page-number-total-pages").text.split()[1])
if number_of_rows < 50:
pass # Less than 30 people, do nothing
elif number_of_rows == 50 and total_pages == 1:
pass # Exactly 30 people, do nothing
else:
# More than 30 people, Changing the scoreboard to 50 rows
driver.find_elements_by_class_name(dropdown_class)[-1].click() # Click drop-down to change rows
sleep(0.5)
driver.find_elements_by_css_selector(dropdown_css)[-1].click() # Click on last option in the drop-down
while number_of_rows == 50:
sleep(0.5)
number_of_rows = len(driver.find_elements_by_class_name(score_class))

# Find number of pages in the scoreboard
total_pages = int(driver.find_element_by_class_name("ranking-table-page-number-total-pages").text.split()[1])
lowest_so_far="1"

for page in range(total_pages):
try:
score_elements = driver.find_elements_by_class_name(score_class)
rank_elements = driver.find_elements_by_class_name(rank_class)
name_elements = driver.find_elements_by_class_name(name_class)
scraped_scoreboard.extend(list(zip(
[x.find_element_by_tag_name("p").text for x in name_elements],
[y.text for y in rank_elements],
[z.text for z in score_elements])))
except:
score_elements = driver.find_elements_by_class_name(score_class)
rank_elements = driver.find_elements_by_class_name(rank_class)[1:]
name_elements = driver.find_elements_by_class_name(name_class)
scraped_scoreboard.extend(list(zip(
[x.find_element_by_tag_name("p").text for x in name_elements],
[y.text for y in rank_elements],
[z.text for z in score_elements])))
if page == total_pages-1: # Reached last_page
break
driver.find_elements_by_tag_name("button")[-1].click() # click to go to next page
last_name = scraped_scoreboard[-1][0]
while driver.find_elements_by_class_name(name_class)[-1].find_element_by_tag_name("p").text == last_name:
sleep(0.1) # Wait until next page has loaded

driver.close()

if IS_OUTPUT_CSV:
with open(f"{scoreboard_url.split('/')[-1]}.csv", "w") as fp:
csv_writer = writer(fp)
csv_writer.writerows(scraped_scoreboard)
else:
for row in scraped_scoreboard:
if not row[1]==lowest_so_far:
print()
lowest_so_far=row[1]
print(row[0],end=" ")
"""

base_url = r'https://codingcompetitions.withgoogle.com'
country_filter = r'?scoreboard_type=India'
score_class = r'user-total-score'
rank_class = r'ranking-table__row-cell__rank'
name_class = r'ranking-table__row-cell__displayname'
dropdown_xpath = r'//*[@id="scoreboard"]/div[2]/div/div[2]/div[2]/div/div'
dropdown_vals_class = r'mdc-list-item'
last_page_class = r'ranking-table-page-number-total-pages'
next_button_xpath = r'//*[@id="scoreboard"]/div[2]/div/div[2]/div[3]/button[2]'

def get_contest_scoreboard(contest_name):
contest_type = contest_name[0]
contest_round = contest_name[1].upper()
year = contest_name[-1]
schedule_url = f'{base_url}/{contest_type}/archive/{year}'
log.info(f'Getting {schedule_url}')
driver.get(schedule_url)
rows = load_all(r'//div[@role="cell"]', 'xpath')
name = f'Round {contest_round} {year}'
log.info(f'Finding {name}')
for row in rows:
if row.text == name:
scoreboard_url = str(row.find_element_by_tag_name('a').get_attribute('href'))
return scoreboard_url

def scrape(contest_names):
final_scoreboard = list()
leaderboards = []
for contest_name in contest_names:
driver.get(get_contest_scoreboard(contest_name))
#driver.get(get_contest_scoreboard(contest_name) + country_filter)
#load_all(r'mdc-button__label', 'class')
#driver.refresh()

load(dropdown_xpath, 'xpath').click()
load_all(dropdown_vals_class, 'class')[-1].click()

# Find number of pages in the scoreboard
last_page = int(load(last_page_class, 'class').text.split()[-1])
last_score = "1"

for page in range(0, last_page):
try:
score_elements = load_all(score_class, 'class')
rank_elements = load_all(rank_class, 'class')
name_elements = load_all(name_class, 'class')
final_scoreboard.extend(list(zip(
[x.find_element_by_tag_name("p").text for x in name_elements],
[y.text for y in rank_elements],
[z.text for z in score_elements])))
except:
score_elements = load_all(score_class, 'class')
rank_elements = load_all(rank_class, 'class')[1:]
name_elements = load_all(name_class, 'class')
final_scoreboard.extend(list(zip(
[x.find_element_by_tag_name("p").text for x in name_elements],
[y.text for y in rank_elements],
[z.text for z in score_elements])))
try:
load(next_button_xpath, 'xpath').click()
except ElementClickInterceptedException: # When unable to click next button
log.info(f'Last Page (page {last_page + 1}) reached')
shared_rank = []
rank_list = []
for user in final_scoreboard:
if user[1] == last_score:
shared_rank.append(user[0])
if final_scoreboard.index(user) == len(final_scoreboard) - 1:
rank_list.append(' '.join(shared_rank))
else:
rank_list.append(' '.join(shared_rank))
shared_rank = []
last_score = user[1]
shared_rank.append(user[0])
leaderboards.append(rank_list)
return leaderboards
2 changes: 1 addition & 1 deletion utils/log.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
import logging
from time import strftime

logging.basicConfig(format='%(message)s', level='DEBUG', datefmt=strftime("%d/%m/%Y, %H:%M:%S"))
logging.basicConfig(format='%(message)s', level='INFO', datefmt=strftime("%d/%m/%Y, %H:%M:%S"))

def info(message):
logging.info(datetime.now().strftime("%d/%m/%Y, %H:%M:%S") + ': ' + message)
Expand Down
6 changes: 5 additions & 1 deletion utils/selenium_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,8 @@ def _load(token, options='tag'):
return waiter.until(EC.presence_of_element_located((By.LINK_TEXT, token)))
if options == 'class':
return waiter.until(EC.presence_of_element_located((By.CLASS_NAME, token)))
if options == 'css':
return waiter.until(EC.presence_of_element_located((By.CSS_SELECTOR, token)))
return _load

def load_all(driver):
Expand All @@ -38,5 +40,7 @@ def _load_all(token, options='class'):
if options == 'xpath':
return waiter.until(EC.presence_of_all_elements_located((By.XPATH, token)))
if options == 'tag':
return waiter.until(EC.presence_of_all_elements_located((By.TAG_NAME, token)))
return waiter.until(EC.presence_of_all_elements_located((By.TAG_NAME, token)))
if options == 'css':
return waiter.until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, token)))
return _load_all