ycombinator_scraper.py

# -*- coding: utf-8 -*-
"""ycombinator_scraper.ipynb

Automatically generated by Colaboratory.

Original file is located at
    https://colab.research.google.com/drive/1ZEil8gXiThybKkZhrV2AoyiJD9PsmVb3

# Scraping Companies Information for listed  companies  on Ycombinator

<br>
<br>
<br>

# Project Outline
1. To start with, I'm going to scrape about 1000 entries from https://ycombinator.com/companies, which are:
- The listed company names
- The company's ycombinator page url 
- The company location (will prefer to get it here since it is written with the country, unlike how it appeared when the company name has been clicked
- The company description head/slogan. Then,


![Untitled-2.png](attachment:Untitled-2.png)

<br>
<br>
<br>

2. I'll go through the scraped company's ycombinator page url  and grab many other informations (company's description, year founded, team size, company page url, social media urls, management details) as they appear on the page.

3. At the end, I will create for each company, a CSV file in the following format:

```
'Company_Name'| 'Company_Page_URL'| 'Company_Location'| 'Description_Head'| 'Website'| 'Description'| 'Founded'| 'Team_Size'| 'Linkedin_Profile'| 'Twitter_Profile'| 'Facebook_Profile'| 'Crunchbase_Profile'| 'Active_Founder1'| 'Active_Founder2'| 'Active_Founder3'
Airbnb|	https://www.ycombinator.com/companies/airbnb|	San Francisco, CA, US,|	Book accommodations around the world.|  http://airbnb.com | Founded in August of 2008 and based in San Fra... | 2008 | 5000 | https://www.linkedin.com/company/airbnb/ | https://twitter.com/Airbnb | https://www.facebook.com/airbnb/ | https://www.crunchbase.com/organization/airbnb | Nathan Blecharczyk\nNone\nhttps://twitter.com/... | Brian Chesky\nNone\nhttps://twitter.com/bchesky\n | Joe Gebbia\nNone\nhttps://twitter.com/jgebbia\n,```

## Import necessary libraries

- use **selenium** to downlaod the page
- use **BS4** to parse and extract information
- convert to a Pandas dataframe

lets import the necessary packages
"""

pip install selenium

# import necessary modules
import time
from datetime import datetime as dt

from bs4 import BeautifulSoup

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait

import requests

import pandas as pd
import numpy as np


def chrome(mode='h'):

    ''' A function to instantiate chrome driver

        :arguments: 
            mode - representing either headless (preferred) or browser mode.
        :returns: 
            driver - the driver object instantiated.
    '''

    if mode == 'h':
        #  Headless mode
        chrome_option = Options()
        chrome_option.add_argument("--headless")
        chrome_option.add_argument("--log-level=3")     # disabling unwanted messages printed while running with am headless browser
        driver = webdriver.Chrome(options=chrome_option)

    elif mode == 'b':
        # Browser mode
        driver = webdriver.Chrome()

    else:
        print("Mode is invalid")
        return None

    return driver


def get_ycombinator_page_source(page_url=None, browser=None):
    """
    A function to get the page source codes and contents

        :arguments: 
            page_url - the url for of the page
            browser - the webdriver object
        :returns: 
            page_dom - a beautiful soup object of the page contents.
    """
    
    time.sleep(5)
    browser.get(page_url)
    time.sleep(20) # allow the page to load 
    
    
    ################ implement infinite scrolling ######################################
    try:
        
        previous_height = browser.execute_script('return document.body.scrollHeight')

        i = 0
        while True:
            i+=1
            print("scroll: ", i)

            browser.execute_script('window.scrollTo(0,document.body.scrollHeight)')

            time.sleep(1)

            new_height = browser.execute_script('return document.body.scrollHeight')

            if previous_height == new_height:
                print("End of page reached")
                break

            previous_height = new_height
    
    except:
        browser.close()
    ################################################################################
    
    time.sleep(2)

    page_dom = BeautifulSoup(browser.page_source,"html.parser")
    
    return page_dom


def get_company_page_info(doc):
    """
    A function to get the names of startup companies, location and url of company pages on ycombinator

        :arguments: 
            doc - the bs4 object of the ycombinator page
        :returns: 
            DataFrame object of the data collected
    """
    page_dict = { 'Company_Name': [], 'Company_Page_URL': [], 'Company_Location': [], 'Description_Head': []}

    # find all a tags with class name "styles-module__company___1UVnl no-hovercard" which points to individual company segment
    item = doc.find_all("a",{"class":"styles-module__company___1UVnl no-hovercard"})
    
    
    # company ycompany's page url
    url_list = []
    
    # get page informations
    for i in range(len(item)):
        # company name
        company_name = item[i].find('span', {"class":"styles-module__coName___3zz21"})
        page_dict["Company_Name"].append(company_name.text)
        
        # company page url
        company_page_url = item[i]['href']
        page_dict["Company_Page_URL"].append("https://www.ycombinator.com"+company_page_url)
        
        # company ycompany's page url
        url_list.append("https://www.ycombinator.com"+company_page_url)

        # company location
        company_location = item[i].find('span', {"class":"styles-module__coLocation___yhKam"})
        page_dict["Company_Location"].append(company_location.text)
        
        # Description_Head
        description_head = item[i].find('span', {"class":"styles-module__coDescription___1b_yd"})
        page_dict["Description_Head"].append(description_head.text)

        
    return pd.DataFrame(page_dict), url_list

"""
## Now, I will write functions to :


1. Browse each Company_Page_URL 
2. Get actual company website address
3. Get the company description as appeared on ycombinator
4. Get the company year founded and team size,
5. Get the company social media urls
6. Get the company founder infos such as there name, position and social media urls
7. Then after collecting the above info for all Company_Page_URL, create a CSV from concatenating the earlier scraped df with the dataframe of the newly collected infos

"""

def get_company_website(doc):
    # scrape the websites
    try:
        company_websites_tags = doc.find("a",{"target":"_blank"})
        try:
            company_websites = company_websites_tags.text
        except:
            company_websites = company_websites_tags
    except:
        company_websites = np.nan
    return company_websites


def get_company_description(doc):
    # scrape the descriptions
    try:
        company_description_tags = doc.find("p",{"class":"whitespace-pre-line"})
        try:
            company_description = company_description_tags.text
        except:
            company_description = company_description_tags
    except:
        company_description = np.nan
    return company_description


def get_company_year_founded_and_team_size(doc):

    # scrape
    try:
        ppty = doc.find("div",{"class":"space-y-0.5"}).find_all("div",{"class":"flex flex-row justify-between"})
        year_founded = ppty[0].text.split(":")[1]
        team_size = ppty[1].text.split(":")[1]
        
        company_year = year_founded
        company_size = team_size
    except:
        company_year = np.nan
        company_size = np.nan

    return company_year, company_size


def get_company_social_media_urls(doc):

    # scrape the social network urls
    try:
        
        sm_info = doc.find("div",{"class":"space-x-2"})
        
        try:
            linkedin_url = sm_info.find("a",{"title":"LinkedIn profile"})["href"]
        except:
            linkedin_url = np.NaN     
        try:
            twitter_url = sm_info.find("a",{"title":"Twitter account"})["href"]
        except:
            twitter_url = np.NaN    
        try:
            facebook_url = sm_info.find("a",{"title":"Facebook profile"})["href"]
        except:
            facebook_url = np.NaN
        try:
            crunchbase_url = sm_info.find("a",{"title":"Crunchbase profile"})["href"]
        except:
            crunchbase_url = np.NaN

    except:
        linkedin_url = np.NaN
        twitter_url = np.NaN    
        facebook_url = np.NaN
        crunchbase_url = np.NaN 

    return linkedin_url, twitter_url, facebook_url, crunchbase_url

        
def get_founder_info(doc):
    founder_dict = {}

    try:
        founder_info = doc.find_all("div",{"class":"leading-snug"})
    except:
        founder_info = ""
    

    if len(founder_info)>0:
        
        for i in range(len(founder_info)):

            founder_name = founder_info[i].find("div",{"class":"font-bold"}).text


            try:
                founder_post = founder_info[i].find("div",{"class":""}).text
            except:
                founder_post = founder_info[i].find("div",{"class":""})

            try:
                founder_sm = founder_info[i].find("div",{"class":"mt-1 space-x-2"}).find_all("a")
                founder_sm_links = ""
                for j in founder_sm:
                    founder_sm_links = founder_sm_links + str(j["href"]) + "\n"
            except:
                founder_sm_links = "None"

            founder = founder_name + "\n" + str(founder_post) + "\n"  + founder_sm_links # + "\n" + str(founder_descr) 

            founder_dict["Active_Founder"+str(i+1)] = founder

        return founder_dict
    
    
    else:
        return founder_dict

        
def scrape_all(url=None):
    
    response = requests.get(url)
#     if response.status_code != 200:
#         raise Exception('Failed to load page {}'.format(url))
        
    soup_other = BeautifulSoup(response.text,"html.parser")

    c_web = get_company_website(soup_other)
    c_description = get_company_description(soup_other)
    company_year, company_size = get_company_year_founded_and_team_size(soup_other)
    linkedin_url, twitter_url, facebook_url, crunchbase_url = get_company_social_media_urls(soup_other)
    founder_details = get_founder_info(soup_other)


    d = {'Website': c_web,
         'Description': c_description,
         'Founded': company_year,
         'Team_Size': company_size,
         'Linkedin_Profile': linkedin_url,
         'Twitter_Profile': twitter_url,
         'Facebook_Profile': facebook_url,
         'Crunchbase_Profile': crunchbase_url,             
         }     

    d.update(founder_details)

    
    return d
        

def make_dataframe_and_save(df1 = None, l=None):
    final_df = pd.concat([df,pd.DataFrame(l)], axis=1)
    
    final_df.to_csv("ycombinator_data.csv", index = False)
    return final_df

"""# Scrape the list of companies link, name, location and short description(head) from ycomobinator

`chrome`: used to instantiate the webdriver as either a headless browser or not

`get_ycombinator_page_source`: Used to handle the dynamic scraping of the project. It scrolls the website till it reaches the end of the page. Afterward, beautifulsoup is used to parse the page source which is used as input to:

`get_company_page_info`: extracted the COmpany name, links, short description/description head and location of the companies. returns url_list and a dataframe for the already scraped information.


**Note:** _They in total take `~2 minutes` to run_ and the lenght of url_list must be 1000
"""

root_url = " https://ycombinator.com/companies"

driver = chrome("h")  # instantiate the webdriver

time_start = time.time()

doc = get_ycombinator_page_source(page_url=root_url, browser = driver)


df, url_list = get_company_page_info(doc)

time_used = time.time() - time_start
print("The time used is %s seconds"%(time_used))

url_list_len = len(url_list)
print("The length of url_list is %s "%(url_list_len))
# close browser
driver.close()

"""# B1. Scraping Companies' information  without multi-threading"""

l = []
retries = []
count = 1

# start_no_thread = dt.now()
time_start = time.time()

run = True
while run:
    for link in url_list:
        try:
            print(f"{count}/{url_list_len}", end='\r')
            d = scrape_all(url=link)
            l.append(d)

        except Exception as e:
            print(e)
            retries.append(link)
    
    if retries != []:
        url_list = retries
        retries = []
    else:
        run = False
        

# runtime_no_thread = (dt.now() - start_no_thread).total_seconds()
# print(f'Total runtime - {runtime_no_thread}')
time_used = time.time() - time_start
print("The time used is %s seconds"%(time_used))

"""# B2. Scraping Companies' information  WITH multi-threading"""

import concurrent.futures as cf


start_thread = dt.now()
l1 = []
with cf.ThreadPoolExecutor() as exc:
    results = exc.map(scrape_all, url_list)

    for result in results:
        l1.append(result)

runtime_thread = (dt.now() - start_thread).total_seconds()
print(f'Total runtime - {runtime_thread}')

"""# View and safe  the company information DataFrame"""

df_final_threaded = make_dataframe_and_save(df1 = df, l=l1)

print(df_final_threaded.isna().sum())

df_final = make_dataframe_and_save(df1 = df, l=l)

print(df_final.isna().sum())

"""# References and Future Work

### Summary of what I did / Issues
- I have just succesfully done all the outlined procedures from the begiining of the project. However, I made a terrible mistake from making my code into modules/functions. I wasn't considerate of network failure which resulted into the scraper not returning any dataframe in the case of a network failure.
- I hope with guidance and little more time, will be able to correct this error

### References

- How to make infinite scrolling with selenium [https://www.youtube.com/watch?v=qhJ_gMB772U]
 
### Ideas for future work

- Make the code robust to network failure
"""