An exploratory data analysis conducted on Hollywood movies released 2000 - 2018. Scraped, cleaned, and visualized.

import urllib.request
from bs4 import BeautifulSoup
import pandas as pd
from tqdm import tqdm_notebook
import seaborn as sns
from matplotlib import pyplot as plt
import numpy as np
%matplotlib inline
movie_names = []
movie_years = []
imdb_ratings = []
meta_ratings = []
movie_earnings = [] 
movie_genres = []
runtimes = []

pages = 10  # 10 pages per year.
start_year = 2000
end_year = 2018

for i in tqdm_notebook(range(start_year, end_year+1), total = end_year-start_year, unit = 'year'):
  k = 1
  for j in range(1, pages + 1):
    imdb_url = '' + str(i) + '-01-01,' + str(i) + '-12-31&sort=num_votes,desc&start=' + str(k) + '&ref_=adv_nxt'
    page_unparsed = urllib.request.urlopen(imdb_url)
    page_parsed = BeautifulSoup(page_unparsed, 'html.parser')

    k += 50

    movie_divs = page_parsed.find_all('div', class_ = 'lister-item mode-advanced')

    for movie in movie_divs:
      skip_movie = False 

      meta_rating_unparsed = movie.find('div', class_ = 'inline-block ratings-metascore')
      gross_unparsed = movie.find_all('span', attrs = {'name' : 'nv'})

      if meta_rating_unparsed is None or gross_unparsed is None:
      temp = movie.find_all('span', attrs = {'name' : 'nv'})
      if len(temp) is not 2:

      meta_ratings.append(int(meta_rating_unparsed.text.replace(" ", "").split("\n")[1]))
      movie_genres.append(movie.find('span', attrs = {'class' : 'genre'}).text.strip().split(",")[0])
      imdb_ratings.append(float(movie.find('div', class_ = 'inline-block ratings-imdb-rating').text))
      movie_names.append(movie.find('h3', class_ = 'lister-item-header').find('a').text)

      year = movie.find('span', class_ = 'lister-item-year text-muted unbold').text.split(" ")
      if len(year) == 1:

      runtimes.append(int(movie.find('span', class_ = 'runtime').text.strip('min')))
# movies = pd.DataFrame(list(zip(movie_names, movie_years, movie_genres, imdb_ratings, meta_ratings, movie_earnings, runtimes)), columns =['name', 'year', 'genre', 'imdb', 'meta', 'gross', 'runtime']) 
movies = pd.read_csv("movies.csv")
movies.drop("Unnamed: 0", axis = 1, inplace = True)
movies.drop([170, 1001], axis = 0, inplace = True)
# movies.to_csv("movies.csv")
imdb meta gross runtime
count 4877.000000 4877.000000 4877.000000 4877.000000
mean 6.517572 56.017839 37.897115 107.052696
std 0.971941 17.860327 68.181217 18.182487
min 1.500000 1.000000 0.000000 61.000000
25% 6.000000 43.000000 0.720000 95.000000
50% 6.600000 57.000000 11.440000 104.000000
75% 7.200000 69.000000 45.170000 116.000000
max 9.000000 100.000000 936.660000 366.000000
name year genre imdb meta gross runtime
0 Gladiator 2000 Action 8.5 67 187.71 155
1 Memento 2000 Mystery 8.5 80 25.54 113
2 Snatch 2000 Comedy 8.3 55 30.33 102
3 Requiem for a Dream 2000 Drama 8.3 68 3.64 102
4 X-Men 2000 Action 7.4 64 157.30 104


import as style
style.use('seaborn-poster') #sets the size of the charts

Palettes :

Accent, Accent_r, Blues, Blues_r, BrBG, BrBG_r, BuGn, BuGn_r, BuPu, BuPu_r, CMRmap, CMRmap_r, Dark2, Dark2_r, GnBu, GnBu_r, Greens, Greens_r, Greys, Greys_r, OrRd, OrRd_r, Oranges, Oranges_r, PRGn, PRGn_r, Paired, Paired_r, Pastel1, Pastel1_r, Pastel2, Pastel2_r, PiYG, PiYG_r, PuBu, PuBuGn, PuBuGn_r, PuBu_r, PuOr, PuOr_r, PuRd, PuRd_r, Purples, Purples_r, RdBu, RdBu_r, RdGy, RdGy_r, RdPu, RdPu_r, RdYlBu, RdYlBu_r, RdYlGn, RdYlGn_r, Reds, Reds_r, Set1, Set1_r, Set2, Set2_r, Set3, Set3_r, Spectral, Spectral_r, Vega10, Vega10_r, Vega20, Vega20_r, Vega20b, Vega20b_r, Vega20c, Vega20c_r, Wistia, Wistia_r, YlGn, YlGnBu, YlGnBu_r, YlGn_r, YlOrBr, YlOrBr_r, YlOrRd, YlOrRd_r, afmhot, afmhot_r, autumn, autumn_r, binary, binary_r, bone, bone_r, brg, brg_r, bwr, bwr_r, cool, cool_r, coolwarm, coolwarm_r, copper, copper_r, cubehelix, cubehelix_r, flag, flag_r, gist_earth, gist_earth_r, gist_gray, gist_gray_r, gist_heat, gist_heat_r, gist_ncar, gist_ncar_r, gist_rainbow, gist_rainbow_r, gist_stern, gist_stern_r, gist_yarg, gist_yarg_r, gnuplot, gnuplot2, gnuplot2_r, gnuplot_r, gray, gray_r, hot, hot_r, hsv, hsv_r, icefire, icefire_r, inferno, inferno_r, jet, jet_r, magma, magma_r, mako, mako_r, nipy_spectral, nipy_spectral_r, ocean, ocean_r, pink, pink_r, plasma, plasma_r, prism, prism_r, rainbow, rainbow_r, rocket, rocket_r, seismic, seismic_r, spectral, spectral_r, spring, spring_r, summer, summer_r, tab10, tab10_r, tab20, tab20_r, tab20b, tab20b_r, tab20c, tab20c_r, terrain, terrain_r, viridis, viridis_r, vlag, vlag_r, winter, winter_r


sns.lmplot(x = 'imdb', y = 'meta', data = movies, scatter_kws={"s": 10})
<seaborn.axisgrid.FacetGrid at 0x24a65eb9080>


sns.kdeplot(movies['imdb'], movies['meta'], cmap = sns.cubehelix_palette(light = 1, as_cmap = True), shade = True)
<matplotlib.axes._subplots.AxesSubplot at 0x24a67f75cc0>


There is a high positive correlation between both the scoring metrics.

**Inference 1: ** People and critics tend to have the same view on movies. Both have the same opinions one movies with a rating of 6 (or 60).

fig, ax = plt.subplots()
sns.kdeplot(movies['imdb'], ax = ax)
sns.kdeplot(movies['meta']/10, ax = ax)
<matplotlib.axes._subplots.AxesSubplot at 0x24a6800f080>


**Inference 2 : ** Meta ratings follow a normal distribution while IMDb ratings tend to favour the 5 - 8 range.

dicts = {}
for genre in movies['genre'].unique():
  dicts[genre] = movies[movies['genre'] == genre]['imdb']
temp = pd.DataFrame(dicts)

fig, ax = plt.subplots(figsize=(100,5))


**Inference 3: ** Biography movies tend to have the high ratings.

fig, ax = plt.subplots(figsize=(21,8))
ax = sns.boxplot(x = 'genre', y = 'imdb', data = movies)


**Inference 4: ** In the world of movies, there are a lot of outliers. I don't think a machine learning model could work well on datasets like this (to predict the likeability of a movie)- unless we have more attributes.

# movie with lowest imdb rating
name       Beyond the Lights
year                    2014
genre                  Drama
imdb                     6.9
meta                      73
gross                  14.62
runtime                  116
Name: 3993, dtype: object
# movie with highest imdb rating
name       Iron Man
year           2008
genre        Action
imdb            7.9
meta             79
gross        318.41
runtime         126
Name: 2174, dtype: object
# movie with lowest meta rating
name       Welcome to Marwen
year                    2018
genre              Biography
imdb                       6
meta                      40
gross                  10.76
runtime                  116
Name: 4871, dtype: object
# movie with highest imdb rating
name       Maleficent
year             2014
genre          Action
imdb                7
meta               56
gross          241.41
runtime            97
Name: 3797, dtype: object

**Inference 5 - 8: **

  • IMDb:
    • Highest rated: The Dark Knight (2008). Score: 9
    • Lowest rated: Saving Christmas (2014). Score: 1.5
  • Metacritic:
    • Highest rated: Boyhood(2014). Score: 100
    • Highest rated: Death of a Nation(2018). Score: 1


Each movie had anywhere from 1 to 3 genres. To simplify the process, I figured the first genre for each movie would be most accurate. Looking at the dataset, only the first 9 genres are the most abundant.

# sns.pairplot(movies, hue = 'genre')
# sns.lmplot(x = 'imdb', y = 'meta', data = movies, hue = 'genre')
sns.lmplot(x = 'imdb', y = 'gross', data = movies.sample(1500), scatter_kws={"s": 5})
<seaborn.axisgrid.FacetGrid at 0x24a6828e4a8>


**Inference 9: ** No correlation between IMDb scores and movie box office. Same can be inferred for meta scores.

# be default, barplot shows mean. beow graph shows that family movies had the highest grossing per movie. 
fig, ax = plt.subplots(figsize=(18,4))
ax = sns.barplot(x = 'genre', y = 'gross', data = movies, palette=("Blues_d"))
ax.set_xticklabels(ax.get_xticklabels(), rotation = 30)
plt.title('mean gross per genre')


Hmmm. Notice the black line for family movies. It tells us that there is a high change of it being erroneous.

Comedy         1297
Drama          1041
Action          962
Crime           351
Biography       321
Animation       256
Adventure       242
Horror          180
Documentary     176
Mystery          15
Fantasy          14
Romance           8
Thriller          6
Musical           2
Family            2
Music             2
Sci-Fi            1
War               1
Name: genre, dtype: int64
movies[movies['genre'] == 'Family']
name year genre imdb meta gross runtime
1201 Raise Your Voice 2004 Family 5.9 33 10.41 103
4502 Beauty and the Beast 2017 Family 7.2 65 504.01 129
temp = movies.groupby('genre').sum()
temp['ppm'] = temp['gross'] / temp['runtime']
temp.sort_values('ppm', ascending =False).head()
imdb meta gross runtime ppm
Family 13.1 98 514.42 232 2.217328
Animation 1720.9 15449 24947.05 23922 1.042850
Action 6054.4 47835 67386.31 108042 0.623705
Adventure 1570.0 13489 16524.08 26933 0.613525
Mystery 98.8 832 949.80 1638 0.579853

**Inference 12: ** Family movies had the best profit per movie value. (514.42 million for 2 movies!) and earned a whopping 2.2 million dollars for 1 minute of screentime. This amount was largely due to Beauty and the Beast.

Inference 13: The most profitable movie genres are Family, Animation, and Action.

fig, ax = plt.subplots(figsize=(18,4))
ax = sns.barplot(x = 'genre', y = 'gross', data = movies, estimator = sum, palette=("Blues_d"))
plt.title('sum of grosses per genre')


**Inference 14: ** Action movies had the most profits (67k million), followed by comedy movies (34k million) and then animation movies (25k million).


fig, ax = plt.subplots(figsize=(18,4))
ax = sns.countplot(x = 'genre', data = movies, palette=("Blues_d"))
ax.set_xticklabels(ax.get_xticklabels() ,rotation = 30)
plt.title('number of movies per genre')


Inference 14:: Comedy movies were the most frequently released, followed by drama and action movies.

fig, ax = plt.subplots(figsize=(18,4))
ax = sns.barplot(x = 'genre', y = 'imdb', data = movies, palette=("Blues_d"))
ax.set_xticklabels(ax.get_xticklabels(), rotation = 30)
plt.title('average imdb rating per genre')


**Inference 15:**For IMDb ratings, on average, horror movies got the lowest ratings while war, documentary, and musical movies seem to get the highest.

Inference 16: IMDb movies tend to approximately get the same ratings regardless of genre.

fig, ax = plt.subplots(figsize=(18,4))
ax = sns.barplot(x = 'genre', y = 'meta', data = movies, palette=("Blues_d"))
ax.set_xticklabels(ax.get_xticklabels(), rotation = 30)
plt.title('average meta rating per genre')


Inference 17:: For meta ratings, documentary movies get the highest average rating.

Inference 18:: Mean ratings for each genre for metacritic tend to vary for each genre, unlike IMDb ratings.

Inference 19: Horror movies are the most disliked.

Inference 20: Documentary movies are the most liked.

fig, ax = plt.subplots()
sns.kdeplot(movies['imdb'], ax=ax)
sns.kdeplot(movies['meta']/10, ax=ax)
<matplotlib.axes._subplots.AxesSubplot at 0x24a6a8c40b8>



fig, ax = plt.subplots(figsize=(18,4))
ax = sns.barplot(x = 'genre', y = 'runtime', data = movies, palette=("Blues_d"))
ax.set_xticklabels(ax.get_xticklabels(), rotation = 30)
plt.title('average runtime per genre')


imdb        7.20
meta       57.50
gross       0.08
runtime    90.50
Name: Musical, dtype: float64
imdb         7.036449
meta        63.781931
gross       25.985452
runtime    117.529595
Name: Biography, dtype: float64

Inference 21: Biography movies had the longest average duration. (117 mins). Musical movies had the lowest. (90 minutes)

name       Ghosts of the Abyss
year                      2003
genre              Documentary
imdb                       6.9
meta                        67
gross                    17.09
runtime                     61
Name: 1020, dtype: object
name       The Best of Youth
year                    2003
genre                  Drama
imdb                     8.5
meta                      89
gross                   0.25
runtime                  366
Name: 946, dtype: object

Inference 22: Longest movie was 'The Best of Youth', which ran for 366 minutes (6 hours). Shortest movie was 'Ghost of the Abyss', which ran for 61 minutes. (1 hour)

sns.jointplot(x = 'runtime', y = 'gross', data = movies,  kind = 'reg', scatter_kws={"s": 3})


Inference 23: Runtime and box office are slightly correlated. Meaning, if the duration of a movie is more, it earns more. (with a probability of 0.3)


imdb meta gross runtime
2000 1644.9 13595 7413.40 27113
2001 1728.3 14157 7956.77 28360
2002 1774.5 15066 9072.36 28557
2003 1705.4 14524 8771.08 28026
2004 1753.3 14558 9186.20 28528
2005 1748.4 14947 8807.64 28870
2006 1897.1 16516 9157.57 31077
2007 1846.1 15657 9321.09 29943
2008 1732.3 14605 9663.57 28448
2009 1734.7 14517 10388.39 28545
2010 1718.8 14939 10085.55 27970
2011 1757.8 15134 9947.34 28678
2012 1633.6 14207 10321.71 26685
2013 1794.9 15628 10685.90 29671
2014 1689.4 14648 10544.93 27719
2015 1480.1 13127 10497.37 24504
2016 1546.5 13733 11253.12 25662
2017 1363.3 12606 10532.35 22917
2018 1228.9 10953 11213.83 20714
fig, ax = plt.subplots(figsize=(18,4))
ax = sns.barplot(x = 'year', y = 'gross', data = movies, palette=("Blues_d"))
ax.set_xticklabels(ax.get_xticklabels(), rotation = 30)
plt.title('average gross per year')


Hmm. The black bars indicate that there's a high chance of error. Guess this is what happens with semeingly unpredictable data like movies.

Inference 24: Average earning for movies increased as time went on.

sns.countplot(x = 'year', data = movies, palette=("Blues_d"))
plt.xticks(rotation = 45)
(array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
        17, 18]), <a list of 19 Text xticklabel objects>)


sns.lmplot(x = 'imdb', y = 'meta', data = movies, scatter_kws={"s": 10}, hue = 'year')
<seaborn.axisgrid.FacetGrid at 0x24a6aab2b70>


imdb meta gross runtime
2000 6.475984 53.523622 29.186614 106.744094
2001 6.497368 53.221805 29.912669 106.616541
2002 6.476277 54.985401 33.110803 104.222628
2003 6.534100 55.647510 33.605670 107.379310
2004 6.566667 54.524345 34.405243 106.846442
2005 6.451661 55.154982 32.500517 106.531365
2006 6.474744 56.368601 31.254505 106.064846
2007 6.523322 55.325088 32.936714 105.805654
2008 6.463806 54.496269 36.058097 106.149254
2009 6.472761 54.167910 38.762649 106.511194
2010 6.535361 56.802281 38.348099 106.349810
2011 6.486347 55.845018 36.706052 105.822878
2012 6.482540 56.376984 40.959167 105.892857
2013 6.550730 57.036496 38.999635 108.288321
2014 6.573541 56.996109 41.030856 107.856031
2015 6.578222 58.342222 46.654978 108.906667
2016 6.580851 58.438298 47.885617 109.200000
2017 6.585990 60.898551 50.880918 110.710145
2018 6.571658 58.572193 59.967005 110.770053


