Skip to content

An exploratory data analysis conducted on Hollywood movies released 2000 - 2018. Scraped, cleaned, and visualized.

Notifications You must be signed in to change notification settings

blazyy/data-analysis-imdb

Folders and files

NameName
Last commit message
Last commit date

Latest commit

 

History

8 Commits
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 

Repository files navigation

Scraper

import urllib.request
from bs4 import BeautifulSoup
import pandas as pd
from tqdm import tqdm_notebook
import seaborn as sns
sns.set_style("whitegrid")
from matplotlib import pyplot as plt
import numpy as np
%matplotlib inline
'''
movie_names = []
movie_years = []
imdb_ratings = []
meta_ratings = []
movie_earnings = [] 
movie_genres = []
runtimes = []

pages = 10  # 10 pages per year.
start_year = 2000
end_year = 2018

for i in tqdm_notebook(range(start_year, end_year+1), total = end_year-start_year, unit = 'year'):
  k = 1
  for j in range(1, pages + 1):
    imdb_url = 'https://www.imdb.com/search/title?release_date=' + str(i) + '-01-01,' + str(i) + '-12-31&sort=num_votes,desc&start=' + str(k) + '&ref_=adv_nxt'
    page_unparsed = urllib.request.urlopen(imdb_url)
    page_parsed = BeautifulSoup(page_unparsed, 'html.parser')

    k += 50

    movie_divs = page_parsed.find_all('div', class_ = 'lister-item mode-advanced')

    for movie in movie_divs:
      skip_movie = False 

      meta_rating_unparsed = movie.find('div', class_ = 'inline-block ratings-metascore')
      gross_unparsed = movie.find_all('span', attrs = {'name' : 'nv'})

      if meta_rating_unparsed is None or gross_unparsed is None:
        continue
        
      temp = movie.find_all('span', attrs = {'name' : 'nv'})
      if len(temp) is not 2:
        continue
      else:
        movie_earnings.append(float(temp[1].string.strip('$').strip('M')))

      meta_ratings.append(int(meta_rating_unparsed.text.replace(" ", "").split("\n")[1]))
      movie_genres.append(movie.find('span', attrs = {'class' : 'genre'}).text.strip().split(",")[0])
      imdb_ratings.append(float(movie.find('div', class_ = 'inline-block ratings-imdb-rating').text))
      movie_names.append(movie.find('h3', class_ = 'lister-item-header').find('a').text)


      year = movie.find('span', class_ = 'lister-item-year text-muted unbold').text.split(" ")
      if len(year) == 1:
        movie_years.append(year[0][1:5])
      else:
        movie_years.append(year[1][1:5])

        
      runtimes.append(int(movie.find('span', class_ = 'runtime').text.strip('min')))
'''
'\nmovie_names = []\nmovie_years = []\nimdb_ratings = []\nmeta_ratings = []\nmovie_earnings = [] \nmovie_genres = []\nruntimes = []\n\npages = 10  # 10 pages per year.\nstart_year = 2000\nend_year = 2018\n\nfor i in tqdm_notebook(range(start_year, end_year+1), total = end_year-start_year, unit = \'year\'):\n  k = 1\n  for j in range(1, pages + 1):\n    imdb_url = \'https://www.imdb.com/search/title?release_date=\' + str(i) + \'-01-01,\' + str(i) + \'-12-31&sort=num_votes,desc&start=\' + str(k) + \'&ref_=adv_nxt\'\n    page_unparsed = urllib.request.urlopen(imdb_url)\n    page_parsed = BeautifulSoup(page_unparsed, \'html.parser\')\n\n    k += 50\n\n    movie_divs = page_parsed.find_all(\'div\', class_ = \'lister-item mode-advanced\')\n\n    for movie in movie_divs:\n      skip_movie = False \n\n      meta_rating_unparsed = movie.find(\'div\', class_ = \'inline-block ratings-metascore\')\n      gross_unparsed = movie.find_all(\'span\', attrs = {\'name\' : \'nv\'})\n\n      if meta_rating_unparsed is None or gross_unparsed is None:\n        continue\n        \n      temp = movie.find_all(\'span\', attrs = {\'name\' : \'nv\'})\n      if len(temp) is not 2:\n        continue\n      else:\n        movie_earnings.append(float(temp[1].string.strip(\'$\').strip(\'M\')))\n\n      meta_ratings.append(int(meta_rating_unparsed.text.replace(" ", "").split("\n")[1]))\n      movie_genres.append(movie.find(\'span\', attrs = {\'class\' : \'genre\'}).text.strip().split(",")[0])\n      imdb_ratings.append(float(movie.find(\'div\', class_ = \'inline-block ratings-imdb-rating\').text))\n      movie_names.append(movie.find(\'h3\', class_ = \'lister-item-header\').find(\'a\').text)\n\n\n      year = movie.find(\'span\', class_ = \'lister-item-year text-muted unbold\').text.split(" ")\n      if len(year) == 1:\n        movie_years.append(year[0][1:5])\n      else:\n        movie_years.append(year[1][1:5])\n\n        \n      runtimes.append(int(movie.find(\'span\', class_ = \'runtime\').text.strip(\'min\')))\n'
# movies = pd.DataFrame(list(zip(movie_names, movie_years, movie_genres, imdb_ratings, meta_ratings, movie_earnings, runtimes)), columns =['name', 'year', 'genre', 'imdb', 'meta', 'gross', 'runtime']) 
movies = pd.read_csv("movies.csv")
movies.drop("Unnamed: 0", axis = 1, inplace = True)
movies.drop([170, 1001], axis = 0, inplace = True)
# movies.to_csv("movies.csv")
# files.download('movies.csv') 
movies.describe()
<style scoped> .dataframe tbody tr th:only-of-type { vertical-align: middle; }
.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}
</style>
imdb meta gross runtime
count 4877.000000 4877.000000 4877.000000 4877.000000
mean 6.517572 56.017839 37.897115 107.052696
std 0.971941 17.860327 68.181217 18.182487
min 1.500000 1.000000 0.000000 61.000000
25% 6.000000 43.000000 0.720000 95.000000
50% 6.600000 57.000000 11.440000 104.000000
75% 7.200000 69.000000 45.170000 116.000000
max 9.000000 100.000000 936.660000 366.000000
movies.head()
<style scoped> .dataframe tbody tr th:only-of-type { vertical-align: middle; }
.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}
</style>
name year genre imdb meta gross runtime
0 Gladiator 2000 Action 8.5 67 187.71 155
1 Memento 2000 Mystery 8.5 80 25.54 113
2 Snatch 2000 Comedy 8.3 55 30.33 102
3 Requiem for a Dream 2000 Drama 8.3 68 3.64 102
4 X-Men 2000 Action 7.4 64 157.30 104

Theme

import matplotlib.style as style
style.available
['bmh',
 'classic',
 'dark_background',
 'fast',
 'fivethirtyeight',
 'ggplot',
 'grayscale',
 'seaborn-bright',
 'seaborn-colorblind',
 'seaborn-dark-palette',
 'seaborn-dark',
 'seaborn-darkgrid',
 'seaborn-deep',
 'seaborn-muted',
 'seaborn-notebook',
 'seaborn-paper',
 'seaborn-pastel',
 'seaborn-poster',
 'seaborn-talk',
 'seaborn-ticks',
 'seaborn-white',
 'seaborn-whitegrid',
 'seaborn',
 'Solarize_Light2',
 'tableau-colorblind10',
 '_classic_test']
style.use('seaborn-poster') #sets the size of the charts
style.use('ggplot')

Palettes :

Accent, Accent_r, Blues, Blues_r, BrBG, BrBG_r, BuGn, BuGn_r, BuPu, BuPu_r, CMRmap, CMRmap_r, Dark2, Dark2_r, GnBu, GnBu_r, Greens, Greens_r, Greys, Greys_r, OrRd, OrRd_r, Oranges, Oranges_r, PRGn, PRGn_r, Paired, Paired_r, Pastel1, Pastel1_r, Pastel2, Pastel2_r, PiYG, PiYG_r, PuBu, PuBuGn, PuBuGn_r, PuBu_r, PuOr, PuOr_r, PuRd, PuRd_r, Purples, Purples_r, RdBu, RdBu_r, RdGy, RdGy_r, RdPu, RdPu_r, RdYlBu, RdYlBu_r, RdYlGn, RdYlGn_r, Reds, Reds_r, Set1, Set1_r, Set2, Set2_r, Set3, Set3_r, Spectral, Spectral_r, Vega10, Vega10_r, Vega20, Vega20_r, Vega20b, Vega20b_r, Vega20c, Vega20c_r, Wistia, Wistia_r, YlGn, YlGnBu, YlGnBu_r, YlGn_r, YlOrBr, YlOrBr_r, YlOrRd, YlOrRd_r, afmhot, afmhot_r, autumn, autumn_r, binary, binary_r, bone, bone_r, brg, brg_r, bwr, bwr_r, cool, cool_r, coolwarm, coolwarm_r, copper, copper_r, cubehelix, cubehelix_r, flag, flag_r, gist_earth, gist_earth_r, gist_gray, gist_gray_r, gist_heat, gist_heat_r, gist_ncar, gist_ncar_r, gist_rainbow, gist_rainbow_r, gist_stern, gist_stern_r, gist_yarg, gist_yarg_r, gnuplot, gnuplot2, gnuplot2_r, gnuplot_r, gray, gray_r, hot, hot_r, hsv, hsv_r, icefire, icefire_r, inferno, inferno_r, jet, jet_r, magma, magma_r, mako, mako_r, nipy_spectral, nipy_spectral_r, ocean, ocean_r, pink, pink_r, plasma, plasma_r, prism, prism_r, rainbow, rainbow_r, rocket, rocket_r, seismic, seismic_r, spectral, spectral_r, spring, spring_r, summer, summer_r, tab10, tab10_r, tab20, tab20_r, tab20b, tab20b_r, tab20c, tab20c_r, terrain, terrain_r, viridis, viridis_r, vlag, vlag_r, winter, winter_r

Rating

sns.lmplot(x = 'imdb', y = 'meta', data = movies, scatter_kws={"s": 10})
<seaborn.axisgrid.FacetGrid at 0x24a65eb9080>

png

sns.kdeplot(movies['imdb'], movies['meta'], cmap = sns.cubehelix_palette(light = 1, as_cmap = True), shade = True)
<matplotlib.axes._subplots.AxesSubplot at 0x24a67f75cc0>

png

There is a high positive correlation between both the scoring metrics.

**Inference 1: ** People and critics tend to have the same view on movies. Both have the same opinions one movies with a rating of 6 (or 60).

fig, ax = plt.subplots()
sns.kdeplot(movies['imdb'], ax = ax)
sns.kdeplot(movies['meta']/10, ax = ax)
<matplotlib.axes._subplots.AxesSubplot at 0x24a6800f080>

png

**Inference 2 : ** Meta ratings follow a normal distribution while IMDb ratings tend to favour the 5 - 8 range.

dicts = {}
for genre in movies['genre'].unique():
  dicts[genre] = movies[movies['genre'] == genre]['imdb']
               
temp = pd.DataFrame(dicts)

fig, ax = plt.subplots(figsize=(100,5))
sns.violinplot(temp)
plt.show()

png

**Inference 3: ** Biography movies tend to have the high ratings.

fig, ax = plt.subplots(figsize=(21,8))
ax = sns.boxplot(x = 'genre', y = 'imdb', data = movies)
plt.show()

png

**Inference 4: ** In the world of movies, there are a lot of outliers. I don't think a machine learning model could work well on datasets like this (to predict the likeability of a movie)- unless we have more attributes.

# movie with lowest imdb rating
movies.iloc[movies['imdb'].idxmin()]
name       Beyond the Lights
year                    2014
genre                  Drama
imdb                     6.9
meta                      73
gross                  14.62
runtime                  116
Name: 3993, dtype: object
# movie with highest imdb rating
movies.iloc[movies['imdb'].idxmax()]
name       Iron Man
year           2008
genre        Action
imdb            7.9
meta             79
gross        318.41
runtime         126
Name: 2174, dtype: object
# movie with lowest meta rating
movies.iloc[movies['meta'].idxmin()]
name       Welcome to Marwen
year                    2018
genre              Biography
imdb                       6
meta                      40
gross                  10.76
runtime                  116
Name: 4871, dtype: object
# movie with highest imdb rating
movies.iloc[movies['meta'].idxmax()]
name       Maleficent
year             2014
genre          Action
imdb                7
meta               56
gross          241.41
runtime            97
Name: 3797, dtype: object

**Inference 5 - 8: **

  • IMDb:
    • Highest rated: The Dark Knight (2008). Score: 9
    • Lowest rated: Saving Christmas (2014). Score: 1.5
  • Metacritic:
    • Highest rated: Boyhood(2014). Score: 100
    • Highest rated: Death of a Nation(2018). Score: 1

Gross

Each movie had anywhere from 1 to 3 genres. To simplify the process, I figured the first genre for each movie would be most accurate. Looking at the dataset, only the first 9 genres are the most abundant.

# sns.pairplot(movies, hue = 'genre')
# sns.lmplot(x = 'imdb', y = 'meta', data = movies, hue = 'genre')
sns.lmplot(x = 'imdb', y = 'gross', data = movies.sample(1500), scatter_kws={"s": 5})
<seaborn.axisgrid.FacetGrid at 0x24a6828e4a8>

png

**Inference 9: ** No correlation between IMDb scores and movie box office. Same can be inferred for meta scores.

# be default, barplot shows mean. beow graph shows that family movies had the highest grossing per movie. 
fig, ax = plt.subplots(figsize=(18,4))
ax = sns.barplot(x = 'genre', y = 'gross', data = movies, palette=("Blues_d"))
ax.set_xticklabels(ax.get_xticklabels(), rotation = 30)
plt.title('mean gross per genre')
plt.show()

png

Hmmm. Notice the black line for family movies. It tells us that there is a high change of it being erroneous.

movies['genre'].value_counts()
Comedy         1297
Drama          1041
Action          962
Crime           351
Biography       321
Animation       256
Adventure       242
Horror          180
Documentary     176
Mystery          15
Fantasy          14
Romance           8
Thriller          6
Musical           2
Family            2
Music             2
Sci-Fi            1
War               1
Name: genre, dtype: int64
movies[movies['genre'] == 'Family']
<style scoped> .dataframe tbody tr th:only-of-type { vertical-align: middle; }
.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}
</style>
name year genre imdb meta gross runtime
1201 Raise Your Voice 2004 Family 5.9 33 10.41 103
4502 Beauty and the Beast 2017 Family 7.2 65 504.01 129
temp = movies.groupby('genre').sum()
temp['ppm'] = temp['gross'] / temp['runtime']
temp.sort_values('ppm', ascending =False).head()
<style scoped> .dataframe tbody tr th:only-of-type { vertical-align: middle; }
.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}
</style>
imdb meta gross runtime ppm
genre
Family 13.1 98 514.42 232 2.217328
Animation 1720.9 15449 24947.05 23922 1.042850
Action 6054.4 47835 67386.31 108042 0.623705
Adventure 1570.0 13489 16524.08 26933 0.613525
Mystery 98.8 832 949.80 1638 0.579853

**Inference 12: ** Family movies had the best profit per movie value. (514.42 million for 2 movies!) and earned a whopping 2.2 million dollars for 1 minute of screentime. This amount was largely due to Beauty and the Beast.

Inference 13: The most profitable movie genres are Family, Animation, and Action.

fig, ax = plt.subplots(figsize=(18,4))
ax = sns.barplot(x = 'genre', y = 'gross', data = movies, estimator = sum, palette=("Blues_d"))
ax.set_xticklabels(ax.get_xticklabels(),rotation=90)
plt.title('sum of grosses per genre')
plt.show()

png

**Inference 14: ** Action movies had the most profits (67k million), followed by comedy movies (34k million) and then animation movies (25k million).

Genre

fig, ax = plt.subplots(figsize=(18,4))
ax = sns.countplot(x = 'genre', data = movies, palette=("Blues_d"))
ax.set_xticklabels(ax.get_xticklabels() ,rotation = 30)
plt.title('number of movies per genre')
plt.show()

png

Inference 14:: Comedy movies were the most frequently released, followed by drama and action movies.

fig, ax = plt.subplots(figsize=(18,4))
ax = sns.barplot(x = 'genre', y = 'imdb', data = movies, palette=("Blues_d"))
ax.set_xticklabels(ax.get_xticklabels(), rotation = 30)
plt.title('average imdb rating per genre')
plt.show()

png

**Inference 15:**For IMDb ratings, on average, horror movies got the lowest ratings while war, documentary, and musical movies seem to get the highest.

Inference 16: IMDb movies tend to approximately get the same ratings regardless of genre.

fig, ax = plt.subplots(figsize=(18,4))
ax = sns.barplot(x = 'genre', y = 'meta', data = movies, palette=("Blues_d"))
ax.set_xticklabels(ax.get_xticklabels(), rotation = 30)
plt.title('average meta rating per genre')
plt.show()

png

Inference 17:: For meta ratings, documentary movies get the highest average rating.

Inference 18:: Mean ratings for each genre for metacritic tend to vary for each genre, unlike IMDb ratings.

Inference 19: Horror movies are the most disliked.

Inference 20: Documentary movies are the most liked.

fig, ax = plt.subplots()
sns.kdeplot(movies['imdb'], ax=ax)
sns.kdeplot(movies['meta']/10, ax=ax)
<matplotlib.axes._subplots.AxesSubplot at 0x24a6a8c40b8>

png

Runtime

fig, ax = plt.subplots(figsize=(18,4))
ax = sns.barplot(x = 'genre', y = 'runtime', data = movies, palette=("Blues_d"))
ax.set_xticklabels(ax.get_xticklabels(), rotation = 30)
plt.title('average runtime per genre')
plt.show()

png

movies.groupby(['genre']).mean().sort_values('runtime').iloc[0]
imdb        7.20
meta       57.50
gross       0.08
runtime    90.50
Name: Musical, dtype: float64
movies.groupby(['genre']).mean().sort_values('runtime').iloc[-1]
imdb         7.036449
meta        63.781931
gross       25.985452
runtime    117.529595
Name: Biography, dtype: float64

Inference 21: Biography movies had the longest average duration. (117 mins). Musical movies had the lowest. (90 minutes)

movies.loc[movies['runtime'].idxmin()]
name       Ghosts of the Abyss
year                      2003
genre              Documentary
imdb                       6.9
meta                        67
gross                    17.09
runtime                     61
Name: 1020, dtype: object
movies.loc[movies['runtime'].idxmax()]
name       The Best of Youth
year                    2003
genre                  Drama
imdb                     8.5
meta                      89
gross                   0.25
runtime                  366
Name: 946, dtype: object

Inference 22: Longest movie was 'The Best of Youth', which ran for 366 minutes (6 hours). Shortest movie was 'Ghost of the Abyss', which ran for 61 minutes. (1 hour)

sns.jointplot(x = 'runtime', y = 'gross', data = movies,  kind = 'reg', scatter_kws={"s": 3})
plt.show()

png

Inference 23: Runtime and box office are slightly correlated. Meaning, if the duration of a movie is more, it earns more. (with a probability of 0.3)

Year

movies.groupby('year').sum()
<style scoped> .dataframe tbody tr th:only-of-type { vertical-align: middle; }
.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}
</style>
imdb meta gross runtime
year
2000 1644.9 13595 7413.40 27113
2001 1728.3 14157 7956.77 28360
2002 1774.5 15066 9072.36 28557
2003 1705.4 14524 8771.08 28026
2004 1753.3 14558 9186.20 28528
2005 1748.4 14947 8807.64 28870
2006 1897.1 16516 9157.57 31077
2007 1846.1 15657 9321.09 29943
2008 1732.3 14605 9663.57 28448
2009 1734.7 14517 10388.39 28545
2010 1718.8 14939 10085.55 27970
2011 1757.8 15134 9947.34 28678
2012 1633.6 14207 10321.71 26685
2013 1794.9 15628 10685.90 29671
2014 1689.4 14648 10544.93 27719
2015 1480.1 13127 10497.37 24504
2016 1546.5 13733 11253.12 25662
2017 1363.3 12606 10532.35 22917
2018 1228.9 10953 11213.83 20714
fig, ax = plt.subplots(figsize=(18,4))
ax = sns.barplot(x = 'year', y = 'gross', data = movies, palette=("Blues_d"))
ax.set_xticklabels(ax.get_xticklabels(), rotation = 30)
plt.title('average gross per year')
plt.show()

png

Hmm. The black bars indicate that there's a high chance of error. Guess this is what happens with semeingly unpredictable data like movies.

Inference 24: Average earning for movies increased as time went on.

sns.countplot(x = 'year', data = movies, palette=("Blues_d"))
plt.xticks(rotation = 45)
(array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
        17, 18]), <a list of 19 Text xticklabel objects>)

png

sns.lmplot(x = 'imdb', y = 'meta', data = movies, scatter_kws={"s": 10}, hue = 'year')
<seaborn.axisgrid.FacetGrid at 0x24a6aab2b70>

png

movies.groupby('year').mean()
<style scoped> .dataframe tbody tr th:only-of-type { vertical-align: middle; }
.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}
</style>
imdb meta gross runtime
year
2000 6.475984 53.523622 29.186614 106.744094
2001 6.497368 53.221805 29.912669 106.616541
2002 6.476277 54.985401 33.110803 104.222628
2003 6.534100 55.647510 33.605670 107.379310
2004 6.566667 54.524345 34.405243 106.846442
2005 6.451661 55.154982 32.500517 106.531365
2006 6.474744 56.368601 31.254505 106.064846
2007 6.523322 55.325088 32.936714 105.805654
2008 6.463806 54.496269 36.058097 106.149254
2009 6.472761 54.167910 38.762649 106.511194
2010 6.535361 56.802281 38.348099 106.349810
2011 6.486347 55.845018 36.706052 105.822878
2012 6.482540 56.376984 40.959167 105.892857
2013 6.550730 57.036496 38.999635 108.288321
2014 6.573541 56.996109 41.030856 107.856031
2015 6.578222 58.342222 46.654978 108.906667
2016 6.580851 58.438298 47.885617 109.200000
2017 6.585990 60.898551 50.880918 110.710145
2018 6.571658 58.572193 59.967005 110.770053

About

An exploratory data analysis conducted on Hollywood movies released 2000 - 2018. Scraped, cleaned, and visualized.

Resources

Stars

Watchers

Forks

Releases

No releases published

Packages

No packages published