Scraper

import urllib.request
from bs4 import BeautifulSoup
import pandas as pd
from tqdm import tqdm_notebook
import seaborn as sns
sns.set_style("whitegrid")
from matplotlib import pyplot as plt
import numpy as np
%matplotlib inline

'''
movie_names = []
movie_years = []
imdb_ratings = []
meta_ratings = []
movie_earnings = [] 
movie_genres = []
runtimes = []

pages = 10  # 10 pages per year.
start_year = 2000
end_year = 2018

for i in tqdm_notebook(range(start_year, end_year+1), total = end_year-start_year, unit = 'year'):
  k = 1
  for j in range(1, pages + 1):
    imdb_url = 'https://www.imdb.com/search/title?release_date=' + str(i) + '-01-01,' + str(i) + '-12-31&sort=num_votes,desc&start=' + str(k) + '&ref_=adv_nxt'
    page_unparsed = urllib.request.urlopen(imdb_url)
    page_parsed = BeautifulSoup(page_unparsed, 'html.parser')

    k += 50

    movie_divs = page_parsed.find_all('div', class_ = 'lister-item mode-advanced')

    for movie in movie_divs:
      skip_movie = False 

      meta_rating_unparsed = movie.find('div', class_ = 'inline-block ratings-metascore')
      gross_unparsed = movie.find_all('span', attrs = {'name' : 'nv'})

      if meta_rating_unparsed is None or gross_unparsed is None:
        continue
        
      temp = movie.find_all('span', attrs = {'name' : 'nv'})
      if len(temp) is not 2:
        continue
      else:
        movie_earnings.append(float(temp[1].string.strip('$').strip('M')))

      meta_ratings.append(int(meta_rating_unparsed.text.replace(" ", "").split("\n")[1]))
      movie_genres.append(movie.find('span', attrs = {'class' : 'genre'}).text.strip().split(",")[0])
      imdb_ratings.append(float(movie.find('div', class_ = 'inline-block ratings-imdb-rating').text))
      movie_names.append(movie.find('h3', class_ = 'lister-item-header').find('a').text)


      year = movie.find('span', class_ = 'lister-item-year text-muted unbold').text.split(" ")
      if len(year) == 1:
        movie_years.append(year[0][1:5])
      else:
        movie_years.append(year[1][1:5])

        
      runtimes.append(int(movie.find('span', class_ = 'runtime').text.strip('min')))
'''

'\nmovie_names = []\nmovie_years = []\nimdb_ratings = []\nmeta_ratings = []\nmovie_earnings = [] \nmovie_genres = []\nruntimes = []\n\npages = 10  # 10 pages per year.\nstart_year = 2000\nend_year = 2018\n\nfor i in tqdm_notebook(range(start_year, end_year+1), total = end_year-start_year, unit = \'year\'):\n  k = 1\n  for j in range(1, pages + 1):\n    imdb_url = \'https://www.imdb.com/search/title?release_date=\' + str(i) + \'-01-01,\' + str(i) + \'-12-31&sort=num_votes,desc&start=\' + str(k) + \'&ref_=adv_nxt\'\n    page_unparsed = urllib.request.urlopen(imdb_url)\n    page_parsed = BeautifulSoup(page_unparsed, \'html.parser\')\n\n    k += 50\n\n    movie_divs = page_parsed.find_all(\'div\', class_ = \'lister-item mode-advanced\')\n\n    for movie in movie_divs:\n      skip_movie = False \n\n      meta_rating_unparsed = movie.find(\'div\', class_ = \'inline-block ratings-metascore\')\n      gross_unparsed = movie.find_all(\'span\', attrs = {\'name\' : \'nv\'})\n\n      if meta_rating_unparsed is None or gross_unparsed is None:\n        continue\n        \n      temp = movie.find_all(\'span\', attrs = {\'name\' : \'nv\'})\n      if len(temp) is not 2:\n        continue\n      else:\n        movie_earnings.append(float(temp[1].string.strip(\'$\').strip(\'M\')))\n\n      meta_ratings.append(int(meta_rating_unparsed.text.replace(" ", "").split("\n")[1]))\n      movie_genres.append(movie.find(\'span\', attrs = {\'class\' : \'genre\'}).text.strip().split(",")[0])\n      imdb_ratings.append(float(movie.find(\'div\', class_ = \'inline-block ratings-imdb-rating\').text))\n      movie_names.append(movie.find(\'h3\', class_ = \'lister-item-header\').find(\'a\').text)\n\n\n      year = movie.find(\'span\', class_ = \'lister-item-year text-muted unbold\').text.split(" ")\n      if len(year) == 1:\n        movie_years.append(year[0][1:5])\n      else:\n        movie_years.append(year[1][1:5])\n\n        \n      runtimes.append(int(movie.find(\'span\', class_ = \'runtime\').text.strip(\'min\')))\n'

# movies = pd.DataFrame(list(zip(movie_names, movie_years, movie_genres, imdb_ratings, meta_ratings, movie_earnings, runtimes)), columns =['name', 'year', 'genre', 'imdb', 'meta', 'gross', 'runtime']) 
movies = pd.read_csv("movies.csv")
movies.drop("Unnamed: 0", axis = 1, inplace = True)
movies.drop([170, 1001], axis = 0, inplace = True)

# movies.to_csv("movies.csv")
# files.download('movies.csv')

movies.describe()

.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}

</style>

	imdb	meta	gross	runtime
count	4877.000000	4877.000000	4877.000000	4877.000000
mean	6.517572	56.017839	37.897115	107.052696
std	0.971941	17.860327	68.181217	18.182487
min	1.500000	1.000000	0.000000	61.000000
25%	6.000000	43.000000	0.720000	95.000000
50%	6.600000	57.000000	11.440000	104.000000
75%	7.200000	69.000000	45.170000	116.000000
max	9.000000	100.000000	936.660000	366.000000

movies.head()

.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}

</style>

	name	year	genre	imdb	meta	gross	runtime
0	Gladiator	2000	Action	8.5	67	187.71	155
1	Memento	2000	Mystery	8.5	80	25.54	113
2	Snatch	2000	Comedy	8.3	55	30.33	102
3	Requiem for a Dream	2000	Drama	8.3	68	3.64	102
4	X-Men	2000	Action	7.4	64	157.30	104

Theme

import matplotlib.style as style
style.available

['bmh',
 'classic',
 'dark_background',
 'fast',
 'fivethirtyeight',
 'ggplot',
 'grayscale',
 'seaborn-bright',
 'seaborn-colorblind',
 'seaborn-dark-palette',
 'seaborn-dark',
 'seaborn-darkgrid',
 'seaborn-deep',
 'seaborn-muted',
 'seaborn-notebook',
 'seaborn-paper',
 'seaborn-pastel',
 'seaborn-poster',
 'seaborn-talk',
 'seaborn-ticks',
 'seaborn-white',
 'seaborn-whitegrid',
 'seaborn',
 'Solarize_Light2',
 'tableau-colorblind10',
 '_classic_test']

style.use('seaborn-poster') #sets the size of the charts
style.use('ggplot')

Palettes :

Accent, Accent_r, Blues, Blues_r, BrBG, BrBG_r, BuGn, BuGn_r, BuPu, BuPu_r, CMRmap, CMRmap_r, Dark2, Dark2_r, GnBu, GnBu_r, Greens, Greens_r, Greys, Greys_r, OrRd, OrRd_r, Oranges, Oranges_r, PRGn, PRGn_r, Paired, Paired_r, Pastel1, Pastel1_r, Pastel2, Pastel2_r, PiYG, PiYG_r, PuBu, PuBuGn, PuBuGn_r, PuBu_r, PuOr, PuOr_r, PuRd, PuRd_r, Purples, Purples_r, RdBu, RdBu_r, RdGy, RdGy_r, RdPu, RdPu_r, RdYlBu, RdYlBu_r, RdYlGn, RdYlGn_r, Reds, Reds_r, Set1, Set1_r, Set2, Set2_r, Set3, Set3_r, Spectral, Spectral_r, Vega10, Vega10_r, Vega20, Vega20_r, Vega20b, Vega20b_r, Vega20c, Vega20c_r, Wistia, Wistia_r, YlGn, YlGnBu, YlGnBu_r, YlGn_r, YlOrBr, YlOrBr_r, YlOrRd, YlOrRd_r, afmhot, afmhot_r, autumn, autumn_r, binary, binary_r, bone, bone_r, brg, brg_r, bwr, bwr_r, cool, cool_r, coolwarm, coolwarm_r, copper, copper_r, cubehelix, cubehelix_r, flag, flag_r, gist_earth, gist_earth_r, gist_gray, gist_gray_r, gist_heat, gist_heat_r, gist_ncar, gist_ncar_r, gist_rainbow, gist_rainbow_r, gist_stern, gist_stern_r, gist_yarg, gist_yarg_r, gnuplot, gnuplot2, gnuplot2_r, gnuplot_r, gray, gray_r, hot, hot_r, hsv, hsv_r, icefire, icefire_r, inferno, inferno_r, jet, jet_r, magma, magma_r, mako, mako_r, nipy_spectral, nipy_spectral_r, ocean, ocean_r, pink, pink_r, plasma, plasma_r, prism, prism_r, rainbow, rainbow_r, rocket, rocket_r, seismic, seismic_r, spectral, spectral_r, spring, spring_r, summer, summer_r, tab10, tab10_r, tab20, tab20_r, tab20b, tab20b_r, tab20c, tab20c_r, terrain, terrain_r, viridis, viridis_r, vlag, vlag_r, winter, winter_r

Rating

sns.lmplot(x = 'imdb', y = 'meta', data = movies, scatter_kws={"s": 10})

<seaborn.axisgrid.FacetGrid at 0x24a65eb9080>

sns.kdeplot(movies['imdb'], movies['meta'], cmap = sns.cubehelix_palette(light = 1, as_cmap = True), shade = True)

<matplotlib.axes._subplots.AxesSubplot at 0x24a67f75cc0>

There is a high positive correlation between both the scoring metrics.

**Inference 1: ** People and critics tend to have the same view on movies. Both have the same opinions one movies with a rating of 6 (or 60).

fig, ax = plt.subplots()
sns.kdeplot(movies['imdb'], ax = ax)
sns.kdeplot(movies['meta']/10, ax = ax)

<matplotlib.axes._subplots.AxesSubplot at 0x24a6800f080>

**Inference 2 : ** Meta ratings follow a normal distribution while IMDb ratings tend to favour the 5 - 8 range.

dicts = {}
for genre in movies['genre'].unique():
  dicts[genre] = movies[movies['genre'] == genre]['imdb']
               
temp = pd.DataFrame(dicts)

fig, ax = plt.subplots(figsize=(100,5))
sns.violinplot(temp)
plt.show()

**Inference 3: ** Biography movies tend to have the high ratings.

fig, ax = plt.subplots(figsize=(21,8))
ax = sns.boxplot(x = 'genre', y = 'imdb', data = movies)
plt.show()

**Inference 4: ** In the world of movies, there are a lot of outliers. I don't think a machine learning model could work well on datasets like this (to predict the likeability of a movie)- unless we have more attributes.

# movie with lowest imdb rating
movies.iloc[movies['imdb'].idxmin()]

name       Beyond the Lights
year                    2014
genre                  Drama
imdb                     6.9
meta                      73
gross                  14.62
runtime                  116
Name: 3993, dtype: object

# movie with highest imdb rating
movies.iloc[movies['imdb'].idxmax()]

name       Iron Man
year           2008
genre        Action
imdb            7.9
meta             79
gross        318.41
runtime         126
Name: 2174, dtype: object

# movie with lowest meta rating
movies.iloc[movies['meta'].idxmin()]

name       Welcome to Marwen
year                    2018
genre              Biography
imdb                       6
meta                      40
gross                  10.76
runtime                  116
Name: 4871, dtype: object

# movie with highest imdb rating
movies.iloc[movies['meta'].idxmax()]

name       Maleficent
year             2014
genre          Action
imdb                7
meta               56
gross          241.41
runtime            97
Name: 3797, dtype: object

**Inference 5 - 8: **

IMDb:
- Highest rated: The Dark Knight (2008). Score: 9
- Lowest rated: Saving Christmas (2014). Score: 1.5
Metacritic:
- Highest rated: Boyhood(2014). Score: 100
- Highest rated: Death of a Nation(2018). Score: 1

Gross

Each movie had anywhere from 1 to 3 genres. To simplify the process, I figured the first genre for each movie would be most accurate. Looking at the dataset, only the first 9 genres are the most abundant.

# sns.pairplot(movies, hue = 'genre')
# sns.lmplot(x = 'imdb', y = 'meta', data = movies, hue = 'genre')

sns.lmplot(x = 'imdb', y = 'gross', data = movies.sample(1500), scatter_kws={"s": 5})

<seaborn.axisgrid.FacetGrid at 0x24a6828e4a8>

**Inference 9: ** No correlation between IMDb scores and movie box office. Same can be inferred for meta scores.

# be default, barplot shows mean. beow graph shows that family movies had the highest grossing per movie. 
fig, ax = plt.subplots(figsize=(18,4))
ax = sns.barplot(x = 'genre', y = 'gross', data = movies, palette=("Blues_d"))
ax.set_xticklabels(ax.get_xticklabels(), rotation = 30)
plt.title('mean gross per genre')
plt.show()

Hmmm. Notice the black line for family movies. It tells us that there is a high change of it being erroneous.

movies['genre'].value_counts()

Comedy         1297
Drama          1041
Action          962
Crime           351
Biography       321
Animation       256
Adventure       242
Horror          180
Documentary     176
Mystery          15
Fantasy          14
Romance           8
Thriller          6
Musical           2
Family            2
Music             2
Sci-Fi            1
War               1
Name: genre, dtype: int64

movies[movies['genre'] == 'Family']

.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}

</style>

	name	year	genre	imdb	meta	gross	runtime
1201	Raise Your Voice	2004	Family	5.9	33	10.41	103
4502	Beauty and the Beast	2017	Family	7.2	65	504.01	129

temp = movies.groupby('genre').sum()
temp['ppm'] = temp['gross'] / temp['runtime']
temp.sort_values('ppm', ascending =False).head()

.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}

</style>

	imdb	meta	gross	runtime	ppm
genre
Family	13.1	98	514.42	232	2.217328
Animation	1720.9	15449	24947.05	23922	1.042850
Action	6054.4	47835	67386.31	108042	0.623705
Adventure	1570.0	13489	16524.08	26933	0.613525
Mystery	98.8	832	949.80	1638	0.579853

**Inference 12: ** Family movies had the best profit per movie value. (514.42 million for 2 movies!) and earned a whopping 2.2 million dollars for 1 minute of screentime. This amount was largely due to Beauty and the Beast.

Inference 13: The most profitable movie genres are Family, Animation, and Action.

fig, ax = plt.subplots(figsize=(18,4))
ax = sns.barplot(x = 'genre', y = 'gross', data = movies, estimator = sum, palette=("Blues_d"))
ax.set_xticklabels(ax.get_xticklabels(),rotation=90)
plt.title('sum of grosses per genre')
plt.show()

**Inference 14: ** Action movies had the most profits (67k million), followed by comedy movies (34k million) and then animation movies (25k million).

Genre

fig, ax = plt.subplots(figsize=(18,4))
ax = sns.countplot(x = 'genre', data = movies, palette=("Blues_d"))
ax.set_xticklabels(ax.get_xticklabels() ,rotation = 30)
plt.title('number of movies per genre')
plt.show()

Inference 14:: Comedy movies were the most frequently released, followed by drama and action movies.

fig, ax = plt.subplots(figsize=(18,4))
ax = sns.barplot(x = 'genre', y = 'imdb', data = movies, palette=("Blues_d"))
ax.set_xticklabels(ax.get_xticklabels(), rotation = 30)
plt.title('average imdb rating per genre')
plt.show()

**Inference 15:**For IMDb ratings, on average, horror movies got the lowest ratings while war, documentary, and musical movies seem to get the highest.

Inference 16: IMDb movies tend to approximately get the same ratings regardless of genre.

fig, ax = plt.subplots(figsize=(18,4))
ax = sns.barplot(x = 'genre', y = 'meta', data = movies, palette=("Blues_d"))
ax.set_xticklabels(ax.get_xticklabels(), rotation = 30)
plt.title('average meta rating per genre')
plt.show()

Inference 17:: For meta ratings, documentary movies get the highest average rating.

Inference 18:: Mean ratings for each genre for metacritic tend to vary for each genre, unlike IMDb ratings.

Inference 19: Horror movies are the most disliked.

Inference 20: Documentary movies are the most liked.

fig, ax = plt.subplots()
sns.kdeplot(movies['imdb'], ax=ax)
sns.kdeplot(movies['meta']/10, ax=ax)

<matplotlib.axes._subplots.AxesSubplot at 0x24a6a8c40b8>

Runtime

fig, ax = plt.subplots(figsize=(18,4))
ax = sns.barplot(x = 'genre', y = 'runtime', data = movies, palette=("Blues_d"))
ax.set_xticklabels(ax.get_xticklabels(), rotation = 30)
plt.title('average runtime per genre')
plt.show()

movies.groupby(['genre']).mean().sort_values('runtime').iloc[0]

imdb        7.20
meta       57.50
gross       0.08
runtime    90.50
Name: Musical, dtype: float64

movies.groupby(['genre']).mean().sort_values('runtime').iloc[-1]

imdb         7.036449
meta        63.781931
gross       25.985452
runtime    117.529595
Name: Biography, dtype: float64

Inference 21: Biography movies had the longest average duration. (117 mins). Musical movies had the lowest. (90 minutes)

movies.loc[movies['runtime'].idxmin()]

name       Ghosts of the Abyss
year                      2003
genre              Documentary
imdb                       6.9
meta                        67
gross                    17.09
runtime                     61
Name: 1020, dtype: object

movies.loc[movies['runtime'].idxmax()]

name       The Best of Youth
year                    2003
genre                  Drama
imdb                     8.5
meta                      89
gross                   0.25
runtime                  366
Name: 946, dtype: object

Inference 22: Longest movie was 'The Best of Youth', which ran for 366 minutes (6 hours). Shortest movie was 'Ghost of the Abyss', which ran for 61 minutes. (1 hour)

sns.jointplot(x = 'runtime', y = 'gross', data = movies,  kind = 'reg', scatter_kws={"s": 3})
plt.show()

Inference 23: Runtime and box office are slightly correlated. Meaning, if the duration of a movie is more, it earns more. (with a probability of 0.3)

Year

movies.groupby('year').sum()

.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}

</style>

	imdb	meta	gross	runtime
year
2000	1644.9	13595	7413.40	27113
2001	1728.3	14157	7956.77	28360
2002	1774.5	15066	9072.36	28557
2003	1705.4	14524	8771.08	28026
2004	1753.3	14558	9186.20	28528
2005	1748.4	14947	8807.64	28870
2006	1897.1	16516	9157.57	31077
2007	1846.1	15657	9321.09	29943
2008	1732.3	14605	9663.57	28448
2009	1734.7	14517	10388.39	28545
2010	1718.8	14939	10085.55	27970
2011	1757.8	15134	9947.34	28678
2012	1633.6	14207	10321.71	26685
2013	1794.9	15628	10685.90	29671
2014	1689.4	14648	10544.93	27719
2015	1480.1	13127	10497.37	24504
2016	1546.5	13733	11253.12	25662
2017	1363.3	12606	10532.35	22917
2018	1228.9	10953	11213.83	20714

fig, ax = plt.subplots(figsize=(18,4))
ax = sns.barplot(x = 'year', y = 'gross', data = movies, palette=("Blues_d"))
ax.set_xticklabels(ax.get_xticklabels(), rotation = 30)
plt.title('average gross per year')
plt.show()

Hmm. The black bars indicate that there's a high chance of error. Guess this is what happens with semeingly unpredictable data like movies.

Inference 24: Average earning for movies increased as time went on.

sns.countplot(x = 'year', data = movies, palette=("Blues_d"))
plt.xticks(rotation = 45)

(array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
        17, 18]), <a list of 19 Text xticklabel objects>)

sns.lmplot(x = 'imdb', y = 'meta', data = movies, scatter_kws={"s": 10}, hue = 'year')

<seaborn.axisgrid.FacetGrid at 0x24a6aab2b70>

movies.groupby('year').mean()

.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}

</style>

	imdb	meta	gross	runtime
year
2000	6.475984	53.523622	29.186614	106.744094
2001	6.497368	53.221805	29.912669	106.616541
2002	6.476277	54.985401	33.110803	104.222628
2003	6.534100	55.647510	33.605670	107.379310
2004	6.566667	54.524345	34.405243	106.846442
2005	6.451661	55.154982	32.500517	106.531365
2006	6.474744	56.368601	31.254505	106.064846
2007	6.523322	55.325088	32.936714	105.805654
2008	6.463806	54.496269	36.058097	106.149254
2009	6.472761	54.167910	38.762649	106.511194
2010	6.535361	56.802281	38.348099	106.349810
2011	6.486347	55.845018	36.706052	105.822878
2012	6.482540	56.376984	40.959167	105.892857
2013	6.550730	57.036496	38.999635	108.288321
2014	6.573541	56.996109	41.030856	107.856031
2015	6.578222	58.342222	46.654978	108.906667
2016	6.580851	58.438298	47.885617	109.200000
2017	6.585990	60.898551	50.880918	110.710145
2018	6.571658	58.572193	59.967005	110.770053

Name		Name	Last commit message	Last commit date
Latest commit History 8 Commits
README.md		README.md
imdb_data_analysis.ipynb		imdb_data_analysis.ipynb
movies.csv		movies.csv
output_13_1.png		output_13_1.png
output_14_1.png		output_14_1.png
output_16_1.png		output_16_1.png
output_18_0.png		output_18_0.png
output_20_0.png		output_20_0.png
output_30_1.png		output_30_1.png
output_32_0.png		output_32_0.png
output_38_0.png		output_38_0.png
output_41_0.png		output_41_0.png
output_43_0.png		output_43_0.png
output_45_0.png		output_45_0.png
output_49_1.png		output_49_1.png
output_51_0.png		output_51_0.png
output_58_0.png		output_58_0.png
output_62_0.png		output_62_0.png
output_64_1.png		output_64_1.png
output_65_1.png		output_65_1.png

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Repository files navigation

Scraper

Theme

Rating

Gross

Genre

Runtime

Year

About

Releases

Packages

Languages

blazyy/data-analysis-imdb

Folders and files

Latest commit

History

Repository files navigation

Scraper

Theme

Rating

Gross

Genre

Runtime

Year

About

Resources

Stars

Watchers

Forks

Releases

Packages 0

Languages

Packages