-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathgenre.py
173 lines (141 loc) · 9.33 KB
/
genre.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
#What's the distribution of movie genres in Netflix library?
# Importing packages for manipulation and visualization
import pandas as pd
import matplotlib.pyplot as plt
#import numpy as np
import seaborn as sns
# Read in the Netflix CSV as a DataFrame
netflix_df = pd.read_csv("netflix_data.csv")
# Subset the DataFrame for type "Movie"
netflix_subset = netflix_df[netflix_df["type"] == "Movie"]
#Select columns of interest. Transform date column into datetype and extract year from dates and re-write cdate_added column to show just years.
movies_col_select=netflix_subset.loc[:,('country','date_added','genre')]
movies_col_select.loc[:,('date_added')]=pd.to_datetime(movies_col_select.loc[:,('date_added')])
movies_col_select['year_added']=movies_col_select['date_added'].values #Ignore warning but read: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
movies_col_select['year_added']=movies_col_select['year_added'].dt.year
#Add different dfs selecting release_year and genre and genre and country separately
movies_country_genre_year=movies_col_select.loc[:,('country','genre','year_added')]
movies_genre_year=movies_col_select.loc[:,('genre','year_added')]
movies_country_genre=movies_col_select.loc[:,('country','genre')]
#Group movies by genre and year added, and count genres movies added per year.
movies_g_y_sum=movies_genre_year.groupby(["genre","year_added"]).value_counts(ascending=True).reset_index(name='movies_per_genreyear')
#print(movies_g_y_sum)
#Group movies by country and genre, and count genres movies added per country.
movies_g_c_sum=movies_country_genre.groupby(["country","genre"]).value_counts(ascending=True).reset_index(name='movies_per_countrygenre')
#print(movies_g_c_sum)
#Count uncategorized movies and remove/display on their own (by country and by year). Add to NaN Movies file.
#Stats based on years when movies were added
uncategorized_s=movies_g_y_sum[movies_g_y_sum['genre']=='Uncategorized']['movies_per_genreyear'].sum()
uncategorized=movies_g_y_sum[movies_g_y_sum['genre']=='Uncategorized']
#print(len(uncategorized))
#Separate international movies into a different df and remove from main.
international_movies_g= movies_genre_year[movies_genre_year['genre']=='International Movies']
#print(international_movies_g.index)
movies_genre_year=movies_g_y_sum.drop(index=[74, 75, 76, 77, 78, 79,107,108,109,110,111,112,113]).reset_index(drop=True)
#print(movies_genre_year) #--> check n of rows is 13 down.
#print(movies_genre_year['genre'].unique()) # <- 18 distinct genres after cleaning
#Visualize movies_per_genreyear.
palette=sns.set_palette('bright',n_colors=18)
fig7=sns.scatterplot(movies_genre_year,x='year_added',y='movies_per_genreyear',palette=palette,hue='genre')
fig7.set(xlabel="Year",ylabel="Total movie count per year")
fig7.set(title="Movies of different genres added per year")
fig7.tick_params(labelsize=9)
plt.show()
#Filter years with over 8 movies added, which is the 2nd quartile as shown by describe(). Remove rows with genre 'International Movies'.
#print(round(movies_genre_year['movies_per_genreyear'].describe(),2))
filtered_genre_year=movies_genre_year[movies_genre_year['movies_per_genreyear']<8]
filtered_gindex=filtered_genre_year.index
#print(filtered_gindex)
filtered_genre_year=movies_genre_year.drop(index=[0, 1, 8, 9, 10, 11, 13, 14, 15, 16, 24, 26, 29, 30,
31, 38, 39, 40, 41, 42, 43, 52, 53, 62, 63, 64, 69, 70,
71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84,
85, 86, 87, 94, 95, 96, 100])
#print(filtered_genre_year['genre'].unique()) #<--- 9 distinct genres with over 8 movies released.
filtered_genre_year=movies_genre_year[(movies_genre_year['year_added'] >= 2014) & (movies_genre_year['year_added'] < 2021)]
#print(len(filtered_genre_year['genre'].unique())) # <- 17 distinct genres after cleaning
#Visualize number of movies released per genre each year.
palette1=sns.set_palette('colorblind',n_colors=17)
fig8=sns.barplot(filtered_genre_year,x='year_added',y='movies_per_genreyear',hue='genre',palette=palette1)
fig8.set(xlabel="Year",ylabel="Total movie count per year")
fig8.set(title="Total movies added per year and genre")
fig8.tick_params(labelsize=8)
plt.show()
#Find whether there's a correlation between genre and number of movies released each year .
genres_sum=filtered_genre_year.groupby(['genre'])['movies_per_genreyear'].sum().reset_index(name='sum').sort_values(by='sum',ascending=False)
#print(genres_sum)
#print(filtered_genre_year)
#Which genre was the most popular each year? Considering only years with over 8 movies added.
pop_genre_year=filtered_genre_year.groupby(['year_added'])[['genre','movies_per_genreyear']].max()
#print(pop_genre_year)
#print(pop_genre_year['genre'].unique()) #<--- 2 genres
#Visualize max number of movies released per genre each year. For those years with over 8 movies added per genre.
palette2=sns.set_palette('deep')
fig9=sns.barplot(pop_genre_year,x='year_added',y='movies_per_genreyear',hue='genre',palette=palette2)
fig9.set(yscale="log")
fig9.set(xlabel="Year",ylabel="Genre with most added movies per year")
fig9.set(title="Max movies added per year")
fig9.tick_params(labelsize=8)
plt.show()
# What prop of the total movies from a certain genre was released each year?
genre_year_sum=filtered_genre_year.merge(genres_sum,on='genre',how='outer')
genre_year_sum['prop_per_genre']=((genre_year_sum['movies_per_genreyear']/genre_year_sum['sum'])*100).round(decimals=2)
genre_year_sum=genre_year_sum.rename(columns={'prop_per_genre':'proportion'})
genre_year_prop=genre_year_sum.drop(['sum','movies_per_genreyear'],axis=1)
#print(genre_year_prop)
#print(len(genre_year_prop['genre'].unique()))
palette=sns.set_palette('deep',n_colors=17)
fig10=sns.relplot(genre_year_prop,x='year_added',y='proportion',hue='genre',size='proportion',sizes=(15, 225),palette=palette)
fig10.set(xlabel="Year",ylabel="Proportion of genre relative to all movies added that year")
plt.ylim(0,50)
fig10.set(title="Genre representation per year")
fig10.tick_params(labelsize=9)
plt.show()
#Filter genres with < 25 of genre_year_prop
#print(genre_year_prop.describe()) # 2Q=20 3Q=29
genre_year_prop_f=genre_year_prop[genre_year_prop['proportion'] >= 25].reset_index(drop=True)
#print(len(genre_year_prop_f['genre'].unique()))
#print(genre_year_prop_f['year_added'].unique())
#print(genre_year_prop_f.head())
palette3=sns.set_palette('colorblind',n_colors=17)
fig11=sns.catplot(genre_year_prop_f,x='proportion',y='genre',col='year_added',col_wrap=2,hue='genre',kind='bar')
fig11.set(xlabel="Proportion of genre relative to all movies added that year",ylabel="Genre")
fig11.tick_params(labelsize=8)
plt.show()
#Stats based on countries of origin for movies added each year.
uncategorized_c=movies_g_c_sum[movies_g_c_sum['genre']=='Uncategorized']['movies_per_countrygenre'].sum()
u_ncategorized=movies_g_c_sum[movies_g_c_sum['genre']=='Uncategorized']
movies_genre_country=movies_g_c_sum.drop(index=[19,51,97,167,192,287,333,349])
#print(movies_genre_country[movies_genre_country['genre']=='International Movies'].describe()) --> 31 rows from various countries
#Separate international movies into a different df and remove from main.
international_movies_c= movies_genre_country[movies_genre_country['genre']=='International Movies']
#print(international_movies_c.index)
movies_genre_country=movies_genre_country.drop(index=[ 16, 22, 35, 45, 55, 63, 77, 84, 95, 106, 116, 132, 152, 166, 170, 173, 190, 199, 212, 224, 234, 241, 246, 259, 268, 275, 285, 300, 307, 318, 327])
#print(movies_genre_country) --> check n of rows is 31 down.
#Add total movies per genre and country and plot.
tot_per_genre=movies_genre_country.groupby(['genre'])['genre'].value_counts().reset_index(name='count')
tot_per_genre=tot_per_genre.sort_values(by='count',ascending=False)
#print(tot_per_genre)
fig4=sns.barplot(tot_per_genre,x='count',y='genre', hue='genre',palette=sns.color_palette('colorblind', n_colors=17))
fig4.set(xlabel="Number of countries",ylabel="Genre")
plt.show()
#Choose only genres with over half the total movies (30). Plot genre distribution by country for those genres.
pop_genres=tot_per_genre[tot_per_genre['count'] >= 30]
#print(pop_genres)
fig5=sns.barplot(pop_genres,x='count',y='genre', hue='genre',palette=sns.color_palette('colorblind', n_colors=6))
fig5.set(xlabel="Number of countries",ylabel="Genre")
plt.show()
pop_genre_country=movies_genre_country.merge(pop_genres,on='genre',how='right').drop(columns='count')
pop_genre_sort=pop_genre_country.sort_values(by='genre',ascending= False)
#print(pop_genre_sort)
less_than_avg_movies=pop_genre_sort[pop_genre_sort['movies_per_countrygenre'] < 7]
list_less_than_avg_movies=less_than_avg_movies.index
pop_genre_sort=pop_genre_sort.drop(index=list_less_than_avg_movies)
#print(list_less_than_avg_movies) #--> 176 country/genre pairs!
pop_genre_sum=pop_genre_sort.groupby(['genre'])[['movies_per_countrygenre']].sum().sort_values(by='movies_per_countrygenre',ascending=True)
fig6=sns.barplot(pop_genre_sum,x='genre',y='movies_per_countrygenre',hue='genre',palette=sns.color_palette('colorblind', n_colors=6))
fig6.set(xlabel="Most popular genres",ylabel="Total number of movies")
fig6.set(title="Top 5 genres with max added movies")
fig6.tick_params(labelsize=8)
plt.show()
#Find whether there's a correlation between genre and country .
#Find whether there's a correlation between genre and continent .