-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathduration.py
120 lines (96 loc) · 5.01 KB
/
duration.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
# Are movies getting shorter?
# Importing pandas and matplotlib
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
# Read in the Netflix CSV as a DataFrame
netflix_df = pd.read_csv("netflix_data.csv")
# Subset the DataFrame for type "Movie"
netflix_subset = netflix_df[netflix_df["type"] == "Movie"]
#print(netflix_df.shape)
#print(netflix_df.info())
#print(netflix_df.head(3))
#data about movies (not documentaries, series, or else) and pick only those columns of interest, i.e. title, country, genre, year released and duration.
#- Filter out movies shorter than an hour.
#- Iterate over the rows and assign a color to each member in a subset of genres (perhaps the most popular overall, see movies_distrib_genre) and others. Inspect the list to check.
# Select only the columns of interest
netflix_movies = netflix_subset[["title", "country", "genre", "release_year", "duration"]]
#Plot unfiltered data.
fig0=sns.scatterplot(data=netflix_movies, x='release_year', y='duration', color='purple')
fig0.set(xlabel="Release year",ylabel="Duration (min)")
fig0.set(title="Movie duration by year of release")
fig0.tick_params(labelsize=8)
plt.show()
# Filter for durations shorter than 60 minutes
short_movies = netflix_movies[netflix_movies.duration < 60]
#print(short_movies.info)
# Filter also for durations longer than 250 minutes to exclude outliers.
netflix_movies=netflix_movies[(netflix_movies.duration >= 60) & (netflix_movies.duration < 250)]
#print(netflix_movies.info)
#Filter out uncategorized movies.
uncategorized=netflix_movies[netflix_movies['genre'] == 'Uncategorized']
#print(uncategorized.index)
netflix_movies=netflix_movies.drop(index=[1318, 1320, 1570, 1709, 2177, 2178, 3253, 3736, 3737, 3738, 4187, 5576, 5577, 6735, 7170, 7171])
#Identify unique genres, make a dictionary of unique genres and colors and build palette that includes all unique genres.
genres=netflix_movies['genre'].unique()
c_olors = ['purple','darkorange','lawngreen','tomato','magenta','lime','red','olive','maroon','royalblue','darkmagenta','brown','orange','yellow','gold','forestgreen','grey','turquoise']
genre_color=dict(zip(genres,c_olors))
print(genre_color) #Check dictionary was built appropriately.
colors=[]
gen_re=netflix_movies[['genre']]
for i in gen_re.values:
for k,v in genre_color.items():
if i == k:
colors.append(v)
else:
continue
print(colors[:10]) # Inspect the first 10 values in your list
# Create a scatter plot of duration versus release_year
palette=sns.color_palette("tab10",n_colors=18)
fig=sns.scatterplot(data=netflix_movies,x='release_year', y='duration',hue="genre",palette=palette,legend="brief")
fig.set(xlabel="Release year",ylabel="Duration (min)")
fig.set(title="Movie Duration by Release Year")
fig.tick_params(labelsize=7)
plt.show()
# Are movies getting shorter?
#firstanswer = "maybe"
#Consider only mean duration per year of release and plot.
movies_duration=netflix_movies[['title','genre','release_year','duration']]
movies_duration=movies_duration.groupby(['release_year'])['duration'].mean().reset_index(name='mean')
movies_duration=movies_duration.drop(movies_duration.index[0])
movies_duration['mean']=round(movies_duration['mean'],2)
#print(movies_duration.head())
fig11=sns.lineplot(data=netflix_movies,x='release_year',y='duration',estimator='mean', errorbar=('ci',95), n_boot=1000)
fig11.set(xlabel="Release year",ylabel="Mean duration (min)")
fig11.set(title="Mean movie duration by year of release")
fig11.tick_params(labelsize=7)
plt.show()
duration_genre=netflix_movies[(netflix_movies['duration'] >= 60) & (netflix_movies['duration'] <= 180 )]
duration_genres=duration_genre[['genre','duration']]
#Group subset df by genre and calculate mean duration for all genres. Order by descending mean_duration,i.e. genres with longer movies to shorter.
mean_dur_gen=duration_genres.groupby(['genre'])['duration'].mean('duration').reset_index(name='mean_duration')
mean_dur_gen['mean_duration']=round(mean_dur_gen['mean_duration'],2)
mean_dur_gen=mean_dur_gen.sort_values(by='mean_duration')
#print(mean_dur_gen)
fig_m=sns.scatterplot(data=movies_duration,x='release_year', y='mean')
fig_m.set(xlabel="Release year",ylabel="Mean duration (min)")
fig_m.set(title="Mean movie duration by year of release")
fig_m.tick_params(labelsize=7)
plt.show()
#Plot to look for possible relations.
fig12=sns.barplot(data=mean_dur_gen,x='genre',y='mean_duration',hue='genre')
fig12.set(xlabel="Genre",ylabel="Mean duration (min)")
fig12.set(title="Mean movie duration by genre")
fig12.tick_params(labelsize=8,rotation=40)
plt.show()
#Plot mean duration vs year of release.
fig13=sns.lineplot(data=movies_duration,x='release_year',y='mean',color='mediumvioletred')
fig13.set(xlabel="Genre",ylabel="Mean duration (min)")
fig13.set(title="Mean movie duration by year")
fig13.tick_params(labelsize=8,rotation=40)
plt.show()
#heatmap: duration vs decade it was released
#duration_genres=sns.heatmap()
#Select top ten genres according to mean duration.
#top_means=mean_dur_gen.iloc[0:10,:]
#print(top_means)