-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcountry.py
127 lines (106 loc) · 5.55 KB
/
country.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
#Analyse total number of movies released per country and continent
import pandas as pd
import matplotlib.pyplot as plt
#import numpy as np
import seaborn as sns
# Import the AWOC package.
import awoc
# Initialize the AWOC class.
my_world = awoc.AWOC()
# Retrieve the full list of nations of Europe.
countries_of_africa=my_world.get_countries_list_of('Africa')
countries_of_south_america=my_world.get_countries_list_of('South America')
countries_of_north_america=my_world.get_countries_list_of('North America')
countries_of_asia=my_world.get_countries_list_of('Asia')
countries_of_europe = my_world.get_countries_list_of('Europe')
countries_of_oceania=my_world.get_countries_list_of('Oceania')
# Read in the Netflix CSV as a DataFrame
netflix_df = pd.read_csv("netflix_data.csv")
# Subset the DataFrame for type "Movie"
netflix_subset = netflix_df[netflix_df["type"] == "Movie"]
#Distribution per continent
# Subset the DataFrame for type "country". Filter NaNs.
netflix_subset_c_noNaN = netflix_subset["country"].dropna()
netflix_countries_filtered=pd.DataFrame(netflix_subset_c_noNaN)
netflix_nat_year=netflix_countries_filtered.merge(netflix_subset, on="country",how="left")
#Select columns of interest. Transform date column into datetype and extract year from dates and re-write cdate_added column to show just years.
movies_col_select=netflix_nat_year.loc[:,('country','date_added')]
movies_col_select.loc[:,('date_added')]=pd.to_datetime(movies_col_select.loc[:,('date_added')])
movies_col_select['year_added']=movies_col_select['date_added'].values #Ignore warning but read: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
movies_col_select['year_added']=movies_col_select['year_added'].dt.year
movies_country_year=movies_col_select.loc[:,('country','year_added')]
#Group movies by country and year added, and count movies added per year.
#Count total number of movies added by country
tot_movies_country=movies_country_year.groupby("country")[["country"]].count()
movies_country_tot=tot_movies_country.rename(columns={"country":"movie_count"})
#What are the max numbers of total movies released and their countries?
movies_country_tot=movies_country_tot.sort_values(by="movie_count",ascending=False).reset_index()
#print(movies_country_tot.head())
#List continents, make a continents_country dictionary including only countries with movies in Netflix db.
keys_set = ['Africa','Asia','Europe','N_America','S_America','Oceania']
continents_country={key : [] for key in keys_set}
for i in movies_country_tot["country"]:
if i in countries_of_africa:
continents_country['Africa'].append(i)
elif i in countries_of_asia:
continents_country['Asia'].append(i)
elif i in countries_of_europe:
continents_country['Europe'].append(i)
elif i in countries_of_north_america:
continents_country['N_America'].append(i)
elif i in countries_of_south_america:
continents_country['S_America'].append(i)
elif i in countries_of_oceania:
continents_country['Oceania'].append(i)
else:
print(i)
index_= movies_country_tot.loc[movies_country_tot['country'] == i].index
print(index_) #Clean miscategorized <---<---<---<--- United Arab West Germany Soviet Union
#print(movies_country_tot.head())
#Add an empty 'continent' column to the movies_country_tot df.
movies_country_tot['continent']=['' for i in range(0,74)]
#Loop over the country column, for each country, search within the dictionary items to check for matches. Where a match is found, consider the rows index for the df value, slice the df for that row and 'country' column. Finally, assign the corresponding continent name to the country as a value for the column 'continent'.
for i in movies_country_tot['country'].values:
for k,v in continents_country.items():
if i in v:
index_i= movies_country_tot.loc[movies_country_tot['country'] == i].index
movies_country_tot.iloc[index_i,2]= k
else:
continue
#print(movies_country_tot)
###DROP currently inexistent countries.
movies_country_tot=movies_country_tot.drop(index=[28,63,67])
#print(movies_country_tot)
movies_counts=movies_country_tot.groupby('continent')['movie_count'].agg(['sum','mean']).reset_index()
#print(movies_counts)
movies_counts['mean']=round(movies_counts['mean'])
movies_counts=movies_counts.sort_values(by='sum',ascending=True)
print(movies_counts.head(4))
sns.set(font_scale=1)
palette1=sns.color_palette('deep', n_colors=6)
fig2=sns.barplot(data=movies_counts,x="continent",y= "sum", hue='sum',palette=palette1)
fig2.set(yscale="log")
plt.xlim(-1,6)
plt.ylim(1000,4600000)
fig2.set(xlabel="Continent",ylabel="Total sum of movies added per continent")
fig2.tick_params(labelsize=8)
plt.show()
#Mean movies count per continent
palette2=sns.color_palette('colorblind', n_colors=6)
fig3=sns.barplot(data=movies_counts,x="continent",y= "mean", hue='mean',palette=palette2)
fig3.set(yscale="log")
plt.xlim(-1,6)
plt.ylim(500,950000)
fig3.set(xlabel="Continent",ylabel="Mean movies added per continent")
fig3.tick_params(labelsize=8)
plt.show()
#Drop USA row to get better scales plots fro all the rest of the countries.
movies_country_tot=movies_country_tot.drop([0,])
sns.set(font_scale=0.85)
palette3=sns.color_palette('muted', n_colors=6)
fig4=sns.catplot(data=movies_country_tot,x='movie_count',y='country',kind='bar',errorbar='ci',col='continent',col_wrap=2,hue='continent',palette=palette3)
fig4.set(xscale="log")
plt.ylim(-1,75)
fig4.set(ylabel="Country",xlabel="Total movies added")
fig4.tick_params(labelsize=6,rotation=75)
plt.show()