#We want to plot the number of movies as a function of the year
#Load the movies csv file into pandas database:
import pandas as pd
movies = pd.read_csv("../archive/Movie_Movies.csv", delimiter=',',low_memory=False)
print(movies.head(5))
Awards Country DVD Director Language \ 0 NaN USA NaN Rose Cummings English 1 NaN USA NaN James Byrne NaN 2 NaN USA NaN Dimitri Buchowetzki NaN 3 NaN USA NaN Julia Hechler English 4 NaN Sri Lanka NaN Udara Siriruwan Sinhalese Plot Poster Production Rated \ 0 Rachel constantly hears her baby cry from the ... NaN NaN NaN 1 The struggle against unfortunate circumstances... NaN NaN NaN 2 NaN NaN NaN NaN 3 A Gift introduces Samuel Green, Washington Sta... NaN NaN NaN 4 NaN NaN NaN NaN Released Runtime Title Type Website Year imdbID \ 0 26 Apr 2012 20 min Baby's Breath movie NaN 2012 tt2268369 1 NaN 9 min Winter Trees movie NaN 2008 tt1560760 2 27 Mar 1926 50 min The Crown of Lies movie NaN 1926 tt0016750 3 27 May 2013 2 min A Gift movie NaN 2013 tt3405286 4 20 Mar 2014 23 min Journey movie NaN 2014 tt3816698 imdbRating imdbVotes 0 NaN NaN 1 NaN NaN 2 NaN NaN 3 NaN NaN 4 NaN NaN
#Check whether all values in the Type column are movies
check_type=movies.where(movies['Type']=='movie')
a=check_type['Type'].count()
print(check_type.shape)
#Check whether all values in the Type column are movies
check_nonmovie_type=movies[movies['Type']!='movie']
b=check_nonmovie_type['Type'].count()
print(check_nonmovie_type.shape)
#print(check_nonmovie_type['Type'].head())
#print(a+b)
#Remove the rows that are not movies...
print(movies.shape)
movies=movies[movies['Type']=='movie']
#Now drop the Type column
movies.drop('Type', axis=1, inplace=True)
#print(movies.head())
print(movies.shape)
(178687, 18) (22, 18) (178687, 18) (178665, 17)
movies.drop(['DVD','Production','Rated','Released','Runtime'], axis=1, inplace=True)
print(movies.head())
Awards Country Director Language \ 0 NaN USA Rose Cummings English 1 NaN USA James Byrne NaN 2 NaN USA Dimitri Buchowetzki NaN 3 NaN USA Julia Hechler English 4 NaN Sri Lanka Udara Siriruwan Sinhalese Plot Poster \ 0 Rachel constantly hears her baby cry from the ... NaN 1 The struggle against unfortunate circumstances... NaN 2 NaN NaN 3 A Gift introduces Samuel Green, Washington Sta... NaN 4 NaN NaN Title Website Year imdbID imdbRating imdbVotes 0 Baby's Breath NaN 2012 tt2268369 NaN NaN 1 Winter Trees NaN 2008 tt1560760 NaN NaN 2 The Crown of Lies NaN 1926 tt0016750 NaN NaN 3 A Gift NaN 2013 tt3405286 NaN NaN 4 Journey NaN 2014 tt3816698 NaN NaN
movies['Year']=pd.to_numeric(movies['Year'], errors='coerce')
#movies['Year']=movies['Year'].astype(int)
print("The year of the earliest movie of the database is: " + str(int(movies['Year'].min(axis = 0))))
print("The year of the latest movie of the database is: " + str(int(movies['Year'].max(axis = 0))))
The year of the earliest movie of the database is: 1889 The year of the latest movie of the database is: 2023
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
sns.set()
fig1, ax1 = plt.subplots(figsize=(15, 7.5))
plt.plot(movies.groupby("Year")["imdbID"].count())
plt.xticks(ticks=range(1880,2030,5),rotation=45);
ax1.set_xlabel('time [years]');
ax1.set_ylabel('number of movies');
%matplotlib inline
#If the movie has several languages, take only the first language:
movies['Language'] = movies['Language'].str.replace(',.*','')
#print(movies.head(40))
mylanguages = movies.groupby("Language")["imdbID"].count().sort_values(ascending=False)
#Group together the languages that have very low counts to make "others" slice:
mylanguages2 = mylanguages.iloc[0:9]
mylanguages2['Others'] = mylanguages.iloc[10:].sum()
mylanguages2=mylanguages2.sort_values(ascending=True)
fig2, ax2 = plt.subplots(figsize=(10, 10));
ax2.pie(mylanguages2, autopct='%1.1f%%',labels=mylanguages2.index.values,
shadow=False, startangle=0,rotatelabels=True, textprops={'fontsize': 14});
ax2.axis('equal'); # Equal aspect ratio ensures that pie is drawn as a circle.
# Change Awards column to only show number of wins
movies['Awards'] = movies['Awards']
movies['Awards'] = movies['Awards'].str.replace(' win.*','')
movies['Awards'] = pd.to_numeric(movies['Awards'], errors='coerce') #removes remaining rows with nominations
#print(movies['Awards'].head(50))
#Sum up the number of wins for each director
myawards = movies.groupby("Director")["Awards"].count().reset_index()
myawards.sort_values(by=['Awards'],ascending=False,inplace=True)
print(myawards.head(20))
#print(myawards.sort_values(ascending=False).head(20))
Director Awards 87759 Tyler Perry 10 82609 Steven Soderbergh 9 79681 Shawn Levy 8 69279 Peter Segal 8 81137 Spike Lee 8 89815 Wes Craven 8 44443 Jon Turteltaub 8 29577 Garry Marshall 8 22078 Dennis Dugan 7 42621 Joel Schumacher 7 70980 Raja Gosnell 7 43891 John Singleton 7 85617 Tim Story 7 912 Adam Shankman 7 68311 Paul W.S. Anderson 7 90574 Wim Wenders 7 54725 M. Night Shyamalan 7 27308 F. Gary Gray 7 6032 Anne Wheeler 6 47236 Justin Lin 6
fig3, ax3 = plt.subplots(figsize=(15, 7.5));
plt.bar(myawards["Director"].iloc[0:25],myawards["Awards"].iloc[0:25])
plt.title('Number of awards per director')
plt.xlabel('Directors')
plt.ylabel('Number of awards')
plt.xticks(rotation=45);
#We will take the first 18 directors and see how many movies they have produced (18 because directors
#9 to 18 have the same number of awards)
#number_of_movies=myawards["Director"].iloc[0:18].
number_of_movies=movies.groupby("Director")["imdbID"].count().reset_index()
number_of_movies.set_index('Director',inplace=True)
number_of_movies.sort_values(by='imdbID',ascending=False,inplace=True)
number_of_movies.rename(columns={'imdbID': 'total_number_of_movies'},inplace=True)
print(number_of_movies.head(10))
total_number_of_movies Director Jim Powers 151 D.W. Griffith 101 Lewin Fitzhamon 95 Dave Fleischer 94 Al Christie 90 Georges Méliès 86 Gilbert M. 'Broncho Billy' Anderson 85 Quasarman 84 Kevin Dunn 83 James H. White 78
ind_list = myawards["Director"].iloc[0:18]
#print(ind_list)
top10directors=number_of_movies.loc[ind_list].head(10)
print(top10directors.sort_values(by="total_number_of_movies",ascending=False))
total_number_of_movies Director Spike Lee 28 Tyler Perry 22 Steven Soderbergh 21 Joel Schumacher 19 Wes Craven 15 Dennis Dugan 15 Garry Marshall 13 Shawn Levy 12 Peter Segal 11 Jon Turteltaub 10
genres = pd.read_csv("../archive/Movie_Genres.csv", delimiter=',',low_memory=False)
#print(genres.head(5))
#Clean the data: remove empty spaces from genres
genres["Genre"]=genres["Genre"].str.replace(" ","")
#Remove tt from imdbID and convert to int
genres["imdbID"]=genres["imdbID"].str.replace("tt","").astype(int)
#Drop superfluous index column:
genres.drop('Unnamed: 0', axis=1, inplace=True)
print(genres.head(5))
Genre imdbID 0 Short 2268369 1 Drama 2268369 2 Short 1560760 3 Drama 1560760 4 Drama 16750
mygenres = genres.groupby("Genre")["imdbID"].count().reset_index()
mygenres.rename(columns={'imdbID': 'total_number_of_movies'},inplace=True)
mygenres.sort_values(by="total_number_of_movies",ascending=False,inplace=True)
cutoff=20
mygenres2= mygenres.iloc[0:cutoff].reset_index(drop=True)
others = mygenres['total_number_of_movies'].iloc[cutoff+1:].sum()
mygenres2.loc[cutoff]=['Others',others]
fig4, ax4 = plt.subplots(figsize=(10, 10));
ax4.pie(mygenres2["total_number_of_movies"], autopct='%1.1f%%',labels=mygenres2["Genre"],
shadow=False, startangle=0,rotatelabels=True, textprops={'fontsize': 14});
ax4.axis('equal'); # Equal aspect ratio ensures that pie is drawn as a circle.