# importing packages and setting matplotlib to draw inline
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns

 
%matplotlib inline


df = pd.read_csv('tmdb-movies.csv', index_col = 'id')
df.head()


df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 10866 entries, 135397 to 22293
Data columns (total 20 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   imdb_id               10856 non-null  object 
 1   popularity            10866 non-null  float64
 2   budget                10866 non-null  int64  
 3   revenue               10866 non-null  int64  
 4   original_title        10866 non-null  object 
 5   cast                  10790 non-null  object 
 6   homepage              2936 non-null   object 
 7   director              10822 non-null  object 
 8   tagline               8042 non-null   object 
 9   keywords              9373 non-null   object 
 10  overview              10862 non-null  object 
 11  runtime               10866 non-null  int64  
 12  genres                10843 non-null  object 
 13  production_companies  9836 non-null   object 
 14  release_date          10866 non-null  object 
 15  vote_count            10866 non-null  int64  
 16  vote_average          10866 non-null  float64
 17  release_year          10866 non-null  int64  
 18  budget_adj            10866 non-null  float64
 19  revenue_adj           10866 non-null  float64
dtypes: float64(4), int64(5), object(11)
memory usage: 1.7+ MB


df.drop(['tagline','homepage','keywords','overview', 'director', 'cast', 'production_companies', 'budget', 'revenue'],axis=1, inplace=True)


# Dropping null data from remaining data to keep a clean dataset.
df.dropna(inplace=True)


# Performing a final check with info to see how many entries we are left with, and verifying all null values have been removed.
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 10835 entries, 135397 to 22293
Data columns (total 11 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   imdb_id         10835 non-null  object 
 1   popularity      10835 non-null  float64
 2   original_title  10835 non-null  object 
 3   runtime         10835 non-null  int64  
 4   genres          10835 non-null  object 
 5   release_date    10835 non-null  object 
 6   vote_count      10835 non-null  int64  
 7   vote_average    10835 non-null  float64
 8   release_year    10835 non-null  int64  
 9   budget_adj      10835 non-null  float64
 10  revenue_adj     10835 non-null  float64
dtypes: float64(4), int64(3), object(4)
memory usage: 1015.8+ KB


#Checking to see if there are duplicate entries in the dataset
sum(df.duplicated())

1


#There appears to be only one duplicate across all columns, but I would also like to individually check by the movie title.
sum(df.duplicated(subset=['original_title']))

295


#investigating title and year together
df_title_year = df[['original_title','release_year']];
df_title_year_duplicates = df_title_year[df_title_year.duplicated(subset=['original_title'],keep=False)]

df_title_year_duplicates.sort_values(by=['original_title','release_year']).head(20)


sum(df.duplicated(subset=['original_title','release_year']))

4


df.drop_duplicates(inplace=True)
df.drop_duplicates(subset=['original_title','release_year'],inplace=True)


df.describe()


df['budget_adj'] = df['budget_adj'].replace(0, np.NaN)
df['revenue_adj'] = df['revenue_adj'].replace(0, np.NaN)
df['runtime'] = df['runtime'].replace(0,np.NaN)


df.describe()


df['genres'].head()

id
135397    Action|Adventure|Science Fiction|Thriller
76341     Action|Adventure|Science Fiction|Thriller
262500           Adventure|Science Fiction|Thriller
140607     Action|Adventure|Science Fiction|Fantasy
168259                        Action|Crime|Thriller
Name: genres, dtype: object


# Get list of genres
genre_list = df['genres'].str.split('|', expand=True)
genre_list = genre_list.apply(pd.Series.value_counts).index.tolist()
genre_list = np.array(genre_list)

# Get list of years and sort from oldest to newest
year_list = df['release_year'].value_counts().index.tolist()
year_list.sort()
year_list = np.array(year_list)


def figure_resize(f, w, h):
    ''' Sets a figure's width and height to (w, h)
        
        Parameters:
        f (matplotlib.pyplot.figure): plt figure to modify 
        w (float): desired width
        h (float): desired height
        
        Returns:
        N/A
    '''
    f.set_figwidth(w)
    f.set_figheight(h)
    
def set_color_palette(palette, n_colors, axis):
    ''' Sets the seaborn color palette for plotting.
        
        Parameters:
        palette (string): seaborn palette to use 
        n_colors (int): number of colors to cycle in palette
        axis (matplotlib.pyplot.axis): axis to pass from subplots
        
        Returns:
        N/A
    '''
    col = sns.color_palette(palette, n_colors)
    axis.set_prop_cycle('color', col)
    

def next_line_style():
    ''' Returns the next line style. Call as line style argument in plt.plot.
        
        Parameters: 
        N/A
        
        Returns: next line style as char ( line_styles[line_style_iterator] )
    
    '''
    global line_style_iterator
    line_style_iterator = line_style_iterator + 1 if line_style_iterator < 3 else 0
    return line_styles[line_style_iterator]
    
line_styles = ['-', '--', '-.', ':']  # List of line styles to use for plt.plot
line_style_iterator = -1              # Used to iterate line styles in next_line_style. 
                                      # ^ Starts at -1 so first function call returns 0


#Get figure and axes for plot
fig, ax = plt.subplots()

# plot formatting
figure_resize(fig, 12, 8)
set_color_palette('husl', len(genre_list), ax)
plt.ylabel("Titles Released")
plt.title('Annual Titles per Genre', fontsize=15)




# While it would be preferable to use built-in functions for this, the fact that each title can 
# contain many genres has made it difficult.
# I have tried:
# df.loc[genre_df].groupby('release_year', as_index = True)['revenue_adj'].sum().plot()
# but this appears to force each title into a single category and skews the data.
# This is currently best method I can find for accounting for every instance of a genre for each title.
    
# Populate plot
for genre in genre_list:
    plot_data = pd.DataFrame(columns=['Year','Titles Released'])
    genre_df = df['genres'].str.contains(genre)
    
    for year in year_list:    
        #number of titles released in this genre on this year
        titles_released = df.loc[genre_df & (df['release_year'] == year)]['original_title'].count()
        
        #add number of titles to plot data
        new_row = {'Year':year,'Titles Released':titles_released}
        plot_data = plot_data.append(new_row,ignore_index = True)
    
    plt.plot(plot_data['Year'], 
             plot_data['Titles Released'],
             next_line_style(), 
             label=genre)

plt.legend(loc='upper left',
           ncol=2,
           handleheight=2.5, 
           labelspacing=0.05, 
           frameon=False)

plt.show()


#Get figure and axes for plot
fig, ax = plt.subplots()

#plot formatting
figure_resize(fig, 12, 8)
set_color_palette('husl', len(genre_list), ax)
plt.ylabel("% Total Title Releases")
plt.title('Annual Share of Releases by Genre', fontsize=15)

#Populate plot
for genre in genre_list:
    plot_data = pd.DataFrame(columns=['Year','Titles Released'])
    genre_df = df['genres'].str.contains(genre)
    for year in year_list:
        
        #number of titles released in this genre on this year
        this_genre = df.loc[genre_df & (df['release_year'] == year)]['original_title'].count()
        this_year = df.loc[(df['release_year'] == year)]['original_title'].count()
                             
        #add percent of total to plot data
        new_row = {'Year':year,'Titles Released': this_genre / this_year}
        plot_data = plot_data.append(new_row,ignore_index = True)
                             
    plt.plot(plot_data['Year'], 
             plot_data['Titles Released'],
             next_line_style(), 
             label=genre)

    
#Set axis scale
x1,x2,y1,y2 = plt.axis()  
plt.axis((x1,x2,0,1))

#Format y-axis as percent
plot_ticks, axis_ticks = plt.gca().get_yticks(), ax.get_yticks().tolist()
plt.gca().set_yticks(axis_ticks)
plt.gca().set_yticklabels(['{:.0f}%'.format(n*100) for n in plot_ticks]) 


plt.legend(loc='upper left',
           ncol=3,
           handleheight=2.4, 
           labelspacing=0.05,
           frameon=False)

plt.show()


# Comparing mean adjusted budget for each genre

fig, ax = plt.subplots()

#formatting y-axis as dollar figures
ax.yaxis.set_major_formatter('${x:1,.0f}')


#plot formatting
figure_resize(fig, 15, 5)
set_color_palette('husl', len(genre_list), ax)
plt.title('Mean adjusted budget by genre', fontsize=15)
plt.xticks(rotation = 25)

for genre in genre_list:
    plt.bar(genre,df.loc[df['genres'].str.contains(genre)]['budget_adj'].mean())
plt.show()


fig, ax = plt.subplots()

#plot formatting
figure_resize(fig, 12, 8)
plt.title('Annual Mean Revenue for Drama Titles', fontsize=15)

#formatting y-axis as dollar figures
ax.yaxis.set_major_formatter('${x:1,.0f}')


df.groupby('release_year', as_index = True)['revenue_adj'].mean().plot()

plt.show()


fig, ax = plt.subplots()

#plot formatting
figure_resize(fig, 12, 8)
plt.title('Total Annual Revenue for Drama Titles', fontsize=15)

#formatting y-axis as dollar figures
ax.yaxis.set_major_formatter('${x:1,.0f}')

df.groupby('release_year', as_index = True)['revenue_adj'].sum().plot()

plt.show()


fig, ax = plt.subplots()
ax.xaxis.set_major_formatter('${x:1,.0f}')


#plot formatting
figure_resize(fig, 15, 5)
set_color_palette('husl', len(genre_list), ax)
plt.title('Adjusted budget by genre', fontsize=15)
plt.xticks(rotation = 25)

for genre in genre_list:
    plt.hist(df.loc[df['genres'].str.contains(genre)]['budget_adj'],
             label=genre,
             alpha=0.25,
             bins=100)
    
plt.legend(loc='upper right',
           ncol=5,
           handleheight=2.4, 
           labelspacing=0.5,
           frameon=False)

plt.locator_params(axis='x', nbins=15)

#Set axis scale
x1,x2,y1,y2 = plt.axis()  
plt.axis((0,250000000,y1,y2))

plt.show()


#comparing histogram of budget by genre and revenue
fig, ax = plt.subplots()
ax.xaxis.set_major_formatter('${x:1,.0f}')


#plot formatting
figure_resize(fig, 15, 5)
set_color_palette('husl', len(genre_list), ax)
plt.title('Adjusted revenue by genre', fontsize=15)
plt.xticks(rotation = 25)

for genre in genre_list:
    plt.hist(df.loc[df['genres'].str.contains(genre)]['revenue_adj'],
             label=genre,
             alpha=0.25,
             bins=100)
    
plt.legend(loc='upper right',
           ncol=5,
           handleheight=2.4, 
           labelspacing=0.5,
           frameon=False)

plt.locator_params(axis='x', nbins=15)

#Set axis scale
x1,x2,y1,y2 = plt.axis()  
plt.axis((0,700000000,y1,y2))

plt.show()


#comparing histogram of budget by genre and revenue
fig, ax = plt.subplots()
ax.xaxis.set_major_formatter('${x:1,.0f}')


#plot formatting
figure_resize(fig, 15, 5)
set_color_palette('husl', len(genre_list), ax)
plt.title('Adjusted profit by genre', fontsize=15)
plt.xticks(rotation = 25)

for genre in genre_list:
    to_plot = df.loc[df['genres'].str.contains(genre)]['revenue_adj'] - df.loc[df['genres'].str.contains(genre)]['budget_adj'] 
    plt.hist(to_plot,
             label=genre,
             alpha=0.25,
             bins=150)
    
plt.legend(loc='upper right',
           ncol=5,
           handleheight=2.4, 
           labelspacing=0.5,
           frameon=False)

plt.locator_params(axis='x', nbins=15)

#Set axis scale
x1,x2,y1,y2 = plt.axis()  
plt.axis((-150000000,700000000,y1,y2))

plt.show()


#comparing runtime and revenue
fig, ax = plt.subplots()
ax.yaxis.set_major_formatter('${x:1,.0f}')


#plot formatting
figure_resize(fig, 15, 5)
set_color_palette('husl', len(genre_list), ax)
plt.title('Mean adjusted revenue by genre', fontsize=15)
plt.xticks(rotation = 25)

for genre in genre_list:
    plt.bar(genre,df.loc[df['genres'].str.contains(genre)]['revenue_adj'].mean())
plt.show()


#comparing runtime and revenue
fig, ax = plt.subplots()
ax.yaxis.set_major_formatter('${x:1,.0f}')


#plot formatting
figure_resize(fig, 15, 5)
set_color_palette('husl', len(genre_list), ax)
plt.title('Mean adjusted profit by genre', fontsize=15)
plt.xticks(rotation = 25)

for genre in genre_list:
    revenue = df.loc[df['genres'].str.contains(genre)]['revenue_adj'].mean()
    cost = df.loc[df['genres'].str.contains(genre)]['budget_adj'].mean()
    plt.bar(genre, revenue - cost)
plt.show()


fig, ax = plt.subplots()
ax.yaxis.set_major_formatter('{x:1,.0f}%')


#plot formatting
figure_resize(fig, 15, 5)
set_color_palette('husl', len(genre_list), ax)
plt.title('Mean margin by genre', fontsize=15)
plt.xticks(rotation = 25)

for genre in genre_list:
    revenue = df.loc[df['genres'].str.contains(genre)]['revenue_adj'].mean()
    cost = df.loc[df['genres'].str.contains(genre)]['budget_adj'].mean()
    margin = 100 * ((revenue - cost) / revenue)
    plt.bar(genre, margin) 
plt.show()


fig, ax = plt.subplots()

#plot formatting
figure_resize(fig, 12, 8)
plt.title('Total revenue by vote average', fontsize=15)

#formatting y-axis as dollar figures
ax.yaxis.set_major_formatter('${x:1,.0f}')

df.groupby('vote_average', as_index = True)['revenue_adj'].sum().plot()

plt.show()

	imdb_id	popularity	budget	revenue	original_title	cast	homepage	director	tagline	keywords	overview	runtime	genres	production_companies	release_date	vote_count	vote_average	release_year	budget_adj	revenue_adj
id
135397	tt0369610	32.985763	150000000	1513528810	Jurassic World	Chris Pratt\|Bryce Dallas Howard\|Irrfan Khan\|Vi...	http://www.jurassicworld.com/	Colin Trevorrow	The park is open.	monster\|dna\|tyrannosaurus rex\|velociraptor\|island	Twenty-two years after the events of Jurassic ...	124	Action\|Adventure\|Science Fiction\|Thriller	Universal Studios\|Amblin Entertainment\|Legenda...	6/9/15	5562	6.5	2015	1.379999e+08	1.392446e+09
76341	tt1392190	28.419936	150000000	378436354	Mad Max: Fury Road	Tom Hardy\|Charlize Theron\|Hugh Keays-Byrne\|Nic...	http://www.madmaxmovie.com/	George Miller	What a Lovely Day.	future\|chase\|post-apocalyptic\|dystopia\|australia	An apocalyptic story set in the furthest reach...	120	Action\|Adventure\|Science Fiction\|Thriller	Village Roadshow Pictures\|Kennedy Miller Produ...	5/13/15	6185	7.1	2015	1.379999e+08	3.481613e+08
262500	tt2908446	13.112507	110000000	295238201	Insurgent	Shailene Woodley\|Theo James\|Kate Winslet\|Ansel...	http://www.thedivergentseries.movie/#insurgent	Robert Schwentke	One Choice Can Destroy You	based on novel\|revolution\|dystopia\|sequel\|dyst...	Beatrice Prior must confront her inner demons ...	119	Adventure\|Science Fiction\|Thriller	Summit Entertainment\|Mandeville Films\|Red Wago...	3/18/15	2480	6.3	2015	1.012000e+08	2.716190e+08
140607	tt2488496	11.173104	200000000	2068178225	Star Wars: The Force Awakens	Harrison Ford\|Mark Hamill\|Carrie Fisher\|Adam D...	http://www.starwars.com/films/star-wars-episod...	J.J. Abrams	Every generation has a story.	android\|spaceship\|jedi\|space opera\|3d	Thirty years after defeating the Galactic Empi...	136	Action\|Adventure\|Science Fiction\|Fantasy	Lucasfilm\|Truenorth Productions\|Bad Robot	12/15/15	5292	7.5	2015	1.839999e+08	1.902723e+09
168259	tt2820852	9.335014	190000000	1506249360	Furious 7	Vin Diesel\|Paul Walker\|Jason Statham\|Michelle ...	http://www.furious7.com/	James Wan	Vengeance Hits Home	car race\|speed\|revenge\|suspense\|car	Deckard Shaw seeks revenge against Dominic Tor...	137	Action\|Crime\|Thriller	Universal Pictures\|Original Film\|Media Rights ...	4/1/15	2947	7.3	2015	1.747999e+08	1.385749e+09

	original_title	release_year
id
217316	1	2013
176068	1	2013
98622	9	2005
12244	9	2009
13189	A Christmas Carol	1984
16716	A Christmas Carol	1999
17979	A Christmas Carol	2009
377	A Nightmare on Elm Street	1984
23437	A Nightmare on Elm Street	2010
15598	Alfie	1966
8849	Alfie	2004
8217	Alice	1990
61872	Alice	2009
34573	Alice in Wonderland	1985
30923	Alice in Wonderland	1999
12155	Alice in Wonderland	2010
51992	And Soon the Darkness	1970
33107	And Soon the Darkness	2010
50512	Anna Karenina	1997
96724	Anna Karenina	2012

	popularity	runtime	vote_count	vote_average	release_year	budget_adj	revenue_adj
count	10831.000000	10831.000000	10831.000000	10831.000000	10831.000000	1.083100e+04	1.083100e+04
mean	0.647887	102.158711	218.006001	5.973170	2001.307820	1.759773e+07	5.152216e+07
std	1.001431	31.266176	576.442381	0.933902	12.817001	3.434265e+07	1.448369e+08
min	0.000065	0.000000	10.000000	1.500000	1960.000000	0.000000e+00	0.000000e+00
25%	0.208402	90.000000	17.000000	5.400000	1995.000000	0.000000e+00	0.000000e+00
50%	0.384763	99.000000	38.000000	6.000000	2006.000000	0.000000e+00	0.000000e+00
75%	0.715921	111.000000	146.000000	6.600000	2011.000000	2.094007e+07	3.389248e+07
max	32.985763	900.000000	9767.000000	9.200000	2015.000000	4.250000e+08	2.827124e+09

	popularity	runtime	vote_count	vote_average	release_year	budget_adj	revenue_adj
count	10831.000000	10801.000000	10831.000000	10831.000000	10831.000000	5.166000e+03	4.848000e+03
mean	0.647887	102.442459	218.006001	5.973170	2001.307820	3.689528e+07	1.151065e+08
std	1.001431	30.841834	576.442381	0.933902	12.817001	4.196287e+07	1.988758e+08
min	0.000065	2.000000	10.000000	1.500000	1960.000000	9.210911e-01	2.370705e+00
25%	0.208402	90.000000	17.000000	5.400000	1995.000000	8.102293e+06	1.046477e+07
50%	0.384763	99.000000	38.000000	6.000000	2006.000000	2.272271e+07	4.392749e+07
75%	0.715921	112.000000	146.000000	6.600000	2011.000000	5.007483e+07	1.316524e+08
max	32.985763	900.000000	9767.000000	9.200000	2015.000000	4.250000e+08	2.827124e+09

Investigating TMDB Movie Titles¶

Table of Contents¶

Introduction¶

Dataset Description¶

Questions for Analysis¶

Data Wrangling¶

General Properties¶

Data Cleaning¶

Exploratory Data Analysis¶

Question 1: How have audience tastes in different genres changed over time?¶

What factors are associated with high-profit films?¶

First, let's take a look at some statistics on revenue and budget individually.¶

Comparing finances by genre¶

How are audience votes and revenue correlated?¶

Conclusions¶