Data Analysis with IMDB dataset

IMDB

Introduction

We will analyize IMBD movies dataset and try to find relations between different columns in this dataset such as runtime of movies and their budget and how they relate to popularity of movies.

In [18]:
%matplotlib inline
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import ast
In [2]:
df = pd.read_csv(r'C:\Users\Jarnail\Desktop\IMDB Dataset\tmdb-movie-metadata\tmdb_5000_movies.csv')
In [3]:
df.head().T
Out[3]:
0 1 2 3 4
budget 237000000 300000000 245000000 250000000 260000000
genres [{"id": 28, "name": "Action"}, {"id": 12, "nam... [{"id": 12, "name": "Adventure"}, {"id": 14, "... [{"id": 28, "name": "Action"}, {"id": 12, "nam... [{"id": 28, "name": "Action"}, {"id": 80, "nam... [{"id": 28, "name": "Action"}, {"id": 12, "nam...
homepage http://www.avatarmovie.com/ http://disney.go.com/disneypictures/pirates/ http://www.sonypictures.com/movies/spectre/ http://www.thedarkknightrises.com/ http://movies.disney.com/john-carter
id 19995 285 206647 49026 49529
keywords [{"id": 1463, "name": "culture clash"}, {"id":... [{"id": 270, "name": "ocean"}, {"id": 726, "na... [{"id": 470, "name": "spy"}, {"id": 818, "name... [{"id": 849, "name": "dc comics"}, {"id": 853,... [{"id": 818, "name": "based on novel"}, {"id":...
original_language en en en en en
original_title Avatar Pirates of the Caribbean: At World's End Spectre The Dark Knight Rises John Carter
overview In the 22nd century, a paraplegic Marine is di... Captain Barbossa, long believed to be dead, ha... A cryptic message from Bond’s past sends him o... Following the death of District Attorney Harve... John Carter is a war-weary, former military ca...
popularity 150.438 139.083 107.377 112.313 43.927
production_companies [{"name": "Ingenious Film Partners", "id": 289... [{"name": "Walt Disney Pictures", "id": 2}, {"... [{"name": "Columbia Pictures", "id": 5}, {"nam... [{"name": "Legendary Pictures", "id": 923}, {"... [{"name": "Walt Disney Pictures", "id": 2}]
production_countries [{"iso_3166_1": "US", "name": "United States o... [{"iso_3166_1": "US", "name": "United States o... [{"iso_3166_1": "GB", "name": "United Kingdom"... [{"iso_3166_1": "US", "name": "United States o... [{"iso_3166_1": "US", "name": "United States o...
release_date 2009-12-10 2007-05-19 2015-10-26 2012-07-16 2012-03-07
revenue 2787965087 961000000 880674609 1084939099 284139100
runtime 162 169 148 165 132
spoken_languages [{"iso_639_1": "en", "name": "English"}, {"iso... [{"iso_639_1": "en", "name": "English"}] [{"iso_639_1": "fr", "name": "Fran\u00e7ais"},... [{"iso_639_1": "en", "name": "English"}] [{"iso_639_1": "en", "name": "English"}]
status Released Released Released Released Released
tagline Enter the World of Pandora. At the end of the world, the adventure begins. A Plan No One Escapes The Legend Ends Lost in our world, found in another.
title Avatar Pirates of the Caribbean: At World's End Spectre The Dark Knight Rises John Carter
vote_average 7.2 6.9 6.3 7.6 6.1
vote_count 11800 4500 4466 9106 2124

Data Cleansing

In [4]:
print("entries from original dataset (rows, columns):", df.shape) # Get number of rows
df = df.drop_duplicates(['original_title'])
df_clean = df[['budget', 'genres', 'release_date', 'revenue']].dropna()
print("entries from cleaned dataset  (rows, columns):",df_clean.shape) # Get number of rows
df_clean.head().T
entries from original dataset (rows, columns): (4803, 20)
entries from cleaned dataset  (rows, columns): (4800, 4)
Out[4]:
0 1 2 3 4
budget 237000000 300000000 245000000 250000000 260000000
genres [{"id": 28, "name": "Action"}, {"id": 12, "nam... [{"id": 12, "name": "Adventure"}, {"id": 14, "... [{"id": 28, "name": "Action"}, {"id": 12, "nam... [{"id": 28, "name": "Action"}, {"id": 80, "nam... [{"id": 28, "name": "Action"}, {"id": 12, "nam...
release_date 2009-12-10 2007-05-19 2015-10-26 2012-07-16 2012-03-07
revenue 2787965087 961000000 880674609 1084939099 284139100

Data Preparation

In [5]:
df_genre = pd.DataFrame(columns = ['genre', 'cgenres','budget', 'revenue', 'day', 'month', 'year'])

def dataPrep(row):
    global df_genre
    d = {}
    genres = np.array([g['name'] for g in ast.literal_eval(row['genres'])])
    n = genres.size
    d['budget'] = [row['budget']]*n
    d['revenue'] = [row['revenue']]*n
    d.update(zip(('year', 'month', 'day'),  map (int, row['release_date'].split('-'))))
    d['genre'], d['cgenres'] = [], []
    for genre in genres:
        d['genre'].append(genre)
        d['cgenres'].append(genres[genres != genre])
    df_genre = df_genre.append(pd.DataFrame(d), ignore_index = True, sort = True)


df_clean.apply(dataPrep, axis = 1)
df_genre = df_genre[['genre', 'budget', 'revenue', 'day', 'month', 'year', 'cgenres']]
df_genre = df_genre.infer_objects()
In [6]:
df_clean[['genres', 'release_date']].head(2)
Out[6]:
genres release_date
0 [{"id": 28, "name": "Action"}, {"id": 12, "nam... 2009-12-10
1 [{"id": 12, "name": "Adventure"}, {"id": 14, "... 2007-05-19
In [7]:
df_genre[['genre','cgenres','year']].head(7)
Out[7]:
genre cgenres year
0 Action [Adventure, Fantasy, Science Fiction] 2009
1 Adventure [Action, Fantasy, Science Fiction] 2009
2 Fantasy [Action, Adventure, Science Fiction] 2009
3 Science Fiction [Action, Adventure, Fantasy] 2009
4 Adventure [Fantasy, Action] 2007
5 Fantasy [Adventure, Action] 2007
6 Action [Adventure, Fantasy] 2007
In [8]:
print("entries before data preparation (rows, columns):", df_clean.shape)
print("entries after data preparation(rows, columns):", df_genre.shape)
entries before data preparation (rows, columns): (4800, 4)
entries after data preparation(rows, columns): (12154, 7)
In [12]:
genre_count = df_genre['genre'].value_counts()

df_g={}
df_g['genre'],df_g['count'] = genre_count.index, genre_count.values
df_gcount = pd.DataFrame(df_g)
plt.barh( df_gcount.genre,df_gcount['count'] )
plt.rcParams['figure.figsize'] = 20, 17
plt.ylabel('Genres', fontsize=15)
plt.xlabel('Number of movies per Genre', fontsize=15)

plt.show()
In [10]:
df_genre2 = pd.DataFrame(columns = ['budget', 'revenue', 'day', 'month', 'year'])

def dataPrep2(row):
    global df_genre2
    d = {}
    d['budget'] = [row['budget']]
    d['revenue'] = [row['revenue']]
    d.update(zip(('year', 'month', 'day'),  map (int, row['release_date'].split('-'))))

    df_genre2 = df_genre.append(pd.DataFrame(d), ignore_index = True, sort = True)


df_clean.apply(dataPrep2, axis = 1)


df_genre2['BudgetinMillions'] = df.apply(lambda row: row['budget'] / 1000000, axis=1 )
df_genre2['RevenueinMillions'] = df.apply(lambda row: row['revenue'] / 1000000, axis=1 )


g_gyear = df_genre2[['year','BudgetinMillions', 'RevenueinMillions']].groupby(['year']).sum().sort_index()


g_gyear.plot.area()
Out[10]:
<matplotlib.axes._subplots.AxesSubplot at 0x246ab590c18>

Question: Popularity based on movies duration:

In [16]:
df = df.drop_duplicates(['original_title'])
df_clean = df[['budget', 'genres', 'release_date', 'revenue', 'popularity','runtime']].dropna()
print("entries from cleaned dataset  (rows, columns):",df_clean.shape) # Get number of rows
df_clean.head().T

data = df_clean[[ 'runtime', 'popularity']]

med_budget = df_clean['runtime'].median()

s = data[data['runtime'] < 60 ]                              # under  1hour 
m = data[(data['runtime'] > 60)  & (data['runtime'] < 120) ] # between 1 and 2 hrs
l = data[data['runtime'] > 120 ]                             # more than 2hrs


df_s = s[[ 'runtime', 'popularity']]
df_m = m[[ 'runtime', 'popularity']]
df_l = l[[ 'runtime', 'popularity']]


for i,j in zip([df_s, df_m, df_l],['short_duration_movies', 'medium_duration_movies', 'long_duration_movies']):
    plt.scatter( i.popularity ,i.runtime, s=100, label = j +' :' + str(i.popularity.count()) + ' entries')
    plt.scatter( i.popularity.mean(),i.runtime.mean(), label ='Mean populatiy for ' + j,  marker='X', s=150)
    x=i.popularity.mean()
    y=i.runtime.mean()
    plt.text(x, y, str(x), fontsize ='17')

plt.legend(loc=2, fontsize = 'x-large')
plt.ylabel('Runtime', fontsize='17')
plt.xlabel('Popularity', fontsize='17')
entries from cleaned dataset  (rows, columns): (4798, 6)
Out[16]:
Text(0.5, 0, 'Popularity')

It seems that movies which are more than 2 hours long tend to have a higher popularity as compared to short (under 1 hour) or medium length movies (between 1 and 2hrs).

Question: Is popularity related to budget of the movies?

In [17]:
data = df_clean[[ 'budget', 'popularity']]

med_budget = df_clean['budget'].median()

high_budget_films = data[data['budget'] > med_budget ]
low_budget_films = data[data['budget'] < med_budget ]

datahigh = high_budget_films[[ 'budget', 'popularity']]
datalow = low_budget_films[[ 'budget', 'popularity']]

plt.scatter(datahigh.popularity, datahigh.budget, marker='^' , s=100, label = 'highbudgetmovies :' + str(datahigh.popularity.count()) + ' entries')
plt.scatter(datahigh.popularity.mean(), datahigh.budget.mean(), label ='Mean populatiy for highbudgetmovies',  marker='X', s=150)
x=datahigh.popularity.mean()
y=datahigh.budget.mean()
plt.text(x, y, " " + str(x), fontweight='bold', fontsize ='17')


plt.scatter(datalow.popularity, datalow.budget, marker ='v', s=100, label = 'lowbudgetmovies :'+ str(datalow.popularity.count()) + ' entries')
plt.scatter(datalow.popularity.mean(), datalow.budget.mean(), label ='Mean populatiy for lowbudgetmovies', marker='X', s=150)
x=datalow.popularity.mean()
y=datalow.budget.mean()
plt.text(x, y, " " + str(x), fontweight='bold', fontsize ='17')


plt.legend(loc=2, fontsize = 'x-large')
plt.ylabel('Budget', fontsize='17')
plt.xlabel('Popularity', fontsize='17')
Out[17]:
Text(0.5, 0, 'Popularity')

It is clear from the above graph that highbudgetmovies tend to have a higher popularity as compared to lowbudgetmovies.