Introduction¶

We will analyize IMBD movies dataset and try to find relations between different columns in this dataset such as runtime of movies and their budget and how they relate to popularity of movies.

%matplotlib inline
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import ast

df = pd.read_csv(r'C:\Users\Jarnail\Desktop\IMDB Dataset\tmdb-movie-metadata\tmdb_5000_movies.csv')

df.head().T

Data Cleansing¶

print("entries from original dataset (rows, columns):", df.shape) # Get number of rows
df = df.drop_duplicates(['original_title'])
df_clean = df[['budget', 'genres', 'release_date', 'revenue']].dropna()
print("entries from cleaned dataset  (rows, columns):",df_clean.shape) # Get number of rows
df_clean.head().T

entries from original dataset (rows, columns): (4803, 20)
entries from cleaned dataset  (rows, columns): (4800, 4)

Data Preparation¶

df_genre = pd.DataFrame(columns = ['genre', 'cgenres','budget', 'revenue', 'day', 'month', 'year'])

def dataPrep(row):
    global df_genre
    d = {}
    genres = np.array([g['name'] for g in ast.literal_eval(row['genres'])])
    n = genres.size
    d['budget'] = [row['budget']]*n
    d['revenue'] = [row['revenue']]*n
    d.update(zip(('year', 'month', 'day'),  map (int, row['release_date'].split('-'))))
    d['genre'], d['cgenres'] = [], []
    for genre in genres:
        d['genre'].append(genre)
        d['cgenres'].append(genres[genres != genre])
    df_genre = df_genre.append(pd.DataFrame(d), ignore_index = True, sort = True)


df_clean.apply(dataPrep, axis = 1)
df_genre = df_genre[['genre', 'budget', 'revenue', 'day', 'month', 'year', 'cgenres']]
df_genre = df_genre.infer_objects()

df_clean[['genres', 'release_date']].head(2)

df_genre[['genre','cgenres','year']].head(7)

print("entries before data preparation (rows, columns):", df_clean.shape)
print("entries after data preparation(rows, columns):", df_genre.shape)

entries before data preparation (rows, columns): (4800, 4)
entries after data preparation(rows, columns): (12154, 7)

genre_count = df_genre['genre'].value_counts()

df_g={}
df_g['genre'],df_g['count'] = genre_count.index, genre_count.values
df_gcount = pd.DataFrame(df_g)
plt.barh( df_gcount.genre,df_gcount['count'] )
plt.rcParams['figure.figsize'] = 20, 17
plt.ylabel('Genres', fontsize=15)
plt.xlabel('Number of movies per Genre', fontsize=15)

plt.show()

df_genre2 = pd.DataFrame(columns = ['budget', 'revenue', 'day', 'month', 'year'])

def dataPrep2(row):
    global df_genre2
    d = {}
    d['budget'] = [row['budget']]
    d['revenue'] = [row['revenue']]
    d.update(zip(('year', 'month', 'day'),  map (int, row['release_date'].split('-'))))

    df_genre2 = df_genre.append(pd.DataFrame(d), ignore_index = True, sort = True)


df_clean.apply(dataPrep2, axis = 1)


df_genre2['BudgetinMillions'] = df.apply(lambda row: row['budget'] / 1000000, axis=1 )
df_genre2['RevenueinMillions'] = df.apply(lambda row: row['revenue'] / 1000000, axis=1 )


g_gyear = df_genre2[['year','BudgetinMillions', 'RevenueinMillions']].groupby(['year']).sum().sort_index()


g_gyear.plot.area()

<matplotlib.axes._subplots.AxesSubplot at 0x246ab590c18>

Question: Popularity based on movies duration:

df = df.drop_duplicates(['original_title'])
df_clean = df[['budget', 'genres', 'release_date', 'revenue', 'popularity','runtime']].dropna()
print("entries from cleaned dataset  (rows, columns):",df_clean.shape) # Get number of rows
df_clean.head().T

data = df_clean[[ 'runtime', 'popularity']]

med_budget = df_clean['runtime'].median()

s = data[data['runtime'] < 60 ]                              # under  1hour 
m = data[(data['runtime'] > 60)  & (data['runtime'] < 120) ] # between 1 and 2 hrs
l = data[data['runtime'] > 120 ]                             # more than 2hrs


df_s = s[[ 'runtime', 'popularity']]
df_m = m[[ 'runtime', 'popularity']]
df_l = l[[ 'runtime', 'popularity']]


for i,j in zip([df_s, df_m, df_l],['short_duration_movies', 'medium_duration_movies', 'long_duration_movies']):
    plt.scatter( i.popularity ,i.runtime, s=100, label = j +' :' + str(i.popularity.count()) + ' entries')
    plt.scatter( i.popularity.mean(),i.runtime.mean(), label ='Mean populatiy for ' + j,  marker='X', s=150)
    x=i.popularity.mean()
    y=i.runtime.mean()
    plt.text(x, y, str(x), fontsize ='17')

plt.legend(loc=2, fontsize = 'x-large')
plt.ylabel('Runtime', fontsize='17')
plt.xlabel('Popularity', fontsize='17')

entries from cleaned dataset  (rows, columns): (4798, 6)

Text(0.5, 0, 'Popularity')

It seems that movies which are more than 2 hours long tend to have a higher popularity as compared to short (under 1 hour) or medium length movies (between 1 and 2hrs).

Question: Is popularity related to budget of the movies?

data = df_clean[[ 'budget', 'popularity']]

med_budget = df_clean['budget'].median()

high_budget_films = data[data['budget'] > med_budget ]
low_budget_films = data[data['budget'] < med_budget ]

datahigh = high_budget_films[[ 'budget', 'popularity']]
datalow = low_budget_films[[ 'budget', 'popularity']]

plt.scatter(datahigh.popularity, datahigh.budget, marker='^' , s=100, label = 'highbudgetmovies :' + str(datahigh.popularity.count()) + ' entries')
plt.scatter(datahigh.popularity.mean(), datahigh.budget.mean(), label ='Mean populatiy for highbudgetmovies',  marker='X', s=150)
x=datahigh.popularity.mean()
y=datahigh.budget.mean()
plt.text(x, y, " " + str(x), fontweight='bold', fontsize ='17')


plt.scatter(datalow.popularity, datalow.budget, marker ='v', s=100, label = 'lowbudgetmovies :'+ str(datalow.popularity.count()) + ' entries')
plt.scatter(datalow.popularity.mean(), datalow.budget.mean(), label ='Mean populatiy for lowbudgetmovies', marker='X', s=150)
x=datalow.popularity.mean()
y=datalow.budget.mean()
plt.text(x, y, " " + str(x), fontweight='bold', fontsize ='17')


plt.legend(loc=2, fontsize = 'x-large')
plt.ylabel('Budget', fontsize='17')
plt.xlabel('Popularity', fontsize='17')

Text(0.5, 0, 'Popularity')

It is clear from the above graph that highbudgetmovies tend to have a higher popularity as compared to lowbudgetmovies.

	0	1	2	3	4
budget	237000000	300000000	245000000	250000000	260000000
genres	[{"id": 28, "name": "Action"}, {"id": 12, "nam...	[{"id": 12, "name": "Adventure"}, {"id": 14, "...	[{"id": 28, "name": "Action"}, {"id": 12, "nam...	[{"id": 28, "name": "Action"}, {"id": 80, "nam...	[{"id": 28, "name": "Action"}, {"id": 12, "nam...
homepage	http://www.avatarmovie.com/	http://disney.go.com/disneypictures/pirates/	http://www.sonypictures.com/movies/spectre/	http://www.thedarkknightrises.com/	http://movies.disney.com/john-carter
id	19995	285	206647	49026	49529
keywords	[{"id": 1463, "name": "culture clash"}, {"id":...	[{"id": 270, "name": "ocean"}, {"id": 726, "na...	[{"id": 470, "name": "spy"}, {"id": 818, "name...	[{"id": 849, "name": "dc comics"}, {"id": 853,...	[{"id": 818, "name": "based on novel"}, {"id":...
original_language	en	en	en	en	en
original_title	Avatar	Pirates of the Caribbean: At World's End	Spectre	The Dark Knight Rises	John Carter
overview	In the 22nd century, a paraplegic Marine is di...	Captain Barbossa, long believed to be dead, ha...	A cryptic message from Bond’s past sends him o...	Following the death of District Attorney Harve...	John Carter is a war-weary, former military ca...
popularity	150.438	139.083	107.377	112.313	43.927
production_companies	[{"name": "Ingenious Film Partners", "id": 289...	[{"name": "Walt Disney Pictures", "id": 2}, {"...	[{"name": "Columbia Pictures", "id": 5}, {"nam...	[{"name": "Legendary Pictures", "id": 923}, {"...	[{"name": "Walt Disney Pictures", "id": 2}]
production_countries	[{"iso_3166_1": "US", "name": "United States o...	[{"iso_3166_1": "US", "name": "United States o...	[{"iso_3166_1": "GB", "name": "United Kingdom"...	[{"iso_3166_1": "US", "name": "United States o...	[{"iso_3166_1": "US", "name": "United States o...
release_date	2009-12-10	2007-05-19	2015-10-26	2012-07-16	2012-03-07
revenue	2787965087	961000000	880674609	1084939099	284139100
runtime	162	169	148	165	132
spoken_languages	[{"iso_639_1": "en", "name": "English"}, {"iso...	[{"iso_639_1": "en", "name": "English"}]	[{"iso_639_1": "fr", "name": "Fran\u00e7ais"},...	[{"iso_639_1": "en", "name": "English"}]	[{"iso_639_1": "en", "name": "English"}]
status	Released	Released	Released	Released	Released
tagline	Enter the World of Pandora.	At the end of the world, the adventure begins.	A Plan No One Escapes	The Legend Ends	Lost in our world, found in another.
title	Avatar	Pirates of the Caribbean: At World's End	Spectre	The Dark Knight Rises	John Carter
vote_average	7.2	6.9	6.3	7.6	6.1
vote_count	11800	4500	4466	9106	2124

	0	1	2	3	4
budget	237000000	300000000	245000000	250000000	260000000
genres	[{"id": 28, "name": "Action"}, {"id": 12, "nam...	[{"id": 12, "name": "Adventure"}, {"id": 14, "...	[{"id": 28, "name": "Action"}, {"id": 12, "nam...	[{"id": 28, "name": "Action"}, {"id": 80, "nam...	[{"id": 28, "name": "Action"}, {"id": 12, "nam...
release_date	2009-12-10	2007-05-19	2015-10-26	2012-07-16	2012-03-07
revenue	2787965087	961000000	880674609	1084939099	284139100

	genres	release_date
0	[{"id": 28, "name": "Action"}, {"id": 12, "nam...	2009-12-10
1	[{"id": 12, "name": "Adventure"}, {"id": 14, "...	2007-05-19

	genre	cgenres	year
0	Action	[Adventure, Fantasy, Science Fiction]	2009
1	Adventure	[Action, Fantasy, Science Fiction]	2009
2	Fantasy	[Action, Adventure, Science Fiction]	2009
3	Science Fiction	[Action, Adventure, Fantasy]	2009
4	Adventure	[Fantasy, Action]	2007
5	Fantasy	[Adventure, Action]	2007
6	Action	[Adventure, Fantasy]	2007

Data Analysis with IMDB dataset

Introduction¶

Data Cleansing¶

Data Preparation¶

Referances:¶