We will analyize IMBD movies dataset and try to find relations between different columns in this dataset such as runtime of movies and their budget and how they relate to popularity of movies.
%matplotlib inline
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import ast
df = pd.read_csv(r'C:\Users\Jarnail\Desktop\IMDB Dataset\tmdb-movie-metadata\tmdb_5000_movies.csv')
df.head().T
print("entries from original dataset (rows, columns):", df.shape) # Get number of rows
df = df.drop_duplicates(['original_title'])
df_clean = df[['budget', 'genres', 'release_date', 'revenue']].dropna()
print("entries from cleaned dataset (rows, columns):",df_clean.shape) # Get number of rows
df_clean.head().T
df_genre = pd.DataFrame(columns = ['genre', 'cgenres','budget', 'revenue', 'day', 'month', 'year'])
def dataPrep(row):
global df_genre
d = {}
genres = np.array([g['name'] for g in ast.literal_eval(row['genres'])])
n = genres.size
d['budget'] = [row['budget']]*n
d['revenue'] = [row['revenue']]*n
d.update(zip(('year', 'month', 'day'), map (int, row['release_date'].split('-'))))
d['genre'], d['cgenres'] = [], []
for genre in genres:
d['genre'].append(genre)
d['cgenres'].append(genres[genres != genre])
df_genre = df_genre.append(pd.DataFrame(d), ignore_index = True, sort = True)
df_clean.apply(dataPrep, axis = 1)
df_genre = df_genre[['genre', 'budget', 'revenue', 'day', 'month', 'year', 'cgenres']]
df_genre = df_genre.infer_objects()
df_clean[['genres', 'release_date']].head(2)
df_genre[['genre','cgenres','year']].head(7)
print("entries before data preparation (rows, columns):", df_clean.shape)
print("entries after data preparation(rows, columns):", df_genre.shape)
genre_count = df_genre['genre'].value_counts()
df_g={}
df_g['genre'],df_g['count'] = genre_count.index, genre_count.values
df_gcount = pd.DataFrame(df_g)
plt.barh( df_gcount.genre,df_gcount['count'] )
plt.rcParams['figure.figsize'] = 20, 17
plt.ylabel('Genres', fontsize=15)
plt.xlabel('Number of movies per Genre', fontsize=15)
plt.show()
df_genre2 = pd.DataFrame(columns = ['budget', 'revenue', 'day', 'month', 'year'])
def dataPrep2(row):
global df_genre2
d = {}
d['budget'] = [row['budget']]
d['revenue'] = [row['revenue']]
d.update(zip(('year', 'month', 'day'), map (int, row['release_date'].split('-'))))
df_genre2 = df_genre.append(pd.DataFrame(d), ignore_index = True, sort = True)
df_clean.apply(dataPrep2, axis = 1)
df_genre2['BudgetinMillions'] = df.apply(lambda row: row['budget'] / 1000000, axis=1 )
df_genre2['RevenueinMillions'] = df.apply(lambda row: row['revenue'] / 1000000, axis=1 )
g_gyear = df_genre2[['year','BudgetinMillions', 'RevenueinMillions']].groupby(['year']).sum().sort_index()
g_gyear.plot.area()
Question: Popularity based on movies duration:
df = df.drop_duplicates(['original_title'])
df_clean = df[['budget', 'genres', 'release_date', 'revenue', 'popularity','runtime']].dropna()
print("entries from cleaned dataset (rows, columns):",df_clean.shape) # Get number of rows
df_clean.head().T
data = df_clean[[ 'runtime', 'popularity']]
med_budget = df_clean['runtime'].median()
s = data[data['runtime'] < 60 ] # under 1hour
m = data[(data['runtime'] > 60) & (data['runtime'] < 120) ] # between 1 and 2 hrs
l = data[data['runtime'] > 120 ] # more than 2hrs
df_s = s[[ 'runtime', 'popularity']]
df_m = m[[ 'runtime', 'popularity']]
df_l = l[[ 'runtime', 'popularity']]
for i,j in zip([df_s, df_m, df_l],['short_duration_movies', 'medium_duration_movies', 'long_duration_movies']):
plt.scatter( i.popularity ,i.runtime, s=100, label = j +' :' + str(i.popularity.count()) + ' entries')
plt.scatter( i.popularity.mean(),i.runtime.mean(), label ='Mean populatiy for ' + j, marker='X', s=150)
x=i.popularity.mean()
y=i.runtime.mean()
plt.text(x, y, str(x), fontsize ='17')
plt.legend(loc=2, fontsize = 'x-large')
plt.ylabel('Runtime', fontsize='17')
plt.xlabel('Popularity', fontsize='17')
It seems that movies which are more than 2 hours long tend to have a higher popularity as compared to short (under 1 hour) or medium length movies (between 1 and 2hrs).
Question: Is popularity related to budget of the movies?
data = df_clean[[ 'budget', 'popularity']]
med_budget = df_clean['budget'].median()
high_budget_films = data[data['budget'] > med_budget ]
low_budget_films = data[data['budget'] < med_budget ]
datahigh = high_budget_films[[ 'budget', 'popularity']]
datalow = low_budget_films[[ 'budget', 'popularity']]
plt.scatter(datahigh.popularity, datahigh.budget, marker='^' , s=100, label = 'highbudgetmovies :' + str(datahigh.popularity.count()) + ' entries')
plt.scatter(datahigh.popularity.mean(), datahigh.budget.mean(), label ='Mean populatiy for highbudgetmovies', marker='X', s=150)
x=datahigh.popularity.mean()
y=datahigh.budget.mean()
plt.text(x, y, " " + str(x), fontweight='bold', fontsize ='17')
plt.scatter(datalow.popularity, datalow.budget, marker ='v', s=100, label = 'lowbudgetmovies :'+ str(datalow.popularity.count()) + ' entries')
plt.scatter(datalow.popularity.mean(), datalow.budget.mean(), label ='Mean populatiy for lowbudgetmovies', marker='X', s=150)
x=datalow.popularity.mean()
y=datalow.budget.mean()
plt.text(x, y, " " + str(x), fontweight='bold', fontsize ='17')
plt.legend(loc=2, fontsize = 'x-large')
plt.ylabel('Budget', fontsize='17')
plt.xlabel('Popularity', fontsize='17')
It is clear from the above graph that highbudgetmovies tend to have a higher popularity as compared to lowbudgetmovies.