Final Project Algorithms¶
Algorithms grade project that I developed based on the
highest_hollywood_grossing_movies.csv
dataset. The goal of this notebook is to test and compare sorting algorithms.
Data read¶
In [7]:
Copied!
import math
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns
from datetime import datetime
# data reading using pandas
df = pd.read_csv('highest_holywood_grossing_movies.csv')
# dataset informations
df.info()
import math
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns
from datetime import datetime
# data reading using pandas
df = pd.read_csv('highest_holywood_grossing_movies.csv')
# dataset informations
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 918 entries, 0 to 917 Data columns (total 11 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Unnamed: 0 918 non-null int64 1 Title 918 non-null object 2 Movie Info 918 non-null object 3 Distributor 918 non-null object 4 Release Date 800 non-null object 5 Domestic Sales (in $) 918 non-null int64 6 International Sales (in $) 918 non-null int64 7 World Sales (in $) 918 non-null int64 8 Genre 918 non-null object 9 Movie Runtime 918 non-null object 10 License 744 non-null object dtypes: int64(4), object(7) memory usage: 79.0+ KB
Warner Movies¶
Running and selecting the list¶
In [8]:
Copied!
# Running through the list and ordering
class Movie():
def __init__(self, world_sales, distributor, release, title):
self.Title = title
self.World_Sales = world_sales
self.Distributor = distributor
self.Release = release
warner_bros_movies = []
movies = []
def order_movies(array):
for i in range (len(array['Distributor'])):
movie1 = Movie(array['World Sales (in $)'][i], array['Distributor'][i], array['Release Date'][i], array['Title'][i])
movies.append(movie1)
def select_warner_movies(array):
for i in range (len(array['Distributor'])):
if array['Distributor'][i] == 'Warner Bros.':
movie1 = Movie(array['World Sales (in $)'][i], array['Distributor'][i], array['Release Date'][i], array['Title'][i])
warner_bros_movies.append(movie1)
date = datetime.now()
select_warner_movies(df)
order_movies(df)
date_end = datetime.now() - date
total_warner_bros_movies_inline = len(warner_bros_movies)
print(f"Total Warner Bros Movies: {total_warner_bros_movies_inline}")
print("\nTime after running through the list with inline code: ", date_end)
# Running through the list and ordering
class Movie():
def __init__(self, world_sales, distributor, release, title):
self.Title = title
self.World_Sales = world_sales
self.Distributor = distributor
self.Release = release
warner_bros_movies = []
movies = []
def order_movies(array):
for i in range (len(array['Distributor'])):
movie1 = Movie(array['World Sales (in $)'][i], array['Distributor'][i], array['Release Date'][i], array['Title'][i])
movies.append(movie1)
def select_warner_movies(array):
for i in range (len(array['Distributor'])):
if array['Distributor'][i] == 'Warner Bros.':
movie1 = Movie(array['World Sales (in $)'][i], array['Distributor'][i], array['Release Date'][i], array['Title'][i])
warner_bros_movies.append(movie1)
date = datetime.now()
select_warner_movies(df)
order_movies(df)
date_end = datetime.now() - date
total_warner_bros_movies_inline = len(warner_bros_movies)
print(f"Total Warner Bros Movies: {total_warner_bros_movies_inline}")
print("\nTime after running through the list with inline code: ", date_end)
Total Warner Bros Movies: 158 Time after running through the list with inline code: 0:00:00.051884
Ordering by distributor name then selecting¶
In [9]:
Copied!
#Ordering by Distributor and then selecting
warner_bros_movies_by_sort = []
def bubble_sort(lista):
elements = len(lista)-1
ordered = False
while not ordered:
ordered = True
for i in range(elements):
if lista[i].Distributor > lista[i+1].Distributor:
lista[i].Distributor, lista[i+1].Distributor = lista[i+1].Distributor,lista[i].Distributor
ordered = False
return lista
date = datetime.now()
ordered_list = bubble_sort(movies)
for i in reversed(ordered_list):
if(i.Distributor == 'Warner Bros.'):
warner_bros_movies_by_sort.append(i)
else:
break
date_end = datetime.now() - date
total_warner_bros_movies = len(warner_bros_movies_by_sort)
print(f"Total Warner Brothers Movies: {total_warner_bros_movies}")
print('\nTime took by bubble sort algotihm:', date_end)
#Ordering by Distributor and then selecting
warner_bros_movies_by_sort = []
def bubble_sort(lista):
elements = len(lista)-1
ordered = False
while not ordered:
ordered = True
for i in range(elements):
if lista[i].Distributor > lista[i+1].Distributor:
lista[i].Distributor, lista[i+1].Distributor = lista[i+1].Distributor,lista[i].Distributor
ordered = False
return lista
date = datetime.now()
ordered_list = bubble_sort(movies)
for i in reversed(ordered_list):
if(i.Distributor == 'Warner Bros.'):
warner_bros_movies_by_sort.append(i)
else:
break
date_end = datetime.now() - date
total_warner_bros_movies = len(warner_bros_movies_by_sort)
print(f"Total Warner Brothers Movies: {total_warner_bros_movies}")
print('\nTime took by bubble sort algotihm:', date_end)
Total Warner Brothers Movies: 158 Time took by bubble sort algotihm: 0:00:00.243218
Verifying methods to retrieve min and max sales¶
In [10]:
Copied!
def partition(array,low,high):
i = ( low - 1 )
x = array[high]
for j in range(low , high):
if array[j] <= x:
i = i+1
array[i],array[j] = array[j],array[i]
array[i+1],array[high] = array[high],array[i+1]
return (i+1)
# low --> Starting index,
# high --> Ending index
def min_max(array):
n = len(array)-1
if (n%2) == 0:
i = 2
if (array[0]>array[1]):
max = array[0]
min = array[1]
else:
max = array[1]
min = array[0]
else:
i = 1
max = array[0]
min = array[0]
while (i<n-1):
i += 1
if (array[i]<array[i+1]):
if(array[i]<min):
min = array[i]
if(array[i+1]>max):
max = array[i+1]
else:
if(array[i+1]<min):
min = array[i+1]
if(array[i]>max):
max = array[i]
return (max, min)
def quick_sort(array,low,high):
# auxiliary stack
size = high - low + 1
stack = [0] * (size)
top = -1
top = top + 1
stack[top] = low
top = top + 1
stack[top] = high
# Keep popping from stack while is not empty
while top >= 0:
# Pop high and low
high = stack[top]
top = top - 1
low = stack[top]
top = top - 1
# sorted array
p = partition( array, low, high )
# push left side to stack
if p-1 > low:
top = top + 1
stack[top] = low
top = top + 1
stack[top] = p - 1
# push right side to stack
if p+1 < high:
top = top + 1
stack[top] = p + 1
top = top + 1
stack[top] = high
def calc_max_min(array):
print("\nMethod 1:\nMin and Max definition by function (quickSort)\n")
start = datetime.now()
quick_sort(array, 0, len(array)-1)
max_val = array[len(array)-1]
min_val = array[0]
elapsed_time = datetime.now() - start
print(f"Elapsed Time: {elapsed_time}\nMin, Max: {min_val}, {max_val}")
print("\nMethod 2:\nMin and Max definition by written function (min_max):\n")
start = datetime.now()
max_min = min_max(array)
max_val = max_min[0]
min_val = max_min[1]
elapsed_time = datetime.now() - start
print(f"Elapsed Time: {elapsed_time}\nMin, Max: {min_val}, {max_val}")
def partition(array,low,high):
i = ( low - 1 )
x = array[high]
for j in range(low , high):
if array[j] <= x:
i = i+1
array[i],array[j] = array[j],array[i]
array[i+1],array[high] = array[high],array[i+1]
return (i+1)
# low --> Starting index,
# high --> Ending index
def min_max(array):
n = len(array)-1
if (n%2) == 0:
i = 2
if (array[0]>array[1]):
max = array[0]
min = array[1]
else:
max = array[1]
min = array[0]
else:
i = 1
max = array[0]
min = array[0]
while (imax):
max = array[i+1]
else:
if(array[i+1]max):
max = array[i]
return (max, min)
def quick_sort(array,low,high):
# auxiliary stack
size = high - low + 1
stack = [0] * (size)
top = -1
top = top + 1
stack[top] = low
top = top + 1
stack[top] = high
# Keep popping from stack while is not empty
while top >= 0:
# Pop high and low
high = stack[top]
top = top - 1
low = stack[top]
top = top - 1
# sorted array
p = partition( array, low, high )
# push left side to stack
if p-1 > low:
top = top + 1
stack[top] = low
top = top + 1
stack[top] = p - 1
# push right side to stack
if p+1 < high:
top = top + 1
stack[top] = p + 1
top = top + 1
stack[top] = high
def calc_max_min(array):
print("\nMethod 1:\nMin and Max definition by function (quickSort)\n")
start = datetime.now()
quick_sort(array, 0, len(array)-1)
max_val = array[len(array)-1]
min_val = array[0]
elapsed_time = datetime.now() - start
print(f"Elapsed Time: {elapsed_time}\nMin, Max: {min_val}, {max_val}")
print("\nMethod 2:\nMin and Max definition by written function (min_max):\n")
start = datetime.now()
max_min = min_max(array)
max_val = max_min[0]
min_val = max_min[1]
elapsed_time = datetime.now() - start
print(f"Elapsed Time: {elapsed_time}\nMin, Max: {min_val}, {max_val}")
Creating array with sales amount¶
In [11]:
Copied!
sales=[]
for i in warner_bros_movies:
sales.append(i.World_Sales)
calc_max_min(sales)
sales=[]
for i in warner_bros_movies:
sales.append(i.World_Sales)
calc_max_min(sales)
Method 1: Min and Max definition by ordering (quickSort) Elapsed Time: 0:00:00.000336 Min, Max: 97470701, 1342321665 Method 2: Min and Max definition by function (min_max): Elapsed Time: 0:00:00.000048 Min, Max: 97470701, 1342321665
In [12]:
Copied!
# Mean calculation
def mean_calc(list):
mean = sum(list)/len(list)
return mean
# ddof(default=0) argument allows us to set the degrees of freedom that
# we want to use when calculating the variance, with samples.
def variance_calc(list, ddof=0):
# Number of observations
n = len(list)
# Data mean
mean = mean_calc(list)
# Square Deviations returning variance
return sum((x - mean) ** 2 for x in list) / (n - ddof)
# Standard deviation calculation, square root of variance
def std_deviation_calc(list):
var = variance_calc(list)
std_dev = math.sqrt(var)
return std_dev
print("\nMean of Sales: ", mean_calc(sales))
print("\nVariance of Sales: ", variance_calc(sales))
print("\nStandard Deviation of Sales: ", std_deviation_calc(sales))
# Mean calculation
def mean_calc(list):
mean = sum(list)/len(list)
return mean
# ddof(default=0) argument allows us to set the degrees of freedom that
# we want to use when calculating the variance, with samples.
def variance_calc(list, ddof=0):
# Number of observations
n = len(list)
# Data mean
mean = mean_calc(list)
# Square Deviations returning variance
return sum((x - mean) ** 2 for x in list) / (n - ddof)
# Standard deviation calculation, square root of variance
def std_deviation_calc(list):
var = variance_calc(list)
std_dev = math.sqrt(var)
return std_dev
print("\nMean of Sales: ", mean_calc(sales))
print("\nVariance of Sales: ", variance_calc(sales))
print("\nStandard Deviation of Sales: ", std_deviation_calc(sales))
Mean of Sales: 402515784.37341774 Variance of Sales: 6.557122119540581e+16 Standard Deviation of Sales: 256068782.15707162