#from google.colab import drive
#drive.mount('/content/drive')

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sb
import numpy as np
from datetime import datetime as dt
import re


# supress uneccessary warning
pd.options.mode.chained_assignment = None  # default='warn'

an_df = pd.read_csv("AnimeRecent.csv")


an_df = an_df.drop(["Synopsis", "Synonyms", "Japanese", "ID", "Status"], axis=1)
an_df.head()


#Remove Music Type
an_df.loc[an_df['Type'] == 'Music'] = np.NaN

#Remove Unknown Type
an_df.loc[an_df['Type'] == 'Unknown'] = np.NaN
an_df = an_df.dropna()
an_df.head()


an_df = an_df.replace('R - 17+ (violence & profanity)', 'R-17+')
an_df = an_df.replace('PG-13 - Teens 13 or older', 'PG-13')
an_df = an_df.replace('R+ - Mild Nudity', 'R+')
an_df = an_df.replace('G - All Ages', 'G')
an_df = an_df.replace('PG - Children', 'PG')
an_df = an_df.replace('None', 'NR')
an_df.head()


an_df = an_df.replace('Kids, Shounen', 'Shounen')
an_df = an_df.replace('Kids, Shoujo', 'Shoujo')
an_df = an_df.replace('Josei, Shoujo', 'Shoujo')
an_df = an_df.replace('Kids, Seinen', 'Seinen')
an_df.head()


#Extracting the Season and Year of the Premiere and make their own seperate columns
an_df = an_df.assign(Season=np.nan)
an_df = an_df.assign(Year=np.nan)
for i in range(0, len(an_df['Premiered'])):
    premiered = an_df['Premiered'].iloc[i]
    if 'Spring' in premiered :
        an_df['Season'].iloc[i] = 'Spring'
    elif 'Fall' in premiered :
        an_df['Season'].iloc[i] = 'Fall'
    elif 'Winter' in premiered :
        an_df['Season'].iloc[i] = 'Winter'
    elif 'Summer' in premiered :
        an_df['Season'].iloc[i] = 'Summer'
    if 'Unknown' in premiered :
        an_df['Season'].iloc[i] = 'Unknown'
        an_df['Year'].iloc[i] = 'Unknown'
    else :
        an_df['Year'].iloc[i] = premiered[len(premiered) - 4 : len(premiered)]   
an_df = an_df.drop(["Premiered"], axis = 1)
an_df.head()


an_df['Demographics'].loc[an_df['Demographics'] == 'Unknown'] = np.NaN #Convert Unknown Demographics to NaN
an_df.head()


#Reorder Columns
an_df = an_df[['Title', 'English', 'Type', 'Episodes', 'Duration_Minutes', 'Season', 'Year',
              'Broadcast', 'Producers', 'Licensors', 'Studios', 'Source', 'Genres', 'Themes', 'Demographics',
              'Rating', 'Score', 'Scored_Users', 'Ranked', 'Popularity', 'Members', 'Favorites']] 

an_df = an_df.reset_index(drop=True)
an_df.head()


an_df.describe()


fig, ax = plt.subplots(figsize=(8,8))
an_corr = an_df.corr() #Computes correlation of columns, excluding NaN values
an_heatmap = sb.heatmap(an_corr, ax=ax, annot=True, fmt=".2f", linewidths=.5, vmin=0, vmax=1)
None


# oneHot_encode_col(col)
# This can one-hot enode any column with categories
# creating a dataframe with binary values (1 and 0)
# in this dataset, categories are split with ", "
def oneHot_encode_col(col):
  
  # create a list of all categories
  all_cats = {}
  for cat_info in an_df[col]:
    splt = cat_info.split(", ")
    for cat in splt:
      if (cat in all_cats):
        pass
      else:
        all_cats[cat] = []

  # for testing purposes
  # print(all_cats)

  # if the column has a speific category,
  # it lables that row with a 1. else, it
  # lables that category with a 0.
  for cat_info in an_df[col]:
    splt = cat_info.split(", ")
    for main_cats in all_cats:
      if (main_cats in splt):
        all_cats[main_cats].append(1)
      else:
        all_cats[main_cats].append(0)

  # return a df with one-hot encoded data.
  # this is done to make sure an_df isn't 
  return pd.DataFrame.from_dict(all_cats)
    

# one-hot encode the columns (test) cluttered
temp = oneHot_encode_col("Themes")
temp.head()


# Try to make a graph of the number of anime per year.
# groupBy to make this a bit easier to view.

angb = an_df.groupby("Year")

year = []
num_anime = []
for yrtup in angb:
  year.append(yrtup[0])
  num_anime.append(len(yrtup[1]))

print(year)
print(num_anime)

['1961', '1962', '1963', '1964', '1965', '1966', '1967', '1968', '1969', '1970', '1971', '1972', '1973', '1974', '1975', '1976', '1977', '1978', '1979', '1980', '1981', '1982', '1983', '1984', '1985', '1986', '1987', '1988', '1989', '1990', '1991', '1992', '1993', '1994', '1995', '1996', '1997', '1998', '1999', '2000', '2001', '2002', '2003', '2004', '2005', '2006', '2007', '2008', '2009', '2010', '2011', '2012', '2013', '2014', '2015', '2016', '2017', '2018', '2019', '2020', '2021', '2022', 'Unknown']
[1, 1, 4, 3, 4, 6, 7, 7, 8, 7, 12, 10, 16, 15, 19, 21, 23, 19, 23, 25, 28, 23, 34, 29, 18, 24, 24, 30, 30, 21, 30, 31, 19, 32, 32, 33, 38, 69, 83, 55, 83, 80, 98, 120, 108, 162, 133, 131, 123, 105, 141, 150, 176, 195, 191, 221, 211, 220, 174, 160, 186, 123, 7627]


# replace all unknown years with 0
an_df["Year"] = an_df["Year"].apply(lambda x : 0 if str(x) == "Unknown" else x)

# as int now
an_df["Year"] = an_df["Year"].apply(lambda x : int(x))

# truncate the lower end.
an_df = an_df.loc[an_df.Year > 1990]

# replace 0 with nans
an_df["Year"] = an_df["Year"].apply(lambda x : np.nan if x == 0 else x)

# Check size of the dataset
print(len(an_df))

3713


 # groupby year
angb = an_df.groupby(["Season"])["Year", "Score"]

# Our Figure
fig, axes = plt.subplots(4, figsize=(15,20))

# Winter
sb.boxplot(ax=axes[0], data=angb.get_group("Winter"), x="Year", y="Score")
axes[0].set_title("Score per Year (Winter)")

# Spring
sb.boxplot(ax=axes[1], data=angb.get_group("Spring"), x="Year", y="Score")
axes[1].set_title("Score per Year (Spring)")

# Summer
sb.boxplot(ax=axes[2], data=angb.get_group("Summer"), x="Year", y="Score")
axes[2].set_title("Score per Year (Summer)")

# Fall
sb.boxplot(ax=axes[3], data=angb.get_group("Fall"), x="Year", y="Score")
axes[3].set_title("Score per Year (Fall)")

/tmp/ipykernel_51/4102440906.py:2: FutureWarning: Indexing with multiple keys (implicitly converted to a tuple of keys) will be deprecated, use a list instead.
  angb = an_df.groupby(["Season"])["Year", "Score"]

Text(0.5, 1.0, 'Score per Year (Fall)')


# GroupBy again for consistency.
angb = an_df.groupby(["Season"])["Year", "Score"]

# line plot
fig, axes = plt.subplots(figsize=(15, 5))

# Plot all four Lines
sb.lineplot( data=angb.get_group("Winter"), x="Year", y="Score", ci=None, legend='brief', label="Winter")
sb.lineplot( data=angb.get_group("Spring"), x="Year", y="Score", ci=None, legend='brief', label="Spring")
sb.lineplot( data=angb.get_group("Summer"), x="Year", y="Score", ci=None, legend='brief', label="Summer")
sb.lineplot( data=angb.get_group("Fall"), x="Year", y="Score", ci=None, legend='brief', label="Fall")

# Add Legend and Title
axes.legend()
axes.set_title("Average Score Per Year (All Seasons)")

/tmp/ipykernel_51/941539687.py:2: FutureWarning: Indexing with multiple keys (implicitly converted to a tuple of keys) will be deprecated, use a list instead.
  angb = an_df.groupby(["Season"])["Year", "Score"]

Text(0.5, 1.0, 'Average Score Per Year (All Seasons)')


fig, ax = plt.subplots(figsize=(13,13))

# using oneHot_encode_col to get Genre features.
gen_df = oneHot_encode_col("Genres")

# heatmap with correlations
gen_corr = gen_df.corr() #Computes correlation of columns, excluding NaN values
gen_heatmap = sb.heatmap(gen_corr, ax=ax, annot=True, fmt=".2f", linewidths=.5, vmin=0, vmax=1)


genres = oneHot_encode_col("Genres")
genres_df = pd.DataFrame(genres.sum())
genres_df.plot.bar(figsize=(12,12))
None


themes = oneHot_encode_col("Themes")
themes_df = pd.DataFrame(themes.sum())
themes_df = themes_df.drop("Unknown")
themes_df.plot.bar(figsize=(12,12))
None


licensors = oneHot_encode_col("Licensors")
licensors_df = pd.DataFrame(licensors.sum())
licensors_df = licensors_df.drop('Unknown')
licensors_df.plot.bar(figsize=(12,12))
None


studios = oneHot_encode_col("Studios")
studios_df = pd.DataFrame(studios.sum())
studios_df = studios_df.rename(columns={0: "Total"})
studios_df

#Set a cutoff to be displayed
for index, row in studios_df.iterrows(): 
    if row[0] < 40 :
        studios_df = studios_df.drop(index)

studios_df = studios_df.drop("Unknown")
studios_df.plot.bar(figsize=(12,12))
None


# grab the score, year, episodes, and season columns as a copy.
an_df_slice = an_df.filter(["Score", "Year", "Season", "Episodes", "Duration_Minutes"], axis=1)

# one-hot-encode respective columns
season_df = oneHot_encode_col("Season")
the_df = oneHot_encode_col("Themes")
gen_df = oneHot_encode_col("Genres").drop("Unknown", axis=1)
stu_df = oneHot_encode_col("Studios").drop("Unknown", axis=1)

# join df's made from the oneHot_encode_col() to a slice of an_df
an_df_slice = an_df_slice.join(season_df)
an_df_slice = an_df_slice.join(the_df)
an_df_slice = an_df_slice.join(gen_df)
an_df_slice = an_df_slice.join(stu_df)

an_df_slice


# reset index.
an_df_slice = an_df_slice.reset_index(drop=True)

# replace NaNs with 0
# an_df_slice = an_df_slice.applymap(lambda x : 0 if x == "NaN" else x)
an_df_slice = an_df_slice.replace(np.nan, 0)

# Check if there are any NaN values left. 
print(an_df_slice.isnull().values.any())

an_df_slice

False


from sklearn import svm
from sklearn.model_selection import train_test_split

# Shuffle so we an train with an equal range
an_df_slice = an_df_slice.sample(frac=1).reset_index(drop=True)

# Split into training and test, 20%
train, test = train_test_split(an_df_slice, test_size=0.2)

# drop Score and Season. That's our classifiation.
# Season is one_hot_encoded.
an_np = (train.drop(columns=["Score", "Season"])).to_numpy()

# X is the dataset w/o Score
X = an_np

# y is the "Score" column but as a float.
y = np.array(list((train.Score).map(lambda x: float(x))))

# reshape numpy array to proper size.
X = np.nan_to_num(X, 0)
y = np.nan_to_num(y, 0)


from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score

# fit the model
regr = svm.SVR()
regr.fit(X, y)

# corr_rate stores the correct results of the testing set.
corr_rate = list(test.Score)
test_X = (test.drop(columns=["Score", "Season"])).to_numpy()

# Predict
svr_pred = regr.predict(test_X)


# Function for a simple visual test.
# visually compares the predicted score 
# and the correct score for 50 samples.
# This function is used for quick tuning
# alongside the Holdout values.
def vis_test(corr, pred):
    for i in range(50):
        print("Correct Score: " + str(corr[i]) + " --- Predicted Score: " + str(pred[i]))

# Do a simple Visual Test
vis_test(corr_rate, list(svr_pred))

print()

# Finding R2 Score
svr_r2 = r2_score(corr_rate, svr_pred)

# Finding Mean Squared Error
svr_mse = mean_squared_error(corr_rate,svr_pred)

print("HOLDOUT STATISTICS:")
print("R2 value for Tuned SVR: " + str(svr_r2))
print("MSE for Tuned SVR: " + str(svr_mse))

Correct Score: 6.071 --- Predicted Score: 6.903709920935294
Correct Score: 6.521 --- Predicted Score: 6.900323173542494
Correct Score: 7.941 --- Predicted Score: 6.9038386366012
Correct Score: 7.521 --- Predicted Score: 6.918020359442526
Correct Score: 7.431 --- Predicted Score: 6.91165382801263
Correct Score: 8.071 --- Predicted Score: 6.914332531923675
Correct Score: 6.711 --- Predicted Score: 6.904118532121238
Correct Score: 6.771 --- Predicted Score: 6.835874515067121
Correct Score: 6.631 --- Predicted Score: 6.912269254829233
Correct Score: 8.391 --- Predicted Score: 6.907923381504749
Correct Score: 7.731 --- Predicted Score: 6.932908166551363
Correct Score: 6.871 --- Predicted Score: 6.899858286837502
Correct Score: 8.041 --- Predicted Score: 6.913595408450492
Correct Score: 6.031 --- Predicted Score: 6.903763368767361
Correct Score: 8.241 --- Predicted Score: 7.228906308359136
Correct Score: 6.271 --- Predicted Score: 6.9245163568983115
Correct Score: 6.301 --- Predicted Score: 6.833293935139024
Correct Score: 7.281 --- Predicted Score: 6.903957314702829
Correct Score: 7.721 --- Predicted Score: 6.901172871120758
Correct Score: 5.781 --- Predicted Score: 6.9085749705999415
Correct Score: 7.371 --- Predicted Score: 6.903031498995937
Correct Score: 7.161 --- Predicted Score: 6.904062872722031
Correct Score: 7.301 --- Predicted Score: 6.900967438496853
Correct Score: 6.801 --- Predicted Score: 6.906548919796699
Correct Score: 6.701 --- Predicted Score: 6.906484608732905
Correct Score: 7.551 --- Predicted Score: 6.903275777387767
Correct Score: 6.561 --- Predicted Score: 6.9029220570945515
Correct Score: 6.921 --- Predicted Score: 6.917806227663953
Correct Score: 6.411 --- Predicted Score: 6.900249676246987
Correct Score: 7.421 --- Predicted Score: 6.912766560001612
Correct Score: 7.201 --- Predicted Score: 6.91645646654022
Correct Score: 7.101 --- Predicted Score: 6.932444661806658
Correct Score: 6.611 --- Predicted Score: 6.906839525225545
Correct Score: 5.621 --- Predicted Score: 6.908472406750698
Correct Score: 8.641 --- Predicted Score: 6.91425640236633
Correct Score: 6.491 --- Predicted Score: 6.932990532885002
Correct Score: 7.211 --- Predicted Score: 6.915065171063747
Correct Score: 7.901 --- Predicted Score: 6.917266768257337
Correct Score: 5.961 --- Predicted Score: 6.903064383714453
Correct Score: 7.401 --- Predicted Score: 6.935785054725833
Correct Score: 6.471 --- Predicted Score: 6.902832636903937
Correct Score: 7.911 --- Predicted Score: 6.900213651996333
Correct Score: 7.911 --- Predicted Score: 6.9326664407277585
Correct Score: 6.381 --- Predicted Score: 6.903048182352836
Correct Score: 6.451 --- Predicted Score: 6.9132765701791605
Correct Score: 7.631 --- Predicted Score: 6.903195782283341
Correct Score: 7.291 --- Predicted Score: 6.913558579935639
Correct Score: 7.461 --- Predicted Score: 6.900037015028906
Correct Score: 7.501 --- Predicted Score: 6.90006383026975
Correct Score: 8.391 --- Predicted Score: 6.903098412384551

HOLDOUT STATISTICS:
R2 value for Tuned SVR: 0.021243671847256174
MSE for Tuned SVR: 0.6959740414545201


# try with a different Gamma and C values.
regr = svm.SVR(kernel="rbf", gamma=1e-8, C=100000)
regr.fit(X, y)

# Predict
svr_pred_tuned = regr.predict(test_X)

# Do a simple Visual Test
vis_test(corr_rate, list(svr_pred_tuned))

print()

# Finding R2 Score
svr_tuned_r2 = r2_score(corr_rate, svr_pred_tuned)

# Finding Mean Squared Error
svr_tuned_mse = mean_squared_error(corr_rate,svr_pred_tuned)

print("HOLDOUT STATISTICS:")
print("R2 value for Tuned SVR: " + str(svr_tuned_r2))
print("MSE for Tuned SVR: " + str(svr_tuned_mse))

Correct Score: 6.071 --- Predicted Score: 6.68544229447761
Correct Score: 6.521 --- Predicted Score: 7.2366552401541355
Correct Score: 7.941 --- Predicted Score: 6.851807258517141
Correct Score: 7.521 --- Predicted Score: 7.100272413069973
Correct Score: 7.431 --- Predicted Score: 6.772063842992424
Correct Score: 8.071 --- Predicted Score: 7.38554267066101
Correct Score: 6.711 --- Predicted Score: 7.174903203038042
Correct Score: 6.771 --- Predicted Score: 5.9116884681457975
Correct Score: 6.631 --- Predicted Score: 7.4977125343171735
Correct Score: 8.391 --- Predicted Score: 6.939512929023834
Correct Score: 7.731 --- Predicted Score: 7.353505627371845
Correct Score: 6.871 --- Predicted Score: 6.764804856927725
Correct Score: 8.041 --- Predicted Score: 7.52194839416515
Correct Score: 6.031 --- Predicted Score: 6.739033953100801
Correct Score: 8.241 --- Predicted Score: 8.236044843087797
Correct Score: 6.271 --- Predicted Score: 6.876120889111547
Correct Score: 6.301 --- Predicted Score: 5.868222067252276
Correct Score: 7.281 --- Predicted Score: 6.969197905625691
Correct Score: 7.721 --- Predicted Score: 7.284492075955853
Correct Score: 5.781 --- Predicted Score: 6.742033885620543
Correct Score: 7.371 --- Predicted Score: 6.921232744850926
Correct Score: 7.161 --- Predicted Score: 7.133500074126999
Correct Score: 7.301 --- Predicted Score: 7.031879082194422
Correct Score: 6.801 --- Predicted Score: 7.360214137082579
Correct Score: 6.701 --- Predicted Score: 7.419414211693578
Correct Score: 7.551 --- Predicted Score: 7.232384039368839
Correct Score: 6.561 --- Predicted Score: 6.7423963327051695
Correct Score: 6.921 --- Predicted Score: 6.826164517191131
Correct Score: 6.411 --- Predicted Score: 7.090955201961691
Correct Score: 7.421 --- Predicted Score: 6.816820083931447
Correct Score: 7.201 --- Predicted Score: 7.219500560404043
Correct Score: 7.101 --- Predicted Score: 6.798518944622259
Correct Score: 6.611 --- Predicted Score: 6.769052118706526
Correct Score: 5.621 --- Predicted Score: 6.628811691410732
Correct Score: 8.641 --- Predicted Score: 7.280085325608155
Correct Score: 6.491 --- Predicted Score: 6.7734406521954895
Correct Score: 7.211 --- Predicted Score: 7.198638345481953
Correct Score: 7.901 --- Predicted Score: 7.206766419552508
Correct Score: 5.961 --- Predicted Score: 6.9203396876870045
Correct Score: 7.401 --- Predicted Score: 7.157461189420786
Correct Score: 6.471 --- Predicted Score: 6.659019346628696
Correct Score: 7.911 --- Predicted Score: 7.1840553938851315
Correct Score: 7.911 --- Predicted Score: 7.130250904479539
Correct Score: 6.381 --- Predicted Score: 6.92824365593296
Correct Score: 6.451 --- Predicted Score: 7.198611518793399
Correct Score: 7.631 --- Predicted Score: 7.110463375811008
Correct Score: 7.291 --- Predicted Score: 7.363140602308334
Correct Score: 7.461 --- Predicted Score: 6.978684877063273
Correct Score: 7.501 --- Predicted Score: 7.046413738705809
Correct Score: 8.391 --- Predicted Score: 6.97500852102857

HOLDOUT STATISTICS:
R2 value for Tuned SVR: 0.3402352698421268
MSE for Tuned SVR: 0.4691454986796947


from sklearn.linear_model import LogisticRegression

# convert X back into an int so it can work with logistic regression.
# We make the score a "category" for classification.
y = y.astype(int)

clf = LogisticRegression(solver="sag", C=10000, max_iter=1000).fit(X, y)

/opt/conda/lib/python3.9/site-packages/sklearn/linear_model/_sag.py:352: ConvergenceWarning: The max_iter was reached which means the coef_ did not converge
  warnings.warn(


# predict
clf_pred = clf.predict(test_X)

# Do a simple Visual Test
vis_test(corr_rate, list(clf_pred))

print()

# Finding R2 Score
clf_r2 = r2_score(corr_rate, clf_pred)

# Finding Mean Squared Error
clf_mse = mean_squared_error(corr_rate,clf_pred)

print("HOLDOUT STATISTICS:")
print("R2 value for Tuned SVR: " + str(clf_r2))
print("MSE for Tuned SVR: " + str(clf_mse))

Correct Score: 6.071 --- Predicted Score: 7
Correct Score: 6.521 --- Predicted Score: 7
Correct Score: 7.941 --- Predicted Score: 7
Correct Score: 7.521 --- Predicted Score: 7
Correct Score: 7.431 --- Predicted Score: 6
Correct Score: 8.071 --- Predicted Score: 7
Correct Score: 6.711 --- Predicted Score: 7
Correct Score: 6.771 --- Predicted Score: 5
Correct Score: 6.631 --- Predicted Score: 7
Correct Score: 8.391 --- Predicted Score: 6
Correct Score: 7.731 --- Predicted Score: 7
Correct Score: 6.871 --- Predicted Score: 6
Correct Score: 8.041 --- Predicted Score: 7
Correct Score: 6.031 --- Predicted Score: 7
Correct Score: 8.241 --- Predicted Score: 7
Correct Score: 6.271 --- Predicted Score: 7
Correct Score: 6.301 --- Predicted Score: 5
Correct Score: 7.281 --- Predicted Score: 7
Correct Score: 7.721 --- Predicted Score: 7
Correct Score: 5.781 --- Predicted Score: 6
Correct Score: 7.371 --- Predicted Score: 7
Correct Score: 7.161 --- Predicted Score: 7
Correct Score: 7.301 --- Predicted Score: 7
Correct Score: 6.801 --- Predicted Score: 7
Correct Score: 6.701 --- Predicted Score: 7
Correct Score: 7.551 --- Predicted Score: 7
Correct Score: 6.561 --- Predicted Score: 7
Correct Score: 6.921 --- Predicted Score: 7
Correct Score: 6.411 --- Predicted Score: 7
Correct Score: 7.421 --- Predicted Score: 7
Correct Score: 7.201 --- Predicted Score: 7
Correct Score: 7.101 --- Predicted Score: 7
Correct Score: 6.611 --- Predicted Score: 7
Correct Score: 5.621 --- Predicted Score: 6
Correct Score: 8.641 --- Predicted Score: 7
Correct Score: 6.491 --- Predicted Score: 7
Correct Score: 7.211 --- Predicted Score: 7
Correct Score: 7.901 --- Predicted Score: 7
Correct Score: 5.961 --- Predicted Score: 7
Correct Score: 7.401 --- Predicted Score: 7
Correct Score: 6.471 --- Predicted Score: 7
Correct Score: 7.911 --- Predicted Score: 7
Correct Score: 7.911 --- Predicted Score: 7
Correct Score: 6.381 --- Predicted Score: 7
Correct Score: 6.451 --- Predicted Score: 7
Correct Score: 7.631 --- Predicted Score: 7
Correct Score: 7.291 --- Predicted Score: 7
Correct Score: 7.461 --- Predicted Score: 7
Correct Score: 7.501 --- Predicted Score: 7
Correct Score: 8.391 --- Predicted Score: 7

HOLDOUT STATISTICS:
R2 value for Tuned SVR: 0.07196069789792736
MSE for Tuned SVR: 0.6599101790040377

	Title	English	Type	Episodes	Start_Aired	End_Aired	Premiered	Broadcast	Producers	Licensors	...	Themes	Demographics	Duration_Minutes	Rating	Score	Scored_Users	Ranked	Popularity	Members	Favorites
0	Shingeki no Kyojin	Attack on Titan	TV	25.0	Apr 7, 2013	Sep 29, 2013	Spring 2013	Sundays at 0158 (JST)	Production I.G, Dentsu, Mainichi Broadcasting ...	Funimation	...	Gore, Military, Survival	Shounen	24.0	R - 17+ (violence & profanity)	8.531	519803.0	1002.0	1	3524109	155695
1	Death Note	Death Note	TV	37.0	Oct 4, 2006	Jun 27, 2007	Fall 2006	Wednesdays at 0056 (JST)	VAP, Konami, Ashi Productions, Nippon Televisi...	VIZ Media	...	Psychological	Shounen	23.0	R - 17+ (violence & profanity)	8.621	485487.0	732.0	2	3504535	159701
2	Fullmetal Alchemist: Brotherhood	Fullmetal Alchemist Brotherhood	TV	64.0	Apr 5, 2009	Jul 4, 2010	Spring 2009	Sundays at 1700 (JST)	Aniplex, Square Enix, Mainichi Broadcasting Sy...	Funimation, Aniplex of America	...	Military	Shounen	24.0	R - 17+ (violence & profanity)	9.131	900398.0	12.0	3	2978455	207772
3	One Punch Man	One Punch Man	TV	12.0	Oct 5, 2015	Dec 21, 2015	Fall 2015	Mondays at 0105 (JST)	TV Tokyo, Bandai Visual, Lantis, Asatsu DK, Ba...	VIZ Media	...	Parody, Super Power	Seinen	24.0	R - 17+ (violence & profanity)	8.511	19066.0	1112.0	4	2879907	59651
4	Sword Art Online	Sword Art Online	TV	25.0	Jul 8, 2012	Dec 23, 2012	Summer 2012	Sundays at 0000 (JST)	Aniplex, Genco, DAX Production, ASCII Media Wo...	Aniplex of America	...	Love Polygon, Video Game	Unknown	23.0	PG-13 - Teens 13 or older	7.201	990254.0	29562.0	5	2813565	64997

	Title	English	Type	Episodes	Start_Aired	End_Aired	Premiered	Broadcast	Producers	Licensors	...	Themes	Demographics	Duration_Minutes	Rating	Score	Scored_Users	Ranked	Popularity	Members	Favorites
0	Shingeki no Kyojin	Attack on Titan	TV	25.0	Apr 7, 2013	Sep 29, 2013	Spring 2013	Sundays at 0158 (JST)	Production I.G, Dentsu, Mainichi Broadcasting ...	Funimation	...	Gore, Military, Survival	Shounen	24.0	R - 17+ (violence & profanity)	8.531	519803.0	1002.0	1.0	3524109.0	155695.0
1	Death Note	Death Note	TV	37.0	Oct 4, 2006	Jun 27, 2007	Fall 2006	Wednesdays at 0056 (JST)	VAP, Konami, Ashi Productions, Nippon Televisi...	VIZ Media	...	Psychological	Shounen	23.0	R - 17+ (violence & profanity)	8.621	485487.0	732.0	2.0	3504535.0	159701.0
2	Fullmetal Alchemist: Brotherhood	Fullmetal Alchemist Brotherhood	TV	64.0	Apr 5, 2009	Jul 4, 2010	Spring 2009	Sundays at 1700 (JST)	Aniplex, Square Enix, Mainichi Broadcasting Sy...	Funimation, Aniplex of America	...	Military	Shounen	24.0	R - 17+ (violence & profanity)	9.131	900398.0	12.0	3.0	2978455.0	207772.0
3	One Punch Man	One Punch Man	TV	12.0	Oct 5, 2015	Dec 21, 2015	Fall 2015	Mondays at 0105 (JST)	TV Tokyo, Bandai Visual, Lantis, Asatsu DK, Ba...	VIZ Media	...	Parody, Super Power	Seinen	24.0	R - 17+ (violence & profanity)	8.511	19066.0	1112.0	4.0	2879907.0	59651.0
4	Sword Art Online	Sword Art Online	TV	25.0	Jul 8, 2012	Dec 23, 2012	Summer 2012	Sundays at 0000 (JST)	Aniplex, Genco, DAX Production, ASCII Media Wo...	Aniplex of America	...	Love Polygon, Video Game	Unknown	23.0	PG-13 - Teens 13 or older	7.201	990254.0	29562.0	5.0	2813565.0	64997.0

	Title	English	Type	Episodes	Start_Aired	End_Aired	Premiered	Broadcast	Producers	Licensors	...	Themes	Demographics	Duration_Minutes	Rating	Score	Scored_Users	Ranked	Popularity	Members	Favorites
0	Shingeki no Kyojin	Attack on Titan	TV	25.0	Apr 7, 2013	Sep 29, 2013	Spring 2013	Sundays at 0158 (JST)	Production I.G, Dentsu, Mainichi Broadcasting ...	Funimation	...	Gore, Military, Survival	Shounen	24.0	R-17+	8.531	519803.0	1002.0	1.0	3524109.0	155695.0
1	Death Note	Death Note	TV	37.0	Oct 4, 2006	Jun 27, 2007	Fall 2006	Wednesdays at 0056 (JST)	VAP, Konami, Ashi Productions, Nippon Televisi...	VIZ Media	...	Psychological	Shounen	23.0	R-17+	8.621	485487.0	732.0	2.0	3504535.0	159701.0
2	Fullmetal Alchemist: Brotherhood	Fullmetal Alchemist Brotherhood	TV	64.0	Apr 5, 2009	Jul 4, 2010	Spring 2009	Sundays at 1700 (JST)	Aniplex, Square Enix, Mainichi Broadcasting Sy...	Funimation, Aniplex of America	...	Military	Shounen	24.0	R-17+	9.131	900398.0	12.0	3.0	2978455.0	207772.0
3	One Punch Man	One Punch Man	TV	12.0	Oct 5, 2015	Dec 21, 2015	Fall 2015	Mondays at 0105 (JST)	TV Tokyo, Bandai Visual, Lantis, Asatsu DK, Ba...	VIZ Media	...	Parody, Super Power	Seinen	24.0	R-17+	8.511	19066.0	1112.0	4.0	2879907.0	59651.0
4	Sword Art Online	Sword Art Online	TV	25.0	Jul 8, 2012	Dec 23, 2012	Summer 2012	Sundays at 0000 (JST)	Aniplex, Genco, DAX Production, ASCII Media Wo...	Aniplex of America	...	Love Polygon, Video Game	Unknown	23.0	PG-13	7.201	990254.0	29562.0	5.0	2813565.0	64997.0

	Title	English	Type	Episodes	Start_Aired	End_Aired	Premiered	Broadcast	Producers	Licensors	...	Themes	Demographics	Duration_Minutes	Rating	Score	Scored_Users	Ranked	Popularity	Members	Favorites
0	Shingeki no Kyojin	Attack on Titan	TV	25.0	Apr 7, 2013	Sep 29, 2013	Spring 2013	Sundays at 0158 (JST)	Production I.G, Dentsu, Mainichi Broadcasting ...	Funimation	...	Gore, Military, Survival	Shounen	24.0	R-17+	8.531	519803.0	1002.0	1.0	3524109.0	155695.0
1	Death Note	Death Note	TV	37.0	Oct 4, 2006	Jun 27, 2007	Fall 2006	Wednesdays at 0056 (JST)	VAP, Konami, Ashi Productions, Nippon Televisi...	VIZ Media	...	Psychological	Shounen	23.0	R-17+	8.621	485487.0	732.0	2.0	3504535.0	159701.0
2	Fullmetal Alchemist: Brotherhood	Fullmetal Alchemist Brotherhood	TV	64.0	Apr 5, 2009	Jul 4, 2010	Spring 2009	Sundays at 1700 (JST)	Aniplex, Square Enix, Mainichi Broadcasting Sy...	Funimation, Aniplex of America	...	Military	Shounen	24.0	R-17+	9.131	900398.0	12.0	3.0	2978455.0	207772.0
3	One Punch Man	One Punch Man	TV	12.0	Oct 5, 2015	Dec 21, 2015	Fall 2015	Mondays at 0105 (JST)	TV Tokyo, Bandai Visual, Lantis, Asatsu DK, Ba...	VIZ Media	...	Parody, Super Power	Seinen	24.0	R-17+	8.511	19066.0	1112.0	4.0	2879907.0	59651.0
4	Sword Art Online	Sword Art Online	TV	25.0	Jul 8, 2012	Dec 23, 2012	Summer 2012	Sundays at 0000 (JST)	Aniplex, Genco, DAX Production, ASCII Media Wo...	Aniplex of America	...	Love Polygon, Video Game	Unknown	23.0	PG-13	7.201	990254.0	29562.0	5.0	2813565.0	64997.0

	Title	English	Type	Episodes	Start_Aired	End_Aired	Broadcast	Producers	Licensors	Studios	...	Duration_Minutes	Rating	Score	Scored_Users	Ranked	Popularity	Members	Favorites	Season	Year
0	Shingeki no Kyojin	Attack on Titan	TV	25.0	Apr 7, 2013	Sep 29, 2013	Sundays at 0158 (JST)	Production I.G, Dentsu, Mainichi Broadcasting ...	Funimation	Wit Studio	...	24.0	R-17+	8.531	519803.0	1002.0	1.0	3524109.0	155695.0	Spring	2013
1	Death Note	Death Note	TV	37.0	Oct 4, 2006	Jun 27, 2007	Wednesdays at 0056 (JST)	VAP, Konami, Ashi Productions, Nippon Televisi...	VIZ Media	Madhouse	...	23.0	R-17+	8.621	485487.0	732.0	2.0	3504535.0	159701.0	Fall	2006
2	Fullmetal Alchemist: Brotherhood	Fullmetal Alchemist Brotherhood	TV	64.0	Apr 5, 2009	Jul 4, 2010	Sundays at 1700 (JST)	Aniplex, Square Enix, Mainichi Broadcasting Sy...	Funimation, Aniplex of America	Bones	...	24.0	R-17+	9.131	900398.0	12.0	3.0	2978455.0	207772.0	Spring	2009
3	One Punch Man	One Punch Man	TV	12.0	Oct 5, 2015	Dec 21, 2015	Mondays at 0105 (JST)	TV Tokyo, Bandai Visual, Lantis, Asatsu DK, Ba...	VIZ Media	Madhouse	...	24.0	R-17+	8.511	19066.0	1112.0	4.0	2879907.0	59651.0	Fall	2015
4	Sword Art Online	Sword Art Online	TV	25.0	Jul 8, 2012	Dec 23, 2012	Sundays at 0000 (JST)	Aniplex, Genco, DAX Production, ASCII Media Wo...	Aniplex of America	A-1 Pictures	...	23.0	PG-13	7.201	990254.0	29562.0	5.0	2813565.0	64997.0	Summer	2012

Finding the Relationship between Seasons, Genres, and more in Anime¶

Introduction¶

Data Set¶

Feature Engineering¶

Data Exploration¶

One Hot Encoding¶

The Relationship between Season, Year, and Score¶

The Relationship between Genres¶

Predicting Anime Scores¶

Model Tuning¶

The Optimal Parameters¶

Conclusion¶

The Data Science Pipeline¶

Works Cited¶

	Episodes	Duration_Minutes	Score	Scored_Users	Ranked	Popularity	Members	Favorites
count	11832.000000	11832.000000	11832.000000	11832.000000	11832.000000	11832.000000	1.183200e+04	11832.000000
mean	13.749070	28.218729	6.517990	32064.138861	63033.302400	6995.155088	6.902013e+04	810.682387
std	54.844314	26.511146	0.919228	93279.255883	37539.391852	4463.040989	2.050284e+05	5685.765462
min	1.000000	1.000000	1.841000	102.000000	12.000000	1.000000	1.920000e+02	0.000000
25%	1.000000	11.000000	5.921000	491.000000	30539.500000	3046.750000	1.533750e+03	1.000000
50%	3.000000	24.000000	6.531000	2578.500000	61927.000000	6616.000000	6.950500e+03	10.000000
75%	13.000000	28.000000	7.181000	17433.000000	94624.500000	10811.500000	4.067675e+04	92.000000
max	3057.000000	168.000000	9.131000	997243.000000	131202.000000	17677.000000	3.524109e+06	207772.000000

	Gore	Military	Survival	Psychological	Parody	Super Power	Love Polygon	Video Game	...
0	1	1	1	0	0	0	0	0	...
1	0	0	0	1	0	0	0	0	...
2	0	1	0	0	0	0	0	0	...
3	0	0	0	0	1	1	0	0	...
4	0	0	0	0	0	0	1	1	...

	Score	Year	Season	Episodes	Duration_Minutes	Spring	Fall	Summer	Winter	Gore	...	happyproject	Monster's Egg	Digital Media Lab	Beijing Rocen Digital	Life Work	Spooky graphic	Ripromo	Pollyanna Graphics	Shanghai Animation Film Studio	Puzzle Animation Studio Limited
0	8.531	2013	Spring	25.0	24.0	1.0	0.0	0.0	0.0	1.0	...	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
1	8.621	2006	Fall	37.0	23.0	0.0	1.0	0.0	0.0	0.0	...	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
2	9.131	2009	Spring	64.0	24.0	1.0	0.0	0.0	0.0	0.0	...	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
3	8.511	2015	Fall	12.0	24.0	0.0	1.0	0.0	0.0	0.0	...	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
4	7.201	2012	Summer	25.0	23.0	0.0	0.0	1.0	0.0	0.0	...	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
3708	6.321	2011	Fall	52.0	22.0	0.0	0.0	0.0	0.0	0.0	...	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
3709	5.621	2012	Winter	26.0	11.0	0.0	0.0	0.0	0.0	0.0	...	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
3710	5.951	2014	Fall	40.0	23.0	0.0	0.0	0.0	0.0	0.0	...	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
3711	5.731	2010	Summer	26.0	11.0	0.0	0.0	0.0	0.0	0.0	...	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
3712	5.811	2013	Fall	52.0	12.0	0.0	0.0	0.0	0.0	0.0	...	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0

	Gore	Military	Survival	Psychological	Parody	Super Power	Love Polygon	Video Game	...
0	1	1	1	0	0	0	0	0	...
1	0	0	0	1	0	0	0	0	...
2	0	1	0	0	0	0	0	0	...
3	0	0	0	0	1	1	0	0	...
4	0	0	0	0	0	0	1	1	...

	Gore	Military	Survival	Psychological	Parody	Super Power	Love Polygon	Video Game	...
0	1	1	1	0	0	0	0	0	...
1	0	0	0	1	0	0	0	0	...
2	0	1	0	0	0	0	0	0	...
3	0	0	0	0	1	1	0	0	...
4	0	0	0	0	0	0	1	1	...