In [ ]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import GridSearchCV

We have chosen the Pokémon dataset from https://www.kaggle.com/datasets/mariotormo/complete-pokemon-dataset-updated-090420?select=pokedex_%28Update_04.21%29.csv . Our objective is to classify Pokémon types based on the stats and find the model and hyperparameters with the best performance. There are multiple ways of going about this. But first, let's explore the dataset.

Exploring the dataset¶

In [ ]:
# import the dataset
df = pd.read_csv('data/pokedex.csv')
# Print head
df.head()
Out[ ]:
Unnamed: 0 pokedex_number name german_name japanese_name generation status species type_number type_1 ... against_ground against_flying against_psychic against_bug against_rock against_ghost against_dragon against_dark against_steel against_fairy
0 0 1 Bulbasaur Bisasam フシギダネ (Fushigidane) 1 Normal Seed Pokémon 2 Grass ... 1.0 2.0 2.0 1.0 1.0 1.0 1.0 1.0 1.0 0.5
1 1 2 Ivysaur Bisaknosp フシギソウ (Fushigisou) 1 Normal Seed Pokémon 2 Grass ... 1.0 2.0 2.0 1.0 1.0 1.0 1.0 1.0 1.0 0.5
2 2 3 Venusaur Bisaflor フシギバナ (Fushigibana) 1 Normal Seed Pokémon 2 Grass ... 1.0 2.0 2.0 1.0 1.0 1.0 1.0 1.0 1.0 0.5
3 3 3 Mega Venusaur Bisaflor フシギバナ (Fushigibana) 1 Normal Seed Pokémon 2 Grass ... 1.0 2.0 2.0 1.0 1.0 1.0 1.0 1.0 1.0 0.5
4 4 4 Charmander Glumanda ヒトカゲ (Hitokage) 1 Normal Lizard Pokémon 1 Fire ... 2.0 1.0 1.0 0.5 2.0 1.0 1.0 1.0 0.5 0.5

5 rows × 51 columns

In [ ]:
# Print info such as data types and number of non-null values
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1045 entries, 0 to 1044
Data columns (total 51 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Unnamed: 0        1045 non-null   int64  
 1   pokedex_number    1045 non-null   int64  
 2   name              1045 non-null   object 
 3   german_name       1045 non-null   object 
 4   japanese_name     1045 non-null   object 
 5   generation        1045 non-null   int64  
 6   status            1045 non-null   object 
 7   species           1045 non-null   object 
 8   type_number       1045 non-null   int64  
 9   type_1            1045 non-null   object 
 10  type_2            553 non-null    object 
 11  height_m          1045 non-null   float64
 12  weight_kg         1044 non-null   float64
 13  abilities_number  1045 non-null   int64  
 14  ability_1         1042 non-null   object 
 15  ability_2         516 non-null    object 
 16  ability_hidden    813 non-null    object 
 17  total_points      1045 non-null   int64  
 18  hp                1045 non-null   int64  
 19  attack            1045 non-null   int64  
 20  defense           1045 non-null   int64  
 21  sp_attack         1045 non-null   int64  
 22  sp_defense        1045 non-null   int64  
 23  speed             1045 non-null   int64  
 24  catch_rate        1027 non-null   float64
 25  base_friendship   930 non-null    float64
 26  base_experience   925 non-null    float64
 27  growth_rate       1044 non-null   object 
 28  egg_type_number   1045 non-null   int64  
 29  egg_type_1        1042 non-null   object 
 30  egg_type_2        285 non-null    object 
 31  percentage_male   872 non-null    float64
 32  egg_cycles        1044 non-null   float64
 33  against_normal    1045 non-null   float64
 34  against_fire      1045 non-null   float64
 35  against_water     1045 non-null   float64
 36  against_electric  1045 non-null   float64
 37  against_grass     1045 non-null   float64
 38  against_ice       1045 non-null   float64
 39  against_fight     1045 non-null   float64
 40  against_poison    1045 non-null   float64
 41  against_ground    1045 non-null   float64
 42  against_flying    1045 non-null   float64
 43  against_psychic   1045 non-null   float64
 44  against_bug       1045 non-null   float64
 45  against_rock      1045 non-null   float64
 46  against_ghost     1045 non-null   float64
 47  against_dragon    1045 non-null   float64
 48  against_dark      1045 non-null   float64
 49  against_steel     1045 non-null   float64
 50  against_fairy     1045 non-null   float64
dtypes: float64(25), int64(13), object(13)
memory usage: 416.5+ KB
In [ ]:
# Print summary statistics of numeric types
df.describe()
Out[ ]:
Unnamed: 0 pokedex_number generation type_number height_m weight_kg abilities_number total_points hp attack ... against_ground against_flying against_psychic against_bug against_rock against_ghost against_dragon against_dark against_steel against_fairy
count 1045.000000 1045.000000 1045.000000 1045.000000 1045.000000 1044.000000 1045.000000 1045.000000 1045.000000 1045.000000 ... 1045.000000 1045.000000 1045.000000 1045.000000 1045.000000 1045.000000 1045.000000 1045.000000 1045.000000 1045.000000
mean 522.000000 440.769378 4.098565 1.529187 1.374067 71.216571 2.268900 439.353110 70.067943 80.476555 ... 1.082297 1.168900 0.977273 0.998086 1.238278 1.018660 0.977033 1.071053 0.981579 1.091148
std 301.809819 262.517231 2.272788 0.499386 3.353349 132.259911 0.803154 121.992897 26.671411 32.432728 ... 0.782683 0.592145 0.501934 0.610411 0.696560 0.568056 0.375812 0.465178 0.501753 0.536285
min 0.000000 1.000000 1.000000 1.000000 0.100000 0.100000 0.000000 175.000000 1.000000 5.000000 ... 0.000000 0.250000 0.000000 0.000000 0.250000 0.000000 0.000000 0.250000 0.000000 0.000000
25% 261.000000 212.000000 2.000000 1.000000 0.600000 9.000000 2.000000 330.000000 50.000000 55.000000 ... 0.500000 1.000000 1.000000 0.500000 1.000000 1.000000 1.000000 1.000000 0.500000 1.000000
50% 522.000000 436.000000 4.000000 2.000000 1.000000 29.500000 2.000000 458.000000 68.000000 77.000000 ... 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000
75% 783.000000 670.000000 6.000000 2.000000 1.600000 70.500000 3.000000 515.000000 82.000000 100.000000 ... 1.500000 1.000000 1.000000 1.000000 2.000000 1.000000 1.000000 1.000000 1.000000 1.000000
max 1044.000000 898.000000 8.000000 2.000000 100.000000 999.900000 3.000000 1125.000000 255.000000 190.000000 ... 4.000000 4.000000 4.000000 4.000000 4.000000 4.000000 2.000000 4.000000 4.000000 4.000000

8 rows × 38 columns

In [ ]:
# show all columns with non-numeric type
object_columns = df.select_dtypes(include=['object']).columns

# Print the selected columns
print(object_columns)
Index(['name', 'german_name', 'japanese_name', 'status', 'species', 'type_1',
       'type_2', 'ability_1', 'ability_2', 'ability_hidden', 'growth_rate',
       'egg_type_1', 'egg_type_2'],
      dtype='object')

We'll need to drop some of those columns and encode others to make classification possible.

Common preprocessing steps¶

Let's start by removing the columns that contain data which only serve to uniquely identify each Pokémon in the dataset. This includes the name of each Pokémon in 3 different languages, each Pokémon's Pokédex number and an unnamed column with the index of each row.

In [ ]:
# Drop the names and index numbers
df.drop(['name', 'Unnamed: 0', 'german_name', 'japanese_name', 'pokedex_number'], axis=1, inplace=True)
# Print the head of the dataframe
df.head()
Out[ ]:
generation status species type_number type_1 type_2 height_m weight_kg abilities_number ability_1 ... against_ground against_flying against_psychic against_bug against_rock against_ghost against_dragon against_dark against_steel against_fairy
0 1 Normal Seed Pokémon 2 Grass Poison 0.7 6.9 2 Overgrow ... 1.0 2.0 2.0 1.0 1.0 1.0 1.0 1.0 1.0 0.5
1 1 Normal Seed Pokémon 2 Grass Poison 1.0 13.0 2 Overgrow ... 1.0 2.0 2.0 1.0 1.0 1.0 1.0 1.0 1.0 0.5
2 1 Normal Seed Pokémon 2 Grass Poison 2.0 100.0 2 Overgrow ... 1.0 2.0 2.0 1.0 1.0 1.0 1.0 1.0 1.0 0.5
3 1 Normal Seed Pokémon 2 Grass Poison 2.4 155.5 1 Thick Fat ... 1.0 2.0 2.0 1.0 1.0 1.0 1.0 1.0 1.0 0.5
4 1 Normal Lizard Pokémon 1 Fire NaN 0.6 8.5 2 Blaze ... 2.0 1.0 1.0 0.5 2.0 1.0 1.0 1.0 0.5 0.5

5 rows × 46 columns

These aren't all the columns we'll need to remove, however. Next, while some Pokémon do share the same species, the number of Pokémon that share any given species is usually under 10. (The 22 Paradox Pokémon aren't included in this dataset. This makes Mouse Pokémon, of which there are 12 in the dataset, due to the inclusion of alternate forms that some Pokémon have, the only species shared by more than 10 Pokémon.) Most commonly, Pokémon that share the same species, are part of the same evolution family.

What this means is that species are sparsely represented in the dataset. Species are also often only ever tied to one specific type combination. The former would make it annoying to deal with, and the latter goes against the spirit of this project by being far too directly correlated. The species is also more a descriptor and not really a stat. As such, let's remove the species column.

In [ ]:
# Drop the species
df.drop(['species'], axis=1, inplace=True)
# Print the head of the dataframe
df.head()
Out[ ]:
generation status type_number type_1 type_2 height_m weight_kg abilities_number ability_1 ability_2 ... against_ground against_flying against_psychic against_bug against_rock against_ghost against_dragon against_dark against_steel against_fairy
0 1 Normal 2 Grass Poison 0.7 6.9 2 Overgrow NaN ... 1.0 2.0 2.0 1.0 1.0 1.0 1.0 1.0 1.0 0.5
1 1 Normal 2 Grass Poison 1.0 13.0 2 Overgrow NaN ... 1.0 2.0 2.0 1.0 1.0 1.0 1.0 1.0 1.0 0.5
2 1 Normal 2 Grass Poison 2.0 100.0 2 Overgrow NaN ... 1.0 2.0 2.0 1.0 1.0 1.0 1.0 1.0 1.0 0.5
3 1 Normal 2 Grass Poison 2.4 155.5 1 Thick Fat NaN ... 1.0 2.0 2.0 1.0 1.0 1.0 1.0 1.0 1.0 0.5
4 1 Normal 1 Fire NaN 0.6 8.5 2 Blaze NaN ... 2.0 1.0 1.0 0.5 2.0 1.0 1.0 1.0 0.5 0.5

5 rows × 45 columns

Now we're going to remove all the columns that give a direct indication of the types. All the columns named against_[insert type here] indicate type (dis)advantage. Type advantage is the direct result of a Pokémon's type(s). (See Type Chart https://pokemondb.net/type) Again, these go against the spirit of this project by being far too direct.

We'll also remove the column type_number, which indicates how many types the Pokémon has (1 or 2), for the same reason.

In [ ]:
#Verify that the last 18 columns are the against_? data
print(df.columns[-18:])
# Drop the species
df.drop(df.columns[-18:], axis=1, inplace=True)
df.drop('type_number', axis=1, inplace=True)
# Print the head of the dataframe
df.head()
Index(['against_normal', 'against_fire', 'against_water', 'against_electric',
       'against_grass', 'against_ice', 'against_fight', 'against_poison',
       'against_ground', 'against_flying', 'against_psychic', 'against_bug',
       'against_rock', 'against_ghost', 'against_dragon', 'against_dark',
       'against_steel', 'against_fairy'],
      dtype='object')
Out[ ]:
generation status type_1 type_2 height_m weight_kg abilities_number ability_1 ability_2 ability_hidden ... speed catch_rate base_friendship base_experience growth_rate egg_type_number egg_type_1 egg_type_2 percentage_male egg_cycles
0 1 Normal Grass Poison 0.7 6.9 2 Overgrow NaN Chlorophyll ... 45 45.0 70.0 64.0 Medium Slow 2 Grass Monster 87.5 20.0
1 1 Normal Grass Poison 1.0 13.0 2 Overgrow NaN Chlorophyll ... 60 45.0 70.0 142.0 Medium Slow 2 Grass Monster 87.5 20.0
2 1 Normal Grass Poison 2.0 100.0 2 Overgrow NaN Chlorophyll ... 80 45.0 70.0 236.0 Medium Slow 2 Grass Monster 87.5 20.0
3 1 Normal Grass Poison 2.4 155.5 1 Thick Fat NaN NaN ... 80 45.0 70.0 281.0 Medium Slow 2 Grass Monster 87.5 20.0
4 1 Normal Fire NaN 0.6 8.5 2 Blaze NaN Solar Power ... 65 45.0 70.0 62.0 Medium Slow 2 Dragon Monster 87.5 20.0

5 rows × 26 columns

Now let's check for any missing values in the remaining dataframe.

In [ ]:
# Check for missing values
print(df.isnull().sum())
generation            0
status                0
type_1                0
type_2              492
height_m              0
weight_kg             1
abilities_number      0
ability_1             3
ability_2           529
ability_hidden      232
total_points          0
hp                    0
attack                0
defense               0
sp_attack             0
sp_defense            0
speed                 0
catch_rate           18
base_friendship     115
base_experience     120
growth_rate           1
egg_type_number       0
egg_type_1            3
egg_type_2          760
percentage_male     173
egg_cycles            1
dtype: int64

At first there appear to be a lot of missing values. However, for most missing values, their absence is actually correct. For example, plenty of Pokémon don't have a second or hidden ability.

That said, there are plenty of actual missing values, so let's deal with those. Let's start by going over what we're dealing with.

In [ ]:
print("Missing weight_kg: ",  np.where(df['weight_kg'].isnull()))
Missing weight_kg:  (array([1033], dtype=int64),)

Taking a quick look in the original dataset reveals this corresponds with Eternatus Eternamax.

In [ ]:
print("Missing ability_1: ",  np.where(df['ability_1'].isnull()))
Missing ability_1:  (array([  33,  172, 1033], dtype=int64),)

Taking a quick look in the original dataset reveals these correspond with Partner Pikachu, Partner Eevee and Eternatus Eternamax.

The columns catch_rate, base_friendship, base_experience, percentage_male just have too many missing values for it to be reasonable to go through the effort of salvaging them. As such, we'll be removing these columns from the dataframe.

In [ ]:
# Drop catch_rate, base_friendship and base_experience
df.drop(['catch_rate', 'base_friendship', 'base_experience', 'percentage_male'], axis=1, inplace=True)

# Print the head of the dataframe
df.head()
Out[ ]:
generation status type_1 type_2 height_m weight_kg abilities_number ability_1 ability_2 ability_hidden ... attack defense sp_attack sp_defense speed growth_rate egg_type_number egg_type_1 egg_type_2 egg_cycles
0 1 Normal Grass Poison 0.7 6.9 2 Overgrow NaN Chlorophyll ... 49 49 65 65 45 Medium Slow 2 Grass Monster 20.0
1 1 Normal Grass Poison 1.0 13.0 2 Overgrow NaN Chlorophyll ... 62 63 80 80 60 Medium Slow 2 Grass Monster 20.0
2 1 Normal Grass Poison 2.0 100.0 2 Overgrow NaN Chlorophyll ... 82 83 100 100 80 Medium Slow 2 Grass Monster 20.0
3 1 Normal Grass Poison 2.4 155.5 1 Thick Fat NaN NaN ... 100 123 122 120 80 Medium Slow 2 Grass Monster 20.0
4 1 Normal Fire NaN 0.6 8.5 2 Blaze NaN Solar Power ... 52 43 60 50 65 Medium Slow 2 Dragon Monster 20.0

5 rows × 22 columns

In [ ]:
print("Missing growth_rate: ",  np.where(df['growth_rate'].isnull()))
Missing growth_rate:  (array([658], dtype=int64),)

Taking a quick look in the original dataset reveals this corresponds with Galarian Darmanitan Zen Mode.

In [ ]:
print("Missing egg_type_1: ",  np.where(df['egg_type_1'].isnull()))
Missing egg_type_1:  (array([ 33, 172, 658], dtype=int64),)

Taking a quick look in the original dataset reveals these correspond with Partner Pikachu, Partner Eevee and Galarian Darmanitan Zen Mode.

It's worth noting that egg_type_number actually lists these as having 0 egg groups, which is incorrect. (It might have been based the total number of values across egg_type_1 and egg_type_2, thereby erroneously taking the missing values into account.)

This means egg_type_number, egg_type_1 and egg_type_2 will need to be corrected.

In [ ]:
print(np.where(df['egg_cycles'].isnull()))
(array([658], dtype=int64),)

Taking a quick look in the original dataset reveals this corresponds with Galarian Darmanitan Zen Mode.

Now that we know what we're dealing with, let's handle these missing values.

In total, there are 4 Pokémon (rows) in the dataset that are still actually missing values: Partner Pikachu, Partner Eevee, Galarian Darmanitan Zen Mode and Eternatus Eternamax.

For Partner Pikachu (row 33) and Partner Eevee (row 172), we need to fix the following: ability_1, egg_type_number, egg_type_1 and egg_type_2

In [ ]:
#Partner Pikachu
print("Partner Pikachu ability_1 : ", df['ability_1'][33]) #to show Partner Pikachu's ability_1 is nan
df['ability_1'][33] = df['ability_1'][32]                  #Parner Pikachu's ability_1 is the same as regular Pikachu (row 32)
print("(Updated) Partner Pikachu ability_1 : ", df['ability_1'][33]) #to show Partner Pikachu's ability_1 now has the correct value

print("Partner Pikachu egg_type_number : ", df['egg_type_number'][33]) #nan
df['egg_type_number'][33] = df['egg_type_number'][32]                  #it's the same as regular Pikachu (row 32)
print("(Updated) Partner Pikachu egg_type_number : ", df['egg_type_number'][33]) #now has the correct value

print("Partner Pikachu egg_type_1 : ", df['egg_type_1'][33]) #nan
df['egg_type_1'][33] = df['egg_type_1'][32]                  #it's the same as regular Pikachu (row 32)
print("(Updated) Partner Pikachu egg_type_1 : ", df['egg_type_1'][33]) #now has the correct value

#(Partner) Pikachu has two egg types
print("Partner Pikachu egg_type_2 : ", df['egg_type_2'][33]) #nan
df['egg_type_2'][33] = df['egg_type_2'][32]                  #it's the same as regular Pikachu (row 32)
print("(Updated) Partner Pikachu egg_type_2 : ", df['egg_type_2'][33]) #now has the correct value
Partner Pikachu ability_1 :  nan
(Updated) Partner Pikachu ability_1 :  Static
Partner Pikachu egg_type_number :  0
(Updated) Partner Pikachu egg_type_number :  2
Partner Pikachu egg_type_1 :  nan
(Updated) Partner Pikachu egg_type_1 :  Fairy
Partner Pikachu egg_type_2 :  nan
(Updated) Partner Pikachu egg_type_2 :  Field
C:\Users\thors\AppData\Local\Temp\ipykernel_20284\1639035027.py:3: FutureWarning: ChainedAssignmentError: behaviour will change in pandas 3.0!
You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) this will never work to update the original DataFrame or Series, because the intermediate object on which we are setting values will behave as a copy.
A typical example is when you are setting values in a column of a DataFrame, like:

df["col"][row_indexer] = value

Use `df.loc[row_indexer, "col"] = values` instead, to perform the assignment in a single step and ensure this keeps updating the original `df`.

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

  df['ability_1'][33] = df['ability_1'][32]                  #Parner Pikachu's ability_1 is the same as regular Pikachu (row 32)
C:\Users\thors\AppData\Local\Temp\ipykernel_20284\1639035027.py:3: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['ability_1'][33] = df['ability_1'][32]                  #Parner Pikachu's ability_1 is the same as regular Pikachu (row 32)
C:\Users\thors\AppData\Local\Temp\ipykernel_20284\1639035027.py:7: FutureWarning: ChainedAssignmentError: behaviour will change in pandas 3.0!
You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) this will never work to update the original DataFrame or Series, because the intermediate object on which we are setting values will behave as a copy.
A typical example is when you are setting values in a column of a DataFrame, like:

df["col"][row_indexer] = value

Use `df.loc[row_indexer, "col"] = values` instead, to perform the assignment in a single step and ensure this keeps updating the original `df`.

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

  df['egg_type_number'][33] = df['egg_type_number'][32]                  #it's the same as regular Pikachu (row 32)
C:\Users\thors\AppData\Local\Temp\ipykernel_20284\1639035027.py:7: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['egg_type_number'][33] = df['egg_type_number'][32]                  #it's the same as regular Pikachu (row 32)
C:\Users\thors\AppData\Local\Temp\ipykernel_20284\1639035027.py:11: FutureWarning: ChainedAssignmentError: behaviour will change in pandas 3.0!
You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) this will never work to update the original DataFrame or Series, because the intermediate object on which we are setting values will behave as a copy.
A typical example is when you are setting values in a column of a DataFrame, like:

df["col"][row_indexer] = value

Use `df.loc[row_indexer, "col"] = values` instead, to perform the assignment in a single step and ensure this keeps updating the original `df`.

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

  df['egg_type_1'][33] = df['egg_type_1'][32]                  #it's the same as regular Pikachu (row 32)
C:\Users\thors\AppData\Local\Temp\ipykernel_20284\1639035027.py:11: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['egg_type_1'][33] = df['egg_type_1'][32]                  #it's the same as regular Pikachu (row 32)
C:\Users\thors\AppData\Local\Temp\ipykernel_20284\1639035027.py:16: FutureWarning: ChainedAssignmentError: behaviour will change in pandas 3.0!
You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) this will never work to update the original DataFrame or Series, because the intermediate object on which we are setting values will behave as a copy.
A typical example is when you are setting values in a column of a DataFrame, like:

df["col"][row_indexer] = value

Use `df.loc[row_indexer, "col"] = values` instead, to perform the assignment in a single step and ensure this keeps updating the original `df`.

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

  df['egg_type_2'][33] = df['egg_type_2'][32]                  #it's the same as regular Pikachu (row 32)
C:\Users\thors\AppData\Local\Temp\ipykernel_20284\1639035027.py:16: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['egg_type_2'][33] = df['egg_type_2'][32]                  #it's the same as regular Pikachu (row 32)
In [ ]:
#Partner Eevee
print("Partner Eevee ability_1 : ", df['ability_1'][172])           #to show Partner Eevee's ability_1 is nan
df['ability_1'][172] = df['ability_1'][171]                         #Parner Eevee's ability_1 is the same as regular Eevee (row 171)
print("(Updated) Partner Eevee ability_1 : ", df['ability_1'][172]) #to show Partner Pikachu's ability_1 now has the correct value

print("Partner Eevee egg_type_number : ", df['egg_type_number'][172])           #nan
df['egg_type_number'][172] = df['egg_type_number'][171]                         #it's the same as regular Eevee (row 171)
print("(Updated) Partner Eevee egg_type_number : ", df['egg_type_number'][172]) #now has the correct value

print("Partner Eevee egg_type_1 : ", df['egg_type_1'][172])           #nan
df['egg_type_1'][172] = df['egg_type_1'][171]                         #it's the same as regular Eevee (row 171)
print("(Updated) Partner Eevee egg_type_1 : ", df['egg_type_1'][172]) #now has the correct value

#(Partner) Eevee only has one egg type, so this won't change anything
print("Partner Eevee egg_type_2 : ", df['egg_type_2'][172])           #nan
df['egg_type_2'][172] = df['egg_type_2'][171]                         #it's the same as regular Eevee (row 171)
print("(Updated) Partner Eevee egg_type_2 : ", df['egg_type_2'][172]) #now has the correct value, still nan
Partner Eevee ability_1 :  nan
(Updated) Partner Eevee ability_1 :  Run Away
Partner Eevee egg_type_number :  0
(Updated) Partner Eevee egg_type_number :  1
Partner Eevee egg_type_1 :  nan
(Updated) Partner Eevee egg_type_1 :  Field
Partner Eevee egg_type_2 :  nan
(Updated) Partner Eevee egg_type_2 :  nan
C:\Users\thors\AppData\Local\Temp\ipykernel_20284\3308900266.py:3: FutureWarning: ChainedAssignmentError: behaviour will change in pandas 3.0!
You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) this will never work to update the original DataFrame or Series, because the intermediate object on which we are setting values will behave as a copy.
A typical example is when you are setting values in a column of a DataFrame, like:

df["col"][row_indexer] = value

Use `df.loc[row_indexer, "col"] = values` instead, to perform the assignment in a single step and ensure this keeps updating the original `df`.

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

  df['ability_1'][172] = df['ability_1'][171]                         #Parner Eevee's ability_1 is the same as regular Eevee (row 171)
C:\Users\thors\AppData\Local\Temp\ipykernel_20284\3308900266.py:3: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['ability_1'][172] = df['ability_1'][171]                         #Parner Eevee's ability_1 is the same as regular Eevee (row 171)
C:\Users\thors\AppData\Local\Temp\ipykernel_20284\3308900266.py:7: FutureWarning: ChainedAssignmentError: behaviour will change in pandas 3.0!
You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) this will never work to update the original DataFrame or Series, because the intermediate object on which we are setting values will behave as a copy.
A typical example is when you are setting values in a column of a DataFrame, like:

df["col"][row_indexer] = value

Use `df.loc[row_indexer, "col"] = values` instead, to perform the assignment in a single step and ensure this keeps updating the original `df`.

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

  df['egg_type_number'][172] = df['egg_type_number'][171]                         #it's the same as regular Eevee (row 171)
C:\Users\thors\AppData\Local\Temp\ipykernel_20284\3308900266.py:7: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['egg_type_number'][172] = df['egg_type_number'][171]                         #it's the same as regular Eevee (row 171)
C:\Users\thors\AppData\Local\Temp\ipykernel_20284\3308900266.py:11: FutureWarning: ChainedAssignmentError: behaviour will change in pandas 3.0!
You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) this will never work to update the original DataFrame or Series, because the intermediate object on which we are setting values will behave as a copy.
A typical example is when you are setting values in a column of a DataFrame, like:

df["col"][row_indexer] = value

Use `df.loc[row_indexer, "col"] = values` instead, to perform the assignment in a single step and ensure this keeps updating the original `df`.

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

  df['egg_type_1'][172] = df['egg_type_1'][171]                         #it's the same as regular Eevee (row 171)
C:\Users\thors\AppData\Local\Temp\ipykernel_20284\3308900266.py:11: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['egg_type_1'][172] = df['egg_type_1'][171]                         #it's the same as regular Eevee (row 171)
C:\Users\thors\AppData\Local\Temp\ipykernel_20284\3308900266.py:16: FutureWarning: ChainedAssignmentError: behaviour will change in pandas 3.0!
You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) this will never work to update the original DataFrame or Series, because the intermediate object on which we are setting values will behave as a copy.
A typical example is when you are setting values in a column of a DataFrame, like:

df["col"][row_indexer] = value

Use `df.loc[row_indexer, "col"] = values` instead, to perform the assignment in a single step and ensure this keeps updating the original `df`.

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

  df['egg_type_2'][172] = df['egg_type_2'][171]                         #it's the same as regular Eevee (row 171)
C:\Users\thors\AppData\Local\Temp\ipykernel_20284\3308900266.py:16: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['egg_type_2'][172] = df['egg_type_2'][171]                         #it's the same as regular Eevee (row 171)

For Galarian Darmanitan Zen Mode (row 658), we need to fix the following: growth_rate, egg_type_number, egg_type_1, egg_type_2, egg_cycles

In [ ]:
#Galarian Darmanitan Zen Mode
print("Galarian Darmanitan Zen Mode growth_rate : ", df['growth_rate'][658]) #to show Galarian Darmanitan Zen Mode's growth_rate is nan
df['growth_rate'][658] = df['growth_rate'][656]                              #Galarian Darmanitan Zen Mode's growth_rate is the same as every 
                                                                             #other form of Darmanitan (rows 655, 656, 657), Medium Slow
print("(Updated) Galarian Darmanitan Zen Mode growth_rate : ", df['growth_rate'][658]) #to show Galarian Darmanitan Zen Mode's growth_rate 
                                                                                       #now has the correct value

print("Galarian Darmanitan Zen Mode egg_type_number : ", df['egg_type_number'][658])           #nan
df['egg_type_number'][658] = df['egg_type_number'][656]  #it's the same as any other form of Darmanitan (rows 655, 656, 657)
print("(Updated) Galarian Darmanitan Zen Mode egg_type_number : ", df['egg_type_number'][658]) #now has the correct value

print("Galarian Darmanitan Zen Mode egg_type_1 : ", df['egg_type_1'][658])       #nan
df['egg_type_1'][658] = df['egg_type_1'][656]  #it's the same as any other form of Darmanitan (rows 655, 656, 657)
print("(Updated) Galarian Darmanitan Zen Mode egg_type_1 : ", df['egg_type_1'][658]) #now has the correct value

#(Galarian) Darmanitan (Zen Mode) only has one egg type, so this won't change anything
print("Galarian Darmanitan Zen Mode egg_type_2 : ", df['egg_type_2'][658])        #nan
df['egg_type_2'][658] = df['egg_type_2'][656]  #it's the same as any other form of Darmanitan (rows 655, 656, 657)
print("(Updated) Galarian Darmanitan Zen Mode egg_type_2 : ", df['egg_type_2'][658]) #now has the correct value, still nan

print("Galarian Darmanitan Zen Mode egg_cycles : ", df['egg_cycles'][658]) #nan
df['egg_cycles'][658] = df['egg_cycles'][656]  #it's the same as any other form of Darmanitan (rows 655, 656, 657)
print("(Updated) Galarian Darmanitan Zen Mode egg_cycles : ", df['egg_cycles'][658]) #now has the correct value
Galarian Darmanitan Zen Mode growth_rate :  nan
(Updated) Galarian Darmanitan Zen Mode growth_rate :  Medium Slow
Galarian Darmanitan Zen Mode egg_type_number :  0
(Updated) Galarian Darmanitan Zen Mode egg_type_number :  1
Galarian Darmanitan Zen Mode egg_type_1 :  nan
(Updated) Galarian Darmanitan Zen Mode egg_type_1 :  Field
Galarian Darmanitan Zen Mode egg_type_2 :  nan
(Updated) Galarian Darmanitan Zen Mode egg_type_2 :  nan
Galarian Darmanitan Zen Mode egg_cycles :  nan
(Updated) Galarian Darmanitan Zen Mode egg_cycles :  20.0
C:\Users\thors\AppData\Local\Temp\ipykernel_20284\1102097238.py:3: FutureWarning: ChainedAssignmentError: behaviour will change in pandas 3.0!
You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) this will never work to update the original DataFrame or Series, because the intermediate object on which we are setting values will behave as a copy.
A typical example is when you are setting values in a column of a DataFrame, like:

df["col"][row_indexer] = value

Use `df.loc[row_indexer, "col"] = values` instead, to perform the assignment in a single step and ensure this keeps updating the original `df`.

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

  df['growth_rate'][658] = df['growth_rate'][656]                              #Galarian Darmanitan Zen Mode's growth_rate is the same as every
C:\Users\thors\AppData\Local\Temp\ipykernel_20284\1102097238.py:3: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['growth_rate'][658] = df['growth_rate'][656]                              #Galarian Darmanitan Zen Mode's growth_rate is the same as every
C:\Users\thors\AppData\Local\Temp\ipykernel_20284\1102097238.py:9: FutureWarning: ChainedAssignmentError: behaviour will change in pandas 3.0!
You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) this will never work to update the original DataFrame or Series, because the intermediate object on which we are setting values will behave as a copy.
A typical example is when you are setting values in a column of a DataFrame, like:

df["col"][row_indexer] = value

Use `df.loc[row_indexer, "col"] = values` instead, to perform the assignment in a single step and ensure this keeps updating the original `df`.

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

  df['egg_type_number'][658] = df['egg_type_number'][656]  #it's the same as any other form of Darmanitan (rows 655, 656, 657)
C:\Users\thors\AppData\Local\Temp\ipykernel_20284\1102097238.py:9: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['egg_type_number'][658] = df['egg_type_number'][656]  #it's the same as any other form of Darmanitan (rows 655, 656, 657)
C:\Users\thors\AppData\Local\Temp\ipykernel_20284\1102097238.py:13: FutureWarning: ChainedAssignmentError: behaviour will change in pandas 3.0!
You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) this will never work to update the original DataFrame or Series, because the intermediate object on which we are setting values will behave as a copy.
A typical example is when you are setting values in a column of a DataFrame, like:

df["col"][row_indexer] = value

Use `df.loc[row_indexer, "col"] = values` instead, to perform the assignment in a single step and ensure this keeps updating the original `df`.

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

  df['egg_type_1'][658] = df['egg_type_1'][656]  #it's the same as any other form of Darmanitan (rows 655, 656, 657)
C:\Users\thors\AppData\Local\Temp\ipykernel_20284\1102097238.py:13: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['egg_type_1'][658] = df['egg_type_1'][656]  #it's the same as any other form of Darmanitan (rows 655, 656, 657)
C:\Users\thors\AppData\Local\Temp\ipykernel_20284\1102097238.py:18: FutureWarning: ChainedAssignmentError: behaviour will change in pandas 3.0!
You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) this will never work to update the original DataFrame or Series, because the intermediate object on which we are setting values will behave as a copy.
A typical example is when you are setting values in a column of a DataFrame, like:

df["col"][row_indexer] = value

Use `df.loc[row_indexer, "col"] = values` instead, to perform the assignment in a single step and ensure this keeps updating the original `df`.

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

  df['egg_type_2'][658] = df['egg_type_2'][656]  #it's the same as any other form of Darmanitan (rows 655, 656, 657)
C:\Users\thors\AppData\Local\Temp\ipykernel_20284\1102097238.py:18: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['egg_type_2'][658] = df['egg_type_2'][656]  #it's the same as any other form of Darmanitan (rows 655, 656, 657)
C:\Users\thors\AppData\Local\Temp\ipykernel_20284\1102097238.py:22: FutureWarning: ChainedAssignmentError: behaviour will change in pandas 3.0!
You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) this will never work to update the original DataFrame or Series, because the intermediate object on which we are setting values will behave as a copy.
A typical example is when you are setting values in a column of a DataFrame, like:

df["col"][row_indexer] = value

Use `df.loc[row_indexer, "col"] = values` instead, to perform the assignment in a single step and ensure this keeps updating the original `df`.

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

  df['egg_cycles'][658] = df['egg_cycles'][656]  #it's the same as any other form of Darmanitan (rows 655, 656, 657)
C:\Users\thors\AppData\Local\Temp\ipykernel_20284\1102097238.py:22: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['egg_cycles'][658] = df['egg_cycles'][656]  #it's the same as any other form of Darmanitan (rows 655, 656, 657)

Lastly, there's Eternatus Eternamax (row 1033). For this one, we'd need to fix weight_kg and ability_1.

Now, ability_1 wouldn't be hard to fix, but it turns out that the missing value for weight_kg is actually correct. It's a trait of Gigantamax Pokémon. (Eternamax is a type of Gigantamax.) However, for some reason, Eternatus Eternamax is the only Gigantamax form that's in the dataset.

Because it's the only one in the dataset and for simplicity's sake, we're just going to remove row 1033 from the dataframe.

In [ ]:
df.drop(index=1033, axis=0, inplace=True)

Now let's check the number of missing values again.

In [ ]:
# Check for missing values
print(df.isnull().sum())
generation            0
status                0
type_1                0
type_2              492
height_m              0
weight_kg             0
abilities_number      0
ability_1             0
ability_2           528
ability_hidden      231
total_points          0
hp                    0
attack                0
defense               0
sp_attack             0
sp_defense            0
speed                 0
growth_rate           0
egg_type_number       0
egg_type_1            0
egg_type_2          758
egg_cycles            0
dtype: int64
In [ ]:
df.info()
<class 'pandas.core.frame.DataFrame'>
Index: 1044 entries, 0 to 1044
Data columns (total 22 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   generation        1044 non-null   int64  
 1   status            1044 non-null   object 
 2   type_1            1044 non-null   object 
 3   type_2            552 non-null    object 
 4   height_m          1044 non-null   float64
 5   weight_kg         1044 non-null   float64
 6   abilities_number  1044 non-null   int64  
 7   ability_1         1044 non-null   object 
 8   ability_2         516 non-null    object 
 9   ability_hidden    813 non-null    object 
 10  total_points      1044 non-null   int64  
 11  hp                1044 non-null   int64  
 12  attack            1044 non-null   int64  
 13  defense           1044 non-null   int64  
 14  sp_attack         1044 non-null   int64  
 15  sp_defense        1044 non-null   int64  
 16  speed             1044 non-null   int64  
 17  growth_rate       1044 non-null   object 
 18  egg_type_number   1044 non-null   int64  
 19  egg_type_1        1044 non-null   object 
 20  egg_type_2        286 non-null    object 
 21  egg_cycles        1044 non-null   float64
dtypes: float64(3), int64(10), object(9)
memory usage: 187.6+ KB

We'll need to fill up these missing values

In [ ]:
# Plot the correlation matrix
sns.heatmap(df.select_dtypes(include=[np.number, bool]).corr(), square=True, cmap='RdYlGn');
['#1f77b4', '#ff7f0e', '#2ca02c', '#d62728', '#9467bd', '#8c564b', '#e377c2', '#7f7f7f', '#bcbd22', '#17becf']
['#1f77b4', '#ff7f0e', '#2ca02c', '#d62728', '#9467bd', '#8c564b', '#e377c2', '#7f7f7f', '#bcbd22', '#17becf']
['#1f77b4', '#ff7f0e', '#2ca02c', '#d62728', '#9467bd', '#8c564b', '#e377c2', '#7f7f7f', '#bcbd22', '#17becf']
['#1f77b4', '#ff7f0e', '#2ca02c', '#d62728', '#9467bd', '#8c564b', '#e377c2', '#7f7f7f', '#bcbd22', '#17becf']
['#1f77b4', '#ff7f0e', '#2ca02c', '#d62728', '#9467bd', '#8c564b', '#e377c2', '#7f7f7f', '#bcbd22', '#17becf']
['#1f77b4', '#ff7f0e', '#2ca02c', '#d62728', '#9467bd', '#8c564b', '#e377c2', '#7f7f7f', '#bcbd22', '#17becf']
['#1f77b4', '#ff7f0e', '#2ca02c', '#d62728', '#9467bd', '#8c564b', '#e377c2', '#7f7f7f', '#bcbd22', '#17becf']
['#1f77b4', '#ff7f0e', '#2ca02c', '#d62728', '#9467bd', '#8c564b', '#e377c2', '#7f7f7f', '#bcbd22', '#17becf']
['#1f77b4', '#ff7f0e', '#2ca02c', '#d62728', '#9467bd', '#8c564b', '#e377c2', '#7f7f7f', '#bcbd22', '#17becf']
['#1f77b4', '#ff7f0e', '#2ca02c', '#d62728', '#9467bd', '#8c564b', '#e377c2', '#7f7f7f', '#bcbd22', '#17becf']
['#1f77b4', '#ff7f0e', '#2ca02c', '#d62728', '#9467bd', '#8c564b', '#e377c2', '#7f7f7f', '#bcbd22', '#17becf']
['#1f77b4', '#ff7f0e', '#2ca02c', '#d62728', '#9467bd', '#8c564b', '#e377c2', '#7f7f7f', '#bcbd22', '#17becf']
['#1f77b4', '#ff7f0e', '#2ca02c', '#d62728', '#9467bd', '#8c564b', '#e377c2', '#7f7f7f', '#bcbd22', '#17becf']
['#1f77b4', '#ff7f0e', '#2ca02c', '#d62728', '#9467bd', '#8c564b', '#e377c2', '#7f7f7f', '#bcbd22', '#17becf']
['#1f77b4', '#ff7f0e', '#2ca02c', '#d62728', '#9467bd', '#8c564b', '#e377c2', '#7f7f7f', '#bcbd22', '#17becf']
['#1f77b4', '#ff7f0e', '#2ca02c', '#d62728', '#9467bd', '#8c564b', '#e377c2', '#7f7f7f', '#bcbd22', '#17becf']
No description has been provided for this image

In addition to the generation, the number of egg types also has little correlation with the other stats.

The number of abilities ranges between having no correlation and having a somewhat negative correlation with the other stats. (Interesting. Could this be because of balancing? Some other reason?)

The stat total once again correlates quite highly with the main stats. No surprises there. The lack of correlation between speed and defense is still visible.

There's also a lack of correlation between speed and weight. This goes against what one might initially expect (that heavier Pokémon are slower). However, Pokémon come in a wide range of sizes and vary in the materials they're composed of, so perhaps it shouldn't be surprising that speed and weight aren't correlated.

Dropping columns we don't need

We will handle the missing values in type_2, ability_2, ability_hidden, and egg_type_2 by filling it up by a 'None' value. It is normal that some pokémon don't have a second or hidden ability or (egg) type.

In [ ]:
# Handling missing values in 'Type 2' column
df['type_2'].fillna('None', inplace=True)
df['ability_2'].fillna('None', inplace=True)
df['ability_hidden'].fillna('None', inplace=True)
df['egg_type_2'].fillna('None', inplace=True)

print(df.isnull().sum())

df.head()
generation          0
status              0
type_1              0
type_2              0
height_m            0
weight_kg           0
abilities_number    0
ability_1           0
ability_2           0
ability_hidden      0
total_points        0
hp                  0
attack              0
defense             0
sp_attack           0
sp_defense          0
speed               0
growth_rate         0
egg_type_number     0
egg_type_1          0
egg_type_2          0
egg_cycles          0
dtype: int64
C:\Users\thors\AppData\Local\Temp\ipykernel_20284\1302662185.py:2: FutureWarning: A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method.
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['type_2'].fillna('None', inplace=True)
C:\Users\thors\AppData\Local\Temp\ipykernel_20284\1302662185.py:3: FutureWarning: A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method.
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['ability_2'].fillna('None', inplace=True)
C:\Users\thors\AppData\Local\Temp\ipykernel_20284\1302662185.py:4: FutureWarning: A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method.
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['ability_hidden'].fillna('None', inplace=True)
C:\Users\thors\AppData\Local\Temp\ipykernel_20284\1302662185.py:5: FutureWarning: A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method.
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['egg_type_2'].fillna('None', inplace=True)
Out[ ]:
generation status type_1 type_2 height_m weight_kg abilities_number ability_1 ability_2 ability_hidden ... attack defense sp_attack sp_defense speed growth_rate egg_type_number egg_type_1 egg_type_2 egg_cycles
0 1 Normal Grass Poison 0.7 6.9 2 Overgrow None Chlorophyll ... 49 49 65 65 45 Medium Slow 2 Grass Monster 20.0
1 1 Normal Grass Poison 1.0 13.0 2 Overgrow None Chlorophyll ... 62 63 80 80 60 Medium Slow 2 Grass Monster 20.0
2 1 Normal Grass Poison 2.0 100.0 2 Overgrow None Chlorophyll ... 82 83 100 100 80 Medium Slow 2 Grass Monster 20.0
3 1 Normal Grass Poison 2.4 155.5 1 Thick Fat None None ... 100 123 122 120 80 Medium Slow 2 Grass Monster 20.0
4 1 Normal Fire None 0.6 8.5 2 Blaze None Solar Power ... 52 43 60 50 65 Medium Slow 2 Dragon Monster 20.0

5 rows × 22 columns

Because scikit-learn does not accept non-numerical features, we need to one-hot encode the categorical columns by creating dummy variables.

In [ ]:
df_one_hot = pd.get_dummies(df.drop(['type_1', 'type_2'], axis=1))
df_one_hot.head() # to check if it worked
Out[ ]:
generation height_m weight_kg abilities_number total_points hp attack defense sp_attack sp_defense ... egg_type_2_Field egg_type_2_Flying egg_type_2_Grass egg_type_2_Human-Like egg_type_2_Mineral egg_type_2_Monster egg_type_2_None egg_type_2_Water 1 egg_type_2_Water 2 egg_type_2_Water 3
0 1 0.7 6.9 2 318 45 49 49 65 65 ... False False False False False True False False False False
1 1 1.0 13.0 2 405 60 62 63 80 80 ... False False False False False True False False False False
2 1 2.0 100.0 2 525 80 82 83 100 100 ... False False False False False True False False False False
3 1 2.4 155.5 1 625 80 100 123 122 120 ... False False False False False True False False False False
4 1 0.6 8.5 2 309 39 52 43 60 50 ... False False False False False True False False False False

5 rows × 544 columns

In [ ]:
df = pd.concat([df_one_hot, df[['type_1', 'type_2']]], axis=1)
df.head()
preprocessed_df = df.copy()

Multi-class classification¶

Accounting for Order of Types¶

We make a new column 'Types' where the combination of type 1 and type 2 are stored in tuples, which is ordered by default

In [ ]:
df = preprocessed_df.copy()

# Separate features and labels
X = df.drop(columns=['type_1', 'type_2'])

# Combine Type 1 and Type 2 into a single column
df['Types'] = df[['type_1', 'type_2']].apply(lambda x: tuple(filter(lambda y: pd.notna(y), x)), axis=1)
df.Types = df.Types.astype(str)

print(df['Types'][0])
print(len(df['Types'].unique()))


# drop the Type 1 and Type 2 columns
df.drop(['type_1', 'type_2'], axis=1, inplace=True)

# print head
df.head()
('Grass', 'Poison')
192
Out[ ]:
generation height_m weight_kg abilities_number total_points hp attack defense sp_attack sp_defense ... egg_type_2_Flying egg_type_2_Grass egg_type_2_Human-Like egg_type_2_Mineral egg_type_2_Monster egg_type_2_None egg_type_2_Water 1 egg_type_2_Water 2 egg_type_2_Water 3 Types
0 1 0.7 6.9 2 318 45 49 49 65 65 ... False False False False True False False False False ('Grass', 'Poison')
1 1 1.0 13.0 2 405 60 62 63 80 80 ... False False False False True False False False False ('Grass', 'Poison')
2 1 2.0 100.0 2 525 80 82 83 100 100 ... False False False False True False False False False ('Grass', 'Poison')
3 1 2.4 155.5 1 625 80 100 123 122 120 ... False False False False True False False False False ('Grass', 'Poison')
4 1 0.6 8.5 2 309 39 52 43 60 50 ... False False False False True False False False False ('Fire', 'None')

5 rows × 545 columns

In [ ]:
# show the distribution of pokemon types
sns.countplot(df, y='Types');
['#1f77b4', '#ff7f0e', '#2ca02c', '#d62728', '#9467bd', '#8c564b', '#e377c2', '#7f7f7f', '#bcbd22', '#17becf']
['#1f77b4', '#ff7f0e', '#2ca02c', '#d62728', '#9467bd', '#8c564b', '#e377c2', '#7f7f7f', '#bcbd22', '#17becf']
['#1f77b4', '#ff7f0e', '#2ca02c', '#d62728', '#9467bd', '#8c564b', '#e377c2', '#7f7f7f', '#bcbd22', '#17becf']
['#1f77b4', '#ff7f0e', '#2ca02c', '#d62728', '#9467bd', '#8c564b', '#e377c2', '#7f7f7f', '#bcbd22', '#17becf']
['#1f77b4', '#ff7f0e', '#2ca02c', '#d62728', '#9467bd', '#8c564b', '#e377c2', '#7f7f7f', '#bcbd22', '#17becf']
No description has been provided for this image
In [ ]:
singleton_classes = df['Types'].value_counts()[df['Types'].value_counts() == 1].index.tolist()
singleton_data = df[df['Types'].isin(singleton_classes)]
other_data = df[~df['Types'].isin(singleton_classes)]

print("Number of singleton classes",len(singleton_classes))
print("number of unique type combinations",len(df['Types'].unique()))
print(len(df['Types']))
df.head()
Number of singleton classes 41
number of unique type combinations 192
1044
Out[ ]:
generation height_m weight_kg abilities_number total_points hp attack defense sp_attack sp_defense ... egg_type_2_Flying egg_type_2_Grass egg_type_2_Human-Like egg_type_2_Mineral egg_type_2_Monster egg_type_2_None egg_type_2_Water 1 egg_type_2_Water 2 egg_type_2_Water 3 Types
0 1 0.7 6.9 2 318 45 49 49 65 65 ... False False False False True False False False False ('Grass', 'Poison')
1 1 1.0 13.0 2 405 60 62 63 80 80 ... False False False False True False False False False ('Grass', 'Poison')
2 1 2.0 100.0 2 525 80 82 83 100 100 ... False False False False True False False False False ('Grass', 'Poison')
3 1 2.4 155.5 1 625 80 100 123 122 120 ... False False False False True False False False False ('Grass', 'Poison')
4 1 0.6 8.5 2 309 39 52 43 60 50 ... False False False False True False False False False ('Fire', 'None')

5 rows × 545 columns

Decision tree¶

The single classes are added to both the training and test sets after being stratified on the rest of the data. This makes the actual test size slightly larger.

In [ ]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
# Split the data into training and testing sets
from sklearn.metrics import classification_report
X = df.drop(columns=['Types'])
y = df['Types']

X_train, X_test, y_train, y_test = train_test_split(other_data.drop(columns=['Types']), other_data['Types'], test_size=0.2, stratify=other_data['Types'], random_state=42)
X_train = pd.concat([X_train, singleton_data.drop(columns=['Types'])])
y_train = pd.concat([y_train, singleton_data['Types']])
X_test = pd.concat([X_test, singleton_data.drop(columns=['Types'])])
y_test = pd.concat([y_test, singleton_data['Types']])

print("actual test size:",len(X_test)/(len(X_train)+len(X_test)))
actual test size: 0.22304147465437787
In [ ]:
# Initialize and train the decision tree classifier
from sklearn.metrics import accuracy_score
model = DecisionTreeClassifier(random_state=42)
model.fit(X_train, y_train)

# Predict labels for the test set
y_pred = model.predict(X_test)

# Calculate accuracy
print("Score: ", model.score(X_test, y_test))
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)
print(classification_report(y_test, y_pred))
Score:  0.4834710743801653
Accuracy: 0.4834710743801653
                         precision    recall  f1-score   support

    ('Bug', 'Electric')       0.00      0.00      0.00         1
    ('Bug', 'Fighting')       0.00      0.00      0.00         1
      ('Bug', 'Flying')       0.00      0.00      0.00         3
       ('Bug', 'Ghost')       0.50      1.00      0.67         1
       ('Bug', 'Grass')       0.00      0.00      0.00         1
        ('Bug', 'None')       0.40      0.50      0.44         4
      ('Bug', 'Poison')       0.00      0.00      0.00         3
        ('Bug', 'Rock')       0.00      0.00      0.00         1
       ('Bug', 'Steel')       0.00      0.00      0.00         1
       ('Bug', 'Water')       0.00      0.00      0.00         1
     ('Dark', 'Dragon')       0.00      0.00      0.00         1
      ('Dark', 'Fairy')       1.00      1.00      1.00         1
       ('Dark', 'Fire')       0.00      0.00      0.00         1
     ('Dark', 'Flying')       0.00      0.00      0.00         1
      ('Dark', 'Ghost')       0.00      0.00      0.00         0
      ('Dark', 'Grass')       1.00      1.00      1.00         1
       ('Dark', 'None')       0.00      0.00      0.00         3
     ('Dark', 'Normal')       0.00      0.00      0.00         1
      ('Dark', 'Steel')       0.00      0.00      0.00         0
 ('Dragon', 'Electric')       1.00      1.00      1.00         1
    ('Dragon', 'Fairy')       1.00      1.00      1.00         1
     ('Dragon', 'Fire')       1.00      1.00      1.00         1
   ('Dragon', 'Flying')       0.00      0.00      0.00         1
    ('Dragon', 'Ghost')       1.00      1.00      1.00         1
   ('Dragon', 'Ground')       0.00      0.00      0.00         2
      ('Dragon', 'Ice')       0.50      1.00      0.67         1
     ('Dragon', 'None')       0.00      0.00      0.00         3
  ('Dragon', 'Psychic')       1.00      1.00      1.00         1
   ('Electric', 'Dark')       0.00      0.00      0.00         0
  ('Electric', 'Fairy')       0.00      0.00      0.00         0
   ('Electric', 'Fire')       0.33      1.00      0.50         1
 ('Electric', 'Flying')       0.00      0.00      0.00         1
  ('Electric', 'Ghost')       1.00      1.00      1.00         1
  ('Electric', 'Grass')       0.00      0.00      0.00         1
   ('Electric', 'None')       0.50      0.43      0.46         7
 ('Electric', 'Poison')       0.00      0.00      0.00         1
('Electric', 'Psychic')       1.00      1.00      1.00         1
  ('Electric', 'Steel')       0.00      0.00      0.00         1
  ('Electric', 'Water')       0.00      0.00      0.00         1
    ('Fairy', 'Flying')       0.00      0.00      0.00         0
      ('Fairy', 'None')       0.33      0.25      0.29         4
     ('Fairy', 'Steel')       0.50      1.00      0.67         1
   ('Fighting', 'Dark')       0.00      0.00      0.00         0
 ('Fighting', 'Flying')       0.00      0.00      0.00         0
  ('Fighting', 'Ghost')       1.00      1.00      1.00         1
    ('Fighting', 'Ice')       1.00      1.00      1.00         1
   ('Fighting', 'None')       0.67      0.33      0.44         6
('Fighting', 'Psychic')       0.00      0.00      0.00         1
  ('Fighting', 'Steel')       0.00      0.00      0.00         1
  ('Fighting', 'Water')       0.00      0.00      0.00         1
        ('Fire', 'Bug')       0.00      0.00      0.00         0
       ('Fire', 'Dark')       1.00      1.00      1.00         1
   ('Fire', 'Fighting')       1.00      1.00      1.00         1
     ('Fire', 'Flying')       0.00      0.00      0.00         2
     ('Fire', 'Ground')       0.00      0.00      0.00         1
       ('Fire', 'None')       0.33      0.57      0.42         7
       ('Fire', 'Rock')       1.00      1.00      1.00         1
      ('Fire', 'Steel')       1.00      1.00      1.00         1
      ('Fire', 'Water')       1.00      1.00      1.00         1
     ('Flying', 'None')       1.00      1.00      1.00         1
    ('Flying', 'Steel')       1.00      1.00      1.00         1
    ('Flying', 'Water')       1.00      1.00      1.00         1
      ('Ghost', 'Dark')       1.00      1.00      1.00         1
     ('Ghost', 'Fairy')       1.00      1.00      1.00         1
      ('Ghost', 'Fire')       1.00      1.00      1.00         1
    ('Ghost', 'Flying')       0.00      0.00      0.00         1
     ('Ghost', 'Grass')       0.67      1.00      0.80         2
      ('Ghost', 'None')       0.25      0.33      0.29         3
    ('Ghost', 'Poison')       1.00      1.00      1.00         1
      ('Grass', 'Dark')       0.00      0.00      0.00         1
    ('Grass', 'Dragon')       0.00      0.00      0.00         1
     ('Grass', 'Fairy')       0.00      0.00      0.00         1
  ('Grass', 'Fighting')       0.00      0.00      0.00         1
    ('Grass', 'Flying')       1.00      0.50      0.67         2
     ('Grass', 'Ghost')       1.00      1.00      1.00         1
    ('Grass', 'Ground')       1.00      1.00      1.00         1
       ('Grass', 'Ice')       1.00      1.00      1.00         1
      ('Grass', 'None')       0.58      0.78      0.67         9
    ('Grass', 'Poison')       0.75      1.00      0.86         3
     ('Grass', 'Steel')       0.00      0.00      0.00         1
     ('Ground', 'Dark')       0.00      0.00      0.00         1
 ('Ground', 'Electric')       1.00      1.00      1.00         1
     ('Ground', 'Fire')       1.00      1.00      1.00         1
   ('Ground', 'Flying')       0.00      0.00      0.00         1
    ('Ground', 'Ghost')       0.00      0.00      0.00         1
     ('Ground', 'None')       0.00      0.00      0.00         4
     ('Ground', 'Rock')       0.00      0.00      0.00         1
    ('Ground', 'Steel')       1.00      1.00      1.00         1
         ('Ice', 'Bug')       0.00      0.00      0.00         0
       ('Ice', 'Fairy')       1.00      1.00      1.00         1
        ('Ice', 'Fire')       1.00      1.00      1.00         1
      ('Ice', 'Flying')       0.00      0.00      0.00         0
       ('Ice', 'Ghost')       1.00      1.00      1.00         1
      ('Ice', 'Ground')       0.00      0.00      0.00         1
        ('Ice', 'None')       0.50      0.50      0.50         4
       ('Ice', 'Water')       1.00      1.00      1.00         1
   ('Normal', 'Dragon')       1.00      1.00      1.00         1
    ('Normal', 'Fairy')       0.00      0.00      0.00         1
 ('Normal', 'Fighting')       0.00      0.00      0.00         1
   ('Normal', 'Flying')       0.83      0.83      0.83         6
    ('Normal', 'Grass')       0.00      0.00      0.00         0
   ('Normal', 'Ground')       1.00      1.00      1.00         1
     ('Normal', 'None')       0.38      0.43      0.40        14
  ('Normal', 'Psychic')       0.00      0.00      0.00         1
    ('Normal', 'Water')       1.00      1.00      1.00         1
      ('Poison', 'Bug')       1.00      1.00      1.00         1
     ('Poison', 'Dark')       0.00      0.00      0.00         1
   ('Poison', 'Dragon')       0.00      0.00      0.00         1
    ('Poison', 'Fairy')       0.50      1.00      0.67         1
   ('Poison', 'Flying')       1.00      1.00      1.00         1
   ('Poison', 'Ground')       0.00      0.00      0.00         0
     ('Poison', 'None')       1.00      0.33      0.50         3
    ('Poison', 'Water')       0.00      0.00      0.00         1
    ('Psychic', 'Dark')       1.00      1.00      1.00         1
  ('Psychic', 'Dragon')       1.00      1.00      1.00         1
   ('Psychic', 'Fairy')       0.00      0.00      0.00         2
('Psychic', 'Fighting')       0.00      0.00      0.00         1
    ('Psychic', 'Fire')       1.00      1.00      1.00         1
  ('Psychic', 'Flying')       0.33      0.50      0.40         2
   ('Psychic', 'Ghost')       1.00      1.00      1.00         1
   ('Psychic', 'Grass')       0.00      0.00      0.00         0
     ('Psychic', 'Ice')       0.00      0.00      0.00         1
    ('Psychic', 'None')       0.57      0.44      0.50         9
   ('Rock', 'Electric')       1.00      1.00      1.00         1
      ('Rock', 'Fairy')       0.00      0.00      0.00         1
   ('Rock', 'Fighting')       1.00      1.00      1.00         1
     ('Rock', 'Flying')       1.00      1.00      1.00         1
     ('Rock', 'Ground')       0.00      0.00      0.00         1
       ('Rock', 'None')       0.50      0.67      0.57         3
     ('Rock', 'Poison')       0.50      1.00      0.67         1
      ('Rock', 'Steel')       0.00      0.00      0.00         1
      ('Rock', 'Water')       1.00      1.00      1.00         1
     ('Steel', 'Fairy')       0.00      0.00      0.00         1
  ('Steel', 'Fighting')       1.00      1.00      1.00         1
     ('Steel', 'Ghost')       0.00      0.00      0.00         1
      ('Steel', 'None')       0.00      0.00      0.00         2
   ('Steel', 'Psychic')       1.00      0.50      0.67         2
      ('Steel', 'Rock')       0.00      0.00      0.00         1
      ('Water', 'Dark')       0.00      0.00      0.00         2
    ('Water', 'Dragon')       0.00      0.00      0.00         1
     ('Water', 'Fairy')       1.00      1.00      1.00         1
  ('Water', 'Fighting')       1.00      1.00      1.00         1
    ('Water', 'Flying')       0.67      1.00      0.80         2
     ('Water', 'Grass')       1.00      1.00      1.00         1
    ('Water', 'Ground')       1.00      0.50      0.67         2
       ('Water', 'Ice')       0.00      0.00      0.00         1
      ('Water', 'None')       0.60      0.60      0.60        15
    ('Water', 'Poison')       0.00      0.00      0.00         1
   ('Water', 'Psychic')       0.00      0.00      0.00         1
      ('Water', 'Rock')       0.00      0.00      0.00         1
     ('Water', 'Steel')       1.00      1.00      1.00         1

               accuracy                           0.48       242
              macro avg       0.44      0.46      0.44       242
           weighted avg       0.48      0.48      0.47       242

C:\Users\thors\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.10_qbz5n2kfra8p0\LocalCache\local-packages\Python310\site-packages\sklearn\metrics\_classification.py:1497: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
C:\Users\thors\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.10_qbz5n2kfra8p0\LocalCache\local-packages\Python310\site-packages\sklearn\metrics\_classification.py:1497: UndefinedMetricWarning: Recall is ill-defined and being set to 0.0 in labels with no true samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
C:\Users\thors\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.10_qbz5n2kfra8p0\LocalCache\local-packages\Python310\site-packages\sklearn\metrics\_classification.py:1497: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
C:\Users\thors\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.10_qbz5n2kfra8p0\LocalCache\local-packages\Python310\site-packages\sklearn\metrics\_classification.py:1497: UndefinedMetricWarning: Recall is ill-defined and being set to 0.0 in labels with no true samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
C:\Users\thors\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.10_qbz5n2kfra8p0\LocalCache\local-packages\Python310\site-packages\sklearn\metrics\_classification.py:1497: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
C:\Users\thors\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.10_qbz5n2kfra8p0\LocalCache\local-packages\Python310\site-packages\sklearn\metrics\_classification.py:1497: UndefinedMetricWarning: Recall is ill-defined and being set to 0.0 in labels with no true samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))

Hyperparameter Tuning¶

For hyperparameter tuning we use GridSearchCV and a pipeline with a StandardScaler to make the data more uniform.

In [ ]:
# Import GridSearchCV
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
# Setup the parameters and distributions to sample from: param_dist
pipeline = make_pipeline(StandardScaler(), DecisionTreeClassifier())

param_dist = {
    "decisiontreeclassifier__max_depth": [15, 30, None],  
    "decisiontreeclassifier__min_samples_leaf": np.arange(1, 10)
}

# Instantiate the GridSearchCV object
grid_search_cv = GridSearchCV(pipeline, param_grid=param_dist, cv=5)

# Fit grid_search_cv using the data X and labels y.
grid_search_cv.fit(X_train, y_train) 
y_pred = grid_search_cv.predict(X_test)

# Print the best score
print("Tuned Model Parameters: {}".format(grid_search_cv.best_params_))
print("Accuracy: {}".format(grid_search_cv.best_estimator_.score(X_test, y_test)))
C:\Users\thors\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.10_qbz5n2kfra8p0\LocalCache\local-packages\Python310\site-packages\sklearn\model_selection\_split.py:737: UserWarning: The least populated class in y has only 1 members, which is less than n_splits=5.
  warnings.warn(
Tuned Model Parameters: {'decisiontreeclassifier__max_depth': None, 'decisiontreeclassifier__min_samples_leaf': 1}
Accuracy: 0.49173553719008267

Random forest¶

In [ ]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV
model = RandomForestClassifier(random_state=42)

model.fit(X_train, y_train)

# Predict labels for the test set
y_pred = model.predict(X_test)
score = model.score(X_test, y_test)
# Calculate accuracy
print("Score :", score )
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)
Score : 0.6487603305785123
Accuracy: 0.6487603305785123

Hyperparameter Tuning¶

In [ ]:
# Import GridSearchCV
pipeline = make_pipeline(StandardScaler(), RandomForestClassifier())

# Setup the parameters and distributions to sample from: param_dist
param_dist = {
    "randomforestclassifier__max_depth": np.arange(5, 25),  
    "randomforestclassifier__min_samples_leaf": np.arange(1, 10),
    "randomforestclassifier__n_estimators": np.arange(60, 140, 5)
}

# Instantiate the RandomizedSearchCV object: random_grid_search_cv
random_search_cv = RandomizedSearchCV(pipeline, param_distributions=param_dist, n_iter=50, cv=3, random_state=42)
#grid_search_cv = GridSearchCV(pipeline, param_grid=param_dist, cv=3)

# Fit random_search_cv using the data X and labels y
random_search_cv.fit(X_train, y_train)
#grid_search_cv.fit(X_train, y_train)

# Print the best score
print("Best score is {}".format(random_search_cv.best_estimator_.score(X_test, y_test)))
print("Best parameters are {}".format(random_search_cv.best_params_))
c:\Users\Chau\miniconda3\Lib\site-packages\sklearn\model_selection\_split.py:737: UserWarning: The least populated class in y has only 1 members, which is less than n_splits=3.
  warnings.warn(
Best score is 0.6900826446280992
Best parameters are {'randomforestclassifier__max_depth': None, 'randomforestclassifier__min_samples_leaf': 1, 'randomforestclassifier__n_estimators': 136}

Support Vector Machine¶

Radial base function¶

In [ ]:
from sklearn.svm import SVC

model = SVC(kernel='rbf', random_state=42)

model.fit(X_train, y_train)
y_pred = model.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)
Accuracy: 0.06198347107438017
Hyperparameter Tuning¶
In [ ]:
from sklearn.svm import SVC

# Define the pipeline with StandardScaler and SVC
pipeline = make_pipeline(StandardScaler(), SVC(kernel='rbf'))

# Define the parameter grid
param_grid = {
    'svc__C': [0.1,0.5, 1,5, 10],        # Regularization parameter
    'svc__coef0': [0.0, 1.0, 2.0], # Independent term in the polynomial kernel function
}

# Initialize GridSearchCV
grid_search = GridSearchCV(pipeline, param_grid, cv=5, n_jobs=-1)

# Perform grid search
grid_search.fit(X_train, y_train)

# Get the best parameters and score
best_params = grid_search.best_params_
best_score=grid_search.best_estimator_.score(X_test, y_test)

print("Best Parameters:", best_params)
print("Best score is {}".format(grid_search.best_estimator_.score(X_test, y_test)))
c:\Users\Chau\miniconda3\Lib\site-packages\sklearn\model_selection\_split.py:737: UserWarning: The least populated class in y has only 1 members, which is less than n_splits=5.
  warnings.warn(
Best Parameters: {'svc__C': 10, 'svc__coef0': 0.0}
Best score is 0.6074380165289256

Linear¶

In [ ]:
svm_classifier = SVC(kernel='linear', random_state=42)

# Train the SVM classifier
svm_classifier.fit(X_train, y_train)
y_pred = svm_classifier.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)
Accuracy: 0.3677685950413223
Hyperparameter tuning¶
In [ ]:
from sklearn.svm import SVC

# Define the pipeline with StandardScaler and SVC
pipeline = make_pipeline(StandardScaler(), SVC(kernel='linear', random_state=42))

# Define the parameter grid
param_grid = {
    'svc__C': [0.1,0.5, 1 , 5, 10],        # Regularization parameter    # Degree of the polynomial kernel
    'svc__coef0': [0.0, 1.0, 2.0], # Independent term in the polynomial kernel function
}

# Initialize GridSearchCV
grid_search = GridSearchCV(pipeline, param_grid, cv=5, n_jobs=-1)

# Perform grid search
grid_search.fit(X_train, y_train)

# Get the best parameters and score
best_params = grid_search.best_params_
best_score=grid_search.best_estimator_.score(X_test, y_test)

print("Best Parameters:", best_params)
print("Best Score:", best_score)
c:\Users\Chau\miniconda3\Lib\site-packages\sklearn\model_selection\_split.py:737: UserWarning: The least populated class in y has only 1 members, which is less than n_splits=5.
  warnings.warn(
Best Parameters: {'svc__C': 0.5, 'svc__coef0': 0.0}
Best Score: 0.6528925619834711

Polynomial¶

In [ ]:
model = SVC(kernel='poly', random_state=42)

# Train the SVM classifier
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)
Accuracy: 0.0743801652892562
Hyperparameter Tuning¶
In [ ]:
from sklearn.svm import SVC

# Define the pipeline with StandardScaler and SVC
pipeline = make_pipeline(StandardScaler(), SVC(kernel='poly'))

# Define the parameter grid
param_grid = {
    'svc__C': [0.1,0.5, 1, 5, 10],        # Regularization parameter
    'svc__degree': [2, 3, 4, 5, 6],      # Degree of the polynomial kernel
    'svc__coef0': [0.0, 1.0, 2.0], # Independent term in the polynomial kernel function
}

# Initialize GridSearchCV
grid_search = GridSearchCV(pipeline, param_grid, cv=5, n_jobs=-1)

# Perform grid search
grid_search.fit(X_train, y_train)

# Get the best parameters and score
best_params = grid_search.best_params_
best_score=grid_search.best_estimator_.score(X_test, y_test)

print("Best Parameters:", best_params)
print("Best Score:", best_score)
c:\Users\Chau\miniconda3\Lib\site-packages\sklearn\model_selection\_split.py:737: UserWarning: The least populated class in y has only 1 members, which is less than n_splits=5.
  warnings.warn(
Best Parameters: {'svc__C': 10, 'svc__coef0': 2.0, 'svc__degree': 2}
Best Score: 0.6198347107438017

Sigmoid¶

In [ ]:
model = SVC(kernel='sigmoid', random_state=42)

# Train the SVM classifier
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

# Calculate accuracy
print("Score: ", model.score(X_test, y_test))

accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)
Score:  0.05785123966942149
Accuracy: 0.05785123966942149
Hyperparameter Tuning¶
In [ ]:
from sklearn.svm import SVC

# Define the pipeline with StandardScaler and SVC
pipeline = make_pipeline(StandardScaler(), SVC(kernel='sigmoid'))

# Define the parameter grid
param_grid = {
    'svc__C': [0.1,0.5, 1,5, 10],        # Regularization parameter
    'svc__coef0': [0.0, 1.0, 2.0], # Independent term in the polynomial kernel function
}

# Initialize GridSearchCV
grid_search = GridSearchCV(pipeline, param_grid, cv=5, n_jobs=-1)

# Perform grid search
grid_search.fit(X_train, y_train)

# Get the best parameters and score
best_params = grid_search.best_params_
best_score=grid_search.best_estimator_.score(X_test, y_test)

print("Best Parameters:", best_params)
print("Best Score:", best_score)
c:\Users\Chau\miniconda3\Lib\site-packages\sklearn\model_selection\_split.py:737: UserWarning: The least populated class in y has only 1 members, which is less than n_splits=5.
  warnings.warn(
Best Parameters: {'svc__C': 10, 'svc__coef0': 0.0}
Best Score: 0.6033057851239669

k Nearest neighbors¶

In [ ]:
from sklearn.neighbors import KNeighborsClassifier
model = KNeighborsClassifier()
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

# Calculate accuracy
print("Score: ",model.score(X_test, y_test))

accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)
Score:  0.06611570247933884
Accuracy: 0.06611570247933884

Hyperparameter Tuning¶

In [ ]:
from sklearn.neighbors import KNeighborsClassifier
param_grid = {
    'kneighborsclassifier__n_neighbors': [3, 5, 7, 9]  # List of k values to try
}

pipeline = make_pipeline(StandardScaler(), KNeighborsClassifier())

grid_search = GridSearchCV(estimator=pipeline, param_grid=param_grid, cv=5)

grid_search.fit(X_train, y_train)

best_params = grid_search.best_params_
best_score=grid_search.best_estimator_.score(X_test, y_test)

print("Best Parameters:", best_params)
print("Best Score:", best_score)
c:\Users\Chau\miniconda3\Lib\site-packages\sklearn\model_selection\_split.py:737: UserWarning: The least populated class in y has only 1 members, which is less than n_splits=5.
  warnings.warn(
Best Parameters: {'kneighborsclassifier__n_neighbors': 3}
Best Score: 0.2809917355371901

Logistic regression¶

In [ ]:
from sklearn.linear_model import LogisticRegression
model = LogisticRegression()
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

# Calculate accuracy
print("Score: ", model.score(X_test, y_test))

accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)
Score:  0.1446280991735537
Accuracy: 0.1446280991735537
c:\Users\Chau\miniconda3\Lib\site-packages\sklearn\linear_model\_logistic.py:469: ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
In [ ]:
model = LogisticRegression( random_state=42, multi_class='auto', solver='liblinear', max_iter=1000) 
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

# Calculate accuracy
print("Score: ", model.score(X_test, y_test))

accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)
Score:  0.5950413223140496
Accuracy: 0.5950413223140496

Hyperparameter tuning

In [ ]:
param_grid = {
    'logisticregression__C': np.logspace(-5, 5, 5),
    'logisticregression__penalty': ['l1', 'l2']
}

pipeline = make_pipeline(StandardScaler(), LogisticRegression(solver='liblinear'))

grid_search = GridSearchCV(estimator=pipeline, param_grid=param_grid, cv=2)

grid_search.fit(X_train, y_train)

best_params = grid_search.best_params_
best_score=grid_search.best_estimator_.score(X_test, y_test)

print("Best Parameters:", best_params)
print("Best Score:", best_score)
c:\Users\Chau\miniconda3\Lib\site-packages\sklearn\model_selection\_split.py:737: UserWarning: The least populated class in y has only 1 members, which is less than n_splits=2.
  warnings.warn(
c:\Users\Chau\miniconda3\Lib\site-packages\sklearn\svm\_base.py:1237: ConvergenceWarning: Liblinear failed to converge, increase the number of iterations.
  warnings.warn(
c:\Users\Chau\miniconda3\Lib\site-packages\sklearn\svm\_base.py:1237: ConvergenceWarning: Liblinear failed to converge, increase the number of iterations.
  warnings.warn(
c:\Users\Chau\miniconda3\Lib\site-packages\sklearn\svm\_base.py:1237: ConvergenceWarning: Liblinear failed to converge, increase the number of iterations.
  warnings.warn(
c:\Users\Chau\miniconda3\Lib\site-packages\sklearn\svm\_base.py:1237: ConvergenceWarning: Liblinear failed to converge, increase the number of iterations.
  warnings.warn(
c:\Users\Chau\miniconda3\Lib\site-packages\sklearn\svm\_base.py:1237: ConvergenceWarning: Liblinear failed to converge, increase the number of iterations.
  warnings.warn(
Best Parameters: {'logisticregression__C': 1.0, 'logisticregression__penalty': 'l2'}
Best Score: 0.6776859504132231
In [ ]:
from sklearn.linear_model import LogisticRegression
model = LogisticRegression( penalty='elasticnet',l1_ratio=0.5, random_state=42, multi_class='auto', solver='saga', max_iter=100) 
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

# Calculate accuracy
print("Score: ", model.score(X_test, y_test))

accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)
Score:  0.1115702479338843
Accuracy: 0.1115702479338843
c:\Users\Chau\miniconda3\Lib\site-packages\sklearn\linear_model\_sag.py:350: ConvergenceWarning: The max_iter was reached which means the coef_ did not converge
  warnings.warn(

Hyperparameter tuning

In [ ]:
param_grid = {
    'logisticregression__C': np.logspace(-3, 3, 3),
    'logisticregression__l1_ratio': np.linspace(0, 1, 5)
}

pipeline = make_pipeline(StandardScaler(), LogisticRegression(penalty='elasticnet', solver='saga'))

grid_search = GridSearchCV(estimator=pipeline, param_grid=param_grid, cv=2)

grid_search.fit(X_train, y_train)

best_params = grid_search.best_params_
best_score=grid_search.best_estimator_.score(X_test, y_test)

print("Best Parameters:", best_params)
print("Best Score:", best_score)
c:\Users\Chau\miniconda3\Lib\site-packages\sklearn\model_selection\_split.py:737: UserWarning: The least populated class in y has only 1 members, which is less than n_splits=2.
  warnings.warn(
c:\Users\Chau\miniconda3\Lib\site-packages\sklearn\linear_model\_sag.py:350: ConvergenceWarning: The max_iter was reached which means the coef_ did not converge
  warnings.warn(
c:\Users\Chau\miniconda3\Lib\site-packages\sklearn\linear_model\_sag.py:350: ConvergenceWarning: The max_iter was reached which means the coef_ did not converge
  warnings.warn(
c:\Users\Chau\miniconda3\Lib\site-packages\sklearn\linear_model\_sag.py:350: ConvergenceWarning: The max_iter was reached which means the coef_ did not converge
  warnings.warn(
c:\Users\Chau\miniconda3\Lib\site-packages\sklearn\linear_model\_sag.py:350: ConvergenceWarning: The max_iter was reached which means the coef_ did not converge
  warnings.warn(
c:\Users\Chau\miniconda3\Lib\site-packages\sklearn\linear_model\_sag.py:350: ConvergenceWarning: The max_iter was reached which means the coef_ did not converge
  warnings.warn(
c:\Users\Chau\miniconda3\Lib\site-packages\sklearn\linear_model\_sag.py:350: ConvergenceWarning: The max_iter was reached which means the coef_ did not converge
  warnings.warn(
c:\Users\Chau\miniconda3\Lib\site-packages\sklearn\linear_model\_sag.py:350: ConvergenceWarning: The max_iter was reached which means the coef_ did not converge
  warnings.warn(
c:\Users\Chau\miniconda3\Lib\site-packages\sklearn\linear_model\_sag.py:350: ConvergenceWarning: The max_iter was reached which means the coef_ did not converge
  warnings.warn(
c:\Users\Chau\miniconda3\Lib\site-packages\sklearn\linear_model\_sag.py:350: ConvergenceWarning: The max_iter was reached which means the coef_ did not converge
  warnings.warn(
c:\Users\Chau\miniconda3\Lib\site-packages\sklearn\linear_model\_sag.py:350: ConvergenceWarning: The max_iter was reached which means the coef_ did not converge
  warnings.warn(
c:\Users\Chau\miniconda3\Lib\site-packages\sklearn\linear_model\_sag.py:350: ConvergenceWarning: The max_iter was reached which means the coef_ did not converge
  warnings.warn(
c:\Users\Chau\miniconda3\Lib\site-packages\sklearn\linear_model\_sag.py:350: ConvergenceWarning: The max_iter was reached which means the coef_ did not converge
  warnings.warn(
c:\Users\Chau\miniconda3\Lib\site-packages\sklearn\linear_model\_sag.py:350: ConvergenceWarning: The max_iter was reached which means the coef_ did not converge
  warnings.warn(
c:\Users\Chau\miniconda3\Lib\site-packages\sklearn\linear_model\_sag.py:350: ConvergenceWarning: The max_iter was reached which means the coef_ did not converge
  warnings.warn(
c:\Users\Chau\miniconda3\Lib\site-packages\sklearn\linear_model\_sag.py:350: ConvergenceWarning: The max_iter was reached which means the coef_ did not converge
  warnings.warn(
c:\Users\Chau\miniconda3\Lib\site-packages\sklearn\linear_model\_sag.py:350: ConvergenceWarning: The max_iter was reached which means the coef_ did not converge
  warnings.warn(
c:\Users\Chau\miniconda3\Lib\site-packages\sklearn\linear_model\_sag.py:350: ConvergenceWarning: The max_iter was reached which means the coef_ did not converge
  warnings.warn(
c:\Users\Chau\miniconda3\Lib\site-packages\sklearn\linear_model\_sag.py:350: ConvergenceWarning: The max_iter was reached which means the coef_ did not converge
  warnings.warn(
c:\Users\Chau\miniconda3\Lib\site-packages\sklearn\linear_model\_sag.py:350: ConvergenceWarning: The max_iter was reached which means the coef_ did not converge
  warnings.warn(
c:\Users\Chau\miniconda3\Lib\site-packages\sklearn\linear_model\_sag.py:350: ConvergenceWarning: The max_iter was reached which means the coef_ did not converge
  warnings.warn(
c:\Users\Chau\miniconda3\Lib\site-packages\sklearn\linear_model\_sag.py:350: ConvergenceWarning: The max_iter was reached which means the coef_ did not converge
  warnings.warn(
c:\Users\Chau\miniconda3\Lib\site-packages\sklearn\linear_model\_sag.py:350: ConvergenceWarning: The max_iter was reached which means the coef_ did not converge
  warnings.warn(
Best Parameters: {'logisticregression__C': 1.0, 'logisticregression__l1_ratio': 0.25}
Best Score: 0.6611570247933884
c:\Users\Chau\miniconda3\Lib\site-packages\sklearn\linear_model\_sag.py:350: ConvergenceWarning: The max_iter was reached which means the coef_ did not converge
  warnings.warn(

Ignoring Order of Types¶

Preprocessing¶

We do the same as before, but we sort types in the tuples alphabetically to ignore order.

In [ ]:
df = preprocessed_df.copy()

df['Types'] = df[['type_1', 'type_2']].apply(lambda x: sorted(tuple(filter(lambda y: pd.notna(y), x))), axis=1)

df.Types = df.Types.astype(str)

# drop the Type 1 and Type 2 columns
df.drop(['type_1', 'type_2'], axis=1, inplace=True)

# print head
df.head()
Out[ ]:
generation height_m weight_kg abilities_number total_points hp attack defense sp_attack sp_defense ... egg_type_2_Flying egg_type_2_Grass egg_type_2_Human-Like egg_type_2_Mineral egg_type_2_Monster egg_type_2_None egg_type_2_Water 1 egg_type_2_Water 2 egg_type_2_Water 3 Types
0 1 0.7 6.9 2 318 45 49 49 65 65 ... False False False False True False False False False ['Grass', 'Poison']
1 1 1.0 13.0 2 405 60 62 63 80 80 ... False False False False True False False False False ['Grass', 'Poison']
2 1 2.0 100.0 2 525 80 82 83 100 100 ... False False False False True False False False False ['Grass', 'Poison']
3 1 2.4 155.5 1 625 80 100 123 122 120 ... False False False False True False False False False ['Grass', 'Poison']
4 1 0.6 8.5 2 309 39 52 43 60 50 ... False False False False True False False False False ['Fire', 'None']

5 rows × 545 columns

In [ ]:
sns.countplot(df, y='Types');
['#1f77b4', '#ff7f0e', '#2ca02c', '#d62728', '#9467bd', '#8c564b', '#e377c2', '#7f7f7f', '#bcbd22', '#17becf']
['#1f77b4', '#ff7f0e', '#2ca02c', '#d62728', '#9467bd', '#8c564b', '#e377c2', '#7f7f7f', '#bcbd22', '#17becf']
['#1f77b4', '#ff7f0e', '#2ca02c', '#d62728', '#9467bd', '#8c564b', '#e377c2', '#7f7f7f', '#bcbd22', '#17becf']
['#1f77b4', '#ff7f0e', '#2ca02c', '#d62728', '#9467bd', '#8c564b', '#e377c2', '#7f7f7f', '#bcbd22', '#17becf']
['#1f77b4', '#ff7f0e', '#2ca02c', '#d62728', '#9467bd', '#8c564b', '#e377c2', '#7f7f7f', '#bcbd22', '#17becf']
No description has been provided for this image
In [ ]:
# Check if there are any pokemon with only one type
singleton_classes = df['Types'].value_counts()[df['Types'].value_counts() == 1].index.tolist()
singleton_data = df[df['Types'].isin(singleton_classes)]
other_data = df[~df['Types'].isin(singleton_classes)]


print("Number of singleton classes",len(singleton_classes))
print("number of unique type combinations",len(df['Types'].unique()))
print(len(df['Types']))
df.head()
Number of singleton classes 23
number of unique type combinations 154
1044
Out[ ]:
generation height_m weight_kg abilities_number total_points hp attack defense sp_attack sp_defense ... egg_type_2_Flying egg_type_2_Grass egg_type_2_Human-Like egg_type_2_Mineral egg_type_2_Monster egg_type_2_None egg_type_2_Water 1 egg_type_2_Water 2 egg_type_2_Water 3 Types
0 1 0.7 6.9 2 318 45 49 49 65 65 ... False False False False True False False False False ['Grass', 'Poison']
1 1 1.0 13.0 2 405 60 62 63 80 80 ... False False False False True False False False False ['Grass', 'Poison']
2 1 2.0 100.0 2 525 80 82 83 100 100 ... False False False False True False False False False ['Grass', 'Poison']
3 1 2.4 155.5 1 625 80 100 123 122 120 ... False False False False True False False False False ['Grass', 'Poison']
4 1 0.6 8.5 2 309 39 52 43 60 50 ... False False False False True False False False False ['Fire', 'None']

5 rows × 545 columns

Decision tree¶

In [ ]:
# Split the data into training and testing sets
X = df.drop(columns=['Types'])
y = df['Types']

X_train, X_test, y_train, y_test = train_test_split(other_data.drop(columns=['Types']), other_data['Types'], test_size=0.2, stratify=other_data['Types'], random_state=42)
X_train = pd.concat([X_train, singleton_data.drop(columns=['Types'])])
y_train = pd.concat([y_train, singleton_data['Types']])
X_test = pd.concat([X_test, singleton_data.drop(columns=['Types'])])
y_test = pd.concat([y_test, singleton_data['Types']])

# Initialize and train the decision tree classifier
model = DecisionTreeClassifier(random_state=42)
model.fit(X_train, y_train)

# Predict labels for the test set
y_pred = model.predict(X_test)

# Calculate accuracy
print("Score: ", model.score(X_test, y_test))

accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)
Score:  0.4298245614035088
Accuracy: 0.4298245614035088

Hyperparameter tuning¶

In [ ]:
# Setup the parameters and distributions to sample from: param_dist
pipeline = make_pipeline(StandardScaler(), DecisionTreeClassifier())

param_dist = {
    "decisiontreeclassifier__max_depth": [15, 30, None],  
    "decisiontreeclassifier__min_samples_leaf": np.arange(1, 10)
}

# Instantiate the GridSearchCV object
grid_search_cv = GridSearchCV(pipeline, param_grid=param_dist, cv=5)

# Fit grid_search_cv using the data X and labels y.
grid_search_cv.fit(X_train, y_train) 
y_pred = grid_search_cv.predict(X_test)

# Print the best score
print("Tuned Model Parameters: {}".format(grid_search_cv.best_params_))
print("Accuracy: {}".format(grid_search_cv.best_estimator_.score(X_test, y_test)))
c:\Users\Chau\miniconda3\Lib\site-packages\sklearn\model_selection\_split.py:737: UserWarning: The least populated class in y has only 1 members, which is less than n_splits=5.
  warnings.warn(
Tuned Model Parameters: {'decisiontreeclassifier__max_depth': None, 'decisiontreeclassifier__min_samples_leaf': 1}
Accuracy: 0.4649122807017544

Random forest¶

In [ ]:
from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier(random_state=42)

model.fit(X_train, y_train)

# Predict labels for the test set
y_pred = model.predict(X_test)

# Calculate accuracy
print("Score: ", model.score(X_test, y_test))

accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)
Score:  0.5964912280701754
Accuracy: 0.5964912280701754

Hyperparameter Tuning¶

There's an issue with cross-validation because some type combinations occur only once in the training data.

In [ ]:
pipeline = make_pipeline(StandardScaler(), RandomForestClassifier())

# Setup the parameters and distributions to sample from: param_dist
param_dist = {
    "randomforestclassifier__max_depth": np.arange(30, 36),  
    "randomforestclassifier__min_samples_leaf": np.arange(1, 10, 4),
    "randomforestclassifier__n_estimators": np.arange(100, 140, 4)
}

# Instantiate the RandomizedSearchCV object: random_grid_search_cv
random_search_cv = RandomizedSearchCV(pipeline, param_distributions=param_dist, n_iter=50, cv=3, random_state=42)
#grid_search_cv = GridSearchCV(pipeline, param_grid=param_dist, cv=3)

# Fit random_search_cv using the data X and labels y
random_search_cv.fit(X_train, y_train)
#grid_search_cv.fit(X_train, y_train)

# Print the best score
print("Best score is {}".format(random_search_cv.best_estimator_.score(X_test, y_test)))
print("Best parameters are {}".format(random_search_cv.best_params_))
c:\Users\Chau\miniconda3\Lib\site-packages\sklearn\model_selection\_split.py:737: UserWarning: The least populated class in y has only 1 members, which is less than n_splits=3.
  warnings.warn(
Best score is 0.5964912280701754
Best parameters are {'randomforestclassifier__max_depth': 33, 'randomforestclassifier__min_samples_leaf': 1, 'randomforestclassifier__n_estimators': 136}

Support vector machine¶

Radial base function¶

In [ ]:
from sklearn.svm import SVC

model = SVC(kernel='rbf', random_state=42)

model.fit(X_train, y_train)
y_pred = model.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)
Accuracy: 0.05263157894736842
Hyperparameter Tuning¶
In [ ]:
from sklearn.svm import SVC

# Define the pipeline with StandardScaler and SVC
pipeline = make_pipeline(StandardScaler(), SVC(kernel='rbf'))

# Define the parameter grid
param_grid = {
    'svc__C': [0.1,0.5, 1,5, 10],        # Regularization parameter
    'svc__coef0': [0.0, 1.0, 2.0], # Independent term in the polynomial kernel function
}

# Initialize GridSearchCV
grid_search = GridSearchCV(pipeline, param_grid, cv=5, n_jobs=-1)

# Perform grid search
grid_search.fit(X_train, y_train)

# Get the best parameters and score
best_params = grid_search.best_params_
best_score = grid_search.best_estimator_.score(X_test, y_test)

print("Best Parameters:", best_params)
print("Best Score:", best_score)
c:\Users\Chau\miniconda3\Lib\site-packages\sklearn\model_selection\_split.py:737: UserWarning: The least populated class in y has only 1 members, which is less than n_splits=5.
  warnings.warn(
Best Parameters: {'svc__C': 10, 'svc__coef0': 0.0}
Best Score: 0.5482456140350878

Linear¶

In [ ]:
svm_classifier = SVC(kernel='linear', random_state=42)

# Train the SVM classifier
svm_classifier.fit(X_train, y_train)
y_pred = svm_classifier.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)
Accuracy: 0.34210526315789475
Hyperparameter tuning¶
In [ ]:
from sklearn.svm import SVC

# Define the pipeline with StandardScaler and SVC
pipeline = make_pipeline(StandardScaler(), SVC(kernel='linear'))

# Define the parameter grid
param_grid = {
    'svc__C': [0.1,0.5, 1 , 5, 10],        # Regularization parameter    # Degree of the polynomial kernel
    'svc__coef0': [0.0, 1.0, 2.0], # Independent term in the polynomial kernel function
}

# Initialize GridSearchCV
grid_search = GridSearchCV(pipeline, param_grid, cv=5, n_jobs=-1)

# Perform grid search
grid_search.fit(X_train, y_train)

# Get the best parameters and score
best_params = grid_search.best_params_
best_score = grid_search.best_estimator_.score(X_test, y_test)

print("Best Parameters:", best_params)
print("Best Score:", best_score)
c:\Users\Chau\miniconda3\Lib\site-packages\sklearn\model_selection\_split.py:737: UserWarning: The least populated class in y has only 1 members, which is less than n_splits=5.
  warnings.warn(
Best Parameters: {'svc__C': 0.5, 'svc__coef0': 0.0}
Best Score: 0.5921052631578947

Polynomial¶

In [ ]:
model = SVC(kernel='poly', random_state=42)

# Train the SVM classifier
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)
Accuracy: 0.07017543859649122
Hyperparameter Tuning¶
In [ ]:
from sklearn.svm import SVC

# Define the pipeline with StandardScaler and SVC
pipeline = make_pipeline(StandardScaler(), SVC(kernel='poly'))

# Define the parameter grid
param_grid = {
    'svc__C': [0.1,0.5, 1, 5, 10],        # Regularization parameter
    'svc__degree': [2, 3, 4, 5, 6],      # Degree of the polynomial kernel
    'svc__coef0': [0.0, 1.0, 2.0], # Independent term in the polynomial kernel function
}

# Initialize GridSearchCV
grid_search = GridSearchCV(pipeline, param_grid, cv=5, n_jobs=-1)

# Perform grid search
grid_search.fit(X_train, y_train)

# Get the best parameters and score
best_params = grid_search.best_params_
best_score = grid_search.best_estimator_.score(X_test, y_test)

print("Best Parameters:", best_params)
print("Best Score:", best_score)
c:\Users\Chau\miniconda3\Lib\site-packages\sklearn\model_selection\_split.py:737: UserWarning: The least populated class in y has only 1 members, which is less than n_splits=5.
  warnings.warn(
Best Parameters: {'svc__C': 10, 'svc__coef0': 2.0, 'svc__degree': 3}
Best Score: 0.5789473684210527

Sigmoid¶

In [ ]:
model = SVC(kernel='sigmoid', random_state=42)

# Train the SVM classifier
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

# Calculate accuracy
print("Score: ", model.score(X_test, y_test))

accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)
Score:  0.07456140350877193
Accuracy: 0.07456140350877193
Hyperparameter Tuning¶
In [ ]:
from sklearn.svm import SVC

# Define the pipeline with StandardScaler and SVC
pipeline = make_pipeline(StandardScaler(), SVC(kernel='sigmoid'))

# Define the parameter grid
param_grid = {
    'svc__C': [0.1,0.5, 1,5, 10, 50],        # Regularization parameter
    'svc__coef0': [0.0, 1.0, 2.0], # Independent term in the polynomial kernel function
}

# Initialize GridSearchCV
grid_search = GridSearchCV(pipeline, param_grid, cv=5, n_jobs=-1)

# Perform grid search
grid_search.fit(X_train, y_train)

# Get the best parameters and score
best_params = grid_search.best_params_
best_score = grid_search.best_estimator_.score(X_test, y_test)

print("Best Parameters:", best_params)
print("Best Score:", best_score)
c:\Users\Chau\miniconda3\Lib\site-packages\sklearn\model_selection\_split.py:737: UserWarning: The least populated class in y has only 1 members, which is less than n_splits=5.
  warnings.warn(
Best Parameters: {'svc__C': 50, 'svc__coef0': 0.0}
Best Score: 0.5614035087719298

k Nearest Neighbors¶

In [ ]:
from sklearn.neighbors import KNeighborsClassifier
model = KNeighborsClassifier()
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

# Calculate accuracy
print("Score: ", model.score(X_test, y_test))

accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)
Score:  0.10087719298245613
Accuracy: 0.10087719298245613

Hyperparameter Tuning¶

In [ ]:
param_grid = {
    'kneighborsclassifier__n_neighbors': [3, 5, 7, 9]  # List of k values to try
}

pipeline = make_pipeline(StandardScaler(), KNeighborsClassifier())

grid_search = GridSearchCV(estimator=pipeline, param_grid=param_grid, cv=5)

grid_search.fit(X_train, y_train)

best_params = grid_search.best_params_
best_score=grid_search.best_estimator_.score(X_test, y_test)

print("Best Parameters:", best_params)
print("Best Score:", best_score)
c:\Users\Chau\miniconda3\Lib\site-packages\sklearn\model_selection\_split.py:737: UserWarning: The least populated class in y has only 1 members, which is less than n_splits=5.
  warnings.warn(
Best Parameters: {'kneighborsclassifier__n_neighbors': 1}
Best Score: 0.5921052631578947

Logistic regression¶

In [ ]:
model = LogisticRegression()
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

# Calculate accuracy
print("Score: ", model.score(X_test, y_test))

accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)
Score:  0.14473684210526316
Accuracy: 0.14473684210526316
c:\Users\Chau\miniconda3\Lib\site-packages\sklearn\linear_model\_logistic.py:469: ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
In [ ]:
model = LogisticRegression( random_state=42, multi_class='auto', solver='liblinear', max_iter=1000) 
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

# Calculate accuracy
print("Score: ", model.score(X_test, y_test))

accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)
Score:  0.5657894736842105
Accuracy: 0.5657894736842105

Hyperparameter tuning

In [ ]:
param_grid = {
    'logisticregression__C': np.logspace(-4, 4, 3),
    'logisticregression__penalty': ['l1', 'l2']
}

pipeline = make_pipeline(StandardScaler(), LogisticRegression(solver='liblinear'))

grid_search = GridSearchCV(estimator=pipeline, param_grid=param_grid, cv=2)

grid_search.fit(X_train, y_train)

best_params = grid_search.best_params_
best_score=grid_search.best_estimator_.score(X_test, y_test)

print("Best Parameters:", best_params)
print("Best Score:", best_score)
c:\Users\Chau\miniconda3\Lib\site-packages\sklearn\model_selection\_split.py:737: UserWarning: The least populated class in y has only 1 members, which is less than n_splits=2.
  warnings.warn(
c:\Users\Chau\miniconda3\Lib\site-packages\sklearn\svm\_base.py:1237: ConvergenceWarning: Liblinear failed to converge, increase the number of iterations.
  warnings.warn(
c:\Users\Chau\miniconda3\Lib\site-packages\sklearn\svm\_base.py:1237: ConvergenceWarning: Liblinear failed to converge, increase the number of iterations.
  warnings.warn(
Best Parameters: {'logisticregression__C': 1.0, 'logisticregression__penalty': 'l2'}
Best Score: 0.6776859504132231
In [ ]:
from sklearn.linear_model import LogisticRegression
model = LogisticRegression( penalty='elasticnet',l1_ratio=0.5, random_state=42, multi_class='auto', solver='saga', max_iter=100) 
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

# Calculate accuracy
print("Score: ", model.score(X_test, y_test))

accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)
Score:  0.1115702479338843
Accuracy: 0.1115702479338843
c:\Users\Chau\miniconda3\Lib\site-packages\sklearn\linear_model\_sag.py:350: ConvergenceWarning: The max_iter was reached which means the coef_ did not converge
  warnings.warn(

Hyperparameter tuning

In [ ]:
param_grid = {
    'logisticregression__C': [1.0],
    'logisticregression__l1_ratio': np.linspace(0, 1, 5)
}

pipeline = make_pipeline(StandardScaler(), LogisticRegression(penalty='elasticnet', solver='saga'))

grid_search = GridSearchCV(estimator=pipeline, param_grid=param_grid, cv=2)

grid_search.fit(X_train, y_train)

best_params = grid_search.best_params_
best_score=grid_search.best_estimator_.score(X_test, y_test)

print("Best Parameters:", best_params)
print("Best Score:", best_score)
c:\Users\Chau\miniconda3\Lib\site-packages\sklearn\model_selection\_split.py:737: UserWarning: The least populated class in y has only 1 members, which is less than n_splits=2.
  warnings.warn(
c:\Users\Chau\miniconda3\Lib\site-packages\sklearn\linear_model\_sag.py:350: ConvergenceWarning: The max_iter was reached which means the coef_ did not converge
  warnings.warn(
c:\Users\Chau\miniconda3\Lib\site-packages\sklearn\linear_model\_sag.py:350: ConvergenceWarning: The max_iter was reached which means the coef_ did not converge
  warnings.warn(
c:\Users\Chau\miniconda3\Lib\site-packages\sklearn\linear_model\_sag.py:350: ConvergenceWarning: The max_iter was reached which means the coef_ did not converge
  warnings.warn(
c:\Users\Chau\miniconda3\Lib\site-packages\sklearn\linear_model\_sag.py:350: ConvergenceWarning: The max_iter was reached which means the coef_ did not converge
  warnings.warn(
c:\Users\Chau\miniconda3\Lib\site-packages\sklearn\linear_model\_sag.py:350: ConvergenceWarning: The max_iter was reached which means the coef_ did not converge
  warnings.warn(
c:\Users\Chau\miniconda3\Lib\site-packages\sklearn\linear_model\_sag.py:350: ConvergenceWarning: The max_iter was reached which means the coef_ did not converge
  warnings.warn(
c:\Users\Chau\miniconda3\Lib\site-packages\sklearn\linear_model\_sag.py:350: ConvergenceWarning: The max_iter was reached which means the coef_ did not converge
  warnings.warn(
c:\Users\Chau\miniconda3\Lib\site-packages\sklearn\linear_model\_sag.py:350: ConvergenceWarning: The max_iter was reached which means the coef_ did not converge
  warnings.warn(
c:\Users\Chau\miniconda3\Lib\site-packages\sklearn\linear_model\_sag.py:350: ConvergenceWarning: The max_iter was reached which means the coef_ did not converge
  warnings.warn(
c:\Users\Chau\miniconda3\Lib\site-packages\sklearn\linear_model\_sag.py:350: ConvergenceWarning: The max_iter was reached which means the coef_ did not converge
  warnings.warn(
Best Parameters: {'logisticregression__C': 1.0, 'logisticregression__l1_ratio': 0.25}
Best Score: 0.6611570247933884
c:\Users\Chau\miniconda3\Lib\site-packages\sklearn\linear_model\_sag.py:350: ConvergenceWarning: The max_iter was reached which means the coef_ did not converge
  warnings.warn(

Multi-label Classification¶

Accounting for Order of Types¶

Preprocessing¶

In [ ]:
df = preprocessed_df.copy()

df['Types'] = df[['type_1', 'type_2']].apply(lambda x: tuple(filter(lambda y: pd.notna(y), x)), axis=1)
df.Types = df.Types.astype(str)

# drop the Type 1 and Type 2 columns
df.drop(['type_1', 'type_2'], axis=1, inplace=True)

# print head
df.head()
Out[ ]:
generation height_m weight_kg abilities_number total_points hp attack defense sp_attack sp_defense ... egg_type_2_Flying egg_type_2_Grass egg_type_2_Human-Like egg_type_2_Mineral egg_type_2_Monster egg_type_2_None egg_type_2_Water 1 egg_type_2_Water 2 egg_type_2_Water 3 Types
0 1 0.7 6.9 2 318 45 49 49 65 65 ... False False False False True False False False False ('Grass', 'Poison')
1 1 1.0 13.0 2 405 60 62 63 80 80 ... False False False False True False False False False ('Grass', 'Poison')
2 1 2.0 100.0 2 525 80 82 83 100 100 ... False False False False True False False False False ('Grass', 'Poison')
3 1 2.4 155.5 1 625 80 100 123 122 120 ... False False False False True False False False False ('Grass', 'Poison')
4 1 0.6 8.5 2 309 39 52 43 60 50 ... False False False False True False False False False ('Fire', 'None')

5 rows × 545 columns

In [ ]:
# Find classes with only one type
singleton_classes = df['Types'].value_counts()[df['Types'].value_counts() == 1].index.tolist()

To account for order of types, we create binary labels for each type combination.

In [ ]:
# Create binary labels for each Pokémon type combination
type_combinations = df['Types'].unique()
for type in type_combinations:
    df[type] = df['Types'].apply(lambda x: 1 if type in x else 0)

singleton_data = df[df['Types'].isin(singleton_classes)]
other_data = df[~df['Types'].isin(singleton_classes)]

print("Number of singleton classes",len(singleton_classes))
print("number of unique type combinations",len(df['Types'].unique()))
print(len(df['Types']))
df.head()
Number of singleton classes 41
number of unique type combinations 192
1044
C:\Users\thors\AppData\Local\Temp\ipykernel_20284\3350647679.py:4: PerformanceWarning: DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`
  df[type] = df['Types'].apply(lambda x: 1 if type in x else 0)
C:\Users\thors\AppData\Local\Temp\ipykernel_20284\3350647679.py:4: PerformanceWarning: DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`
  df[type] = df['Types'].apply(lambda x: 1 if type in x else 0)
C:\Users\thors\AppData\Local\Temp\ipykernel_20284\3350647679.py:4: PerformanceWarning: DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`
  df[type] = df['Types'].apply(lambda x: 1 if type in x else 0)
C:\Users\thors\AppData\Local\Temp\ipykernel_20284\3350647679.py:4: PerformanceWarning: DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`
  df[type] = df['Types'].apply(lambda x: 1 if type in x else 0)
C:\Users\thors\AppData\Local\Temp\ipykernel_20284\3350647679.py:4: PerformanceWarning: DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`
  df[type] = df['Types'].apply(lambda x: 1 if type in x else 0)
C:\Users\thors\AppData\Local\Temp\ipykernel_20284\3350647679.py:4: PerformanceWarning: DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`
  df[type] = df['Types'].apply(lambda x: 1 if type in x else 0)
C:\Users\thors\AppData\Local\Temp\ipykernel_20284\3350647679.py:4: PerformanceWarning: DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`
  df[type] = df['Types'].apply(lambda x: 1 if type in x else 0)
C:\Users\thors\AppData\Local\Temp\ipykernel_20284\3350647679.py:4: PerformanceWarning: DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`
  df[type] = df['Types'].apply(lambda x: 1 if type in x else 0)
C:\Users\thors\AppData\Local\Temp\ipykernel_20284\3350647679.py:4: PerformanceWarning: DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`
  df[type] = df['Types'].apply(lambda x: 1 if type in x else 0)
C:\Users\thors\AppData\Local\Temp\ipykernel_20284\3350647679.py:4: PerformanceWarning: DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`
  df[type] = df['Types'].apply(lambda x: 1 if type in x else 0)
C:\Users\thors\AppData\Local\Temp\ipykernel_20284\3350647679.py:4: PerformanceWarning: DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`
  df[type] = df['Types'].apply(lambda x: 1 if type in x else 0)
C:\Users\thors\AppData\Local\Temp\ipykernel_20284\3350647679.py:4: PerformanceWarning: DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`
  df[type] = df['Types'].apply(lambda x: 1 if type in x else 0)
C:\Users\thors\AppData\Local\Temp\ipykernel_20284\3350647679.py:4: PerformanceWarning: DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`
  df[type] = df['Types'].apply(lambda x: 1 if type in x else 0)
C:\Users\thors\AppData\Local\Temp\ipykernel_20284\3350647679.py:4: PerformanceWarning: DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`
  df[type] = df['Types'].apply(lambda x: 1 if type in x else 0)
C:\Users\thors\AppData\Local\Temp\ipykernel_20284\3350647679.py:4: PerformanceWarning: DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`
  df[type] = df['Types'].apply(lambda x: 1 if type in x else 0)
C:\Users\thors\AppData\Local\Temp\ipykernel_20284\3350647679.py:4: PerformanceWarning: DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`
  df[type] = df['Types'].apply(lambda x: 1 if type in x else 0)
C:\Users\thors\AppData\Local\Temp\ipykernel_20284\3350647679.py:4: PerformanceWarning: DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`
  df[type] = df['Types'].apply(lambda x: 1 if type in x else 0)
C:\Users\thors\AppData\Local\Temp\ipykernel_20284\3350647679.py:4: PerformanceWarning: DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`
  df[type] = df['Types'].apply(lambda x: 1 if type in x else 0)
C:\Users\thors\AppData\Local\Temp\ipykernel_20284\3350647679.py:4: PerformanceWarning: DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`
  df[type] = df['Types'].apply(lambda x: 1 if type in x else 0)
C:\Users\thors\AppData\Local\Temp\ipykernel_20284\3350647679.py:4: PerformanceWarning: DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`
  df[type] = df['Types'].apply(lambda x: 1 if type in x else 0)
C:\Users\thors\AppData\Local\Temp\ipykernel_20284\3350647679.py:4: PerformanceWarning: DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`
  df[type] = df['Types'].apply(lambda x: 1 if type in x else 0)
C:\Users\thors\AppData\Local\Temp\ipykernel_20284\3350647679.py:4: PerformanceWarning: DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`
  df[type] = df['Types'].apply(lambda x: 1 if type in x else 0)
C:\Users\thors\AppData\Local\Temp\ipykernel_20284\3350647679.py:4: PerformanceWarning: DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`
  df[type] = df['Types'].apply(lambda x: 1 if type in x else 0)
C:\Users\thors\AppData\Local\Temp\ipykernel_20284\3350647679.py:4: PerformanceWarning: DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`
  df[type] = df['Types'].apply(lambda x: 1 if type in x else 0)
C:\Users\thors\AppData\Local\Temp\ipykernel_20284\3350647679.py:4: PerformanceWarning: DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`
  df[type] = df['Types'].apply(lambda x: 1 if type in x else 0)
C:\Users\thors\AppData\Local\Temp\ipykernel_20284\3350647679.py:4: PerformanceWarning: DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`
  df[type] = df['Types'].apply(lambda x: 1 if type in x else 0)
C:\Users\thors\AppData\Local\Temp\ipykernel_20284\3350647679.py:4: PerformanceWarning: DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`
  df[type] = df['Types'].apply(lambda x: 1 if type in x else 0)
C:\Users\thors\AppData\Local\Temp\ipykernel_20284\3350647679.py:4: PerformanceWarning: DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`
  df[type] = df['Types'].apply(lambda x: 1 if type in x else 0)
C:\Users\thors\AppData\Local\Temp\ipykernel_20284\3350647679.py:4: PerformanceWarning: DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`
  df[type] = df['Types'].apply(lambda x: 1 if type in x else 0)
C:\Users\thors\AppData\Local\Temp\ipykernel_20284\3350647679.py:4: PerformanceWarning: DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`
  df[type] = df['Types'].apply(lambda x: 1 if type in x else 0)
C:\Users\thors\AppData\Local\Temp\ipykernel_20284\3350647679.py:4: PerformanceWarning: DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`
  df[type] = df['Types'].apply(lambda x: 1 if type in x else 0)
C:\Users\thors\AppData\Local\Temp\ipykernel_20284\3350647679.py:4: PerformanceWarning: DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`
  df[type] = df['Types'].apply(lambda x: 1 if type in x else 0)
C:\Users\thors\AppData\Local\Temp\ipykernel_20284\3350647679.py:4: PerformanceWarning: DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`
  df[type] = df['Types'].apply(lambda x: 1 if type in x else 0)
C:\Users\thors\AppData\Local\Temp\ipykernel_20284\3350647679.py:4: PerformanceWarning: DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`
  df[type] = df['Types'].apply(lambda x: 1 if type in x else 0)
C:\Users\thors\AppData\Local\Temp\ipykernel_20284\3350647679.py:4: PerformanceWarning: DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`
  df[type] = df['Types'].apply(lambda x: 1 if type in x else 0)
C:\Users\thors\AppData\Local\Temp\ipykernel_20284\3350647679.py:4: PerformanceWarning: DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`
  df[type] = df['Types'].apply(lambda x: 1 if type in x else 0)
C:\Users\thors\AppData\Local\Temp\ipykernel_20284\3350647679.py:4: PerformanceWarning: DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`
  df[type] = df['Types'].apply(lambda x: 1 if type in x else 0)
C:\Users\thors\AppData\Local\Temp\ipykernel_20284\3350647679.py:4: PerformanceWarning: DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`
  df[type] = df['Types'].apply(lambda x: 1 if type in x else 0)
C:\Users\thors\AppData\Local\Temp\ipykernel_20284\3350647679.py:4: PerformanceWarning: DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`
  df[type] = df['Types'].apply(lambda x: 1 if type in x else 0)
C:\Users\thors\AppData\Local\Temp\ipykernel_20284\3350647679.py:4: PerformanceWarning: DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`
  df[type] = df['Types'].apply(lambda x: 1 if type in x else 0)
C:\Users\thors\AppData\Local\Temp\ipykernel_20284\3350647679.py:4: PerformanceWarning: DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`
  df[type] = df['Types'].apply(lambda x: 1 if type in x else 0)
C:\Users\thors\AppData\Local\Temp\ipykernel_20284\3350647679.py:4: PerformanceWarning: DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`
  df[type] = df['Types'].apply(lambda x: 1 if type in x else 0)
C:\Users\thors\AppData\Local\Temp\ipykernel_20284\3350647679.py:4: PerformanceWarning: DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`
  df[type] = df['Types'].apply(lambda x: 1 if type in x else 0)
C:\Users\thors\AppData\Local\Temp\ipykernel_20284\3350647679.py:4: PerformanceWarning: DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`
  df[type] = df['Types'].apply(lambda x: 1 if type in x else 0)
C:\Users\thors\AppData\Local\Temp\ipykernel_20284\3350647679.py:4: PerformanceWarning: DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`
  df[type] = df['Types'].apply(lambda x: 1 if type in x else 0)
C:\Users\thors\AppData\Local\Temp\ipykernel_20284\3350647679.py:4: PerformanceWarning: DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`
  df[type] = df['Types'].apply(lambda x: 1 if type in x else 0)
C:\Users\thors\AppData\Local\Temp\ipykernel_20284\3350647679.py:4: PerformanceWarning: DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`
  df[type] = df['Types'].apply(lambda x: 1 if type in x else 0)
C:\Users\thors\AppData\Local\Temp\ipykernel_20284\3350647679.py:4: PerformanceWarning: DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`
  df[type] = df['Types'].apply(lambda x: 1 if type in x else 0)
C:\Users\thors\AppData\Local\Temp\ipykernel_20284\3350647679.py:4: PerformanceWarning: DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`
  df[type] = df['Types'].apply(lambda x: 1 if type in x else 0)
C:\Users\thors\AppData\Local\Temp\ipykernel_20284\3350647679.py:4: PerformanceWarning: DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`
  df[type] = df['Types'].apply(lambda x: 1 if type in x else 0)
C:\Users\thors\AppData\Local\Temp\ipykernel_20284\3350647679.py:4: PerformanceWarning: DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`
  df[type] = df['Types'].apply(lambda x: 1 if type in x else 0)
C:\Users\thors\AppData\Local\Temp\ipykernel_20284\3350647679.py:4: PerformanceWarning: DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`
  df[type] = df['Types'].apply(lambda x: 1 if type in x else 0)
C:\Users\thors\AppData\Local\Temp\ipykernel_20284\3350647679.py:4: PerformanceWarning: DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`
  df[type] = df['Types'].apply(lambda x: 1 if type in x else 0)
C:\Users\thors\AppData\Local\Temp\ipykernel_20284\3350647679.py:4: PerformanceWarning: DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`
  df[type] = df['Types'].apply(lambda x: 1 if type in x else 0)
C:\Users\thors\AppData\Local\Temp\ipykernel_20284\3350647679.py:4: PerformanceWarning: DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`
  df[type] = df['Types'].apply(lambda x: 1 if type in x else 0)
C:\Users\thors\AppData\Local\Temp\ipykernel_20284\3350647679.py:4: PerformanceWarning: DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`
  df[type] = df['Types'].apply(lambda x: 1 if type in x else 0)
C:\Users\thors\AppData\Local\Temp\ipykernel_20284\3350647679.py:4: PerformanceWarning: DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`
  df[type] = df['Types'].apply(lambda x: 1 if type in x else 0)
C:\Users\thors\AppData\Local\Temp\ipykernel_20284\3350647679.py:4: PerformanceWarning: DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`
  df[type] = df['Types'].apply(lambda x: 1 if type in x else 0)
C:\Users\thors\AppData\Local\Temp\ipykernel_20284\3350647679.py:4: PerformanceWarning: DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`
  df[type] = df['Types'].apply(lambda x: 1 if type in x else 0)
C:\Users\thors\AppData\Local\Temp\ipykernel_20284\3350647679.py:4: PerformanceWarning: DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`
  df[type] = df['Types'].apply(lambda x: 1 if type in x else 0)
C:\Users\thors\AppData\Local\Temp\ipykernel_20284\3350647679.py:4: PerformanceWarning: DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`
  df[type] = df['Types'].apply(lambda x: 1 if type in x else 0)
C:\Users\thors\AppData\Local\Temp\ipykernel_20284\3350647679.py:4: PerformanceWarning: DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`
  df[type] = df['Types'].apply(lambda x: 1 if type in x else 0)
C:\Users\thors\AppData\Local\Temp\ipykernel_20284\3350647679.py:4: PerformanceWarning: DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`
  df[type] = df['Types'].apply(lambda x: 1 if type in x else 0)
C:\Users\thors\AppData\Local\Temp\ipykernel_20284\3350647679.py:4: PerformanceWarning: DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`
  df[type] = df['Types'].apply(lambda x: 1 if type in x else 0)
C:\Users\thors\AppData\Local\Temp\ipykernel_20284\3350647679.py:4: PerformanceWarning: DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`
  df[type] = df['Types'].apply(lambda x: 1 if type in x else 0)
C:\Users\thors\AppData\Local\Temp\ipykernel_20284\3350647679.py:4: PerformanceWarning: DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`
  df[type] = df['Types'].apply(lambda x: 1 if type in x else 0)
C:\Users\thors\AppData\Local\Temp\ipykernel_20284\3350647679.py:4: PerformanceWarning: DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`
  df[type] = df['Types'].apply(lambda x: 1 if type in x else 0)
C:\Users\thors\AppData\Local\Temp\ipykernel_20284\3350647679.py:4: PerformanceWarning: DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`
  df[type] = df['Types'].apply(lambda x: 1 if type in x else 0)
C:\Users\thors\AppData\Local\Temp\ipykernel_20284\3350647679.py:4: PerformanceWarning: DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`
  df[type] = df['Types'].apply(lambda x: 1 if type in x else 0)
C:\Users\thors\AppData\Local\Temp\ipykernel_20284\3350647679.py:4: PerformanceWarning: DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`
  df[type] = df['Types'].apply(lambda x: 1 if type in x else 0)
C:\Users\thors\AppData\Local\Temp\ipykernel_20284\3350647679.py:4: PerformanceWarning: DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`
  df[type] = df['Types'].apply(lambda x: 1 if type in x else 0)
C:\Users\thors\AppData\Local\Temp\ipykernel_20284\3350647679.py:4: PerformanceWarning: DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`
  df[type] = df['Types'].apply(lambda x: 1 if type in x else 0)
C:\Users\thors\AppData\Local\Temp\ipykernel_20284\3350647679.py:4: PerformanceWarning: DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`
  df[type] = df['Types'].apply(lambda x: 1 if type in x else 0)
C:\Users\thors\AppData\Local\Temp\ipykernel_20284\3350647679.py:4: PerformanceWarning: DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`
  df[type] = df['Types'].apply(lambda x: 1 if type in x else 0)
C:\Users\thors\AppData\Local\Temp\ipykernel_20284\3350647679.py:4: PerformanceWarning: DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`
  df[type] = df['Types'].apply(lambda x: 1 if type in x else 0)
C:\Users\thors\AppData\Local\Temp\ipykernel_20284\3350647679.py:4: PerformanceWarning: DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`
  df[type] = df['Types'].apply(lambda x: 1 if type in x else 0)
C:\Users\thors\AppData\Local\Temp\ipykernel_20284\3350647679.py:4: PerformanceWarning: DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`
  df[type] = df['Types'].apply(lambda x: 1 if type in x else 0)
C:\Users\thors\AppData\Local\Temp\ipykernel_20284\3350647679.py:4: PerformanceWarning: DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`
  df[type] = df['Types'].apply(lambda x: 1 if type in x else 0)
C:\Users\thors\AppData\Local\Temp\ipykernel_20284\3350647679.py:4: PerformanceWarning: DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`
  df[type] = df['Types'].apply(lambda x: 1 if type in x else 0)
C:\Users\thors\AppData\Local\Temp\ipykernel_20284\3350647679.py:4: PerformanceWarning: DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`
  df[type] = df['Types'].apply(lambda x: 1 if type in x else 0)
C:\Users\thors\AppData\Local\Temp\ipykernel_20284\3350647679.py:4: PerformanceWarning: DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`
  df[type] = df['Types'].apply(lambda x: 1 if type in x else 0)
C:\Users\thors\AppData\Local\Temp\ipykernel_20284\3350647679.py:4: PerformanceWarning: DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`
  df[type] = df['Types'].apply(lambda x: 1 if type in x else 0)
C:\Users\thors\AppData\Local\Temp\ipykernel_20284\3350647679.py:4: PerformanceWarning: DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`
  df[type] = df['Types'].apply(lambda x: 1 if type in x else 0)
C:\Users\thors\AppData\Local\Temp\ipykernel_20284\3350647679.py:4: PerformanceWarning: DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`
  df[type] = df['Types'].apply(lambda x: 1 if type in x else 0)
C:\Users\thors\AppData\Local\Temp\ipykernel_20284\3350647679.py:4: PerformanceWarning: DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`
  df[type] = df['Types'].apply(lambda x: 1 if type in x else 0)
C:\Users\thors\AppData\Local\Temp\ipykernel_20284\3350647679.py:4: PerformanceWarning: DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`
  df[type] = df['Types'].apply(lambda x: 1 if type in x else 0)
C:\Users\thors\AppData\Local\Temp\ipykernel_20284\3350647679.py:4: PerformanceWarning: DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`
  df[type] = df['Types'].apply(lambda x: 1 if type in x else 0)
C:\Users\thors\AppData\Local\Temp\ipykernel_20284\3350647679.py:4: PerformanceWarning: DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`
  df[type] = df['Types'].apply(lambda x: 1 if type in x else 0)
C:\Users\thors\AppData\Local\Temp\ipykernel_20284\3350647679.py:4: PerformanceWarning: DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`
  df[type] = df['Types'].apply(lambda x: 1 if type in x else 0)
C:\Users\thors\AppData\Local\Temp\ipykernel_20284\3350647679.py:4: PerformanceWarning: DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`
  df[type] = df['Types'].apply(lambda x: 1 if type in x else 0)
C:\Users\thors\AppData\Local\Temp\ipykernel_20284\3350647679.py:4: PerformanceWarning: DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`
  df[type] = df['Types'].apply(lambda x: 1 if type in x else 0)
C:\Users\thors\AppData\Local\Temp\ipykernel_20284\3350647679.py:4: PerformanceWarning: DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`
  df[type] = df['Types'].apply(lambda x: 1 if type in x else 0)
C:\Users\thors\AppData\Local\Temp\ipykernel_20284\3350647679.py:4: PerformanceWarning: DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`
  df[type] = df['Types'].apply(lambda x: 1 if type in x else 0)
C:\Users\thors\AppData\Local\Temp\ipykernel_20284\3350647679.py:4: PerformanceWarning: DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`
  df[type] = df['Types'].apply(lambda x: 1 if type in x else 0)
C:\Users\thors\AppData\Local\Temp\ipykernel_20284\3350647679.py:4: PerformanceWarning: DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`
  df[type] = df['Types'].apply(lambda x: 1 if type in x else 0)
C:\Users\thors\AppData\Local\Temp\ipykernel_20284\3350647679.py:4: PerformanceWarning: DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`
  df[type] = df['Types'].apply(lambda x: 1 if type in x else 0)
Out[ ]:
generation height_m weight_kg abilities_number total_points hp attack defense sp_attack sp_defense ... ('Electric', 'Poison') ('Fire', 'Bug') ('Dark', 'Fairy') ('Ice', 'Bug') ('Psychic', 'Normal') ('Electric', 'Dark') ('Dragon', 'Ghost') ('Fairy', 'Steel') ('Fighting', 'Water') ('Dark', 'Grass')
0 1 0.7 6.9 2 318 45 49 49 65 65 ... 0 0 0 0 0 0 0 0 0 0
1 1 1.0 13.0 2 405 60 62 63 80 80 ... 0 0 0 0 0 0 0 0 0 0
2 1 2.0 100.0 2 525 80 82 83 100 100 ... 0 0 0 0 0 0 0 0 0 0
3 1 2.4 155.5 1 625 80 100 123 122 120 ... 0 0 0 0 0 0 0 0 0 0
4 1 0.6 8.5 2 309 39 52 43 60 50 ... 0 0 0 0 0 0 0 0 0 0

5 rows × 737 columns

In [ ]:
# Drop the 'Types' column
df = df.drop(columns=['Types'])
other_data.drop(columns=['Types'], inplace=True)
singleton_data.drop(columns=['Types'], inplace=True)

Decision Tree¶

In [ ]:
# Split the data into training and testing sets
y = df[type_combinations]

X_train, X_test, y_train, y_test = train_test_split(other_data.drop(columns=type_combinations), other_data[type_combinations], test_size=0.2, stratify=other_data[type_combinations], random_state=42)
X_train = pd.concat([X_train, singleton_data.drop(columns=type_combinations)])
y_train = pd.concat([y_train, singleton_data[type_combinations]])
X_test = pd.concat([X_test, singleton_data.drop(columns=type_combinations)])
y_test = pd.concat([y_test, singleton_data[type_combinations]])

# Initialize and train the decision tree classifier
model = DecisionTreeClassifier(random_state=42)
model.fit(X_train, y_train)

# Predict labels for the test set
y_pred = model.predict(X_test)
score = model.score(X_test, y_test)

# Calculate accuracy
print("Score: ", score)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy: ", accuracy)
print(classification_report(y_test, y_pred))
Score:  0.47107438016528924
Accuracy:  0.47107438016528924
              precision    recall  f1-score   support

           0       1.00      0.67      0.80         3
           1       0.60      0.86      0.71         7
           2       0.00      0.00      0.00         2
           3       0.00      0.00      0.00         0
           4       0.45      0.33      0.38        15
           5       0.43      0.75      0.55         4
           6       0.33      0.33      0.33         3
           7       0.25      0.33      0.29         3
           8       0.83      0.83      0.83         6
           9       0.33      0.50      0.40        14
          10       0.00      0.00      0.00         1
          11       0.25      0.67      0.36         3
          12       1.00      0.14      0.25         7
          13       1.00      1.00      1.00         1
          14       0.00      0.00      0.00         4
          15       0.00      0.00      0.00         0
          16       0.00      0.00      0.00         0
          17       0.33      0.25      0.29         4
          18       1.00      0.25      0.40         4
          19       1.00      1.00      1.00         1
          20       1.00      1.00      1.00         1
          21       1.00      1.00      1.00         1
          22       1.00      1.00      1.00         1
          23       0.00      0.00      0.00         1
          24       0.00      0.00      0.00         3
          25       0.00      0.00      0.00         2
          26       0.57      0.67      0.62         6
          27       1.00      1.00      1.00         1
          28       0.38      0.33      0.35         9
          29       0.00      0.00      0.00         1
          30       1.00      1.00      1.00         1
          31       1.00      1.00      1.00         1
          32       0.50      0.50      0.50         2
          33       0.00      0.00      0.00         1
          34       0.00      0.00      0.00         0
          35       0.00      0.00      0.00         1
          36       0.00      0.00      0.00         1
          37       0.00      0.00      0.00         1
          38       0.00      0.00      0.00         1
          39       0.00      0.00      0.00         0
          40       0.00      0.00      0.00         1
          41       0.00      0.00      0.00         0
          42       1.00      1.00      1.00         1
          43       1.00      1.00      1.00         1
          44       0.62      0.89      0.73         9
          45       0.00      0.00      0.00         1
          46       0.00      0.00      0.00         0
          47       1.00      0.50      0.67         2
          48       0.00      0.00      0.00         2
          49       0.00      0.00      0.00         1
          50       1.00      1.00      1.00         1
          51       0.00      0.00      0.00         0
          52       0.00      0.00      0.00         2
          53       0.00      0.00      0.00         1
          54       0.00      0.00      0.00         0
          55       0.00      0.00      0.00         1
          56       0.50      0.67      0.57         3
          57       0.00      0.00      0.00         1
          58       0.00      0.00      0.00         1
          59       0.00      0.00      0.00         0
          60       0.00      0.00      0.00         0
          61       0.00      0.00      0.00         0
          62       1.00      1.00      1.00         1
          63       0.33      0.33      0.33         3
          64       1.00      0.50      0.67         2
          65       0.00      0.00      0.00         2
          66       0.20      0.33      0.25         3
          67       0.00      0.00      0.00         1
          68       0.00      0.00      0.00         2
          69       0.00      0.00      0.00         1
          70       0.00      0.00      0.00         0
          71       0.00      0.00      0.00         1
          72       0.00      0.00      0.00         1
          73       0.00      0.00      0.00         0
          74       1.00      1.00      1.00         1
          75       0.00      0.00      0.00         1
          76       0.00      0.00      0.00         1
          77       0.00      0.00      0.00         0
          78       0.00      0.00      0.00         1
          79       0.00      0.00      0.00         1
          80       0.00      0.00      0.00         0
          81       0.00      0.00      0.00         0
          82       0.00      0.00      0.00         1
          83       1.00      1.00      1.00         1
          84       0.50      1.00      0.67         1
          85       0.00      0.00      0.00         1
          86       0.00      0.00      0.00         1
          87       0.00      0.00      0.00         0
          88       1.00      1.00      1.00         1
          89       0.00      0.00      0.00         0
          90       0.00      0.00      0.00         1
          91       1.00      1.00      1.00         1
          92       0.00      0.00      0.00         1
          93       0.00      0.00      0.00         1
          94       0.00      0.00      0.00         0
          95       0.50      1.00      0.67         1
          96       0.00      0.00      0.00         0
          97       0.00      0.00      0.00         0
          98       0.00      0.00      0.00         0
          99       0.00      0.00      0.00         0
         100       0.00      0.00      0.00         1
         101       0.00      0.00      0.00         2
         102       0.50      1.00      0.67         1
         103       1.00      1.00      1.00         1
         104       1.00      1.00      1.00         1
         105       1.00      1.00      1.00         1
         106       0.33      1.00      0.50         1
         107       0.00      0.00      0.00         1
         108       0.00      0.00      0.00         1
         109       0.00      0.00      0.00         1
         110       0.50      1.00      0.67         1
         111       0.00      0.00      0.00         1
         112       0.00      0.00      0.00         1
         113       1.00      1.00      1.00         1
         114       0.00      0.00      0.00         0
         115       0.00      0.00      0.00         1
         116       1.00      1.00      1.00         1
         117       0.50      1.00      0.67         1
         118       0.00      0.00      0.00         1
         119       0.00      0.00      0.00         1
         120       0.00      0.00      0.00         0
         121       0.00      0.00      0.00         1
         122       0.00      0.00      0.00         0
         123       1.00      1.00      1.00         1
         124       0.00      0.00      0.00         0
         125       0.33      1.00      0.50         1
         126       1.00      1.00      1.00         1
         127       1.00      1.00      1.00         1
         128       0.00      0.00      0.00         0
         129       1.00      1.00      1.00         1
         130       0.00      0.00      0.00         0
         131       0.00      0.00      0.00         1
         132       0.00      0.00      0.00         0
         133       0.00      0.00      0.00         0
         134       0.00      0.00      0.00         1
         135       0.00      0.00      0.00         1
         136       1.00      1.00      1.00         1
         137       1.00      1.00      1.00         1
         138       0.00      0.00      0.00         0
         139       0.00      0.00      0.00         1
         140       0.00      0.00      0.00         0
         141       1.00      1.00      1.00         1
         142       1.00      1.00      1.00         1
         143       0.50      1.00      0.67         1
         144       0.50      1.00      0.67         1
         145       1.00      1.00      1.00         1
         146       0.00      0.00      0.00         1
         147       1.00      1.00      1.00         1
         148       0.00      0.00      0.00         0
         149       0.00      0.00      0.00         0
         150       0.00      0.00      0.00         1
         151       0.00      0.00      0.00         0
         152       0.00      0.00      0.00         1
         153       0.00      0.00      0.00         1
         154       0.00      0.00      0.00         0
         155       0.00      0.00      0.00         0
         156       0.00      0.00      0.00         0
         157       0.00      0.00      0.00         0
         158       0.00      0.00      0.00         1
         159       0.50      0.50      0.50         2
         160       0.00      0.00      0.00         0
         161       0.00      0.00      0.00         1
         162       1.00      1.00      1.00         1
         163       1.00      1.00      1.00         1
         164       1.00      1.00      1.00         1
         165       1.00      1.00      1.00         1
         166       1.00      1.00      1.00         1
         167       0.00      0.00      0.00         0
         168       0.00      0.00      0.00         0
         169       0.00      0.00      0.00         0
         170       0.00      0.00      0.00         0
         171       1.00      1.00      1.00         1
         172       0.50      1.00      0.67         1
         173       0.00      0.00      0.00         0
         174       0.00      0.00      0.00         0
         175       1.00      1.00      1.00         1
         176       1.00      1.00      1.00         1
         177       1.00      1.00      1.00         1
         178       1.00      1.00      1.00         1
         179       0.00      0.00      0.00         0
         180       0.00      0.00      0.00         0
         181       1.00      1.00      1.00         1
         182       0.00      0.00      0.00         1
         183       0.00      0.00      0.00         0
         184       1.00      1.00      1.00         1
         185       0.00      0.00      0.00         0
         186       0.00      0.00      0.00         0
         187       0.00      0.00      0.00         0
         188       1.00      1.00      1.00         1
         189       1.00      1.00      1.00         1
         190       0.00      0.00      0.00         1
         191       1.00      1.00      1.00         1

   micro avg       0.48      0.47      0.48       242
   macro avg       0.33      0.35      0.33       242
weighted avg       0.46      0.47      0.44       242
 samples avg       0.47      0.47      0.47       242

C:\Users\thors\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.10_qbz5n2kfra8p0\LocalCache\local-packages\Python310\site-packages\sklearn\metrics\_classification.py:1497: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
C:\Users\thors\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.10_qbz5n2kfra8p0\LocalCache\local-packages\Python310\site-packages\sklearn\metrics\_classification.py:1497: UndefinedMetricWarning: Recall is ill-defined and being set to 0.0 in labels with no true samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
C:\Users\thors\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.10_qbz5n2kfra8p0\LocalCache\local-packages\Python310\site-packages\sklearn\metrics\_classification.py:1497: UndefinedMetricWarning: F-score is ill-defined and being set to 0.0 in labels with no true nor predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
C:\Users\thors\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.10_qbz5n2kfra8p0\LocalCache\local-packages\Python310\site-packages\sklearn\metrics\_classification.py:1497: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in samples with no predicted labels. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))

Hyperparameter tuning¶

In [ ]:
pipeline = make_pipeline(StandardScaler(), DecisionTreeClassifier())

# Setup the parameters
param_dist = {
    "decisiontreeclassifier__max_depth": [15, 30, None],  
    "decisiontreeclassifier__min_samples_leaf": np.arange(1, 10)
}

# Instantiate the GridSearchCV object
grid_search_cv = GridSearchCV(pipeline, param_grid=param_dist, cv=5)

# Fit grid_search_cv using the data X and labels y.
grid_search_cv.fit(X_train, y_train) 
y_pred = grid_search_cv.predict(X_test)

# Print the best score
print("Tuned Model Parameters: {}".format(grid_search_cv.best_params_))
print("Accuracy: {}".format(grid_search_cv.best_estimator_.score(X_test, y_test)))
print(classification_report(y_test, y_pred))
Tuned Model Parameters: {'decisiontreeclassifier__max_depth': None, 'decisiontreeclassifier__min_samples_leaf': 1}
Accuracy: 0.47520661157024796
              precision    recall  f1-score   support

           0       1.00      0.67      0.80         3
           1       0.57      0.57      0.57         7
           2       0.33      0.50      0.40         2
           3       0.00      0.00      0.00         0
           4       0.45      0.33      0.38        15
           5       0.43      0.75      0.55         4
           6       0.50      0.67      0.57         3
           7       0.50      0.33      0.40         3
           8       0.83      0.83      0.83         6
           9       0.35      0.43      0.39        14
          10       0.00      0.00      0.00         1
          11       0.25      0.67      0.36         3
          12       0.50      0.14      0.22         7
          13       1.00      1.00      1.00         1
          14       1.00      0.25      0.40         4
          15       0.00      0.00      0.00         0
          16       0.00      0.00      0.00         0
          17       0.50      0.25      0.33         4
          18       0.50      0.25      0.33         4
          19       1.00      1.00      1.00         1
          20       1.00      1.00      1.00         1
          21       1.00      1.00      1.00         1
          22       1.00      1.00      1.00         1
          23       0.00      0.00      0.00         1
          24       0.00      0.00      0.00         3
          25       0.00      0.00      0.00         2
          26       0.80      0.67      0.73         6
          27       0.25      1.00      0.40         1
          28       0.33      0.22      0.27         9
          29       0.00      0.00      0.00         1
          30       1.00      1.00      1.00         1
          31       1.00      1.00      1.00         1
          32       0.50      0.50      0.50         2
          33       0.00      0.00      0.00         1
          34       0.00      0.00      0.00         0
          35       0.00      0.00      0.00         1
          36       0.00      0.00      0.00         1
          37       0.00      0.00      0.00         1
          38       0.00      0.00      0.00         1
          39       0.00      0.00      0.00         0
          40       0.00      0.00      0.00         1
          41       0.00      0.00      0.00         0
          42       1.00      1.00      1.00         1
          43       1.00      1.00      1.00         1
          44       0.73      0.89      0.80         9
          45       0.00      0.00      0.00         1
          46       0.00      0.00      0.00         0
          47       1.00      0.50      0.67         2
          48       0.00      0.00      0.00         2
          49       0.00      0.00      0.00         1
          50       1.00      1.00      1.00         1
          51       0.00      0.00      0.00         0
          52       0.00      0.00      0.00         2
          53       0.00      0.00      0.00         1
          54       0.00      0.00      0.00         0
          55       0.00      0.00      0.00         1
          56       1.00      0.67      0.80         3
          57       0.00      0.00      0.00         1
          58       0.00      0.00      0.00         1
          59       0.00      0.00      0.00         0
          60       0.00      0.00      0.00         0
          61       0.00      0.00      0.00         0
          62       0.00      0.00      0.00         1
          63       0.50      0.33      0.40         3
          64       1.00      0.50      0.67         2
          65       0.00      0.00      0.00         2
          66       0.67      0.67      0.67         3
          67       0.00      0.00      0.00         1
          68       0.50      0.50      0.50         2
          69       0.00      0.00      0.00         1
          70       0.00      0.00      0.00         0
          71       0.00      0.00      0.00         1
          72       0.00      0.00      0.00         1
          73       0.00      0.00      0.00         0
          74       1.00      1.00      1.00         1
          75       0.00      0.00      0.00         1
          76       0.00      0.00      0.00         1
          77       0.00      0.00      0.00         0
          78       0.00      0.00      0.00         1
          79       0.00      0.00      0.00         1
          80       0.00      0.00      0.00         0
          81       0.00      0.00      0.00         0
          82       0.00      0.00      0.00         1
          83       1.00      1.00      1.00         1
          84       0.00      0.00      0.00         1
          85       0.00      0.00      0.00         1
          86       0.00      0.00      0.00         1
          87       0.00      0.00      0.00         0
          88       1.00      1.00      1.00         1
          89       0.00      0.00      0.00         0
          90       0.00      0.00      0.00         1
          91       1.00      1.00      1.00         1
          92       0.00      0.00      0.00         1
          93       0.00      0.00      0.00         1
          94       0.00      0.00      0.00         0
          95       1.00      1.00      1.00         1
          96       0.00      0.00      0.00         0
          97       0.00      0.00      0.00         0
          98       0.00      0.00      0.00         0
          99       0.00      0.00      0.00         0
         100       0.00      0.00      0.00         1
         101       1.00      0.50      0.67         2
         102       0.50      1.00      0.67         1
         103       1.00      1.00      1.00         1
         104       1.00      1.00      1.00         1
         105       1.00      1.00      1.00         1
         106       0.25      1.00      0.40         1
         107       0.00      0.00      0.00         1
         108       0.00      0.00      0.00         1
         109       0.00      0.00      0.00         1
         110       0.50      1.00      0.67         1
         111       0.00      0.00      0.00         1
         112       0.00      0.00      0.00         1
         113       1.00      1.00      1.00         1
         114       0.00      0.00      0.00         0
         115       0.50      1.00      0.67         1
         116       1.00      1.00      1.00         1
         117       0.50      1.00      0.67         1
         118       0.00      0.00      0.00         1
         119       0.00      0.00      0.00         1
         120       0.00      0.00      0.00         0
         121       0.00      0.00      0.00         1
         122       0.00      0.00      0.00         0
         123       1.00      1.00      1.00         1
         124       0.00      0.00      0.00         0
         125       1.00      1.00      1.00         1
         126       1.00      1.00      1.00         1
         127       1.00      1.00      1.00         1
         128       0.00      0.00      0.00         0
         129       1.00      1.00      1.00         1
         130       0.00      0.00      0.00         0
         131       0.00      0.00      0.00         1
         132       0.00      0.00      0.00         0
         133       0.00      0.00      0.00         0
         134       0.00      0.00      0.00         1
         135       0.00      0.00      0.00         1
         136       1.00      1.00      1.00         1
         137       1.00      1.00      1.00         1
         138       0.00      0.00      0.00         0
         139       0.00      0.00      0.00         1
         140       0.00      0.00      0.00         0
         141       1.00      1.00      1.00         1
         142       1.00      1.00      1.00         1
         143       0.50      1.00      0.67         1
         144       1.00      1.00      1.00         1
         145       1.00      1.00      1.00         1
         146       0.00      0.00      0.00         1
         147       0.50      1.00      0.67         1
         148       0.00      0.00      0.00         0
         149       0.00      0.00      0.00         0
         150       0.00      0.00      0.00         1
         151       0.00      0.00      0.00         0
         152       0.00      0.00      0.00         1
         153       0.00      0.00      0.00         1
         154       0.00      0.00      0.00         0
         155       0.00      0.00      0.00         0
         156       0.00      0.00      0.00         0
         157       0.00      0.00      0.00         0
         158       0.00      0.00      0.00         1
         159       0.50      0.50      0.50         2
         160       0.00      0.00      0.00         0
         161       0.00      0.00      0.00         1
         162       1.00      1.00      1.00         1
         163       0.50      1.00      0.67         1
         164       1.00      1.00      1.00         1
         165       0.50      1.00      0.67         1
         166       0.50      1.00      0.67         1
         167       0.00      0.00      0.00         0
         168       0.00      0.00      0.00         0
         169       0.00      0.00      0.00         0
         170       0.00      0.00      0.00         0
         171       0.50      1.00      0.67         1
         172       0.25      1.00      0.40         1
         173       0.00      0.00      0.00         0
         174       0.00      0.00      0.00         0
         175       1.00      1.00      1.00         1
         176       1.00      1.00      1.00         1
         177       1.00      1.00      1.00         1
         178       1.00      1.00      1.00         1
         179       0.00      0.00      0.00         0
         180       0.00      0.00      0.00         0
         181       1.00      1.00      1.00         1
         182       0.00      0.00      0.00         1
         183       0.00      0.00      0.00         0
         184       1.00      1.00      1.00         1
         185       0.00      0.00      0.00         0
         186       0.00      0.00      0.00         0
         187       0.00      0.00      0.00         0
         188       1.00      1.00      1.00         1
         189       1.00      1.00      1.00         1
         190       0.00      0.00      0.00         1
         191       1.00      1.00      1.00         1

   micro avg       0.49      0.48      0.48       242
   macro avg       0.33      0.35      0.33       242
weighted avg       0.49      0.48      0.46       242
 samples avg       0.48      0.48      0.48       242

C:\Users\thors\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.10_qbz5n2kfra8p0\LocalCache\local-packages\Python310\site-packages\sklearn\metrics\_classification.py:1497: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
C:\Users\thors\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.10_qbz5n2kfra8p0\LocalCache\local-packages\Python310\site-packages\sklearn\metrics\_classification.py:1497: UndefinedMetricWarning: Recall is ill-defined and being set to 0.0 in labels with no true samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
C:\Users\thors\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.10_qbz5n2kfra8p0\LocalCache\local-packages\Python310\site-packages\sklearn\metrics\_classification.py:1497: UndefinedMetricWarning: F-score is ill-defined and being set to 0.0 in labels with no true nor predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
C:\Users\thors\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.10_qbz5n2kfra8p0\LocalCache\local-packages\Python310\site-packages\sklearn\metrics\_classification.py:1497: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in samples with no predicted labels. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))

Random Forest¶

In [ ]:
from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier(random_state=42)

model.fit(X_train, y_train)

# Predict labels for the test set
y_pred = model.predict(X_test)
score = model.score(X_test, y_test)
# Calculate accuracy
print("Score: ", score)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)
print(classification_report(y_test, y_pred))
Score:  0.25206611570247933
Accuracy: 0.25206611570247933
              precision    recall  f1-score   support

           0       1.00      0.33      0.50         3
           1       1.00      0.43      0.60         7
           2       0.00      0.00      0.00         2
           3       0.00      0.00      0.00         0
           4       0.80      0.27      0.40        15
           5       1.00      0.25      0.40         4
           6       0.00      0.00      0.00         3
           7       0.00      0.00      0.00         3
           8       0.33      0.17      0.22         6
           9       0.50      0.14      0.22        14
          10       0.00      0.00      0.00         1
          11       1.00      0.67      0.80         3
          12       0.00      0.00      0.00         7
          13       1.00      1.00      1.00         1
          14       0.00      0.00      0.00         4
          15       0.00      0.00      0.00         0
          16       0.00      0.00      0.00         0
          17       1.00      0.25      0.40         4
          18       0.00      0.00      0.00         4
          19       1.00      1.00      1.00         1
          20       0.00      0.00      0.00         1
          21       0.00      0.00      0.00         1
          22       0.00      0.00      0.00         1
          23       0.00      0.00      0.00         1
          24       0.00      0.00      0.00         3
          25       0.00      0.00      0.00         2
          26       1.00      0.33      0.50         6
          27       1.00      1.00      1.00         1
          28       0.00      0.00      0.00         9
          29       0.00      0.00      0.00         1
          30       0.00      0.00      0.00         1
          31       0.00      0.00      0.00         1
          32       0.00      0.00      0.00         2
          33       0.00      0.00      0.00         1
          34       0.00      0.00      0.00         0
          35       0.00      0.00      0.00         1
          36       0.00      0.00      0.00         1
          37       0.00      0.00      0.00         1
          38       0.00      0.00      0.00         1
          39       0.00      0.00      0.00         0
          40       0.00      0.00      0.00         1
          41       0.00      0.00      0.00         0
          42       1.00      1.00      1.00         1
          43       1.00      1.00      1.00         1
          44       0.67      0.22      0.33         9
          45       0.00      0.00      0.00         1
          46       0.00      0.00      0.00         0
          47       0.00      0.00      0.00         2
          48       0.00      0.00      0.00         2
          49       0.00      0.00      0.00         1
          50       0.00      0.00      0.00         1
          51       0.00      0.00      0.00         0
          52       0.00      0.00      0.00         2
          53       0.00      0.00      0.00         1
          54       0.00      0.00      0.00         0
          55       0.00      0.00      0.00         1
          56       1.00      0.33      0.50         3
          57       0.00      0.00      0.00         1
          58       0.00      0.00      0.00         1
          59       0.00      0.00      0.00         0
          60       0.00      0.00      0.00         0
          61       0.00      0.00      0.00         0
          62       0.00      0.00      0.00         1
          63       0.00      0.00      0.00         3
          64       0.00      0.00      0.00         2
          65       0.00      0.00      0.00         2
          66       0.00      0.00      0.00         3
          67       0.00      0.00      0.00         1
          68       0.00      0.00      0.00         2
          69       0.00      0.00      0.00         1
          70       0.00      0.00      0.00         0
          71       0.00      0.00      0.00         1
          72       0.00      0.00      0.00         1
          73       0.00      0.00      0.00         0
          74       1.00      1.00      1.00         1
          75       0.00      0.00      0.00         1
          76       0.00      0.00      0.00         1
          77       0.00      0.00      0.00         0
          78       0.00      0.00      0.00         1
          79       0.00      0.00      0.00         1
          80       0.00      0.00      0.00         0
          81       0.00      0.00      0.00         0
          82       0.00      0.00      0.00         1
          83       0.00      0.00      0.00         1
          84       0.00      0.00      0.00         1
          85       0.00      0.00      0.00         1
          86       0.00      0.00      0.00         1
          87       0.00      0.00      0.00         0
          88       1.00      1.00      1.00         1
          89       0.00      0.00      0.00         0
          90       0.00      0.00      0.00         1
          91       1.00      1.00      1.00         1
          92       0.00      0.00      0.00         1
          93       0.00      0.00      0.00         1
          94       0.00      0.00      0.00         0
          95       1.00      1.00      1.00         1
          96       0.00      0.00      0.00         0
          97       0.00      0.00      0.00         0
          98       0.00      0.00      0.00         0
          99       0.00      0.00      0.00         0
         100       0.00      0.00      0.00         1
         101       0.00      0.00      0.00         2
         102       0.00      0.00      0.00         1
         103       1.00      1.00      1.00         1
         104       1.00      1.00      1.00         1
         105       1.00      1.00      1.00         1
         106       1.00      1.00      1.00         1
         107       0.00      0.00      0.00         1
         108       0.00      0.00      0.00         1
         109       0.00      0.00      0.00         1
         110       1.00      1.00      1.00         1
         111       0.00      0.00      0.00         1
         112       0.00      0.00      0.00         1
         113       1.00      1.00      1.00         1
         114       0.00      0.00      0.00         0
         115       0.00      0.00      0.00         1
         116       1.00      1.00      1.00         1
         117       1.00      1.00      1.00         1
         118       0.00      0.00      0.00         1
         119       0.00      0.00      0.00         1
         120       0.00      0.00      0.00         0
         121       0.00      0.00      0.00         1
         122       0.00      0.00      0.00         0
         123       1.00      1.00      1.00         1
         124       0.00      0.00      0.00         0
         125       1.00      1.00      1.00         1
         126       0.00      0.00      0.00         1
         127       0.00      0.00      0.00         1
         128       0.00      0.00      0.00         0
         129       1.00      1.00      1.00         1
         130       0.00      0.00      0.00         0
         131       0.00      0.00      0.00         1
         132       0.00      0.00      0.00         0
         133       0.00      0.00      0.00         0
         134       0.00      0.00      0.00         1
         135       0.00      0.00      0.00         1
         136       0.00      0.00      0.00         1
         137       1.00      1.00      1.00         1
         138       0.00      0.00      0.00         0
         139       0.00      0.00      0.00         1
         140       0.00      0.00      0.00         0
         141       1.00      1.00      1.00         1
         142       1.00      1.00      1.00         1
         143       0.00      0.00      0.00         1
         144       1.00      1.00      1.00         1
         145       1.00      1.00      1.00         1
         146       0.00      0.00      0.00         1
         147       1.00      1.00      1.00         1
         148       0.00      0.00      0.00         0
         149       0.00      0.00      0.00         0
         150       0.00      0.00      0.00         1
         151       0.00      0.00      0.00         0
         152       0.00      0.00      0.00         1
         153       0.00      0.00      0.00         1
         154       0.00      0.00      0.00         0
         155       0.00      0.00      0.00         0
         156       0.00      0.00      0.00         0
         157       0.00      0.00      0.00         0
         158       0.00      0.00      0.00         1
         159       1.00      0.50      0.67         2
         160       0.00      0.00      0.00         0
         161       0.00      0.00      0.00         1
         162       1.00      1.00      1.00         1
         163       1.00      1.00      1.00         1
         164       1.00      1.00      1.00         1
         165       1.00      1.00      1.00         1
         166       1.00      1.00      1.00         1
         167       0.00      0.00      0.00         0
         168       0.00      0.00      0.00         0
         169       0.00      0.00      0.00         0
         170       0.00      0.00      0.00         0
         171       1.00      1.00      1.00         1
         172       1.00      1.00      1.00         1
         173       0.00      0.00      0.00         0
         174       0.00      0.00      0.00         0
         175       1.00      1.00      1.00         1
         176       1.00      1.00      1.00         1
         177       1.00      1.00      1.00         1
         178       1.00      1.00      1.00         1
         179       0.00      0.00      0.00         0
         180       0.00      0.00      0.00         0
         181       1.00      1.00      1.00         1
         182       0.00      0.00      0.00         1
         183       0.00      0.00      0.00         0
         184       0.00      0.00      0.00         1
         185       0.00      0.00      0.00         0
         186       0.00      0.00      0.00         0
         187       0.00      0.00      0.00         0
         188       0.00      0.00      0.00         1
         189       1.00      1.00      1.00         1
         190       0.00      0.00      0.00         1
         191       1.00      1.00      1.00         1

   micro avg       0.86      0.25      0.39       242
   macro avg       0.26      0.23      0.24       242
weighted avg       0.41      0.25      0.29       242
 samples avg       0.25      0.25      0.25       242

C:\Users\thors\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.10_qbz5n2kfra8p0\LocalCache\local-packages\Python310\site-packages\sklearn\metrics\_classification.py:1497: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
C:\Users\thors\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.10_qbz5n2kfra8p0\LocalCache\local-packages\Python310\site-packages\sklearn\metrics\_classification.py:1497: UndefinedMetricWarning: Recall is ill-defined and being set to 0.0 in labels with no true samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
C:\Users\thors\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.10_qbz5n2kfra8p0\LocalCache\local-packages\Python310\site-packages\sklearn\metrics\_classification.py:1497: UndefinedMetricWarning: F-score is ill-defined and being set to 0.0 in labels with no true nor predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
C:\Users\thors\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.10_qbz5n2kfra8p0\LocalCache\local-packages\Python310\site-packages\sklearn\metrics\_classification.py:1497: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in samples with no predicted labels. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))

Hyperparameter Tuning¶

In [ ]:
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import RandomizedSearchCV
pipeline = make_pipeline(StandardScaler(), RandomForestClassifier())

# Setup the parameters and distributions to sample from: param_dist
param_dist = {
    "randomforestclassifier__max_depth": [15, 30, None],  
    "randomforestclassifier__min_samples_leaf": np.arange(1, 10, 4),
    "randomforestclassifier__n_estimators": np.arange(80, 120, 5)
}

# Instantiate the RandomizedSearchCV object: random_grid_search_cv
random_search_cv = RandomizedSearchCV(pipeline, param_distributions=param_dist, n_iter=50, cv=3, random_state=42)
#grid_search_cv = GridSearchCV(pipeline, param_grid=param_dist, cv=3)

# Fit random_search_cv using the data X and labels y
random_search_cv.fit(X_train, y_train)
#grid_search_cv.fit(X_train, y_train)

# Print the best score
print("Best score is {}".format(random_search_cv.best_estimator_.score(X_test, y_test)))
print("Best parameters are {}".format(random_search_cv.best_params_))
print(classification_report(y_test, y_pred))
Best score is 0.256198347107438
Best parameters are {'randomforestclassifier__n_estimators': 85, 'randomforestclassifier__min_samples_leaf': 1, 'randomforestclassifier__max_depth': None}
              precision    recall  f1-score   support

           0       1.00      0.33      0.50         3
           1       1.00      0.43      0.60         7
           2       0.00      0.00      0.00         2
           3       0.00      0.00      0.00         0
           4       0.80      0.27      0.40        15
           5       1.00      0.25      0.40         4
           6       0.00      0.00      0.00         3
           7       0.00      0.00      0.00         3
           8       0.33      0.17      0.22         6
           9       0.50      0.14      0.22        14
          10       0.00      0.00      0.00         1
          11       1.00      0.67      0.80         3
          12       0.00      0.00      0.00         7
          13       1.00      1.00      1.00         1
          14       0.00      0.00      0.00         4
          15       0.00      0.00      0.00         0
          16       0.00      0.00      0.00         0
          17       1.00      0.25      0.40         4
          18       0.00      0.00      0.00         4
          19       1.00      1.00      1.00         1
          20       0.00      0.00      0.00         1
          21       0.00      0.00      0.00         1
          22       0.00      0.00      0.00         1
          23       0.00      0.00      0.00         1
          24       0.00      0.00      0.00         3
          25       0.00      0.00      0.00         2
          26       1.00      0.33      0.50         6
          27       1.00      1.00      1.00         1
          28       0.00      0.00      0.00         9
          29       0.00      0.00      0.00         1
          30       0.00      0.00      0.00         1
          31       0.00      0.00      0.00         1
          32       0.00      0.00      0.00         2
          33       0.00      0.00      0.00         1
          34       0.00      0.00      0.00         0
          35       0.00      0.00      0.00         1
          36       0.00      0.00      0.00         1
          37       0.00      0.00      0.00         1
          38       0.00      0.00      0.00         1
          39       0.00      0.00      0.00         0
          40       0.00      0.00      0.00         1
          41       0.00      0.00      0.00         0
          42       1.00      1.00      1.00         1
          43       1.00      1.00      1.00         1
          44       0.67      0.22      0.33         9
          45       0.00      0.00      0.00         1
          46       0.00      0.00      0.00         0
          47       0.00      0.00      0.00         2
          48       0.00      0.00      0.00         2
          49       0.00      0.00      0.00         1
          50       0.00      0.00      0.00         1
          51       0.00      0.00      0.00         0
          52       0.00      0.00      0.00         2
          53       0.00      0.00      0.00         1
          54       0.00      0.00      0.00         0
          55       0.00      0.00      0.00         1
          56       1.00      0.33      0.50         3
          57       0.00      0.00      0.00         1
          58       0.00      0.00      0.00         1
          59       0.00      0.00      0.00         0
          60       0.00      0.00      0.00         0
          61       0.00      0.00      0.00         0
          62       0.00      0.00      0.00         1
          63       0.00      0.00      0.00         3
          64       0.00      0.00      0.00         2
          65       0.00      0.00      0.00         2
          66       0.00      0.00      0.00         3
          67       0.00      0.00      0.00         1
          68       0.00      0.00      0.00         2
          69       0.00      0.00      0.00         1
          70       0.00      0.00      0.00         0
          71       0.00      0.00      0.00         1
          72       0.00      0.00      0.00         1
          73       0.00      0.00      0.00         0
          74       1.00      1.00      1.00         1
          75       0.00      0.00      0.00         1
          76       0.00      0.00      0.00         1
          77       0.00      0.00      0.00         0
          78       0.00      0.00      0.00         1
          79       0.00      0.00      0.00         1
          80       0.00      0.00      0.00         0
          81       0.00      0.00      0.00         0
          82       0.00      0.00      0.00         1
          83       0.00      0.00      0.00         1
          84       0.00      0.00      0.00         1
          85       0.00      0.00      0.00         1
          86       0.00      0.00      0.00         1
          87       0.00      0.00      0.00         0
          88       1.00      1.00      1.00         1
          89       0.00      0.00      0.00         0
          90       0.00      0.00      0.00         1
          91       1.00      1.00      1.00         1
          92       0.00      0.00      0.00         1
          93       0.00      0.00      0.00         1
          94       0.00      0.00      0.00         0
          95       1.00      1.00      1.00         1
          96       0.00      0.00      0.00         0
          97       0.00      0.00      0.00         0
          98       0.00      0.00      0.00         0
          99       0.00      0.00      0.00         0
         100       0.00      0.00      0.00         1
         101       0.00      0.00      0.00         2
         102       0.00      0.00      0.00         1
         103       1.00      1.00      1.00         1
         104       1.00      1.00      1.00         1
         105       1.00      1.00      1.00         1
         106       1.00      1.00      1.00         1
         107       0.00      0.00      0.00         1
         108       0.00      0.00      0.00         1
         109       0.00      0.00      0.00         1
         110       1.00      1.00      1.00         1
         111       0.00      0.00      0.00         1
         112       0.00      0.00      0.00         1
         113       1.00      1.00      1.00         1
         114       0.00      0.00      0.00         0
         115       0.00      0.00      0.00         1
         116       1.00      1.00      1.00         1
         117       1.00      1.00      1.00         1
         118       0.00      0.00      0.00         1
         119       0.00      0.00      0.00         1
         120       0.00      0.00      0.00         0
         121       0.00      0.00      0.00         1
         122       0.00      0.00      0.00         0
         123       1.00      1.00      1.00         1
         124       0.00      0.00      0.00         0
         125       1.00      1.00      1.00         1
         126       0.00      0.00      0.00         1
         127       0.00      0.00      0.00         1
         128       0.00      0.00      0.00         0
         129       1.00      1.00      1.00         1
         130       0.00      0.00      0.00         0
         131       0.00      0.00      0.00         1
         132       0.00      0.00      0.00         0
         133       0.00      0.00      0.00         0
         134       0.00      0.00      0.00         1
         135       0.00      0.00      0.00         1
         136       0.00      0.00      0.00         1
         137       1.00      1.00      1.00         1
         138       0.00      0.00      0.00         0
         139       0.00      0.00      0.00         1
         140       0.00      0.00      0.00         0
         141       1.00      1.00      1.00         1
         142       1.00      1.00      1.00         1
         143       0.00      0.00      0.00         1
         144       1.00      1.00      1.00         1
         145       1.00      1.00      1.00         1
         146       0.00      0.00      0.00         1
         147       1.00      1.00      1.00         1
         148       0.00      0.00      0.00         0
         149       0.00      0.00      0.00         0
         150       0.00      0.00      0.00         1
         151       0.00      0.00      0.00         0
         152       0.00      0.00      0.00         1
         153       0.00      0.00      0.00         1
         154       0.00      0.00      0.00         0
         155       0.00      0.00      0.00         0
         156       0.00      0.00      0.00         0
         157       0.00      0.00      0.00         0
         158       0.00      0.00      0.00         1
         159       1.00      0.50      0.67         2
         160       0.00      0.00      0.00         0
         161       0.00      0.00      0.00         1
         162       1.00      1.00      1.00         1
         163       1.00      1.00      1.00         1
         164       1.00      1.00      1.00         1
         165       1.00      1.00      1.00         1
         166       1.00      1.00      1.00         1
         167       0.00      0.00      0.00         0
         168       0.00      0.00      0.00         0
         169       0.00      0.00      0.00         0
         170       0.00      0.00      0.00         0
         171       1.00      1.00      1.00         1
         172       1.00      1.00      1.00         1
         173       0.00      0.00      0.00         0
         174       0.00      0.00      0.00         0
         175       1.00      1.00      1.00         1
         176       1.00      1.00      1.00         1
         177       1.00      1.00      1.00         1
         178       1.00      1.00      1.00         1
         179       0.00      0.00      0.00         0
         180       0.00      0.00      0.00         0
         181       1.00      1.00      1.00         1
         182       0.00      0.00      0.00         1
         183       0.00      0.00      0.00         0
         184       0.00      0.00      0.00         1
         185       0.00      0.00      0.00         0
         186       0.00      0.00      0.00         0
         187       0.00      0.00      0.00         0
         188       0.00      0.00      0.00         1
         189       1.00      1.00      1.00         1
         190       0.00      0.00      0.00         1
         191       1.00      1.00      1.00         1

   micro avg       0.86      0.25      0.39       242
   macro avg       0.26      0.23      0.24       242
weighted avg       0.41      0.25      0.29       242
 samples avg       0.25      0.25      0.25       242

C:\Users\thors\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.10_qbz5n2kfra8p0\LocalCache\local-packages\Python310\site-packages\sklearn\metrics\_classification.py:1497: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
C:\Users\thors\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.10_qbz5n2kfra8p0\LocalCache\local-packages\Python310\site-packages\sklearn\metrics\_classification.py:1497: UndefinedMetricWarning: Recall is ill-defined and being set to 0.0 in labels with no true samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
C:\Users\thors\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.10_qbz5n2kfra8p0\LocalCache\local-packages\Python310\site-packages\sklearn\metrics\_classification.py:1497: UndefinedMetricWarning: F-score is ill-defined and being set to 0.0 in labels with no true nor predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
C:\Users\thors\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.10_qbz5n2kfra8p0\LocalCache\local-packages\Python310\site-packages\sklearn\metrics\_classification.py:1497: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in samples with no predicted labels. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))

K Nearest neighbors¶

In [ ]:
from sklearn.neighbors import KNeighborsClassifier
model = KNeighborsClassifier()
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

# Calculate accuracy
print("Score: ", model.score(X_test, y_test))
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)
print(classification_report(y_test, y_pred))
Score:  0.03305785123966942
Accuracy: 0.03305785123966942
              precision    recall  f1-score   support

           0       0.00      0.00      0.00         3
           1       1.00      0.29      0.44         7
           2       0.00      0.00      0.00         2
           3       0.00      0.00      0.00         0
           4       0.00      0.00      0.00        15
           5       1.00      0.50      0.67         4
           6       0.00      0.00      0.00         3
           7       0.00      0.00      0.00         3
           8       0.00      0.00      0.00         6
           9       0.75      0.21      0.33        14
          10       0.00      0.00      0.00         1
          11       0.00      0.00      0.00         3
          12       0.00      0.00      0.00         7
          13       0.00      0.00      0.00         1
          14       0.00      0.00      0.00         4
          15       0.00      0.00      0.00         0
          16       0.00      0.00      0.00         0
          17       0.00      0.00      0.00         4
          18       0.00      0.00      0.00         4
          19       0.00      0.00      0.00         1
          20       0.00      0.00      0.00         1
          21       0.00      0.00      0.00         1
          22       0.00      0.00      0.00         1
          23       0.00      0.00      0.00         1
          24       0.00      0.00      0.00         3
          25       0.00      0.00      0.00         2
          26       0.00      0.00      0.00         6
          27       0.00      0.00      0.00         1
          28       0.00      0.00      0.00         9
          29       0.00      0.00      0.00         1
          30       0.00      0.00      0.00         1
          31       0.00      0.00      0.00         1
          32       0.00      0.00      0.00         2
          33       0.00      0.00      0.00         1
          34       0.00      0.00      0.00         0
          35       0.00      0.00      0.00         1
          36       0.00      0.00      0.00         1
          37       0.00      0.00      0.00         1
          38       0.00      0.00      0.00         1
          39       0.00      0.00      0.00         0
          40       0.00      0.00      0.00         1
          41       0.00      0.00      0.00         0
          42       0.00      0.00      0.00         1
          43       0.00      0.00      0.00         1
          44       0.00      0.00      0.00         9
          45       0.00      0.00      0.00         1
          46       0.00      0.00      0.00         0
          47       0.00      0.00      0.00         2
          48       0.00      0.00      0.00         2
          49       0.00      0.00      0.00         1
          50       0.00      0.00      0.00         1
          51       0.00      0.00      0.00         0
          52       0.00      0.00      0.00         2
          53       0.00      0.00      0.00         1
          54       0.00      0.00      0.00         0
          55       0.00      0.00      0.00         1
          56       0.00      0.00      0.00         3
          57       0.00      0.00      0.00         1
          58       0.00      0.00      0.00         1
          59       0.00      0.00      0.00         0
          60       0.00      0.00      0.00         0
          61       0.00      0.00      0.00         0
          62       0.00      0.00      0.00         1
          63       0.00      0.00      0.00         3
          64       0.00      0.00      0.00         2
          65       0.00      0.00      0.00         2
          66       0.00      0.00      0.00         3
          67       0.00      0.00      0.00         1
          68       0.00      0.00      0.00         2
          69       0.00      0.00      0.00         1
          70       0.00      0.00      0.00         0
          71       0.00      0.00      0.00         1
          72       0.00      0.00      0.00         1
          73       0.00      0.00      0.00         0
          74       0.00      0.00      0.00         1
          75       0.00      0.00      0.00         1
          76       0.00      0.00      0.00         1
          77       0.00      0.00      0.00         0
          78       0.00      0.00      0.00         1
          79       0.00      0.00      0.00         1
          80       0.00      0.00      0.00         0
          81       0.00      0.00      0.00         0
          82       0.00      0.00      0.00         1
          83       0.00      0.00      0.00         1
          84       0.00      0.00      0.00         1
          85       0.00      0.00      0.00         1
          86       0.00      0.00      0.00         1
          87       0.00      0.00      0.00         0
          88       0.00      0.00      0.00         1
          89       0.00      0.00      0.00         0
          90       0.00      0.00      0.00         1
          91       0.00      0.00      0.00         1
          92       0.00      0.00      0.00         1
          93       0.00      0.00      0.00         1
          94       0.00      0.00      0.00         0
          95       0.00      0.00      0.00         1
          96       0.00      0.00      0.00         0
          97       0.00      0.00      0.00         0
          98       0.00      0.00      0.00         0
          99       0.00      0.00      0.00         0
         100       0.00      0.00      0.00         1
         101       0.00      0.00      0.00         2
         102       0.00      0.00      0.00         1
         103       0.00      0.00      0.00         1
         104       0.00      0.00      0.00         1
         105       0.00      0.00      0.00         1
         106       0.00      0.00      0.00         1
         107       0.00      0.00      0.00         1
         108       0.00      0.00      0.00         1
         109       0.00      0.00      0.00         1
         110       0.00      0.00      0.00         1
         111       0.00      0.00      0.00         1
         112       0.00      0.00      0.00         1
         113       0.00      0.00      0.00         1
         114       0.00      0.00      0.00         0
         115       0.00      0.00      0.00         1
         116       0.00      0.00      0.00         1
         117       0.00      0.00      0.00         1
         118       0.00      0.00      0.00         1
         119       0.00      0.00      0.00         1
         120       0.00      0.00      0.00         0
         121       0.00      0.00      0.00         1
         122       0.00      0.00      0.00         0
         123       0.00      0.00      0.00         1
         124       0.00      0.00      0.00         0
         125       0.00      0.00      0.00         1
         126       0.00      0.00      0.00         1
         127       0.00      0.00      0.00         1
         128       0.00      0.00      0.00         0
         129       0.00      0.00      0.00         1
         130       0.00      0.00      0.00         0
         131       0.00      0.00      0.00         1
         132       0.00      0.00      0.00         0
         133       0.00      0.00      0.00         0
         134       0.00      0.00      0.00         1
         135       0.00      0.00      0.00         1
         136       0.00      0.00      0.00         1
         137       0.00      0.00      0.00         1
         138       0.00      0.00      0.00         0
         139       0.00      0.00      0.00         1
         140       0.00      0.00      0.00         0
         141       0.00      0.00      0.00         1
         142       0.00      0.00      0.00         1
         143       0.00      0.00      0.00         1
         144       0.00      0.00      0.00         1
         145       0.00      0.00      0.00         1
         146       0.00      0.00      0.00         1
         147       0.00      0.00      0.00         1
         148       0.00      0.00      0.00         0
         149       0.00      0.00      0.00         0
         150       0.00      0.00      0.00         1
         151       0.00      0.00      0.00         0
         152       0.00      0.00      0.00         1
         153       0.00      0.00      0.00         1
         154       0.00      0.00      0.00         0
         155       0.00      0.00      0.00         0
         156       0.00      0.00      0.00         0
         157       0.00      0.00      0.00         0
         158       0.00      0.00      0.00         1
         159       0.50      0.50      0.50         2
         160       0.00      0.00      0.00         0
         161       0.00      0.00      0.00         1
         162       0.00      0.00      0.00         1
         163       0.00      0.00      0.00         1
         164       0.00      0.00      0.00         1
         165       0.00      0.00      0.00         1
         166       0.00      0.00      0.00         1
         167       0.00      0.00      0.00         0
         168       0.00      0.00      0.00         0
         169       0.00      0.00      0.00         0
         170       0.00      0.00      0.00         0
         171       0.00      0.00      0.00         1
         172       0.00      0.00      0.00         1
         173       0.00      0.00      0.00         0
         174       0.00      0.00      0.00         0
         175       0.00      0.00      0.00         1
         176       0.00      0.00      0.00         1
         177       0.00      0.00      0.00         1
         178       0.00      0.00      0.00         1
         179       0.00      0.00      0.00         0
         180       0.00      0.00      0.00         0
         181       0.00      0.00      0.00         1
         182       0.00      0.00      0.00         1
         183       0.00      0.00      0.00         0
         184       0.00      0.00      0.00         1
         185       0.00      0.00      0.00         0
         186       0.00      0.00      0.00         0
         187       0.00      0.00      0.00         0
         188       0.00      0.00      0.00         1
         189       0.00      0.00      0.00         1
         190       0.00      0.00      0.00         1
         191       0.00      0.00      0.00         1

   micro avg       0.44      0.03      0.06       242
   macro avg       0.02      0.01      0.01       242
weighted avg       0.09      0.03      0.05       242
 samples avg       0.03      0.03      0.03       242

C:\Users\thors\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.10_qbz5n2kfra8p0\LocalCache\local-packages\Python310\site-packages\sklearn\metrics\_classification.py:1497: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
C:\Users\thors\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.10_qbz5n2kfra8p0\LocalCache\local-packages\Python310\site-packages\sklearn\metrics\_classification.py:1497: UndefinedMetricWarning: Recall is ill-defined and being set to 0.0 in labels with no true samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
C:\Users\thors\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.10_qbz5n2kfra8p0\LocalCache\local-packages\Python310\site-packages\sklearn\metrics\_classification.py:1497: UndefinedMetricWarning: F-score is ill-defined and being set to 0.0 in labels with no true nor predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
C:\Users\thors\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.10_qbz5n2kfra8p0\LocalCache\local-packages\Python310\site-packages\sklearn\metrics\_classification.py:1497: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in samples with no predicted labels. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))

Hyperparameter Tuning¶

In [ ]:
param_grid = {
    'kneighborsclassifier__n_neighbors': [3, 5, 7, 9]  # List of k values to try
}

pipeline = make_pipeline(StandardScaler(), KNeighborsClassifier())

grid_search = GridSearchCV(estimator=pipeline, param_grid=param_grid, cv=5)

grid_search.fit(X_train, y_train)

best_params = grid_search.best_params_
best_score=grid_search.best_estimator_.score(X_test, y_test)

print("Best Parameters:", best_params)
print("Best Score:", best_score)
print(classification_report(y_test, y_pred))
Best Parameters: {'kneighborsclassifier__n_neighbors': 3}
Best Score: 0.2231404958677686

Ignoring Order of Types 1¶

The first way to ignore order of types is by making a label for each type and matching the data with two labels .

Preprocessing¶

In [ ]:
df  = preprocessed_df.copy()

# Combine Type 1 and Type 2 into a single column
df['Types'] = df[['type_1', 'type_2']].apply(lambda x: tuple(filter(lambda y: pd.notna(y), x)), axis=1)
print(df['Types'][0])

# Get unique Pokémon types
unique_types = np.unique(df['Types'].explode())
df.drop(['type_1', 'type_2'], axis=1, inplace=True)

df.head()
('Grass', 'Poison')
Out[ ]:
generation height_m weight_kg abilities_number total_points hp attack defense sp_attack sp_defense ... egg_type_2_Flying egg_type_2_Grass egg_type_2_Human-Like egg_type_2_Mineral egg_type_2_Monster egg_type_2_None egg_type_2_Water 1 egg_type_2_Water 2 egg_type_2_Water 3 Types
0 1 0.7 6.9 2 318 45 49 49 65 65 ... False False False False True False False False False (Grass, Poison)
1 1 1.0 13.0 2 405 60 62 63 80 80 ... False False False False True False False False False (Grass, Poison)
2 1 2.0 100.0 2 525 80 82 83 100 100 ... False False False False True False False False False (Grass, Poison)
3 1 2.4 155.5 1 625 80 100 123 122 120 ... False False False False True False False False False (Grass, Poison)
4 1 0.6 8.5 2 309 39 52 43 60 50 ... False False False False True False False False False (Fire, None)

5 rows × 545 columns

For each unique type, we create a binary label. The label is 1 if the Pokemon has that type, (so was present in the type combination tuple) and 0 if it doesn't.

In [ ]:
# Create binary labels for each Pokémon type
for type in unique_types:
    df[type] = df['Types'].apply(lambda x: 1 if type in x else 0)

df.head()
Out[ ]:
generation height_m weight_kg abilities_number total_points hp attack defense sp_attack sp_defense ... Grass Ground Ice None Normal Poison Psychic Rock Steel Water
0 1 0.7 6.9 2 318 45 49 49 65 65 ... 1 0 0 0 0 1 0 0 0 0
1 1 1.0 13.0 2 405 60 62 63 80 80 ... 1 0 0 0 0 1 0 0 0 0
2 1 2.0 100.0 2 525 80 82 83 100 100 ... 1 0 0 0 0 1 0 0 0 0
3 1 2.4 155.5 1 625 80 100 123 122 120 ... 1 0 0 0 0 1 0 0 0 0
4 1 0.6 8.5 2 309 39 52 43 60 50 ... 0 0 0 1 0 0 0 0 0 0

5 rows × 564 columns

In [ ]:
# Some type combinations only occur once so we double them to stratify the data better
singleton_classes = df['Types'].value_counts()[df['Types'].value_counts() == 1].index.tolist()
singleton_data = df[df['Types'].isin(singleton_classes)]
other_data = df[~df['Types'].isin(singleton_classes)]

df = df.drop(columns=['Types'])
other_data.drop(columns=['Types'], inplace=True)
singleton_data.drop(columns=['Types'], inplace=True)

Decision tree¶

In [ ]:
# Split the data into training and testing sets
y = df[unique_types]
X_train, X_test, y_train, y_test = train_test_split(other_data.drop(columns=unique_types), other_data[unique_types], test_size=0.2, stratify=other_data[unique_types], random_state=42)
X_train = pd.concat([X_train, singleton_data.drop(columns=unique_types)])
y_train = pd.concat([y_train, singleton_data[unique_types]])
X_test = pd.concat([X_test, singleton_data.drop(columns=unique_types)])
y_test = pd.concat([y_test, singleton_data[unique_types]])

# Initialize and train the decision tree classifier
model = DecisionTreeClassifier(random_state=42)
model.fit(X_train, y_train)

# Predict labels for the test set
y_pred = model.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print("Score: ", model.score(X_test, y_test))
print("Accuracy:", accuracy)
print(classification_report(y_test, y_pred))
Score:  0.49173553719008267
Accuracy: 0.49173553719008267
              precision    recall  f1-score   support

           0       0.94      0.89      0.92        19
           1       0.60      0.50      0.55        18
           2       0.71      0.79      0.75        19
           3       0.63      0.63      0.63        19
           4       0.54      0.82      0.65        17
           5       0.79      0.58      0.67        19
           6       0.76      0.83      0.79        23
           7       0.66      0.68      0.67        28
           8       0.68      0.68      0.68        19
           9       1.00      0.72      0.84        29
          10       0.52      0.55      0.54        20
          11       0.29      0.38      0.33        13
          12       0.69      0.69      0.69       101
          13       0.74      0.74      0.74        27
          14       0.62      0.56      0.59        18
          15       0.50      0.50      0.50        26
          16       0.80      0.53      0.64        15
          17       0.43      0.53      0.47        17
          18       0.84      0.73      0.78        37

   micro avg       0.68      0.67      0.67       484
   macro avg       0.67      0.65      0.65       484
weighted avg       0.69      0.67      0.68       484
 samples avg       0.68      0.67      0.67       484

Hyperparameter tuning¶

In [ ]:
pipeline = make_pipeline(StandardScaler(), DecisionTreeClassifier())

param_dist = {
    "decisiontreeclassifier__max_depth": [15, 30, None],  
    "decisiontreeclassifier__min_samples_leaf": np.arange(1, 10)
}

# Instantiate the GridSearchCV object
grid_search_cv = GridSearchCV(pipeline, param_grid=param_dist, cv=5)

# Fit grid_search_cv using the data X and labels y.
grid_search_cv.fit(X_train, y_train) 
y_pred = grid_search_cv.predict(X_test)

# Print the best score
print("Tuned Model Parameters: {}".format(grid_search_cv.best_params_))
print("Accuracy: {}".format(grid_search_cv.best_estimator_.score(X_test, y_test)))
print(classification_report(y_test, y_pred))
Tuned Model Parameters: {'decisiontreeclassifier__max_depth': None, 'decisiontreeclassifier__min_samples_leaf': 1}
Accuracy: 0.5165289256198347
              precision    recall  f1-score   support

           0       0.94      0.89      0.92        19
           1       0.52      0.61      0.56        18
           2       0.74      0.74      0.74        19
           3       0.60      0.63      0.62        19
           4       0.60      0.88      0.71        17
           5       0.79      0.58      0.67        19
           6       0.76      0.83      0.79        23
           7       0.66      0.68      0.67        28
           8       0.75      0.79      0.77        19
           9       0.92      0.76      0.83        29
          10       0.73      0.55      0.63        20
          11       0.50      0.46      0.48        13
          12       0.70      0.68      0.69       101
          13       0.70      0.78      0.74        27
          14       0.63      0.67      0.65        18
          15       0.48      0.38      0.43        26
          16       0.64      0.60      0.62        15
          17       0.45      0.53      0.49        17
          18       0.79      0.73      0.76        37

   micro avg       0.69      0.68      0.68       484
   macro avg       0.68      0.67      0.67       484
weighted avg       0.69      0.68      0.68       484
 samples avg       0.69      0.68      0.68       484

Hyperparameter tuning can sometimes lead to worse results than using default settings. This can occur when the tuning process, typically done via cross-validation on the training data, inadvertently overfits the model. This issue is exacerbated when the dataset has classes with very few members, leading to unreliable splits during cross-validation. However, this is much less the case in this dataset because the dataset is much larger.

Random Forest¶

In [ ]:
from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier(random_state=42)

model.fit(X_train, y_train)

# Predict labels for the test set
y_pred = model.predict(X_test)

# Calculate accuracy
print("Score: ", model.score(X_test, y_test))
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)
print(classification_report(y_test, y_pred))
Score:  0.29338842975206614
Accuracy: 0.29338842975206614
              precision    recall  f1-score   support

           0       1.00      0.89      0.94        19
           1       0.80      0.22      0.35        18
           2       1.00      0.58      0.73        19
           3       1.00      0.37      0.54        19
           4       1.00      0.41      0.58        17
           5       1.00      0.42      0.59        19
           6       1.00      0.61      0.76        23
           7       1.00      0.46      0.63        28
           8       1.00      0.58      0.73        19
           9       1.00      0.69      0.82        29
          10       1.00      0.30      0.46        20
          11       1.00      0.31      0.47        13
          12       0.77      0.73      0.75       101
          13       0.82      0.33      0.47        27
          14       1.00      0.22      0.36        18
          15       1.00      0.35      0.51        26
          16       1.00      0.33      0.50        15
          17       1.00      0.29      0.45        17
          18       1.00      0.51      0.68        37

   micro avg       0.91      0.51      0.65       484
   macro avg       0.97      0.45      0.60       484
weighted avg       0.93      0.51      0.63       484
 samples avg       0.69      0.51      0.57       484

C:\Users\thors\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.10_qbz5n2kfra8p0\LocalCache\local-packages\Python310\site-packages\sklearn\metrics\_classification.py:1497: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in samples with no predicted labels. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))

Hyperparameter tuning¶

In [ ]:
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint  
# Define the parameter grid
pipeline = make_pipeline(StandardScaler(), RandomForestClassifier())

# Setup the parameters and distributions to sample from: param_dist
param_dist = {
    "randomforestclassifier__max_depth": [15, 30, None],  
    "randomforestclassifier__min_samples_leaf": np.arange(1, 10, 4),
    "randomforestclassifier__n_estimators": np.arange(120, 140, 4)
}

# Instantiate the RandomizedSearchCV object: random_grid_search_cv
random_search_cv = RandomizedSearchCV(pipeline, param_distributions=param_dist, n_iter=50, cv=3, random_state=42)
#grid_search_cv = GridSearchCV(pipeline, param_grid=param_dist, cv=3)

# Fit random_search_cv using the data X and labels y
random_search_cv.fit(X_train, y_train)
#grid_search_cv.fit(X_train, y_train)

# Print the best score
print("Best score is {}".format(random_search_cv.best_estimator_.score(X_test, y_test)))
print("Best parameters are {}".format(random_search_cv.best_params_))
print(classification_report(y_test, y_pred))
Best score is 0.3305785123966942
Best parameters are {'randomforestclassifier__max_depth': None, 'randomforestclassifier__min_samples_leaf': 1, 'randomforestclassifier__n_estimators': 120}

K Nearest neigbors¶

In [ ]:
from sklearn.neighbors import KNeighborsClassifier
model = KNeighborsClassifier()
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

# Calculate accuracy
print("Score: ", model.score(X_test, y_test))
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)
print(classification_report(y_test, y_pred))
Score:  0.02066115702479339
Accuracy: 0.02066115702479339
              precision    recall  f1-score   support

           0       0.50      0.16      0.24        19
           1       0.00      0.00      0.00        18
           2       0.67      0.32      0.43        19
           3       1.00      0.21      0.35        19
           4       1.00      0.18      0.30        17
           5       0.67      0.11      0.18        19
           6       0.40      0.09      0.14        23
           7       0.38      0.18      0.24        28
           8       0.00      0.00      0.00        19
           9       0.40      0.07      0.12        29
          10       0.50      0.10      0.17        20
          11       0.33      0.08      0.12        13
          12       0.48      0.46      0.47       101
          13       0.67      0.37      0.48        27
          14       0.00      0.00      0.00        18
          15       1.00      0.08      0.14        26
          16       0.00      0.00      0.00        15
          17       0.00      0.00      0.00        17
          18       0.24      0.11      0.15        37

   micro avg       0.49      0.19      0.27       484
   macro avg       0.43      0.13      0.19       484
weighted avg       0.45      0.19      0.24       484
 samples avg       0.32      0.19      0.23       484

C:\Users\thors\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.10_qbz5n2kfra8p0\LocalCache\local-packages\Python310\site-packages\sklearn\metrics\_classification.py:1497: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
C:\Users\thors\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.10_qbz5n2kfra8p0\LocalCache\local-packages\Python310\site-packages\sklearn\metrics\_classification.py:1497: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in samples with no predicted labels. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))

Hyperparameter tuning¶

In [ ]:
param_grid = {
    'kneighborsclassifier__n_neighbors': [3, 5, 7, 9]  # List of k values to try
}

pipeline = make_pipeline(StandardScaler(), KNeighborsClassifier())

grid_search = GridSearchCV(estimator=pipeline, param_grid=param_grid, cv=5)

grid_search.fit(X_train, y_train)

best_params = grid_search.best_params_
best_score=grid_search.best_estimator_.score(X_test, y_test)

print("Best Parameters:", best_params)
print("Best Score:", best_score)
print(classification_report(y_test, y_pred))
Best Parameters: {'kneighborsclassifier__n_neighbors': 3}
Best Score: 0.2727272727272727
              precision    recall  f1-score   support

           0       0.50      0.16      0.24        19
           1       0.00      0.00      0.00        18
           2       0.67      0.32      0.43        19
           3       1.00      0.21      0.35        19
           4       1.00      0.18      0.30        17
           5       0.67      0.11      0.18        19
           6       0.40      0.09      0.14        23
           7       0.38      0.18      0.24        28
           8       0.00      0.00      0.00        19
           9       0.40      0.07      0.12        29
          10       0.50      0.10      0.17        20
          11       0.33      0.08      0.12        13
          12       0.48      0.46      0.47       101
          13       0.67      0.37      0.48        27
          14       0.00      0.00      0.00        18
          15       1.00      0.08      0.14        26
          16       0.00      0.00      0.00        15
          17       0.00      0.00      0.00        17
          18       0.24      0.11      0.15        37

   micro avg       0.49      0.19      0.27       484
   macro avg       0.43      0.13      0.19       484
weighted avg       0.45      0.19      0.24       484
 samples avg       0.32      0.19      0.23       484

C:\Users\thors\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.10_qbz5n2kfra8p0\LocalCache\local-packages\Python310\site-packages\sklearn\metrics\_classification.py:1497: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
C:\Users\thors\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.10_qbz5n2kfra8p0\LocalCache\local-packages\Python310\site-packages\sklearn\metrics\_classification.py:1497: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in samples with no predicted labels. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))

Ignoring Order of Types 2¶

The second way of ignoring order of types is to use binary labels for each sorted type combination.

Preprocessing¶

We again combine the Type 1 and Type 2 columns into a single column that contains a list of type combination tuples that are sorted.

In [ ]:
df = preprocessed_df.copy()

df['Types'] = df[['type_1', 'type_2']].apply(lambda x: sorted(tuple(filter(lambda y: pd.notna(y), x))), axis=1)

df['Types'] = df['Types'].astype(str)

# drop the Type 1 and Type 2 columns
df.drop(['type_1', 'type_2'], axis=1, inplace=True)
# print head
df.head()
Out[ ]:
generation height_m weight_kg abilities_number total_points hp attack defense sp_attack sp_defense ... egg_type_2_Flying egg_type_2_Grass egg_type_2_Human-Like egg_type_2_Mineral egg_type_2_Monster egg_type_2_None egg_type_2_Water 1 egg_type_2_Water 2 egg_type_2_Water 3 Types
0 1 0.7 6.9 2 318 45 49 49 65 65 ... False False False False True False False False False ['Grass', 'Poison']
1 1 1.0 13.0 2 405 60 62 63 80 80 ... False False False False True False False False False ['Grass', 'Poison']
2 1 2.0 100.0 2 525 80 82 83 100 100 ... False False False False True False False False False ['Grass', 'Poison']
3 1 2.4 155.5 1 625 80 100 123 122 120 ... False False False False True False False False False ['Grass', 'Poison']
4 1 0.6 8.5 2 309 39 52 43 60 50 ... False False False False True False False False False ['Fire', 'None']

5 rows × 545 columns

In [ ]:
# Find classes with only one type
singleton_classes = df['Types'].value_counts()[df['Types'].value_counts() == 1].index.tolist()

Make binary labels for each sorted type combination

In [ ]:
# Create binary labels for each Pokémon type combination
unique_type_combinations = df['Types'].unique()
for type_combination in unique_type_combinations:
    df[type_combination] = df['Types'].apply(lambda x: 1 if type in x else 0)

singleton_data = df[df['Types'].isin(singleton_classes)]
other_data = df[~df['Types'].isin(singleton_classes)]

print("Number of singleton classes",len(singleton_classes))
print("number of unique type combinations",len(df['Types'].unique()))
print(len(df['Types']))
df.head()
Number of singleton classes 23
number of unique type combinations 154
1044
C:\Users\thors\AppData\Local\Temp\ipykernel_20284\1134825012.py:4: PerformanceWarning: DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`
  df[type_combination] = df['Types'].apply(lambda x: 1 if type in x else 0)
C:\Users\thors\AppData\Local\Temp\ipykernel_20284\1134825012.py:4: PerformanceWarning: DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`
  df[type_combination] = df['Types'].apply(lambda x: 1 if type in x else 0)
C:\Users\thors\AppData\Local\Temp\ipykernel_20284\1134825012.py:4: PerformanceWarning: DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`
  df[type_combination] = df['Types'].apply(lambda x: 1 if type in x else 0)
C:\Users\thors\AppData\Local\Temp\ipykernel_20284\1134825012.py:4: PerformanceWarning: DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`
  df[type_combination] = df['Types'].apply(lambda x: 1 if type in x else 0)
C:\Users\thors\AppData\Local\Temp\ipykernel_20284\1134825012.py:4: PerformanceWarning: DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`
  df[type_combination] = df['Types'].apply(lambda x: 1 if type in x else 0)
C:\Users\thors\AppData\Local\Temp\ipykernel_20284\1134825012.py:4: PerformanceWarning: DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`
  df[type_combination] = df['Types'].apply(lambda x: 1 if type in x else 0)
C:\Users\thors\AppData\Local\Temp\ipykernel_20284\1134825012.py:4: PerformanceWarning: DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`
  df[type_combination] = df['Types'].apply(lambda x: 1 if type in x else 0)
C:\Users\thors\AppData\Local\Temp\ipykernel_20284\1134825012.py:4: PerformanceWarning: DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`
  df[type_combination] = df['Types'].apply(lambda x: 1 if type in x else 0)
C:\Users\thors\AppData\Local\Temp\ipykernel_20284\1134825012.py:4: PerformanceWarning: DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`
  df[type_combination] = df['Types'].apply(lambda x: 1 if type in x else 0)
C:\Users\thors\AppData\Local\Temp\ipykernel_20284\1134825012.py:4: PerformanceWarning: DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`
  df[type_combination] = df['Types'].apply(lambda x: 1 if type in x else 0)
C:\Users\thors\AppData\Local\Temp\ipykernel_20284\1134825012.py:4: PerformanceWarning: DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`
  df[type_combination] = df['Types'].apply(lambda x: 1 if type in x else 0)
C:\Users\thors\AppData\Local\Temp\ipykernel_20284\1134825012.py:4: PerformanceWarning: DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`
  df[type_combination] = df['Types'].apply(lambda x: 1 if type in x else 0)
C:\Users\thors\AppData\Local\Temp\ipykernel_20284\1134825012.py:4: PerformanceWarning: DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`
  df[type_combination] = df['Types'].apply(lambda x: 1 if type in x else 0)
C:\Users\thors\AppData\Local\Temp\ipykernel_20284\1134825012.py:4: PerformanceWarning: DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`
  df[type_combination] = df['Types'].apply(lambda x: 1 if type in x else 0)
C:\Users\thors\AppData\Local\Temp\ipykernel_20284\1134825012.py:4: PerformanceWarning: DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`
  df[type_combination] = df['Types'].apply(lambda x: 1 if type in x else 0)
C:\Users\thors\AppData\Local\Temp\ipykernel_20284\1134825012.py:4: PerformanceWarning: DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`
  df[type_combination] = df['Types'].apply(lambda x: 1 if type in x else 0)
C:\Users\thors\AppData\Local\Temp\ipykernel_20284\1134825012.py:4: PerformanceWarning: DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`
  df[type_combination] = df['Types'].apply(lambda x: 1 if type in x else 0)
C:\Users\thors\AppData\Local\Temp\ipykernel_20284\1134825012.py:4: PerformanceWarning: DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`
  df[type_combination] = df['Types'].apply(lambda x: 1 if type in x else 0)
C:\Users\thors\AppData\Local\Temp\ipykernel_20284\1134825012.py:4: PerformanceWarning: DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`
  df[type_combination] = df['Types'].apply(lambda x: 1 if type in x else 0)
C:\Users\thors\AppData\Local\Temp\ipykernel_20284\1134825012.py:4: PerformanceWarning: DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`
  df[type_combination] = df['Types'].apply(lambda x: 1 if type in x else 0)
C:\Users\thors\AppData\Local\Temp\ipykernel_20284\1134825012.py:4: PerformanceWarning: DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`
  df[type_combination] = df['Types'].apply(lambda x: 1 if type in x else 0)
C:\Users\thors\AppData\Local\Temp\ipykernel_20284\1134825012.py:4: PerformanceWarning: DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`
  df[type_combination] = df['Types'].apply(lambda x: 1 if type in x else 0)
C:\Users\thors\AppData\Local\Temp\ipykernel_20284\1134825012.py:4: PerformanceWarning: DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`
  df[type_combination] = df['Types'].apply(lambda x: 1 if type in x else 0)
C:\Users\thors\AppData\Local\Temp\ipykernel_20284\1134825012.py:4: PerformanceWarning: DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`
  df[type_combination] = df['Types'].apply(lambda x: 1 if type in x else 0)
C:\Users\thors\AppData\Local\Temp\ipykernel_20284\1134825012.py:4: PerformanceWarning: DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`
  df[type_combination] = df['Types'].apply(lambda x: 1 if type in x else 0)
C:\Users\thors\AppData\Local\Temp\ipykernel_20284\1134825012.py:4: PerformanceWarning: DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`
  df[type_combination] = df['Types'].apply(lambda x: 1 if type in x else 0)
C:\Users\thors\AppData\Local\Temp\ipykernel_20284\1134825012.py:4: PerformanceWarning: DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`
  df[type_combination] = df['Types'].apply(lambda x: 1 if type in x else 0)
C:\Users\thors\AppData\Local\Temp\ipykernel_20284\1134825012.py:4: PerformanceWarning: DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`
  df[type_combination] = df['Types'].apply(lambda x: 1 if type in x else 0)
C:\Users\thors\AppData\Local\Temp\ipykernel_20284\1134825012.py:4: PerformanceWarning: DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`
  df[type_combination] = df['Types'].apply(lambda x: 1 if type in x else 0)
C:\Users\thors\AppData\Local\Temp\ipykernel_20284\1134825012.py:4: PerformanceWarning: DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`
  df[type_combination] = df['Types'].apply(lambda x: 1 if type in x else 0)
C:\Users\thors\AppData\Local\Temp\ipykernel_20284\1134825012.py:4: PerformanceWarning: DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`
  df[type_combination] = df['Types'].apply(lambda x: 1 if type in x else 0)
C:\Users\thors\AppData\Local\Temp\ipykernel_20284\1134825012.py:4: PerformanceWarning: DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`
  df[type_combination] = df['Types'].apply(lambda x: 1 if type in x else 0)
C:\Users\thors\AppData\Local\Temp\ipykernel_20284\1134825012.py:4: PerformanceWarning: DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`
  df[type_combination] = df['Types'].apply(lambda x: 1 if type in x else 0)
C:\Users\thors\AppData\Local\Temp\ipykernel_20284\1134825012.py:4: PerformanceWarning: DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`
  df[type_combination] = df['Types'].apply(lambda x: 1 if type in x else 0)
C:\Users\thors\AppData\Local\Temp\ipykernel_20284\1134825012.py:4: PerformanceWarning: DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`
  df[type_combination] = df['Types'].apply(lambda x: 1 if type in x else 0)
C:\Users\thors\AppData\Local\Temp\ipykernel_20284\1134825012.py:4: PerformanceWarning: DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`
  df[type_combination] = df['Types'].apply(lambda x: 1 if type in x else 0)
C:\Users\thors\AppData\Local\Temp\ipykernel_20284\1134825012.py:4: PerformanceWarning: DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`
  df[type_combination] = df['Types'].apply(lambda x: 1 if type in x else 0)
C:\Users\thors\AppData\Local\Temp\ipykernel_20284\1134825012.py:4: PerformanceWarning: DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`
  df[type_combination] = df['Types'].apply(lambda x: 1 if type in x else 0)
C:\Users\thors\AppData\Local\Temp\ipykernel_20284\1134825012.py:4: PerformanceWarning: DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`
  df[type_combination] = df['Types'].apply(lambda x: 1 if type in x else 0)
C:\Users\thors\AppData\Local\Temp\ipykernel_20284\1134825012.py:4: PerformanceWarning: DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`
  df[type_combination] = df['Types'].apply(lambda x: 1 if type in x else 0)
C:\Users\thors\AppData\Local\Temp\ipykernel_20284\1134825012.py:4: PerformanceWarning: DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`
  df[type_combination] = df['Types'].apply(lambda x: 1 if type in x else 0)
C:\Users\thors\AppData\Local\Temp\ipykernel_20284\1134825012.py:4: PerformanceWarning: DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`
  df[type_combination] = df['Types'].apply(lambda x: 1 if type in x else 0)
C:\Users\thors\AppData\Local\Temp\ipykernel_20284\1134825012.py:4: PerformanceWarning: DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`
  df[type_combination] = df['Types'].apply(lambda x: 1 if type in x else 0)
C:\Users\thors\AppData\Local\Temp\ipykernel_20284\1134825012.py:4: PerformanceWarning: DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`
  df[type_combination] = df['Types'].apply(lambda x: 1 if type in x else 0)
C:\Users\thors\AppData\Local\Temp\ipykernel_20284\1134825012.py:4: PerformanceWarning: DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`
  df[type_combination] = df['Types'].apply(lambda x: 1 if type in x else 0)
C:\Users\thors\AppData\Local\Temp\ipykernel_20284\1134825012.py:4: PerformanceWarning: DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`
  df[type_combination] = df['Types'].apply(lambda x: 1 if type in x else 0)
C:\Users\thors\AppData\Local\Temp\ipykernel_20284\1134825012.py:4: PerformanceWarning: DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`
  df[type_combination] = df['Types'].apply(lambda x: 1 if type in x else 0)
C:\Users\thors\AppData\Local\Temp\ipykernel_20284\1134825012.py:4: PerformanceWarning: DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`
  df[type_combination] = df['Types'].apply(lambda x: 1 if type in x else 0)
C:\Users\thors\AppData\Local\Temp\ipykernel_20284\1134825012.py:4: PerformanceWarning: DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`
  df[type_combination] = df['Types'].apply(lambda x: 1 if type in x else 0)
C:\Users\thors\AppData\Local\Temp\ipykernel_20284\1134825012.py:4: PerformanceWarning: DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`
  df[type_combination] = df['Types'].apply(lambda x: 1 if type in x else 0)
C:\Users\thors\AppData\Local\Temp\ipykernel_20284\1134825012.py:4: PerformanceWarning: DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`
  df[type_combination] = df['Types'].apply(lambda x: 1 if type in x else 0)
C:\Users\thors\AppData\Local\Temp\ipykernel_20284\1134825012.py:4: PerformanceWarning: DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`
  df[type_combination] = df['Types'].apply(lambda x: 1 if type in x else 0)
C:\Users\thors\AppData\Local\Temp\ipykernel_20284\1134825012.py:4: PerformanceWarning: DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`
  df[type_combination] = df['Types'].apply(lambda x: 1 if type in x else 0)
C:\Users\thors\AppData\Local\Temp\ipykernel_20284\1134825012.py:4: PerformanceWarning: DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`
  df[type_combination] = df['Types'].apply(lambda x: 1 if type in x else 0)
C:\Users\thors\AppData\Local\Temp\ipykernel_20284\1134825012.py:4: PerformanceWarning: DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`
  df[type_combination] = df['Types'].apply(lambda x: 1 if type in x else 0)
C:\Users\thors\AppData\Local\Temp\ipykernel_20284\1134825012.py:4: PerformanceWarning: DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`
  df[type_combination] = df['Types'].apply(lambda x: 1 if type in x else 0)
C:\Users\thors\AppData\Local\Temp\ipykernel_20284\1134825012.py:4: PerformanceWarning: DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`
  df[type_combination] = df['Types'].apply(lambda x: 1 if type in x else 0)
C:\Users\thors\AppData\Local\Temp\ipykernel_20284\1134825012.py:4: PerformanceWarning: DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`
  df[type_combination] = df['Types'].apply(lambda x: 1 if type in x else 0)
Out[ ]:
generation height_m weight_kg abilities_number total_points hp attack defense sp_attack sp_defense ... ['Fairy', 'Ghost'] ['Dragon', 'Normal'] ['Dragon', 'Fighting'] ['Poison', 'Rock'] ['Fighting', 'Ghost'] ['Bug', 'Psychic'] ['Electric', 'Poison'] ['Dark', 'Fairy'] ['Bug', 'Ice'] ['Dark', 'Electric']
0 1 0.7 6.9 2 318 45 49 49 65 65 ... 0 0 0 0 0 0 0 0 0 0
1 1 1.0 13.0 2 405 60 62 63 80 80 ... 0 0 0 0 0 0 0 0 0 0
2 1 2.0 100.0 2 525 80 82 83 100 100 ... 0 0 0 0 0 0 0 0 0 0
3 1 2.4 155.5 1 625 80 100 123 122 120 ... 0 0 0 0 0 0 0 0 0 0
4 1 0.6 8.5 2 309 39 52 43 60 50 ... 0 0 0 0 0 0 0 0 0 0

5 rows × 699 columns

In [ ]:
# Drop the 'Types' column
df = df.drop(columns=['Types'])
other_data.drop(columns=['Types'], inplace=True)
singleton_data.drop(columns=['Types'], inplace=True)

Decision Tree¶

In [ ]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(other_data.drop(columns=unique_type_combinations), other_data[unique_type_combinations], test_size=0.2, stratify=other_data[unique_type_combinations], random_state=42)
X_train = pd.concat([X_train, singleton_data.drop(columns=unique_type_combinations)])
y_train = pd.concat([y_train, singleton_data[unique_type_combinations]])
X_test = pd.concat([X_test, singleton_data.drop(columns=unique_type_combinations)])
y_test = pd.concat([y_test, singleton_data[unique_type_combinations]])


# Initialize and train the decision tree classifier
model = DecisionTreeClassifier(random_state=42)
model.fit(X_train, y_train)

# Predict labels for the test set
y_pred = model.predict(X_test)

# Calculate accuracy
print("Score: ", model.score(X_test, y_test))
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)
print(classification_report(y_test, y_pred))
Score:  0.9692982456140351
Accuracy: 0.9692982456140351
              precision    recall  f1-score   support

           0       0.88      0.91      0.90        33
           1       0.88      0.91      0.90        33
           2       0.88      0.91      0.90        33
           3       0.88      0.91      0.90        33
           4       0.88      0.91      0.90        33
           5       0.88      0.91      0.90        33
           6       0.88      0.91      0.90        33
           7       0.88      0.91      0.90        33
           8       0.88      0.91      0.90        33
           9       0.88      0.91      0.90        33
          10       0.88      0.91      0.90        33
          11       0.88      0.91      0.90        33
          12       0.88      0.91      0.90        33
          13       0.88      0.91      0.90        33
          14       0.88      0.91      0.90        33
          15       0.88      0.91      0.90        33
          16       0.88      0.91      0.90        33
          17       0.88      0.91      0.90        33
          18       0.88      0.91      0.90        33
          19       0.88      0.91      0.90        33
          20       0.88      0.91      0.90        33
          21       0.88      0.91      0.90        33
          22       0.88      0.91      0.90        33
          23       0.88      0.91      0.90        33
          24       0.88      0.91      0.90        33
          25       0.88      0.91      0.90        33
          26       0.88      0.91      0.90        33
          27       0.88      0.91      0.90        33
          28       0.88      0.91      0.90        33
          29       0.88      0.91      0.90        33
          30       0.88      0.91      0.90        33
          31       0.88      0.91      0.90        33
          32       0.88      0.91      0.90        33
          33       0.88      0.91      0.90        33
          34       0.88      0.91      0.90        33
          35       0.88      0.91      0.90        33
          36       0.88      0.91      0.90        33
          37       0.88      0.91      0.90        33
          38       0.88      0.91      0.90        33
          39       0.88      0.91      0.90        33
          40       0.88      0.91      0.90        33
          41       0.88      0.91      0.90        33
          42       0.88      0.91      0.90        33
          43       0.88      0.91      0.90        33
          44       0.88      0.91      0.90        33
          45       0.88      0.91      0.90        33
          46       0.88      0.91      0.90        33
          47       0.88      0.91      0.90        33
          48       0.88      0.91      0.90        33
          49       0.88      0.91      0.90        33
          50       0.88      0.91      0.90        33
          51       0.88      0.91      0.90        33
          52       0.88      0.91      0.90        33
          53       0.88      0.91      0.90        33
          54       0.88      0.91      0.90        33
          55       0.88      0.91      0.90        33
          56       0.88      0.91      0.90        33
          57       0.88      0.91      0.90        33
          58       0.88      0.91      0.90        33
          59       0.88      0.91      0.90        33
          60       0.88      0.91      0.90        33
          61       0.88      0.91      0.90        33
          62       0.88      0.91      0.90        33
          63       0.88      0.91      0.90        33
          64       0.88      0.91      0.90        33
          65       0.88      0.91      0.90        33
          66       0.88      0.91      0.90        33
          67       0.88      0.91      0.90        33
          68       0.88      0.91      0.90        33
          69       0.88      0.91      0.90        33
          70       0.88      0.91      0.90        33
          71       0.88      0.91      0.90        33
          72       0.88      0.91      0.90        33
          73       0.88      0.91      0.90        33
          74       0.88      0.91      0.90        33
          75       0.88      0.91      0.90        33
          76       0.88      0.91      0.90        33
          77       0.88      0.91      0.90        33
          78       0.88      0.91      0.90        33
          79       0.88      0.91      0.90        33
          80       0.88      0.91      0.90        33
          81       0.88      0.91      0.90        33
          82       0.88      0.91      0.90        33
          83       0.88      0.91      0.90        33
          84       0.88      0.91      0.90        33
          85       0.88      0.91      0.90        33
          86       0.88      0.91      0.90        33
          87       0.88      0.91      0.90        33
          88       0.88      0.91      0.90        33
          89       0.88      0.91      0.90        33
          90       0.88      0.91      0.90        33
          91       0.88      0.91      0.90        33
          92       0.88      0.91      0.90        33
          93       0.88      0.91      0.90        33
          94       0.88      0.91      0.90        33
          95       0.88      0.91      0.90        33
          96       0.88      0.91      0.90        33
          97       0.88      0.91      0.90        33
          98       0.88      0.91      0.90        33
          99       0.88      0.91      0.90        33
         100       0.88      0.91      0.90        33
         101       0.88      0.91      0.90        33
         102       0.88      0.91      0.90        33
         103       0.88      0.91      0.90        33
         104       0.88      0.91      0.90        33
         105       0.88      0.91      0.90        33
         106       0.88      0.91      0.90        33
         107       0.88      0.91      0.90        33
         108       0.88      0.91      0.90        33
         109       0.88      0.91      0.90        33
         110       0.88      0.91      0.90        33
         111       0.88      0.91      0.90        33
         112       0.88      0.91      0.90        33
         113       0.88      0.91      0.90        33
         114       0.88      0.91      0.90        33
         115       0.88      0.91      0.90        33
         116       0.88      0.91      0.90        33
         117       0.88      0.91      0.90        33
         118       0.88      0.91      0.90        33
         119       0.88      0.91      0.90        33
         120       0.88      0.91      0.90        33
         121       0.88      0.91      0.90        33
         122       0.88      0.91      0.90        33
         123       0.88      0.91      0.90        33
         124       0.88      0.91      0.90        33
         125       0.88      0.91      0.90        33
         126       0.88      0.91      0.90        33
         127       0.88      0.91      0.90        33
         128       0.88      0.91      0.90        33
         129       0.88      0.91      0.90        33
         130       0.88      0.91      0.90        33
         131       0.88      0.91      0.90        33
         132       0.88      0.91      0.90        33
         133       0.88      0.91      0.90        33
         134       0.88      0.91      0.90        33
         135       0.88      0.91      0.90        33
         136       0.88      0.91      0.90        33
         137       0.88      0.91      0.90        33
         138       0.88      0.91      0.90        33
         139       0.88      0.91      0.90        33
         140       0.88      0.91      0.90        33
         141       0.88      0.91      0.90        33
         142       0.88      0.91      0.90        33
         143       0.88      0.91      0.90        33
         144       0.88      0.91      0.90        33
         145       0.88      0.91      0.90        33
         146       0.88      0.91      0.90        33
         147       0.88      0.91      0.90        33
         148       0.88      0.91      0.90        33
         149       0.88      0.91      0.90        33
         150       0.88      0.91      0.90        33
         151       0.88      0.91      0.90        33
         152       0.88      0.91      0.90        33
         153       0.88      0.91      0.90        33

   micro avg       0.88      0.91      0.90      5082
   macro avg       0.88      0.91      0.90      5082
weighted avg       0.88      0.91      0.90      5082
 samples avg       0.13      0.13      0.13      5082

C:\Users\thors\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.10_qbz5n2kfra8p0\LocalCache\local-packages\Python310\site-packages\sklearn\metrics\_classification.py:1497: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in samples with no predicted labels. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
C:\Users\thors\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.10_qbz5n2kfra8p0\LocalCache\local-packages\Python310\site-packages\sklearn\metrics\_classification.py:1497: UndefinedMetricWarning: Recall is ill-defined and being set to 0.0 in samples with no true labels. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
C:\Users\thors\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.10_qbz5n2kfra8p0\LocalCache\local-packages\Python310\site-packages\sklearn\metrics\_classification.py:1497: UndefinedMetricWarning: F-score is ill-defined and being set to 0.0 in samples with no true nor predicted labels. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))

Here we have the highest accuracy score of all the models.

Hyperparameter Tuning¶

In [ ]:
pipeline = make_pipeline(StandardScaler(), DecisionTreeClassifier())

param_dist = {
    "decisiontreeclassifier__max_depth": [15, 30, None],  
    "decisiontreeclassifier__min_samples_leaf": np.arange(1, 10)
}

# Instantiate the GridSearchCV object
grid_search_cv = GridSearchCV(pipeline, param_grid=param_dist, cv=5)

# Fit grid_search_cv using the data X and labels y.
grid_search_cv.fit(X_train, y_train) 
y_pred = grid_search_cv.predict(X_test)

# Print the best score
print("Tuned Model Parameters: {}".format(grid_search_cv.best_params_))
print("Accuracy: {}".format(grid_search_cv.best_estimator_.score(X_test, y_test)))
print(classification_report(y_test, y_pred))
Tuned Model Parameters: {'decisiontreeclassifier__max_depth': 30, 'decisiontreeclassifier__min_samples_leaf': 1}
Accuracy: 0.9692982456140351
              precision    recall  f1-score   support

           0       0.86      0.94      0.90        33
           1       0.86      0.94      0.90        33
           2       0.86      0.94      0.90        33
           3       0.86      0.94      0.90        33
           4       0.86      0.94      0.90        33
           5       0.86      0.94      0.90        33
           6       0.86      0.94      0.90        33
           7       0.86      0.94      0.90        33
           8       0.86      0.94      0.90        33
           9       0.86      0.94      0.90        33
          10       0.86      0.94      0.90        33
          11       0.86      0.94      0.90        33
          12       0.86      0.94      0.90        33
          13       0.86      0.94      0.90        33
          14       0.86      0.94      0.90        33
          15       0.86      0.94      0.90        33
          16       0.86      0.94      0.90        33
          17       0.86      0.94      0.90        33
          18       0.86      0.94      0.90        33
          19       0.86      0.94      0.90        33
          20       0.86      0.94      0.90        33
          21       0.86      0.94      0.90        33
          22       0.86      0.94      0.90        33
          23       0.86      0.94      0.90        33
          24       0.86      0.94      0.90        33
          25       0.86      0.94      0.90        33
          26       0.86      0.94      0.90        33
          27       0.86      0.94      0.90        33
          28       0.86      0.94      0.90        33
          29       0.86      0.94      0.90        33
          30       0.86      0.94      0.90        33
          31       0.86      0.94      0.90        33
          32       0.86      0.94      0.90        33
          33       0.86      0.94      0.90        33
          34       0.86      0.94      0.90        33
          35       0.86      0.94      0.90        33
          36       0.86      0.94      0.90        33
          37       0.86      0.94      0.90        33
          38       0.86      0.94      0.90        33
          39       0.86      0.94      0.90        33
          40       0.86      0.94      0.90        33
          41       0.86      0.94      0.90        33
          42       0.86      0.94      0.90        33
          43       0.86      0.94      0.90        33
          44       0.86      0.94      0.90        33
          45       0.86      0.94      0.90        33
          46       0.86      0.94      0.90        33
          47       0.86      0.94      0.90        33
          48       0.86      0.94      0.90        33
          49       0.86      0.94      0.90        33
          50       0.86      0.94      0.90        33
          51       0.86      0.94      0.90        33
          52       0.86      0.94      0.90        33
          53       0.86      0.94      0.90        33
          54       0.86      0.94      0.90        33
          55       0.86      0.94      0.90        33
          56       0.86      0.94      0.90        33
          57       0.86      0.94      0.90        33
          58       0.86      0.94      0.90        33
          59       0.86      0.94      0.90        33
          60       0.86      0.94      0.90        33
          61       0.86      0.94      0.90        33
          62       0.86      0.94      0.90        33
          63       0.86      0.94      0.90        33
          64       0.86      0.94      0.90        33
          65       0.86      0.94      0.90        33
          66       0.86      0.94      0.90        33
          67       0.86      0.94      0.90        33
          68       0.86      0.94      0.90        33
          69       0.86      0.94      0.90        33
          70       0.86      0.94      0.90        33
          71       0.86      0.94      0.90        33
          72       0.86      0.94      0.90        33
          73       0.86      0.94      0.90        33
          74       0.86      0.94      0.90        33
          75       0.86      0.94      0.90        33
          76       0.86      0.94      0.90        33
          77       0.86      0.94      0.90        33
          78       0.86      0.94      0.90        33
          79       0.86      0.94      0.90        33
          80       0.86      0.94      0.90        33
          81       0.86      0.94      0.90        33
          82       0.86      0.94      0.90        33
          83       0.86      0.94      0.90        33
          84       0.86      0.94      0.90        33
          85       0.86      0.94      0.90        33
          86       0.86      0.94      0.90        33
          87       0.86      0.94      0.90        33
          88       0.86      0.94      0.90        33
          89       0.86      0.94      0.90        33
          90       0.86      0.94      0.90        33
          91       0.86      0.94      0.90        33
          92       0.86      0.94      0.90        33
          93       0.86      0.94      0.90        33
          94       0.86      0.94      0.90        33
          95       0.86      0.94      0.90        33
          96       0.86      0.94      0.90        33
          97       0.86      0.94      0.90        33
          98       0.86      0.94      0.90        33
          99       0.86      0.94      0.90        33
         100       0.86      0.94      0.90        33
         101       0.86      0.94      0.90        33
         102       0.86      0.94      0.90        33
         103       0.86      0.94      0.90        33
         104       0.86      0.94      0.90        33
         105       0.86      0.94      0.90        33
         106       0.86      0.94      0.90        33
         107       0.86      0.94      0.90        33
         108       0.86      0.94      0.90        33
         109       0.86      0.94      0.90        33
         110       0.86      0.94      0.90        33
         111       0.86      0.94      0.90        33
         112       0.86      0.94      0.90        33
         113       0.86      0.94      0.90        33
         114       0.86      0.94      0.90        33
         115       0.86      0.94      0.90        33
         116       0.86      0.94      0.90        33
         117       0.86      0.94      0.90        33
         118       0.86      0.94      0.90        33
         119       0.86      0.94      0.90        33
         120       0.86      0.94      0.90        33
         121       0.86      0.94      0.90        33
         122       0.86      0.94      0.90        33
         123       0.86      0.94      0.90        33
         124       0.86      0.94      0.90        33
         125       0.86      0.94      0.90        33
         126       0.86      0.94      0.90        33
         127       0.86      0.94      0.90        33
         128       0.86      0.94      0.90        33
         129       0.86      0.94      0.90        33
         130       0.86      0.94      0.90        33
         131       0.86      0.94      0.90        33
         132       0.86      0.94      0.90        33
         133       0.86      0.94      0.90        33
         134       0.86      0.94      0.90        33
         135       0.86      0.94      0.90        33
         136       0.86      0.94      0.90        33
         137       0.86      0.94      0.90        33
         138       0.86      0.94      0.90        33
         139       0.86      0.94      0.90        33
         140       0.86      0.94      0.90        33
         141       0.86      0.94      0.90        33
         142       0.86      0.94      0.90        33
         143       0.86      0.94      0.90        33
         144       0.86      0.94      0.90        33
         145       0.86      0.94      0.90        33
         146       0.86      0.94      0.90        33
         147       0.86      0.94      0.90        33
         148       0.86      0.94      0.90        33
         149       0.86      0.94      0.90        33
         150       0.86      0.94      0.90        33
         151       0.86      0.94      0.90        33
         152       0.86      0.94      0.90        33
         153       0.86      0.94      0.90        33

   micro avg       0.86      0.94      0.90      5082
   macro avg       0.86      0.94      0.90      5082
weighted avg       0.86      0.94      0.90      5082
 samples avg       0.14      0.14      0.14      5082

C:\Users\thors\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.10_qbz5n2kfra8p0\LocalCache\local-packages\Python310\site-packages\sklearn\metrics\_classification.py:1497: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in samples with no predicted labels. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
C:\Users\thors\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.10_qbz5n2kfra8p0\LocalCache\local-packages\Python310\site-packages\sklearn\metrics\_classification.py:1497: UndefinedMetricWarning: Recall is ill-defined and being set to 0.0 in samples with no true labels. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
C:\Users\thors\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.10_qbz5n2kfra8p0\LocalCache\local-packages\Python310\site-packages\sklearn\metrics\_classification.py:1497: UndefinedMetricWarning: F-score is ill-defined and being set to 0.0 in samples with no true nor predicted labels. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))

Random forests¶

In [ ]:
# Initialize and train the decision tree classifier
model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)

# Predict labels for the test set
y_pred = model.predict(X_test)

# Calculate accuracy
print("Score: ", model.score(X_test,y_test))
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)
print(classification_report(y_test, y_pred))
Score:  0.9385964912280702
Accuracy: 0.9385964912280702
              precision    recall  f1-score   support

           0       0.95      0.61      0.74        33
           1       0.95      0.61      0.74        33
           2       0.95      0.61      0.74        33
           3       0.95      0.61      0.74        33
           4       0.95      0.61      0.74        33
           5       0.95      0.61      0.74        33
           6       0.95      0.61      0.74        33
           7       0.95      0.61      0.74        33
           8       0.95      0.61      0.74        33
           9       0.95      0.61      0.74        33
          10       0.95      0.61      0.74        33
          11       0.95      0.61      0.74        33
          12       0.95      0.61      0.74        33
          13       0.95      0.61      0.74        33
          14       0.95      0.61      0.74        33
          15       0.95      0.61      0.74        33
          16       0.95      0.61      0.74        33
          17       0.95      0.61      0.74        33
          18       0.95      0.61      0.74        33
          19       0.95      0.61      0.74        33
          20       0.95      0.61      0.74        33
          21       0.95      0.61      0.74        33
          22       0.95      0.61      0.74        33
          23       0.95      0.61      0.74        33
          24       0.95      0.61      0.74        33
          25       0.95      0.61      0.74        33
          26       0.95      0.61      0.74        33
          27       0.95      0.61      0.74        33
          28       0.95      0.61      0.74        33
          29       0.95      0.61      0.74        33
          30       0.95      0.61      0.74        33
          31       0.95      0.61      0.74        33
          32       0.95      0.61      0.74        33
          33       0.95      0.61      0.74        33
          34       0.95      0.61      0.74        33
          35       0.95      0.61      0.74        33
          36       0.95      0.61      0.74        33
          37       0.95      0.61      0.74        33
          38       0.95      0.61      0.74        33
          39       0.95      0.61      0.74        33
          40       0.95      0.61      0.74        33
          41       0.95      0.61      0.74        33
          42       0.95      0.61      0.74        33
          43       0.95      0.61      0.74        33
          44       0.95      0.61      0.74        33
          45       0.95      0.61      0.74        33
          46       0.95      0.61      0.74        33
          47       0.95      0.61      0.74        33
          48       0.95      0.61      0.74        33
          49       0.95      0.61      0.74        33
          50       0.95      0.61      0.74        33
          51       0.95      0.61      0.74        33
          52       0.95      0.61      0.74        33
          53       0.95      0.61      0.74        33
          54       0.95      0.61      0.74        33
          55       0.95      0.61      0.74        33
          56       0.95      0.61      0.74        33
          57       0.95      0.61      0.74        33
          58       0.95      0.61      0.74        33
          59       0.95      0.61      0.74        33
          60       0.95      0.61      0.74        33
          61       0.95      0.61      0.74        33
          62       0.95      0.61      0.74        33
          63       0.95      0.61      0.74        33
          64       0.95      0.61      0.74        33
          65       0.95      0.61      0.74        33
          66       0.95      0.61      0.74        33
          67       0.95      0.61      0.74        33
          68       0.95      0.61      0.74        33
          69       0.95      0.61      0.74        33
          70       0.95      0.61      0.74        33
          71       0.95      0.61      0.74        33
          72       0.95      0.61      0.74        33
          73       0.95      0.61      0.74        33
          74       0.95      0.61      0.74        33
          75       0.95      0.61      0.74        33
          76       0.95      0.61      0.74        33
          77       0.95      0.61      0.74        33
          78       0.95      0.61      0.74        33
          79       0.95      0.61      0.74        33
          80       0.95      0.61      0.74        33
          81       0.95      0.61      0.74        33
          82       0.95      0.61      0.74        33
          83       0.95      0.61      0.74        33
          84       0.95      0.61      0.74        33
          85       0.95      0.61      0.74        33
          86       0.95      0.61      0.74        33
          87       0.95      0.61      0.74        33
          88       0.95      0.61      0.74        33
          89       0.95      0.61      0.74        33
          90       0.95      0.61      0.74        33
          91       0.95      0.61      0.74        33
          92       0.95      0.61      0.74        33
          93       0.95      0.61      0.74        33
          94       0.95      0.61      0.74        33
          95       0.95      0.61      0.74        33
          96       0.95      0.61      0.74        33
          97       0.95      0.61      0.74        33
          98       0.95      0.61      0.74        33
          99       0.95      0.61      0.74        33
         100       0.95      0.61      0.74        33
         101       0.95      0.61      0.74        33
         102       0.95      0.61      0.74        33
         103       0.95      0.61      0.74        33
         104       0.95      0.61      0.74        33
         105       0.95      0.61      0.74        33
         106       0.95      0.61      0.74        33
         107       0.95      0.61      0.74        33
         108       0.95      0.61      0.74        33
         109       0.95      0.61      0.74        33
         110       0.95      0.61      0.74        33
         111       0.95      0.61      0.74        33
         112       0.95      0.61      0.74        33
         113       0.95      0.61      0.74        33
         114       0.95      0.61      0.74        33
         115       0.95      0.61      0.74        33
         116       0.95      0.61      0.74        33
         117       0.95      0.61      0.74        33
         118       0.95      0.61      0.74        33
         119       0.95      0.61      0.74        33
         120       0.95      0.61      0.74        33
         121       0.95      0.61      0.74        33
         122       0.95      0.61      0.74        33
         123       0.95      0.61      0.74        33
         124       0.95      0.61      0.74        33
         125       0.95      0.61      0.74        33
         126       0.95      0.61      0.74        33
         127       0.95      0.61      0.74        33
         128       0.95      0.61      0.74        33
         129       0.95      0.61      0.74        33
         130       0.95      0.61      0.74        33
         131       0.95      0.61      0.74        33
         132       0.95      0.61      0.74        33
         133       0.95      0.61      0.74        33
         134       0.95      0.61      0.74        33
         135       0.95      0.61      0.74        33
         136       0.95      0.61      0.74        33
         137       0.95      0.61      0.74        33
         138       0.95      0.61      0.74        33
         139       0.95      0.61      0.74        33
         140       0.95      0.61      0.74        33
         141       0.95      0.61      0.74        33
         142       0.95      0.61      0.74        33
         143       0.95      0.61      0.74        33
         144       0.95      0.61      0.74        33
         145       0.95      0.61      0.74        33
         146       0.95      0.61      0.74        33
         147       0.95      0.61      0.74        33
         148       0.95      0.61      0.74        33
         149       0.95      0.61      0.74        33
         150       0.95      0.61      0.74        33
         151       0.95      0.61      0.74        33
         152       0.95      0.61      0.74        33
         153       0.95      0.61      0.74        33

   micro avg       0.95      0.61      0.74      5082
   macro avg       0.95      0.61      0.74      5082
weighted avg       0.95      0.61      0.74      5082
 samples avg       0.09      0.09      0.09      5082

C:\Users\thors\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.10_qbz5n2kfra8p0\LocalCache\local-packages\Python310\site-packages\sklearn\metrics\_classification.py:1497: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in samples with no predicted labels. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
C:\Users\thors\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.10_qbz5n2kfra8p0\LocalCache\local-packages\Python310\site-packages\sklearn\metrics\_classification.py:1497: UndefinedMetricWarning: Recall is ill-defined and being set to 0.0 in samples with no true labels. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
C:\Users\thors\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.10_qbz5n2kfra8p0\LocalCache\local-packages\Python310\site-packages\sklearn\metrics\_classification.py:1497: UndefinedMetricWarning: F-score is ill-defined and being set to 0.0 in samples with no true nor predicted labels. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))

Hyperparameter Tuning¶

In [ ]:
pipeline = make_pipeline(StandardScaler(), RandomForestClassifier())

# Setup the parameters and distributions to sample from: param_dist
param_dist = {
    "randomforestclassifier__max_depth": [28, 30, 32, None],  
    "randomforestclassifier__min_samples_leaf": np.arange(1, 10, 4),
    "randomforestclassifier__n_estimators": np.arange(60, 100, 4)
}

# Instantiate the RandomizedSearchCV object: random_grid_search_cv
random_search_cv = RandomizedSearchCV(pipeline, param_distributions=param_dist, n_iter=50, cv=3, random_state=42)
#grid_search_cv = GridSearchCV(pipeline, param_grid=param_dist, cv=3)

# Fit random_search_cv using the data X and labels y
random_search_cv.fit(X_train, y_train)
#grid_search_cv.fit(X_train, y_train)

# Print the best score
print("Best score is {}".format(random_search_cv.best_estimator_.score(X_test, y_test)))
print("Best parameters are {}".format(random_search_cv.best_params_))
print(classification_report(y_test, y_pred))
Best score is 0.956140350877193
Best parameters are {'randomforestclassifier__max_depth': 28, 'randomforestclassifier__min_samples_leaf': 1, 'randomforestclassifier__n_estimators': 68}

K Nearest Neighbors¶

In [ ]:
from sklearn.neighbors import KNeighborsClassifier
model = KNeighborsClassifier()
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

# Calculate accuracy
print("Score: ", model.score(X_test, y_test))
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)
print(classification_report(y_test, y_pred))
Score:  0.8333333333333334
Accuracy: 0.8333333333333334
              precision    recall  f1-score   support

           0       0.14      0.03      0.05        33
           1       0.14      0.03      0.05        33
           2       0.14      0.03      0.05        33
           3       0.14      0.03      0.05        33
           4       0.14      0.03      0.05        33
           5       0.14      0.03      0.05        33
           6       0.14      0.03      0.05        33
           7       0.14      0.03      0.05        33
           8       0.14      0.03      0.05        33
           9       0.14      0.03      0.05        33
          10       0.14      0.03      0.05        33
          11       0.14      0.03      0.05        33
          12       0.14      0.03      0.05        33
          13       0.14      0.03      0.05        33
          14       0.14      0.03      0.05        33
          15       0.14      0.03      0.05        33
          16       0.14      0.03      0.05        33
          17       0.14      0.03      0.05        33
          18       0.14      0.03      0.05        33
          19       0.14      0.03      0.05        33
          20       0.14      0.03      0.05        33
          21       0.14      0.03      0.05        33
          22       0.14      0.03      0.05        33
          23       0.14      0.03      0.05        33
          24       0.14      0.03      0.05        33
          25       0.14      0.03      0.05        33
          26       0.14      0.03      0.05        33
          27       0.14      0.03      0.05        33
          28       0.14      0.03      0.05        33
          29       0.14      0.03      0.05        33
          30       0.14      0.03      0.05        33
          31       0.14      0.03      0.05        33
          32       0.14      0.03      0.05        33
          33       0.14      0.03      0.05        33
          34       0.14      0.03      0.05        33
          35       0.14      0.03      0.05        33
          36       0.14      0.03      0.05        33
          37       0.14      0.03      0.05        33
          38       0.14      0.03      0.05        33
          39       0.14      0.03      0.05        33
          40       0.14      0.03      0.05        33
          41       0.14      0.03      0.05        33
          42       0.14      0.03      0.05        33
          43       0.14      0.03      0.05        33
          44       0.14      0.03      0.05        33
          45       0.14      0.03      0.05        33
          46       0.14      0.03      0.05        33
          47       0.14      0.03      0.05        33
          48       0.14      0.03      0.05        33
          49       0.14      0.03      0.05        33
          50       0.14      0.03      0.05        33
          51       0.14      0.03      0.05        33
          52       0.14      0.03      0.05        33
          53       0.14      0.03      0.05        33
          54       0.14      0.03      0.05        33
          55       0.14      0.03      0.05        33
          56       0.14      0.03      0.05        33
          57       0.14      0.03      0.05        33
          58       0.14      0.03      0.05        33
          59       0.14      0.03      0.05        33
          60       0.14      0.03      0.05        33
          61       0.14      0.03      0.05        33
          62       0.14      0.03      0.05        33
          63       0.14      0.03      0.05        33
          64       0.14      0.03      0.05        33
          65       0.14      0.03      0.05        33
          66       0.14      0.03      0.05        33
          67       0.14      0.03      0.05        33
          68       0.14      0.03      0.05        33
          69       0.14      0.03      0.05        33
          70       0.14      0.03      0.05        33
          71       0.14      0.03      0.05        33
          72       0.14      0.03      0.05        33
          73       0.14      0.03      0.05        33
          74       0.14      0.03      0.05        33
          75       0.14      0.03      0.05        33
          76       0.14      0.03      0.05        33
          77       0.14      0.03      0.05        33
          78       0.14      0.03      0.05        33
          79       0.14      0.03      0.05        33
          80       0.14      0.03      0.05        33
          81       0.14      0.03      0.05        33
          82       0.14      0.03      0.05        33
          83       0.14      0.03      0.05        33
          84       0.14      0.03      0.05        33
          85       0.14      0.03      0.05        33
          86       0.14      0.03      0.05        33
          87       0.14      0.03      0.05        33
          88       0.14      0.03      0.05        33
          89       0.14      0.03      0.05        33
          90       0.14      0.03      0.05        33
          91       0.14      0.03      0.05        33
          92       0.14      0.03      0.05        33
          93       0.14      0.03      0.05        33
          94       0.14      0.03      0.05        33
          95       0.14      0.03      0.05        33
          96       0.14      0.03      0.05        33
          97       0.14      0.03      0.05        33
          98       0.14      0.03      0.05        33
          99       0.14      0.03      0.05        33
         100       0.14      0.03      0.05        33
         101       0.14      0.03      0.05        33
         102       0.14      0.03      0.05        33
         103       0.14      0.03      0.05        33
         104       0.14      0.03      0.05        33
         105       0.14      0.03      0.05        33
         106       0.14      0.03      0.05        33
         107       0.14      0.03      0.05        33
         108       0.14      0.03      0.05        33
         109       0.14      0.03      0.05        33
         110       0.14      0.03      0.05        33
         111       0.14      0.03      0.05        33
         112       0.14      0.03      0.05        33
         113       0.14      0.03      0.05        33
         114       0.14      0.03      0.05        33
         115       0.14      0.03      0.05        33
         116       0.14      0.03      0.05        33
         117       0.14      0.03      0.05        33
         118       0.14      0.03      0.05        33
         119       0.14      0.03      0.05        33
         120       0.14      0.03      0.05        33
         121       0.14      0.03      0.05        33
         122       0.14      0.03      0.05        33
         123       0.14      0.03      0.05        33
         124       0.14      0.03      0.05        33
         125       0.14      0.03      0.05        33
         126       0.14      0.03      0.05        33
         127       0.14      0.03      0.05        33
         128       0.14      0.03      0.05        33
         129       0.14      0.03      0.05        33
         130       0.14      0.03      0.05        33
         131       0.14      0.03      0.05        33
         132       0.14      0.03      0.05        33
         133       0.14      0.03      0.05        33
         134       0.14      0.03      0.05        33
         135       0.14      0.03      0.05        33
         136       0.14      0.03      0.05        33
         137       0.14      0.03      0.05        33
         138       0.14      0.03      0.05        33
         139       0.14      0.03      0.05        33
         140       0.14      0.03      0.05        33
         141       0.14      0.03      0.05        33
         142       0.14      0.03      0.05        33
         143       0.14      0.03      0.05        33
         144       0.14      0.03      0.05        33
         145       0.14      0.03      0.05        33
         146       0.14      0.03      0.05        33
         147       0.14      0.03      0.05        33
         148       0.14      0.03      0.05        33
         149       0.14      0.03      0.05        33
         150       0.14      0.03      0.05        33
         151       0.14      0.03      0.05        33
         152       0.14      0.03      0.05        33
         153       0.14      0.03      0.05        33

   micro avg       0.14      0.03      0.05      5082
   macro avg       0.14      0.03      0.05      5082
weighted avg       0.14      0.03      0.05      5082
 samples avg       0.00      0.00      0.00      5082

C:\Users\thors\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.10_qbz5n2kfra8p0\LocalCache\local-packages\Python310\site-packages\sklearn\metrics\_classification.py:1497: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in samples with no predicted labels. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
C:\Users\thors\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.10_qbz5n2kfra8p0\LocalCache\local-packages\Python310\site-packages\sklearn\metrics\_classification.py:1497: UndefinedMetricWarning: Recall is ill-defined and being set to 0.0 in samples with no true labels. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
C:\Users\thors\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.10_qbz5n2kfra8p0\LocalCache\local-packages\Python310\site-packages\sklearn\metrics\_classification.py:1497: UndefinedMetricWarning: F-score is ill-defined and being set to 0.0 in samples with no true nor predicted labels. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))

Hyperparameter Tuning¶

In [ ]:
from sklearn.neighbors import KNeighborsClassifier
param_grid = {
    'kneighborsclassifier__n_neighbors': [3, 5, 7, 9]  # List of k values to try
}

pipeline = make_pipeline(StandardScaler(), KNeighborsClassifier())

grid_search = GridSearchCV(estimator=pipeline, param_grid=param_grid, cv=5)

grid_search.fit(X_train, y_train)

best_params = grid_search.best_params_
best_score=grid_search.best_estimator_.score(X_test, y_test)

print("Best Parameters:", best_params)
print("Best Score:", best_score)
print(classification_report(y_test, y_pred))
Best Parameters: {'kneighborsclassifier__n_neighbors': 3}
Best Score: 0.9035087719298246
              precision    recall  f1-score   support

           0       0.14      0.03      0.05        33
           1       0.14      0.03      0.05        33
           2       0.14      0.03      0.05        33
           3       0.14      0.03      0.05        33
           4       0.14      0.03      0.05        33
           5       0.14      0.03      0.05        33
           6       0.14      0.03      0.05        33
           7       0.14      0.03      0.05        33
           8       0.14      0.03      0.05        33
           9       0.14      0.03      0.05        33
          10       0.14      0.03      0.05        33
          11       0.14      0.03      0.05        33
          12       0.14      0.03      0.05        33
          13       0.14      0.03      0.05        33
          14       0.14      0.03      0.05        33
          15       0.14      0.03      0.05        33
          16       0.14      0.03      0.05        33
          17       0.14      0.03      0.05        33
          18       0.14      0.03      0.05        33
          19       0.14      0.03      0.05        33
          20       0.14      0.03      0.05        33
          21       0.14      0.03      0.05        33
          22       0.14      0.03      0.05        33
          23       0.14      0.03      0.05        33
          24       0.14      0.03      0.05        33
          25       0.14      0.03      0.05        33
          26       0.14      0.03      0.05        33
          27       0.14      0.03      0.05        33
          28       0.14      0.03      0.05        33
          29       0.14      0.03      0.05        33
          30       0.14      0.03      0.05        33
          31       0.14      0.03      0.05        33
          32       0.14      0.03      0.05        33
          33       0.14      0.03      0.05        33
          34       0.14      0.03      0.05        33
          35       0.14      0.03      0.05        33
          36       0.14      0.03      0.05        33
          37       0.14      0.03      0.05        33
          38       0.14      0.03      0.05        33
          39       0.14      0.03      0.05        33
          40       0.14      0.03      0.05        33
          41       0.14      0.03      0.05        33
          42       0.14      0.03      0.05        33
          43       0.14      0.03      0.05        33
          44       0.14      0.03      0.05        33
          45       0.14      0.03      0.05        33
          46       0.14      0.03      0.05        33
          47       0.14      0.03      0.05        33
          48       0.14      0.03      0.05        33
          49       0.14      0.03      0.05        33
          50       0.14      0.03      0.05        33
          51       0.14      0.03      0.05        33
          52       0.14      0.03      0.05        33
          53       0.14      0.03      0.05        33
          54       0.14      0.03      0.05        33
          55       0.14      0.03      0.05        33
          56       0.14      0.03      0.05        33
          57       0.14      0.03      0.05        33
          58       0.14      0.03      0.05        33
          59       0.14      0.03      0.05        33
          60       0.14      0.03      0.05        33
          61       0.14      0.03      0.05        33
          62       0.14      0.03      0.05        33
          63       0.14      0.03      0.05        33
          64       0.14      0.03      0.05        33
          65       0.14      0.03      0.05        33
          66       0.14      0.03      0.05        33
          67       0.14      0.03      0.05        33
          68       0.14      0.03      0.05        33
          69       0.14      0.03      0.05        33
          70       0.14      0.03      0.05        33
          71       0.14      0.03      0.05        33
          72       0.14      0.03      0.05        33
          73       0.14      0.03      0.05        33
          74       0.14      0.03      0.05        33
          75       0.14      0.03      0.05        33
          76       0.14      0.03      0.05        33
          77       0.14      0.03      0.05        33
          78       0.14      0.03      0.05        33
          79       0.14      0.03      0.05        33
          80       0.14      0.03      0.05        33
          81       0.14      0.03      0.05        33
          82       0.14      0.03      0.05        33
          83       0.14      0.03      0.05        33
          84       0.14      0.03      0.05        33
          85       0.14      0.03      0.05        33
          86       0.14      0.03      0.05        33
          87       0.14      0.03      0.05        33
          88       0.14      0.03      0.05        33
          89       0.14      0.03      0.05        33
          90       0.14      0.03      0.05        33
          91       0.14      0.03      0.05        33
          92       0.14      0.03      0.05        33
          93       0.14      0.03      0.05        33
          94       0.14      0.03      0.05        33
          95       0.14      0.03      0.05        33
          96       0.14      0.03      0.05        33
          97       0.14      0.03      0.05        33
          98       0.14      0.03      0.05        33
          99       0.14      0.03      0.05        33
         100       0.14      0.03      0.05        33
         101       0.14      0.03      0.05        33
         102       0.14      0.03      0.05        33
         103       0.14      0.03      0.05        33
         104       0.14      0.03      0.05        33
         105       0.14      0.03      0.05        33
         106       0.14      0.03      0.05        33
         107       0.14      0.03      0.05        33
         108       0.14      0.03      0.05        33
         109       0.14      0.03      0.05        33
         110       0.14      0.03      0.05        33
         111       0.14      0.03      0.05        33
         112       0.14      0.03      0.05        33
         113       0.14      0.03      0.05        33
         114       0.14      0.03      0.05        33
         115       0.14      0.03      0.05        33
         116       0.14      0.03      0.05        33
         117       0.14      0.03      0.05        33
         118       0.14      0.03      0.05        33
         119       0.14      0.03      0.05        33
         120       0.14      0.03      0.05        33
         121       0.14      0.03      0.05        33
         122       0.14      0.03      0.05        33
         123       0.14      0.03      0.05        33
         124       0.14      0.03      0.05        33
         125       0.14      0.03      0.05        33
         126       0.14      0.03      0.05        33
         127       0.14      0.03      0.05        33
         128       0.14      0.03      0.05        33
         129       0.14      0.03      0.05        33
         130       0.14      0.03      0.05        33
         131       0.14      0.03      0.05        33
         132       0.14      0.03      0.05        33
         133       0.14      0.03      0.05        33
         134       0.14      0.03      0.05        33
         135       0.14      0.03      0.05        33
         136       0.14      0.03      0.05        33
         137       0.14      0.03      0.05        33
         138       0.14      0.03      0.05        33
         139       0.14      0.03      0.05        33
         140       0.14      0.03      0.05        33
         141       0.14      0.03      0.05        33
         142       0.14      0.03      0.05        33
         143       0.14      0.03      0.05        33
         144       0.14      0.03      0.05        33
         145       0.14      0.03      0.05        33
         146       0.14      0.03      0.05        33
         147       0.14      0.03      0.05        33
         148       0.14      0.03      0.05        33
         149       0.14      0.03      0.05        33
         150       0.14      0.03      0.05        33
         151       0.14      0.03      0.05        33
         152       0.14      0.03      0.05        33
         153       0.14      0.03      0.05        33

   micro avg       0.14      0.03      0.05      5082
   macro avg       0.14      0.03      0.05      5082
weighted avg       0.14      0.03      0.05      5082
 samples avg       0.00      0.00      0.00      5082

C:\Users\thors\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.10_qbz5n2kfra8p0\LocalCache\local-packages\Python310\site-packages\sklearn\metrics\_classification.py:1497: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in samples with no predicted labels. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
C:\Users\thors\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.10_qbz5n2kfra8p0\LocalCache\local-packages\Python310\site-packages\sklearn\metrics\_classification.py:1497: UndefinedMetricWarning: Recall is ill-defined and being set to 0.0 in samples with no true labels. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
C:\Users\thors\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.10_qbz5n2kfra8p0\LocalCache\local-packages\Python310\site-packages\sklearn\metrics\_classification.py:1497: UndefinedMetricWarning: F-score is ill-defined and being set to 0.0 in samples with no true nor predicted labels. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))

The accuracy scores generally appear to be higher for multilabel classifiers than for the multiclass classifiers. However, we should take these accuracy scores for the multilabel classifiers with a grain of salt. In a multilabel setting, accuracy can be deceptively high when the majority of labels are negative. Since only one type combination (or two types) can be true for each Pokémon (row), all others are false, meaning we are definitely dealing with an imbalanced dataset. Recall is a mathematical measure of how many of the actual positive cases were correctly identified by a model. As we can see, recall scores are generally pretty low for the multilabel classifiers.

Multiclass multioutput Classification¶

Finally, there is Multiclass multioutput Classification, which is supported natively by some scikit-learn models.

Preprocessing¶

Here, we don't want to drop the Type 1 and Type 2 columns because we want to use them as the y vector.

In [ ]:
df = preprocessed_df.copy()

# Some type combinations only occur once so we extract them and add them to the test data
df['Types'] = df[['type_1', 'type_2']].apply(lambda x: tuple(filter(lambda y: pd.notna(y), x)), axis=1)

singleton_classes = df['Types'].value_counts()[df['Types'].value_counts() == 1].index.tolist()
singleton_data = df[df['Types'].isin(singleton_classes)]
other_data = df[~df['Types'].isin(singleton_classes)]
df = df.drop(columns=['Types'])
other_data.drop(columns=['Types'], inplace=True)
singleton_data.drop(columns=['Types'], inplace=True)
y = df[['type_1', 'type_2']]
df.head()
Out[ ]:
generation height_m weight_kg abilities_number total_points hp attack defense sp_attack sp_defense ... egg_type_2_Grass egg_type_2_Human-Like egg_type_2_Mineral egg_type_2_Monster egg_type_2_None egg_type_2_Water 1 egg_type_2_Water 2 egg_type_2_Water 3 type_1 type_2
0 1 0.7 6.9 2 318 45 49 49 65 65 ... False False False True False False False False Grass Poison
1 1 1.0 13.0 2 405 60 62 63 80 80 ... False False False True False False False False Grass Poison
2 1 2.0 100.0 2 525 80 82 83 100 100 ... False False False True False False False False Grass Poison
3 1 2.4 155.5 1 625 80 100 123 122 120 ... False False False True False False False False Grass Poison
4 1 0.6 8.5 2 309 39 52 43 60 50 ... False False False True False False False False Fire None

5 rows × 546 columns

In [ ]:
y.head()
Out[ ]:
type_1 type_2
0 Grass Poison
1 Grass Poison
2 Grass Poison
3 Grass Poison
4 Fire None

Decision Tree¶

We used a multioutput classifier to measure the accuracy score.

We calculated the accuracy for each type individually and together (where both types are correctly predicted).

We made our own method to calculate the accuracy for both types together and concluded it was similar to .score.

In [ ]:
from sklearn.multioutput import MultiOutputClassifier

# Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(other_data.drop(columns=['type_1', 'type_2']), other_data[['type_1', 'type_2']], test_size=0.2, stratify=other_data[['type_1', 'type_2']], random_state=42)
X_train = pd.concat([X_train, singleton_data.drop(columns=['type_1', 'type_2'])])
y_train = pd.concat([y_train, singleton_data[['type_1', 'type_2']]])
X_test = pd.concat([X_test, singleton_data.drop(columns=['type_1', 'type_2'])])
y_test = pd.concat([y_test, singleton_data[['type_1', 'type_2']]])


base_classifier = DecisionTreeClassifier()

multi_output_classifier = MultiOutputClassifier(base_classifier)
multi_output_classifier.fit(X_train, y_train)
base_classifier.fit(X_train, y_train)

# Step 4: Model Evaluation
y_pred = multi_output_classifier.predict(X_test)

# Evaluate

print("Score: ", multi_output_classifier.score(X_test, y_test))

y_pred = base_classifier.predict(X_test)

# our own score function
a = (y_test == y_pred)
b = []

for i, j in enumerate(a.iterrows()):
    b.append(j[1]['type_1'] and j[1]['type_2'])

nb_correct = 0
for i in b:
    if i:
        nb_correct += 1

score_ratio = nb_correct/len(b)
print("score ratio: ",score_ratio)

# accuracy score for each type
accuracy_list=[]
y_test = np.asarray(y_test)
y_pred = np.asarray(y_pred)
for i in range(2):
    accuracy = accuracy_score(y_test[:, i], y_pred[:, i])
    accuracy_list.append(accuracy)
    print("Accuracy type ", i+1, ": ", accuracy )
print("Averaged Accuracy for types: ",np.mean(accuracy_list))
Score:  0.4049586776859504
score ratio:  0.45867768595041325
Accuracy type  1 :  0.6487603305785123
Accuracy type  2 :  0.6074380165289256
Averaged Accuracy for types:  0.6280991735537189

Hyperparameter Tuning¶

In [ ]:
pipeline = make_pipeline(StandardScaler(), MultiOutputClassifier(DecisionTreeClassifier()))

param_dist = {
    "multioutputclassifier__estimator__max_depth": [5, 6, 7, 8, 9, 10, 15, 30, None], 
    "multioutputclassifier__estimator__min_samples_leaf": np.arange(1, 10)
}

# Instantiate the GridSearchCV object
grid_search_cv = GridSearchCV(pipeline, param_grid=param_dist, cv=5)

# Fit grid_search_cv using the data X and labels y.
grid_search_cv.fit(X_train, y_train) 
y_pred = grid_search_cv.predict(X_test)

# Print the best score
print("Tuned Model Parameters: {}".format(grid_search_cv.best_params_))
print("Best score is {}".format(grid_search_cv.best_estimator_.score(X_test, y_test)))
Tuned Model Parameters: {'multioutputclassifier__estimator__max_depth': 30, 'multioutputclassifier__estimator__min_samples_leaf': 1}
Best score is 0.4049586776859504

Random Forest¶

In [ ]:
from sklearn.multioutput import MultiOutputClassifier
from sklearn.ensemble import RandomForestClassifier
base_classifier = RandomForestClassifier()

#base_classifier.fit(X_train, y_train)

multi_output_classifier = MultiOutputClassifier(base_classifier)
multi_output_classifier.fit(X_train, y_train)

# Step 4: Model Evaluation
accuracy_list=[]
y_pred = multi_output_classifier.predict(X_test)
print("score: ", multi_output_classifier.score(X_test, y_test))
y_test = np.asarray(y_test)
y_pred = np.asarray(y_pred)
for i in range(2):
    accuracy = accuracy_score(y_test[:, i], y_pred[:, i])
    print("Accuracy type ", i+1, ": ", accuracy )
    accuracy_list.append(accuracy)
print("Averaged Accuracy for types: ",np.mean(accuracy_list))
score:  0.5619834710743802
Accuracy type  1 :  0.7933884297520661
Accuracy type  2 :  0.6818181818181818
Averaged Accuracy for types:  0.7376033057851239

Hyperparameter Tuning¶

In [ ]:
pipeline = make_pipeline(StandardScaler(), MultiOutputClassifier(RandomForestClassifier()))

# Setup the parameters and distributions to sample from: param_dist
param_dist = {
    "multioutputclassifier__estimator__max_depth": [5, 10, 15, 30, None],  
    "multioutputclassifier__estimator__min_samples_leaf": np.arange(1, 10, 2),
    "multioutputclassifier__estimator__n_estimators": np.arange(60, 140, 8)
}

# Instantiate the RandomizedSearchCV object: random_grid_search_cv
random_search_cv = RandomizedSearchCV(pipeline, param_distributions=param_dist, n_iter=50, cv=3, random_state=42)
#grid_search_cv = GridSearchCV(pipeline, param_grid=param_dist, cv=3)

# Fit random_search_cv using the data X and labels y
random_search_cv.fit(X_train, y_train)
#grid_search_cv.fit(X_train, y_train)

# Print the best score
print("Best score is {}".format(random_search_cv.best_estimator_.score(X_test, y_test)))
print("Best parameters are {}".format(random_search_cv.best_params_))
Best score is 0.5578512396694215
Best parameters are {'multioutputclassifier__estimator__max_depth': None, 'multioutputclassifier__estimator__min_samples_leaf': 1, 'multioutputclassifier__estimator__n_estimators': 116}

KNeighborsClassifier¶

In [ ]:
from sklearn.multioutput import MultiOutputClassifier
from sklearn.neighbors import KNeighborsClassifier
base_classifier = KNeighborsClassifier()


multi_output_classifier = MultiOutputClassifier(base_classifier)
multi_output_classifier.fit(X_train, y_train)

# Step 4: Model Evaluation
y_pred = multi_output_classifier.predict(X_test)
print("score: ", multi_output_classifier.score(X_test, y_test))
y_test = np.asarray(y_test)
y_pred = np.asarray(y_pred)

accuracy_list=[]
for i in range(2):
    accuracy = accuracy_score(y_test[:, i], y_pred[:, i])
    print("Accuracy type ", i+1, ": ", accuracy )
    accuracy_list.append(accuracy)
print("Averaged Accuracy for types: ",np.mean(accuracy_list))
score:  0.08264462809917356
Accuracy type  1 :  0.24793388429752067
Accuracy type  2 :  0.35537190082644626
Averaged Accuracy for types:  0.30165289256198347

Hyperparameter Tuning¶

In [ ]:
param_grid = {
    'multioutputclassifier__estimator__n_neighbors': [3, 5, 7, 9]  # List of k values to try
}

pipeline = make_pipeline(StandardScaler(), MultiOutputClassifier(KNeighborsClassifier()))

grid_search = GridSearchCV(estimator=pipeline, param_grid=param_grid, cv=5)

grid_search.fit(X_train, y_train)

best_params = grid_search.best_params_
best_score=grid_search.best_estimator_.score(X_test, y_test)

print("Best Parameters:", best_params)
print("Best Score:", best_score)
Best Parameters: {'multioutputclassifier__estimator__n_neighbors': 3}
Best Score: 0.2892561983471074

We can clearly state that this extended dataset performs much better on most of the models, especially the Decision Tree for multilabel classification in Ignoring Order of Types 2.