import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import GridSearchCV
We have chosen the Pokémon dataset from https://www.kaggle.com/datasets/mariotormo/complete-pokemon-dataset-updated-090420?select=pokedex_%28Update_04.21%29.csv . Our objective is to classify Pokémon types based on the stats and find the model and hyperparameters with the best performance. There are multiple ways of going about this. But first, let's explore the dataset.
Exploring the dataset¶
# import the dataset
df = pd.read_csv('data/pokedex.csv')
# Print head
df.head()
Unnamed: 0 | pokedex_number | name | german_name | japanese_name | generation | status | species | type_number | type_1 | ... | against_ground | against_flying | against_psychic | against_bug | against_rock | against_ghost | against_dragon | against_dark | against_steel | against_fairy | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0 | 1 | Bulbasaur | Bisasam | フシギダネ (Fushigidane) | 1 | Normal | Seed Pokémon | 2 | Grass | ... | 1.0 | 2.0 | 2.0 | 1.0 | 1.0 | 1.0 | 1.0 | 1.0 | 1.0 | 0.5 |
1 | 1 | 2 | Ivysaur | Bisaknosp | フシギソウ (Fushigisou) | 1 | Normal | Seed Pokémon | 2 | Grass | ... | 1.0 | 2.0 | 2.0 | 1.0 | 1.0 | 1.0 | 1.0 | 1.0 | 1.0 | 0.5 |
2 | 2 | 3 | Venusaur | Bisaflor | フシギバナ (Fushigibana) | 1 | Normal | Seed Pokémon | 2 | Grass | ... | 1.0 | 2.0 | 2.0 | 1.0 | 1.0 | 1.0 | 1.0 | 1.0 | 1.0 | 0.5 |
3 | 3 | 3 | Mega Venusaur | Bisaflor | フシギバナ (Fushigibana) | 1 | Normal | Seed Pokémon | 2 | Grass | ... | 1.0 | 2.0 | 2.0 | 1.0 | 1.0 | 1.0 | 1.0 | 1.0 | 1.0 | 0.5 |
4 | 4 | 4 | Charmander | Glumanda | ヒトカゲ (Hitokage) | 1 | Normal | Lizard Pokémon | 1 | Fire | ... | 2.0 | 1.0 | 1.0 | 0.5 | 2.0 | 1.0 | 1.0 | 1.0 | 0.5 | 0.5 |
5 rows × 51 columns
# Print info such as data types and number of non-null values
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 1045 entries, 0 to 1044 Data columns (total 51 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Unnamed: 0 1045 non-null int64 1 pokedex_number 1045 non-null int64 2 name 1045 non-null object 3 german_name 1045 non-null object 4 japanese_name 1045 non-null object 5 generation 1045 non-null int64 6 status 1045 non-null object 7 species 1045 non-null object 8 type_number 1045 non-null int64 9 type_1 1045 non-null object 10 type_2 553 non-null object 11 height_m 1045 non-null float64 12 weight_kg 1044 non-null float64 13 abilities_number 1045 non-null int64 14 ability_1 1042 non-null object 15 ability_2 516 non-null object 16 ability_hidden 813 non-null object 17 total_points 1045 non-null int64 18 hp 1045 non-null int64 19 attack 1045 non-null int64 20 defense 1045 non-null int64 21 sp_attack 1045 non-null int64 22 sp_defense 1045 non-null int64 23 speed 1045 non-null int64 24 catch_rate 1027 non-null float64 25 base_friendship 930 non-null float64 26 base_experience 925 non-null float64 27 growth_rate 1044 non-null object 28 egg_type_number 1045 non-null int64 29 egg_type_1 1042 non-null object 30 egg_type_2 285 non-null object 31 percentage_male 872 non-null float64 32 egg_cycles 1044 non-null float64 33 against_normal 1045 non-null float64 34 against_fire 1045 non-null float64 35 against_water 1045 non-null float64 36 against_electric 1045 non-null float64 37 against_grass 1045 non-null float64 38 against_ice 1045 non-null float64 39 against_fight 1045 non-null float64 40 against_poison 1045 non-null float64 41 against_ground 1045 non-null float64 42 against_flying 1045 non-null float64 43 against_psychic 1045 non-null float64 44 against_bug 1045 non-null float64 45 against_rock 1045 non-null float64 46 against_ghost 1045 non-null float64 47 against_dragon 1045 non-null float64 48 against_dark 1045 non-null float64 49 against_steel 1045 non-null float64 50 against_fairy 1045 non-null float64 dtypes: float64(25), int64(13), object(13) memory usage: 416.5+ KB
# Print summary statistics of numeric types
df.describe()
Unnamed: 0 | pokedex_number | generation | type_number | height_m | weight_kg | abilities_number | total_points | hp | attack | ... | against_ground | against_flying | against_psychic | against_bug | against_rock | against_ghost | against_dragon | against_dark | against_steel | against_fairy | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
count | 1045.000000 | 1045.000000 | 1045.000000 | 1045.000000 | 1045.000000 | 1044.000000 | 1045.000000 | 1045.000000 | 1045.000000 | 1045.000000 | ... | 1045.000000 | 1045.000000 | 1045.000000 | 1045.000000 | 1045.000000 | 1045.000000 | 1045.000000 | 1045.000000 | 1045.000000 | 1045.000000 |
mean | 522.000000 | 440.769378 | 4.098565 | 1.529187 | 1.374067 | 71.216571 | 2.268900 | 439.353110 | 70.067943 | 80.476555 | ... | 1.082297 | 1.168900 | 0.977273 | 0.998086 | 1.238278 | 1.018660 | 0.977033 | 1.071053 | 0.981579 | 1.091148 |
std | 301.809819 | 262.517231 | 2.272788 | 0.499386 | 3.353349 | 132.259911 | 0.803154 | 121.992897 | 26.671411 | 32.432728 | ... | 0.782683 | 0.592145 | 0.501934 | 0.610411 | 0.696560 | 0.568056 | 0.375812 | 0.465178 | 0.501753 | 0.536285 |
min | 0.000000 | 1.000000 | 1.000000 | 1.000000 | 0.100000 | 0.100000 | 0.000000 | 175.000000 | 1.000000 | 5.000000 | ... | 0.000000 | 0.250000 | 0.000000 | 0.000000 | 0.250000 | 0.000000 | 0.000000 | 0.250000 | 0.000000 | 0.000000 |
25% | 261.000000 | 212.000000 | 2.000000 | 1.000000 | 0.600000 | 9.000000 | 2.000000 | 330.000000 | 50.000000 | 55.000000 | ... | 0.500000 | 1.000000 | 1.000000 | 0.500000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 0.500000 | 1.000000 |
50% | 522.000000 | 436.000000 | 4.000000 | 2.000000 | 1.000000 | 29.500000 | 2.000000 | 458.000000 | 68.000000 | 77.000000 | ... | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 |
75% | 783.000000 | 670.000000 | 6.000000 | 2.000000 | 1.600000 | 70.500000 | 3.000000 | 515.000000 | 82.000000 | 100.000000 | ... | 1.500000 | 1.000000 | 1.000000 | 1.000000 | 2.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 |
max | 1044.000000 | 898.000000 | 8.000000 | 2.000000 | 100.000000 | 999.900000 | 3.000000 | 1125.000000 | 255.000000 | 190.000000 | ... | 4.000000 | 4.000000 | 4.000000 | 4.000000 | 4.000000 | 4.000000 | 2.000000 | 4.000000 | 4.000000 | 4.000000 |
8 rows × 38 columns
# show all columns with non-numeric type
object_columns = df.select_dtypes(include=['object']).columns
# Print the selected columns
print(object_columns)
Index(['name', 'german_name', 'japanese_name', 'status', 'species', 'type_1', 'type_2', 'ability_1', 'ability_2', 'ability_hidden', 'growth_rate', 'egg_type_1', 'egg_type_2'], dtype='object')
We'll need to drop some of those columns and encode others to make classification possible.
Common preprocessing steps¶
Let's start by removing the columns that contain data which only serve to uniquely identify each Pokémon in the dataset. This includes the name of each Pokémon in 3 different languages, each Pokémon's Pokédex number and an unnamed column with the index of each row.
# Drop the names and index numbers
df.drop(['name', 'Unnamed: 0', 'german_name', 'japanese_name', 'pokedex_number'], axis=1, inplace=True)
# Print the head of the dataframe
df.head()
generation | status | species | type_number | type_1 | type_2 | height_m | weight_kg | abilities_number | ability_1 | ... | against_ground | against_flying | against_psychic | against_bug | against_rock | against_ghost | against_dragon | against_dark | against_steel | against_fairy | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | Normal | Seed Pokémon | 2 | Grass | Poison | 0.7 | 6.9 | 2 | Overgrow | ... | 1.0 | 2.0 | 2.0 | 1.0 | 1.0 | 1.0 | 1.0 | 1.0 | 1.0 | 0.5 |
1 | 1 | Normal | Seed Pokémon | 2 | Grass | Poison | 1.0 | 13.0 | 2 | Overgrow | ... | 1.0 | 2.0 | 2.0 | 1.0 | 1.0 | 1.0 | 1.0 | 1.0 | 1.0 | 0.5 |
2 | 1 | Normal | Seed Pokémon | 2 | Grass | Poison | 2.0 | 100.0 | 2 | Overgrow | ... | 1.0 | 2.0 | 2.0 | 1.0 | 1.0 | 1.0 | 1.0 | 1.0 | 1.0 | 0.5 |
3 | 1 | Normal | Seed Pokémon | 2 | Grass | Poison | 2.4 | 155.5 | 1 | Thick Fat | ... | 1.0 | 2.0 | 2.0 | 1.0 | 1.0 | 1.0 | 1.0 | 1.0 | 1.0 | 0.5 |
4 | 1 | Normal | Lizard Pokémon | 1 | Fire | NaN | 0.6 | 8.5 | 2 | Blaze | ... | 2.0 | 1.0 | 1.0 | 0.5 | 2.0 | 1.0 | 1.0 | 1.0 | 0.5 | 0.5 |
5 rows × 46 columns
These aren't all the columns we'll need to remove, however. Next, while some Pokémon do share the same species, the number of Pokémon that share any given species is usually under 10. (The 22 Paradox Pokémon aren't included in this dataset. This makes Mouse Pokémon, of which there are 12 in the dataset, due to the inclusion of alternate forms that some Pokémon have, the only species shared by more than 10 Pokémon.) Most commonly, Pokémon that share the same species, are part of the same evolution family.
What this means is that species are sparsely represented in the dataset. Species are also often only ever tied to one specific type combination. The former would make it annoying to deal with, and the latter goes against the spirit of this project by being far too directly correlated. The species is also more a descriptor and not really a stat. As such, let's remove the species column.
# Drop the species
df.drop(['species'], axis=1, inplace=True)
# Print the head of the dataframe
df.head()
generation | status | type_number | type_1 | type_2 | height_m | weight_kg | abilities_number | ability_1 | ability_2 | ... | against_ground | against_flying | against_psychic | against_bug | against_rock | against_ghost | against_dragon | against_dark | against_steel | against_fairy | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | Normal | 2 | Grass | Poison | 0.7 | 6.9 | 2 | Overgrow | NaN | ... | 1.0 | 2.0 | 2.0 | 1.0 | 1.0 | 1.0 | 1.0 | 1.0 | 1.0 | 0.5 |
1 | 1 | Normal | 2 | Grass | Poison | 1.0 | 13.0 | 2 | Overgrow | NaN | ... | 1.0 | 2.0 | 2.0 | 1.0 | 1.0 | 1.0 | 1.0 | 1.0 | 1.0 | 0.5 |
2 | 1 | Normal | 2 | Grass | Poison | 2.0 | 100.0 | 2 | Overgrow | NaN | ... | 1.0 | 2.0 | 2.0 | 1.0 | 1.0 | 1.0 | 1.0 | 1.0 | 1.0 | 0.5 |
3 | 1 | Normal | 2 | Grass | Poison | 2.4 | 155.5 | 1 | Thick Fat | NaN | ... | 1.0 | 2.0 | 2.0 | 1.0 | 1.0 | 1.0 | 1.0 | 1.0 | 1.0 | 0.5 |
4 | 1 | Normal | 1 | Fire | NaN | 0.6 | 8.5 | 2 | Blaze | NaN | ... | 2.0 | 1.0 | 1.0 | 0.5 | 2.0 | 1.0 | 1.0 | 1.0 | 0.5 | 0.5 |
5 rows × 45 columns
Now we're going to remove all the columns that give a direct indication of the types. All the columns named against_[insert type here] indicate type (dis)advantage. Type advantage is the direct result of a Pokémon's type(s). (See Type Chart https://pokemondb.net/type) Again, these go against the spirit of this project by being far too direct.
We'll also remove the column type_number, which indicates how many types the Pokémon has (1 or 2), for the same reason.
#Verify that the last 18 columns are the against_? data
print(df.columns[-18:])
# Drop the species
df.drop(df.columns[-18:], axis=1, inplace=True)
df.drop('type_number', axis=1, inplace=True)
# Print the head of the dataframe
df.head()
Index(['against_normal', 'against_fire', 'against_water', 'against_electric', 'against_grass', 'against_ice', 'against_fight', 'against_poison', 'against_ground', 'against_flying', 'against_psychic', 'against_bug', 'against_rock', 'against_ghost', 'against_dragon', 'against_dark', 'against_steel', 'against_fairy'], dtype='object')
generation | status | type_1 | type_2 | height_m | weight_kg | abilities_number | ability_1 | ability_2 | ability_hidden | ... | speed | catch_rate | base_friendship | base_experience | growth_rate | egg_type_number | egg_type_1 | egg_type_2 | percentage_male | egg_cycles | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | Normal | Grass | Poison | 0.7 | 6.9 | 2 | Overgrow | NaN | Chlorophyll | ... | 45 | 45.0 | 70.0 | 64.0 | Medium Slow | 2 | Grass | Monster | 87.5 | 20.0 |
1 | 1 | Normal | Grass | Poison | 1.0 | 13.0 | 2 | Overgrow | NaN | Chlorophyll | ... | 60 | 45.0 | 70.0 | 142.0 | Medium Slow | 2 | Grass | Monster | 87.5 | 20.0 |
2 | 1 | Normal | Grass | Poison | 2.0 | 100.0 | 2 | Overgrow | NaN | Chlorophyll | ... | 80 | 45.0 | 70.0 | 236.0 | Medium Slow | 2 | Grass | Monster | 87.5 | 20.0 |
3 | 1 | Normal | Grass | Poison | 2.4 | 155.5 | 1 | Thick Fat | NaN | NaN | ... | 80 | 45.0 | 70.0 | 281.0 | Medium Slow | 2 | Grass | Monster | 87.5 | 20.0 |
4 | 1 | Normal | Fire | NaN | 0.6 | 8.5 | 2 | Blaze | NaN | Solar Power | ... | 65 | 45.0 | 70.0 | 62.0 | Medium Slow | 2 | Dragon | Monster | 87.5 | 20.0 |
5 rows × 26 columns
Now let's check for any missing values in the remaining dataframe.
# Check for missing values
print(df.isnull().sum())
generation 0 status 0 type_1 0 type_2 492 height_m 0 weight_kg 1 abilities_number 0 ability_1 3 ability_2 529 ability_hidden 232 total_points 0 hp 0 attack 0 defense 0 sp_attack 0 sp_defense 0 speed 0 catch_rate 18 base_friendship 115 base_experience 120 growth_rate 1 egg_type_number 0 egg_type_1 3 egg_type_2 760 percentage_male 173 egg_cycles 1 dtype: int64
At first there appear to be a lot of missing values. However, for most missing values, their absence is actually correct. For example, plenty of Pokémon don't have a second or hidden ability.
That said, there are plenty of actual missing values, so let's deal with those. Let's start by going over what we're dealing with.
print("Missing weight_kg: ", np.where(df['weight_kg'].isnull()))
Missing weight_kg: (array([1033], dtype=int64),)
Taking a quick look in the original dataset reveals this corresponds with Eternatus Eternamax.
print("Missing ability_1: ", np.where(df['ability_1'].isnull()))
Missing ability_1: (array([ 33, 172, 1033], dtype=int64),)
Taking a quick look in the original dataset reveals these correspond with Partner Pikachu, Partner Eevee and Eternatus Eternamax.
The columns catch_rate, base_friendship, base_experience, percentage_male just have too many missing values for it to be reasonable to go through the effort of salvaging them. As such, we'll be removing these columns from the dataframe.
# Drop catch_rate, base_friendship and base_experience
df.drop(['catch_rate', 'base_friendship', 'base_experience', 'percentage_male'], axis=1, inplace=True)
# Print the head of the dataframe
df.head()
generation | status | type_1 | type_2 | height_m | weight_kg | abilities_number | ability_1 | ability_2 | ability_hidden | ... | attack | defense | sp_attack | sp_defense | speed | growth_rate | egg_type_number | egg_type_1 | egg_type_2 | egg_cycles | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | Normal | Grass | Poison | 0.7 | 6.9 | 2 | Overgrow | NaN | Chlorophyll | ... | 49 | 49 | 65 | 65 | 45 | Medium Slow | 2 | Grass | Monster | 20.0 |
1 | 1 | Normal | Grass | Poison | 1.0 | 13.0 | 2 | Overgrow | NaN | Chlorophyll | ... | 62 | 63 | 80 | 80 | 60 | Medium Slow | 2 | Grass | Monster | 20.0 |
2 | 1 | Normal | Grass | Poison | 2.0 | 100.0 | 2 | Overgrow | NaN | Chlorophyll | ... | 82 | 83 | 100 | 100 | 80 | Medium Slow | 2 | Grass | Monster | 20.0 |
3 | 1 | Normal | Grass | Poison | 2.4 | 155.5 | 1 | Thick Fat | NaN | NaN | ... | 100 | 123 | 122 | 120 | 80 | Medium Slow | 2 | Grass | Monster | 20.0 |
4 | 1 | Normal | Fire | NaN | 0.6 | 8.5 | 2 | Blaze | NaN | Solar Power | ... | 52 | 43 | 60 | 50 | 65 | Medium Slow | 2 | Dragon | Monster | 20.0 |
5 rows × 22 columns
print("Missing growth_rate: ", np.where(df['growth_rate'].isnull()))
Missing growth_rate: (array([658], dtype=int64),)
Taking a quick look in the original dataset reveals this corresponds with Galarian Darmanitan Zen Mode.
print("Missing egg_type_1: ", np.where(df['egg_type_1'].isnull()))
Missing egg_type_1: (array([ 33, 172, 658], dtype=int64),)
Taking a quick look in the original dataset reveals these correspond with Partner Pikachu, Partner Eevee and Galarian Darmanitan Zen Mode.
It's worth noting that egg_type_number actually lists these as having 0 egg groups, which is incorrect. (It might have been based the total number of values across egg_type_1 and egg_type_2, thereby erroneously taking the missing values into account.)
This means egg_type_number, egg_type_1 and egg_type_2 will need to be corrected.
print(np.where(df['egg_cycles'].isnull()))
(array([658], dtype=int64),)
Taking a quick look in the original dataset reveals this corresponds with Galarian Darmanitan Zen Mode.
Now that we know what we're dealing with, let's handle these missing values.
In total, there are 4 Pokémon (rows) in the dataset that are still actually missing values: Partner Pikachu, Partner Eevee, Galarian Darmanitan Zen Mode and Eternatus Eternamax.
For Partner Pikachu (row 33) and Partner Eevee (row 172), we need to fix the following: ability_1, egg_type_number, egg_type_1 and egg_type_2
#Partner Pikachu
print("Partner Pikachu ability_1 : ", df['ability_1'][33]) #to show Partner Pikachu's ability_1 is nan
df['ability_1'][33] = df['ability_1'][32] #Parner Pikachu's ability_1 is the same as regular Pikachu (row 32)
print("(Updated) Partner Pikachu ability_1 : ", df['ability_1'][33]) #to show Partner Pikachu's ability_1 now has the correct value
print("Partner Pikachu egg_type_number : ", df['egg_type_number'][33]) #nan
df['egg_type_number'][33] = df['egg_type_number'][32] #it's the same as regular Pikachu (row 32)
print("(Updated) Partner Pikachu egg_type_number : ", df['egg_type_number'][33]) #now has the correct value
print("Partner Pikachu egg_type_1 : ", df['egg_type_1'][33]) #nan
df['egg_type_1'][33] = df['egg_type_1'][32] #it's the same as regular Pikachu (row 32)
print("(Updated) Partner Pikachu egg_type_1 : ", df['egg_type_1'][33]) #now has the correct value
#(Partner) Pikachu has two egg types
print("Partner Pikachu egg_type_2 : ", df['egg_type_2'][33]) #nan
df['egg_type_2'][33] = df['egg_type_2'][32] #it's the same as regular Pikachu (row 32)
print("(Updated) Partner Pikachu egg_type_2 : ", df['egg_type_2'][33]) #now has the correct value
Partner Pikachu ability_1 : nan (Updated) Partner Pikachu ability_1 : Static Partner Pikachu egg_type_number : 0 (Updated) Partner Pikachu egg_type_number : 2 Partner Pikachu egg_type_1 : nan (Updated) Partner Pikachu egg_type_1 : Fairy Partner Pikachu egg_type_2 : nan (Updated) Partner Pikachu egg_type_2 : Field
C:\Users\thors\AppData\Local\Temp\ipykernel_20284\1639035027.py:3: FutureWarning: ChainedAssignmentError: behaviour will change in pandas 3.0! You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) this will never work to update the original DataFrame or Series, because the intermediate object on which we are setting values will behave as a copy. A typical example is when you are setting values in a column of a DataFrame, like: df["col"][row_indexer] = value Use `df.loc[row_indexer, "col"] = values` instead, to perform the assignment in a single step and ensure this keeps updating the original `df`. See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy df['ability_1'][33] = df['ability_1'][32] #Parner Pikachu's ability_1 is the same as regular Pikachu (row 32) C:\Users\thors\AppData\Local\Temp\ipykernel_20284\1639035027.py:3: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy df['ability_1'][33] = df['ability_1'][32] #Parner Pikachu's ability_1 is the same as regular Pikachu (row 32) C:\Users\thors\AppData\Local\Temp\ipykernel_20284\1639035027.py:7: FutureWarning: ChainedAssignmentError: behaviour will change in pandas 3.0! You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) this will never work to update the original DataFrame or Series, because the intermediate object on which we are setting values will behave as a copy. A typical example is when you are setting values in a column of a DataFrame, like: df["col"][row_indexer] = value Use `df.loc[row_indexer, "col"] = values` instead, to perform the assignment in a single step and ensure this keeps updating the original `df`. See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy df['egg_type_number'][33] = df['egg_type_number'][32] #it's the same as regular Pikachu (row 32) C:\Users\thors\AppData\Local\Temp\ipykernel_20284\1639035027.py:7: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy df['egg_type_number'][33] = df['egg_type_number'][32] #it's the same as regular Pikachu (row 32) C:\Users\thors\AppData\Local\Temp\ipykernel_20284\1639035027.py:11: FutureWarning: ChainedAssignmentError: behaviour will change in pandas 3.0! You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) this will never work to update the original DataFrame or Series, because the intermediate object on which we are setting values will behave as a copy. A typical example is when you are setting values in a column of a DataFrame, like: df["col"][row_indexer] = value Use `df.loc[row_indexer, "col"] = values` instead, to perform the assignment in a single step and ensure this keeps updating the original `df`. See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy df['egg_type_1'][33] = df['egg_type_1'][32] #it's the same as regular Pikachu (row 32) C:\Users\thors\AppData\Local\Temp\ipykernel_20284\1639035027.py:11: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy df['egg_type_1'][33] = df['egg_type_1'][32] #it's the same as regular Pikachu (row 32) C:\Users\thors\AppData\Local\Temp\ipykernel_20284\1639035027.py:16: FutureWarning: ChainedAssignmentError: behaviour will change in pandas 3.0! You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) this will never work to update the original DataFrame or Series, because the intermediate object on which we are setting values will behave as a copy. A typical example is when you are setting values in a column of a DataFrame, like: df["col"][row_indexer] = value Use `df.loc[row_indexer, "col"] = values` instead, to perform the assignment in a single step and ensure this keeps updating the original `df`. See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy df['egg_type_2'][33] = df['egg_type_2'][32] #it's the same as regular Pikachu (row 32) C:\Users\thors\AppData\Local\Temp\ipykernel_20284\1639035027.py:16: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy df['egg_type_2'][33] = df['egg_type_2'][32] #it's the same as regular Pikachu (row 32)
#Partner Eevee
print("Partner Eevee ability_1 : ", df['ability_1'][172]) #to show Partner Eevee's ability_1 is nan
df['ability_1'][172] = df['ability_1'][171] #Parner Eevee's ability_1 is the same as regular Eevee (row 171)
print("(Updated) Partner Eevee ability_1 : ", df['ability_1'][172]) #to show Partner Pikachu's ability_1 now has the correct value
print("Partner Eevee egg_type_number : ", df['egg_type_number'][172]) #nan
df['egg_type_number'][172] = df['egg_type_number'][171] #it's the same as regular Eevee (row 171)
print("(Updated) Partner Eevee egg_type_number : ", df['egg_type_number'][172]) #now has the correct value
print("Partner Eevee egg_type_1 : ", df['egg_type_1'][172]) #nan
df['egg_type_1'][172] = df['egg_type_1'][171] #it's the same as regular Eevee (row 171)
print("(Updated) Partner Eevee egg_type_1 : ", df['egg_type_1'][172]) #now has the correct value
#(Partner) Eevee only has one egg type, so this won't change anything
print("Partner Eevee egg_type_2 : ", df['egg_type_2'][172]) #nan
df['egg_type_2'][172] = df['egg_type_2'][171] #it's the same as regular Eevee (row 171)
print("(Updated) Partner Eevee egg_type_2 : ", df['egg_type_2'][172]) #now has the correct value, still nan
Partner Eevee ability_1 : nan (Updated) Partner Eevee ability_1 : Run Away Partner Eevee egg_type_number : 0 (Updated) Partner Eevee egg_type_number : 1 Partner Eevee egg_type_1 : nan (Updated) Partner Eevee egg_type_1 : Field Partner Eevee egg_type_2 : nan (Updated) Partner Eevee egg_type_2 : nan
C:\Users\thors\AppData\Local\Temp\ipykernel_20284\3308900266.py:3: FutureWarning: ChainedAssignmentError: behaviour will change in pandas 3.0! You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) this will never work to update the original DataFrame or Series, because the intermediate object on which we are setting values will behave as a copy. A typical example is when you are setting values in a column of a DataFrame, like: df["col"][row_indexer] = value Use `df.loc[row_indexer, "col"] = values` instead, to perform the assignment in a single step and ensure this keeps updating the original `df`. See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy df['ability_1'][172] = df['ability_1'][171] #Parner Eevee's ability_1 is the same as regular Eevee (row 171) C:\Users\thors\AppData\Local\Temp\ipykernel_20284\3308900266.py:3: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy df['ability_1'][172] = df['ability_1'][171] #Parner Eevee's ability_1 is the same as regular Eevee (row 171) C:\Users\thors\AppData\Local\Temp\ipykernel_20284\3308900266.py:7: FutureWarning: ChainedAssignmentError: behaviour will change in pandas 3.0! You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) this will never work to update the original DataFrame or Series, because the intermediate object on which we are setting values will behave as a copy. A typical example is when you are setting values in a column of a DataFrame, like: df["col"][row_indexer] = value Use `df.loc[row_indexer, "col"] = values` instead, to perform the assignment in a single step and ensure this keeps updating the original `df`. See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy df['egg_type_number'][172] = df['egg_type_number'][171] #it's the same as regular Eevee (row 171) C:\Users\thors\AppData\Local\Temp\ipykernel_20284\3308900266.py:7: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy df['egg_type_number'][172] = df['egg_type_number'][171] #it's the same as regular Eevee (row 171) C:\Users\thors\AppData\Local\Temp\ipykernel_20284\3308900266.py:11: FutureWarning: ChainedAssignmentError: behaviour will change in pandas 3.0! You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) this will never work to update the original DataFrame or Series, because the intermediate object on which we are setting values will behave as a copy. A typical example is when you are setting values in a column of a DataFrame, like: df["col"][row_indexer] = value Use `df.loc[row_indexer, "col"] = values` instead, to perform the assignment in a single step and ensure this keeps updating the original `df`. See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy df['egg_type_1'][172] = df['egg_type_1'][171] #it's the same as regular Eevee (row 171) C:\Users\thors\AppData\Local\Temp\ipykernel_20284\3308900266.py:11: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy df['egg_type_1'][172] = df['egg_type_1'][171] #it's the same as regular Eevee (row 171) C:\Users\thors\AppData\Local\Temp\ipykernel_20284\3308900266.py:16: FutureWarning: ChainedAssignmentError: behaviour will change in pandas 3.0! You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) this will never work to update the original DataFrame or Series, because the intermediate object on which we are setting values will behave as a copy. A typical example is when you are setting values in a column of a DataFrame, like: df["col"][row_indexer] = value Use `df.loc[row_indexer, "col"] = values` instead, to perform the assignment in a single step and ensure this keeps updating the original `df`. See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy df['egg_type_2'][172] = df['egg_type_2'][171] #it's the same as regular Eevee (row 171) C:\Users\thors\AppData\Local\Temp\ipykernel_20284\3308900266.py:16: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy df['egg_type_2'][172] = df['egg_type_2'][171] #it's the same as regular Eevee (row 171)
For Galarian Darmanitan Zen Mode (row 658), we need to fix the following: growth_rate, egg_type_number, egg_type_1, egg_type_2, egg_cycles
#Galarian Darmanitan Zen Mode
print("Galarian Darmanitan Zen Mode growth_rate : ", df['growth_rate'][658]) #to show Galarian Darmanitan Zen Mode's growth_rate is nan
df['growth_rate'][658] = df['growth_rate'][656] #Galarian Darmanitan Zen Mode's growth_rate is the same as every
#other form of Darmanitan (rows 655, 656, 657), Medium Slow
print("(Updated) Galarian Darmanitan Zen Mode growth_rate : ", df['growth_rate'][658]) #to show Galarian Darmanitan Zen Mode's growth_rate
#now has the correct value
print("Galarian Darmanitan Zen Mode egg_type_number : ", df['egg_type_number'][658]) #nan
df['egg_type_number'][658] = df['egg_type_number'][656] #it's the same as any other form of Darmanitan (rows 655, 656, 657)
print("(Updated) Galarian Darmanitan Zen Mode egg_type_number : ", df['egg_type_number'][658]) #now has the correct value
print("Galarian Darmanitan Zen Mode egg_type_1 : ", df['egg_type_1'][658]) #nan
df['egg_type_1'][658] = df['egg_type_1'][656] #it's the same as any other form of Darmanitan (rows 655, 656, 657)
print("(Updated) Galarian Darmanitan Zen Mode egg_type_1 : ", df['egg_type_1'][658]) #now has the correct value
#(Galarian) Darmanitan (Zen Mode) only has one egg type, so this won't change anything
print("Galarian Darmanitan Zen Mode egg_type_2 : ", df['egg_type_2'][658]) #nan
df['egg_type_2'][658] = df['egg_type_2'][656] #it's the same as any other form of Darmanitan (rows 655, 656, 657)
print("(Updated) Galarian Darmanitan Zen Mode egg_type_2 : ", df['egg_type_2'][658]) #now has the correct value, still nan
print("Galarian Darmanitan Zen Mode egg_cycles : ", df['egg_cycles'][658]) #nan
df['egg_cycles'][658] = df['egg_cycles'][656] #it's the same as any other form of Darmanitan (rows 655, 656, 657)
print("(Updated) Galarian Darmanitan Zen Mode egg_cycles : ", df['egg_cycles'][658]) #now has the correct value
Galarian Darmanitan Zen Mode growth_rate : nan (Updated) Galarian Darmanitan Zen Mode growth_rate : Medium Slow Galarian Darmanitan Zen Mode egg_type_number : 0 (Updated) Galarian Darmanitan Zen Mode egg_type_number : 1 Galarian Darmanitan Zen Mode egg_type_1 : nan (Updated) Galarian Darmanitan Zen Mode egg_type_1 : Field Galarian Darmanitan Zen Mode egg_type_2 : nan (Updated) Galarian Darmanitan Zen Mode egg_type_2 : nan Galarian Darmanitan Zen Mode egg_cycles : nan (Updated) Galarian Darmanitan Zen Mode egg_cycles : 20.0
C:\Users\thors\AppData\Local\Temp\ipykernel_20284\1102097238.py:3: FutureWarning: ChainedAssignmentError: behaviour will change in pandas 3.0! You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) this will never work to update the original DataFrame or Series, because the intermediate object on which we are setting values will behave as a copy. A typical example is when you are setting values in a column of a DataFrame, like: df["col"][row_indexer] = value Use `df.loc[row_indexer, "col"] = values` instead, to perform the assignment in a single step and ensure this keeps updating the original `df`. See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy df['growth_rate'][658] = df['growth_rate'][656] #Galarian Darmanitan Zen Mode's growth_rate is the same as every C:\Users\thors\AppData\Local\Temp\ipykernel_20284\1102097238.py:3: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy df['growth_rate'][658] = df['growth_rate'][656] #Galarian Darmanitan Zen Mode's growth_rate is the same as every C:\Users\thors\AppData\Local\Temp\ipykernel_20284\1102097238.py:9: FutureWarning: ChainedAssignmentError: behaviour will change in pandas 3.0! You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) this will never work to update the original DataFrame or Series, because the intermediate object on which we are setting values will behave as a copy. A typical example is when you are setting values in a column of a DataFrame, like: df["col"][row_indexer] = value Use `df.loc[row_indexer, "col"] = values` instead, to perform the assignment in a single step and ensure this keeps updating the original `df`. See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy df['egg_type_number'][658] = df['egg_type_number'][656] #it's the same as any other form of Darmanitan (rows 655, 656, 657) C:\Users\thors\AppData\Local\Temp\ipykernel_20284\1102097238.py:9: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy df['egg_type_number'][658] = df['egg_type_number'][656] #it's the same as any other form of Darmanitan (rows 655, 656, 657) C:\Users\thors\AppData\Local\Temp\ipykernel_20284\1102097238.py:13: FutureWarning: ChainedAssignmentError: behaviour will change in pandas 3.0! You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) this will never work to update the original DataFrame or Series, because the intermediate object on which we are setting values will behave as a copy. A typical example is when you are setting values in a column of a DataFrame, like: df["col"][row_indexer] = value Use `df.loc[row_indexer, "col"] = values` instead, to perform the assignment in a single step and ensure this keeps updating the original `df`. See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy df['egg_type_1'][658] = df['egg_type_1'][656] #it's the same as any other form of Darmanitan (rows 655, 656, 657) C:\Users\thors\AppData\Local\Temp\ipykernel_20284\1102097238.py:13: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy df['egg_type_1'][658] = df['egg_type_1'][656] #it's the same as any other form of Darmanitan (rows 655, 656, 657) C:\Users\thors\AppData\Local\Temp\ipykernel_20284\1102097238.py:18: FutureWarning: ChainedAssignmentError: behaviour will change in pandas 3.0! You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) this will never work to update the original DataFrame or Series, because the intermediate object on which we are setting values will behave as a copy. A typical example is when you are setting values in a column of a DataFrame, like: df["col"][row_indexer] = value Use `df.loc[row_indexer, "col"] = values` instead, to perform the assignment in a single step and ensure this keeps updating the original `df`. See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy df['egg_type_2'][658] = df['egg_type_2'][656] #it's the same as any other form of Darmanitan (rows 655, 656, 657) C:\Users\thors\AppData\Local\Temp\ipykernel_20284\1102097238.py:18: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy df['egg_type_2'][658] = df['egg_type_2'][656] #it's the same as any other form of Darmanitan (rows 655, 656, 657) C:\Users\thors\AppData\Local\Temp\ipykernel_20284\1102097238.py:22: FutureWarning: ChainedAssignmentError: behaviour will change in pandas 3.0! You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) this will never work to update the original DataFrame or Series, because the intermediate object on which we are setting values will behave as a copy. A typical example is when you are setting values in a column of a DataFrame, like: df["col"][row_indexer] = value Use `df.loc[row_indexer, "col"] = values` instead, to perform the assignment in a single step and ensure this keeps updating the original `df`. See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy df['egg_cycles'][658] = df['egg_cycles'][656] #it's the same as any other form of Darmanitan (rows 655, 656, 657) C:\Users\thors\AppData\Local\Temp\ipykernel_20284\1102097238.py:22: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy df['egg_cycles'][658] = df['egg_cycles'][656] #it's the same as any other form of Darmanitan (rows 655, 656, 657)
Lastly, there's Eternatus Eternamax (row 1033). For this one, we'd need to fix weight_kg and ability_1.
Now, ability_1 wouldn't be hard to fix, but it turns out that the missing value for weight_kg is actually correct. It's a trait of Gigantamax Pokémon. (Eternamax is a type of Gigantamax.) However, for some reason, Eternatus Eternamax is the only Gigantamax form that's in the dataset.
Because it's the only one in the dataset and for simplicity's sake, we're just going to remove row 1033 from the dataframe.
df.drop(index=1033, axis=0, inplace=True)
Now let's check the number of missing values again.
# Check for missing values
print(df.isnull().sum())
generation 0 status 0 type_1 0 type_2 492 height_m 0 weight_kg 0 abilities_number 0 ability_1 0 ability_2 528 ability_hidden 231 total_points 0 hp 0 attack 0 defense 0 sp_attack 0 sp_defense 0 speed 0 growth_rate 0 egg_type_number 0 egg_type_1 0 egg_type_2 758 egg_cycles 0 dtype: int64
df.info()
<class 'pandas.core.frame.DataFrame'> Index: 1044 entries, 0 to 1044 Data columns (total 22 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 generation 1044 non-null int64 1 status 1044 non-null object 2 type_1 1044 non-null object 3 type_2 552 non-null object 4 height_m 1044 non-null float64 5 weight_kg 1044 non-null float64 6 abilities_number 1044 non-null int64 7 ability_1 1044 non-null object 8 ability_2 516 non-null object 9 ability_hidden 813 non-null object 10 total_points 1044 non-null int64 11 hp 1044 non-null int64 12 attack 1044 non-null int64 13 defense 1044 non-null int64 14 sp_attack 1044 non-null int64 15 sp_defense 1044 non-null int64 16 speed 1044 non-null int64 17 growth_rate 1044 non-null object 18 egg_type_number 1044 non-null int64 19 egg_type_1 1044 non-null object 20 egg_type_2 286 non-null object 21 egg_cycles 1044 non-null float64 dtypes: float64(3), int64(10), object(9) memory usage: 187.6+ KB
We'll need to fill up these missing values
# Plot the correlation matrix
sns.heatmap(df.select_dtypes(include=[np.number, bool]).corr(), square=True, cmap='RdYlGn');
['#1f77b4', '#ff7f0e', '#2ca02c', '#d62728', '#9467bd', '#8c564b', '#e377c2', '#7f7f7f', '#bcbd22', '#17becf'] ['#1f77b4', '#ff7f0e', '#2ca02c', '#d62728', '#9467bd', '#8c564b', '#e377c2', '#7f7f7f', '#bcbd22', '#17becf'] ['#1f77b4', '#ff7f0e', '#2ca02c', '#d62728', '#9467bd', '#8c564b', '#e377c2', '#7f7f7f', '#bcbd22', '#17becf'] ['#1f77b4', '#ff7f0e', '#2ca02c', '#d62728', '#9467bd', '#8c564b', '#e377c2', '#7f7f7f', '#bcbd22', '#17becf'] ['#1f77b4', '#ff7f0e', '#2ca02c', '#d62728', '#9467bd', '#8c564b', '#e377c2', '#7f7f7f', '#bcbd22', '#17becf'] ['#1f77b4', '#ff7f0e', '#2ca02c', '#d62728', '#9467bd', '#8c564b', '#e377c2', '#7f7f7f', '#bcbd22', '#17becf'] ['#1f77b4', '#ff7f0e', '#2ca02c', '#d62728', '#9467bd', '#8c564b', '#e377c2', '#7f7f7f', '#bcbd22', '#17becf'] ['#1f77b4', '#ff7f0e', '#2ca02c', '#d62728', '#9467bd', '#8c564b', '#e377c2', '#7f7f7f', '#bcbd22', '#17becf'] ['#1f77b4', '#ff7f0e', '#2ca02c', '#d62728', '#9467bd', '#8c564b', '#e377c2', '#7f7f7f', '#bcbd22', '#17becf'] ['#1f77b4', '#ff7f0e', '#2ca02c', '#d62728', '#9467bd', '#8c564b', '#e377c2', '#7f7f7f', '#bcbd22', '#17becf'] ['#1f77b4', '#ff7f0e', '#2ca02c', '#d62728', '#9467bd', '#8c564b', '#e377c2', '#7f7f7f', '#bcbd22', '#17becf'] ['#1f77b4', '#ff7f0e', '#2ca02c', '#d62728', '#9467bd', '#8c564b', '#e377c2', '#7f7f7f', '#bcbd22', '#17becf'] ['#1f77b4', '#ff7f0e', '#2ca02c', '#d62728', '#9467bd', '#8c564b', '#e377c2', '#7f7f7f', '#bcbd22', '#17becf'] ['#1f77b4', '#ff7f0e', '#2ca02c', '#d62728', '#9467bd', '#8c564b', '#e377c2', '#7f7f7f', '#bcbd22', '#17becf'] ['#1f77b4', '#ff7f0e', '#2ca02c', '#d62728', '#9467bd', '#8c564b', '#e377c2', '#7f7f7f', '#bcbd22', '#17becf'] ['#1f77b4', '#ff7f0e', '#2ca02c', '#d62728', '#9467bd', '#8c564b', '#e377c2', '#7f7f7f', '#bcbd22', '#17becf']
In addition to the generation, the number of egg types also has little correlation with the other stats.
The number of abilities ranges between having no correlation and having a somewhat negative correlation with the other stats. (Interesting. Could this be because of balancing? Some other reason?)
The stat total once again correlates quite highly with the main stats. No surprises there. The lack of correlation between speed and defense is still visible.
There's also a lack of correlation between speed and weight. This goes against what one might initially expect (that heavier Pokémon are slower). However, Pokémon come in a wide range of sizes and vary in the materials they're composed of, so perhaps it shouldn't be surprising that speed and weight aren't correlated.
Dropping columns we don't need
We will handle the missing values in type_2, ability_2, ability_hidden, and egg_type_2 by filling it up by a 'None' value. It is normal that some pokémon don't have a second or hidden ability or (egg) type.
# Handling missing values in 'Type 2' column
df['type_2'].fillna('None', inplace=True)
df['ability_2'].fillna('None', inplace=True)
df['ability_hidden'].fillna('None', inplace=True)
df['egg_type_2'].fillna('None', inplace=True)
print(df.isnull().sum())
df.head()
generation 0 status 0 type_1 0 type_2 0 height_m 0 weight_kg 0 abilities_number 0 ability_1 0 ability_2 0 ability_hidden 0 total_points 0 hp 0 attack 0 defense 0 sp_attack 0 sp_defense 0 speed 0 growth_rate 0 egg_type_number 0 egg_type_1 0 egg_type_2 0 egg_cycles 0 dtype: int64
C:\Users\thors\AppData\Local\Temp\ipykernel_20284\1302662185.py:2: FutureWarning: A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method. The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy. For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object. df['type_2'].fillna('None', inplace=True) C:\Users\thors\AppData\Local\Temp\ipykernel_20284\1302662185.py:3: FutureWarning: A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method. The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy. For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object. df['ability_2'].fillna('None', inplace=True) C:\Users\thors\AppData\Local\Temp\ipykernel_20284\1302662185.py:4: FutureWarning: A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method. The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy. For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object. df['ability_hidden'].fillna('None', inplace=True) C:\Users\thors\AppData\Local\Temp\ipykernel_20284\1302662185.py:5: FutureWarning: A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method. The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy. For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object. df['egg_type_2'].fillna('None', inplace=True)
generation | status | type_1 | type_2 | height_m | weight_kg | abilities_number | ability_1 | ability_2 | ability_hidden | ... | attack | defense | sp_attack | sp_defense | speed | growth_rate | egg_type_number | egg_type_1 | egg_type_2 | egg_cycles | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | Normal | Grass | Poison | 0.7 | 6.9 | 2 | Overgrow | None | Chlorophyll | ... | 49 | 49 | 65 | 65 | 45 | Medium Slow | 2 | Grass | Monster | 20.0 |
1 | 1 | Normal | Grass | Poison | 1.0 | 13.0 | 2 | Overgrow | None | Chlorophyll | ... | 62 | 63 | 80 | 80 | 60 | Medium Slow | 2 | Grass | Monster | 20.0 |
2 | 1 | Normal | Grass | Poison | 2.0 | 100.0 | 2 | Overgrow | None | Chlorophyll | ... | 82 | 83 | 100 | 100 | 80 | Medium Slow | 2 | Grass | Monster | 20.0 |
3 | 1 | Normal | Grass | Poison | 2.4 | 155.5 | 1 | Thick Fat | None | None | ... | 100 | 123 | 122 | 120 | 80 | Medium Slow | 2 | Grass | Monster | 20.0 |
4 | 1 | Normal | Fire | None | 0.6 | 8.5 | 2 | Blaze | None | Solar Power | ... | 52 | 43 | 60 | 50 | 65 | Medium Slow | 2 | Dragon | Monster | 20.0 |
5 rows × 22 columns
Because scikit-learn does not accept non-numerical features, we need to one-hot encode the categorical columns by creating dummy variables.
df_one_hot = pd.get_dummies(df.drop(['type_1', 'type_2'], axis=1))
df_one_hot.head() # to check if it worked
generation | height_m | weight_kg | abilities_number | total_points | hp | attack | defense | sp_attack | sp_defense | ... | egg_type_2_Field | egg_type_2_Flying | egg_type_2_Grass | egg_type_2_Human-Like | egg_type_2_Mineral | egg_type_2_Monster | egg_type_2_None | egg_type_2_Water 1 | egg_type_2_Water 2 | egg_type_2_Water 3 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | 0.7 | 6.9 | 2 | 318 | 45 | 49 | 49 | 65 | 65 | ... | False | False | False | False | False | True | False | False | False | False |
1 | 1 | 1.0 | 13.0 | 2 | 405 | 60 | 62 | 63 | 80 | 80 | ... | False | False | False | False | False | True | False | False | False | False |
2 | 1 | 2.0 | 100.0 | 2 | 525 | 80 | 82 | 83 | 100 | 100 | ... | False | False | False | False | False | True | False | False | False | False |
3 | 1 | 2.4 | 155.5 | 1 | 625 | 80 | 100 | 123 | 122 | 120 | ... | False | False | False | False | False | True | False | False | False | False |
4 | 1 | 0.6 | 8.5 | 2 | 309 | 39 | 52 | 43 | 60 | 50 | ... | False | False | False | False | False | True | False | False | False | False |
5 rows × 544 columns
df = pd.concat([df_one_hot, df[['type_1', 'type_2']]], axis=1)
df.head()
preprocessed_df = df.copy()
Multi-class classification¶
Accounting for Order of Types¶
We make a new column 'Types' where the combination of type 1 and type 2 are stored in tuples, which is ordered by default
df = preprocessed_df.copy()
# Separate features and labels
X = df.drop(columns=['type_1', 'type_2'])
# Combine Type 1 and Type 2 into a single column
df['Types'] = df[['type_1', 'type_2']].apply(lambda x: tuple(filter(lambda y: pd.notna(y), x)), axis=1)
df.Types = df.Types.astype(str)
print(df['Types'][0])
print(len(df['Types'].unique()))
# drop the Type 1 and Type 2 columns
df.drop(['type_1', 'type_2'], axis=1, inplace=True)
# print head
df.head()
('Grass', 'Poison') 192
generation | height_m | weight_kg | abilities_number | total_points | hp | attack | defense | sp_attack | sp_defense | ... | egg_type_2_Flying | egg_type_2_Grass | egg_type_2_Human-Like | egg_type_2_Mineral | egg_type_2_Monster | egg_type_2_None | egg_type_2_Water 1 | egg_type_2_Water 2 | egg_type_2_Water 3 | Types | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | 0.7 | 6.9 | 2 | 318 | 45 | 49 | 49 | 65 | 65 | ... | False | False | False | False | True | False | False | False | False | ('Grass', 'Poison') |
1 | 1 | 1.0 | 13.0 | 2 | 405 | 60 | 62 | 63 | 80 | 80 | ... | False | False | False | False | True | False | False | False | False | ('Grass', 'Poison') |
2 | 1 | 2.0 | 100.0 | 2 | 525 | 80 | 82 | 83 | 100 | 100 | ... | False | False | False | False | True | False | False | False | False | ('Grass', 'Poison') |
3 | 1 | 2.4 | 155.5 | 1 | 625 | 80 | 100 | 123 | 122 | 120 | ... | False | False | False | False | True | False | False | False | False | ('Grass', 'Poison') |
4 | 1 | 0.6 | 8.5 | 2 | 309 | 39 | 52 | 43 | 60 | 50 | ... | False | False | False | False | True | False | False | False | False | ('Fire', 'None') |
5 rows × 545 columns
# show the distribution of pokemon types
sns.countplot(df, y='Types');
['#1f77b4', '#ff7f0e', '#2ca02c', '#d62728', '#9467bd', '#8c564b', '#e377c2', '#7f7f7f', '#bcbd22', '#17becf'] ['#1f77b4', '#ff7f0e', '#2ca02c', '#d62728', '#9467bd', '#8c564b', '#e377c2', '#7f7f7f', '#bcbd22', '#17becf'] ['#1f77b4', '#ff7f0e', '#2ca02c', '#d62728', '#9467bd', '#8c564b', '#e377c2', '#7f7f7f', '#bcbd22', '#17becf'] ['#1f77b4', '#ff7f0e', '#2ca02c', '#d62728', '#9467bd', '#8c564b', '#e377c2', '#7f7f7f', '#bcbd22', '#17becf'] ['#1f77b4', '#ff7f0e', '#2ca02c', '#d62728', '#9467bd', '#8c564b', '#e377c2', '#7f7f7f', '#bcbd22', '#17becf']
singleton_classes = df['Types'].value_counts()[df['Types'].value_counts() == 1].index.tolist()
singleton_data = df[df['Types'].isin(singleton_classes)]
other_data = df[~df['Types'].isin(singleton_classes)]
print("Number of singleton classes",len(singleton_classes))
print("number of unique type combinations",len(df['Types'].unique()))
print(len(df['Types']))
df.head()
Number of singleton classes 41 number of unique type combinations 192 1044
generation | height_m | weight_kg | abilities_number | total_points | hp | attack | defense | sp_attack | sp_defense | ... | egg_type_2_Flying | egg_type_2_Grass | egg_type_2_Human-Like | egg_type_2_Mineral | egg_type_2_Monster | egg_type_2_None | egg_type_2_Water 1 | egg_type_2_Water 2 | egg_type_2_Water 3 | Types | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | 0.7 | 6.9 | 2 | 318 | 45 | 49 | 49 | 65 | 65 | ... | False | False | False | False | True | False | False | False | False | ('Grass', 'Poison') |
1 | 1 | 1.0 | 13.0 | 2 | 405 | 60 | 62 | 63 | 80 | 80 | ... | False | False | False | False | True | False | False | False | False | ('Grass', 'Poison') |
2 | 1 | 2.0 | 100.0 | 2 | 525 | 80 | 82 | 83 | 100 | 100 | ... | False | False | False | False | True | False | False | False | False | ('Grass', 'Poison') |
3 | 1 | 2.4 | 155.5 | 1 | 625 | 80 | 100 | 123 | 122 | 120 | ... | False | False | False | False | True | False | False | False | False | ('Grass', 'Poison') |
4 | 1 | 0.6 | 8.5 | 2 | 309 | 39 | 52 | 43 | 60 | 50 | ... | False | False | False | False | True | False | False | False | False | ('Fire', 'None') |
5 rows × 545 columns
Decision tree¶
The single classes are added to both the training and test sets after being stratified on the rest of the data. This makes the actual test size slightly larger.
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
# Split the data into training and testing sets
from sklearn.metrics import classification_report
X = df.drop(columns=['Types'])
y = df['Types']
X_train, X_test, y_train, y_test = train_test_split(other_data.drop(columns=['Types']), other_data['Types'], test_size=0.2, stratify=other_data['Types'], random_state=42)
X_train = pd.concat([X_train, singleton_data.drop(columns=['Types'])])
y_train = pd.concat([y_train, singleton_data['Types']])
X_test = pd.concat([X_test, singleton_data.drop(columns=['Types'])])
y_test = pd.concat([y_test, singleton_data['Types']])
print("actual test size:",len(X_test)/(len(X_train)+len(X_test)))
actual test size: 0.22304147465437787
# Initialize and train the decision tree classifier
from sklearn.metrics import accuracy_score
model = DecisionTreeClassifier(random_state=42)
model.fit(X_train, y_train)
# Predict labels for the test set
y_pred = model.predict(X_test)
# Calculate accuracy
print("Score: ", model.score(X_test, y_test))
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)
print(classification_report(y_test, y_pred))
Score: 0.4834710743801653 Accuracy: 0.4834710743801653 precision recall f1-score support ('Bug', 'Electric') 0.00 0.00 0.00 1 ('Bug', 'Fighting') 0.00 0.00 0.00 1 ('Bug', 'Flying') 0.00 0.00 0.00 3 ('Bug', 'Ghost') 0.50 1.00 0.67 1 ('Bug', 'Grass') 0.00 0.00 0.00 1 ('Bug', 'None') 0.40 0.50 0.44 4 ('Bug', 'Poison') 0.00 0.00 0.00 3 ('Bug', 'Rock') 0.00 0.00 0.00 1 ('Bug', 'Steel') 0.00 0.00 0.00 1 ('Bug', 'Water') 0.00 0.00 0.00 1 ('Dark', 'Dragon') 0.00 0.00 0.00 1 ('Dark', 'Fairy') 1.00 1.00 1.00 1 ('Dark', 'Fire') 0.00 0.00 0.00 1 ('Dark', 'Flying') 0.00 0.00 0.00 1 ('Dark', 'Ghost') 0.00 0.00 0.00 0 ('Dark', 'Grass') 1.00 1.00 1.00 1 ('Dark', 'None') 0.00 0.00 0.00 3 ('Dark', 'Normal') 0.00 0.00 0.00 1 ('Dark', 'Steel') 0.00 0.00 0.00 0 ('Dragon', 'Electric') 1.00 1.00 1.00 1 ('Dragon', 'Fairy') 1.00 1.00 1.00 1 ('Dragon', 'Fire') 1.00 1.00 1.00 1 ('Dragon', 'Flying') 0.00 0.00 0.00 1 ('Dragon', 'Ghost') 1.00 1.00 1.00 1 ('Dragon', 'Ground') 0.00 0.00 0.00 2 ('Dragon', 'Ice') 0.50 1.00 0.67 1 ('Dragon', 'None') 0.00 0.00 0.00 3 ('Dragon', 'Psychic') 1.00 1.00 1.00 1 ('Electric', 'Dark') 0.00 0.00 0.00 0 ('Electric', 'Fairy') 0.00 0.00 0.00 0 ('Electric', 'Fire') 0.33 1.00 0.50 1 ('Electric', 'Flying') 0.00 0.00 0.00 1 ('Electric', 'Ghost') 1.00 1.00 1.00 1 ('Electric', 'Grass') 0.00 0.00 0.00 1 ('Electric', 'None') 0.50 0.43 0.46 7 ('Electric', 'Poison') 0.00 0.00 0.00 1 ('Electric', 'Psychic') 1.00 1.00 1.00 1 ('Electric', 'Steel') 0.00 0.00 0.00 1 ('Electric', 'Water') 0.00 0.00 0.00 1 ('Fairy', 'Flying') 0.00 0.00 0.00 0 ('Fairy', 'None') 0.33 0.25 0.29 4 ('Fairy', 'Steel') 0.50 1.00 0.67 1 ('Fighting', 'Dark') 0.00 0.00 0.00 0 ('Fighting', 'Flying') 0.00 0.00 0.00 0 ('Fighting', 'Ghost') 1.00 1.00 1.00 1 ('Fighting', 'Ice') 1.00 1.00 1.00 1 ('Fighting', 'None') 0.67 0.33 0.44 6 ('Fighting', 'Psychic') 0.00 0.00 0.00 1 ('Fighting', 'Steel') 0.00 0.00 0.00 1 ('Fighting', 'Water') 0.00 0.00 0.00 1 ('Fire', 'Bug') 0.00 0.00 0.00 0 ('Fire', 'Dark') 1.00 1.00 1.00 1 ('Fire', 'Fighting') 1.00 1.00 1.00 1 ('Fire', 'Flying') 0.00 0.00 0.00 2 ('Fire', 'Ground') 0.00 0.00 0.00 1 ('Fire', 'None') 0.33 0.57 0.42 7 ('Fire', 'Rock') 1.00 1.00 1.00 1 ('Fire', 'Steel') 1.00 1.00 1.00 1 ('Fire', 'Water') 1.00 1.00 1.00 1 ('Flying', 'None') 1.00 1.00 1.00 1 ('Flying', 'Steel') 1.00 1.00 1.00 1 ('Flying', 'Water') 1.00 1.00 1.00 1 ('Ghost', 'Dark') 1.00 1.00 1.00 1 ('Ghost', 'Fairy') 1.00 1.00 1.00 1 ('Ghost', 'Fire') 1.00 1.00 1.00 1 ('Ghost', 'Flying') 0.00 0.00 0.00 1 ('Ghost', 'Grass') 0.67 1.00 0.80 2 ('Ghost', 'None') 0.25 0.33 0.29 3 ('Ghost', 'Poison') 1.00 1.00 1.00 1 ('Grass', 'Dark') 0.00 0.00 0.00 1 ('Grass', 'Dragon') 0.00 0.00 0.00 1 ('Grass', 'Fairy') 0.00 0.00 0.00 1 ('Grass', 'Fighting') 0.00 0.00 0.00 1 ('Grass', 'Flying') 1.00 0.50 0.67 2 ('Grass', 'Ghost') 1.00 1.00 1.00 1 ('Grass', 'Ground') 1.00 1.00 1.00 1 ('Grass', 'Ice') 1.00 1.00 1.00 1 ('Grass', 'None') 0.58 0.78 0.67 9 ('Grass', 'Poison') 0.75 1.00 0.86 3 ('Grass', 'Steel') 0.00 0.00 0.00 1 ('Ground', 'Dark') 0.00 0.00 0.00 1 ('Ground', 'Electric') 1.00 1.00 1.00 1 ('Ground', 'Fire') 1.00 1.00 1.00 1 ('Ground', 'Flying') 0.00 0.00 0.00 1 ('Ground', 'Ghost') 0.00 0.00 0.00 1 ('Ground', 'None') 0.00 0.00 0.00 4 ('Ground', 'Rock') 0.00 0.00 0.00 1 ('Ground', 'Steel') 1.00 1.00 1.00 1 ('Ice', 'Bug') 0.00 0.00 0.00 0 ('Ice', 'Fairy') 1.00 1.00 1.00 1 ('Ice', 'Fire') 1.00 1.00 1.00 1 ('Ice', 'Flying') 0.00 0.00 0.00 0 ('Ice', 'Ghost') 1.00 1.00 1.00 1 ('Ice', 'Ground') 0.00 0.00 0.00 1 ('Ice', 'None') 0.50 0.50 0.50 4 ('Ice', 'Water') 1.00 1.00 1.00 1 ('Normal', 'Dragon') 1.00 1.00 1.00 1 ('Normal', 'Fairy') 0.00 0.00 0.00 1 ('Normal', 'Fighting') 0.00 0.00 0.00 1 ('Normal', 'Flying') 0.83 0.83 0.83 6 ('Normal', 'Grass') 0.00 0.00 0.00 0 ('Normal', 'Ground') 1.00 1.00 1.00 1 ('Normal', 'None') 0.38 0.43 0.40 14 ('Normal', 'Psychic') 0.00 0.00 0.00 1 ('Normal', 'Water') 1.00 1.00 1.00 1 ('Poison', 'Bug') 1.00 1.00 1.00 1 ('Poison', 'Dark') 0.00 0.00 0.00 1 ('Poison', 'Dragon') 0.00 0.00 0.00 1 ('Poison', 'Fairy') 0.50 1.00 0.67 1 ('Poison', 'Flying') 1.00 1.00 1.00 1 ('Poison', 'Ground') 0.00 0.00 0.00 0 ('Poison', 'None') 1.00 0.33 0.50 3 ('Poison', 'Water') 0.00 0.00 0.00 1 ('Psychic', 'Dark') 1.00 1.00 1.00 1 ('Psychic', 'Dragon') 1.00 1.00 1.00 1 ('Psychic', 'Fairy') 0.00 0.00 0.00 2 ('Psychic', 'Fighting') 0.00 0.00 0.00 1 ('Psychic', 'Fire') 1.00 1.00 1.00 1 ('Psychic', 'Flying') 0.33 0.50 0.40 2 ('Psychic', 'Ghost') 1.00 1.00 1.00 1 ('Psychic', 'Grass') 0.00 0.00 0.00 0 ('Psychic', 'Ice') 0.00 0.00 0.00 1 ('Psychic', 'None') 0.57 0.44 0.50 9 ('Rock', 'Electric') 1.00 1.00 1.00 1 ('Rock', 'Fairy') 0.00 0.00 0.00 1 ('Rock', 'Fighting') 1.00 1.00 1.00 1 ('Rock', 'Flying') 1.00 1.00 1.00 1 ('Rock', 'Ground') 0.00 0.00 0.00 1 ('Rock', 'None') 0.50 0.67 0.57 3 ('Rock', 'Poison') 0.50 1.00 0.67 1 ('Rock', 'Steel') 0.00 0.00 0.00 1 ('Rock', 'Water') 1.00 1.00 1.00 1 ('Steel', 'Fairy') 0.00 0.00 0.00 1 ('Steel', 'Fighting') 1.00 1.00 1.00 1 ('Steel', 'Ghost') 0.00 0.00 0.00 1 ('Steel', 'None') 0.00 0.00 0.00 2 ('Steel', 'Psychic') 1.00 0.50 0.67 2 ('Steel', 'Rock') 0.00 0.00 0.00 1 ('Water', 'Dark') 0.00 0.00 0.00 2 ('Water', 'Dragon') 0.00 0.00 0.00 1 ('Water', 'Fairy') 1.00 1.00 1.00 1 ('Water', 'Fighting') 1.00 1.00 1.00 1 ('Water', 'Flying') 0.67 1.00 0.80 2 ('Water', 'Grass') 1.00 1.00 1.00 1 ('Water', 'Ground') 1.00 0.50 0.67 2 ('Water', 'Ice') 0.00 0.00 0.00 1 ('Water', 'None') 0.60 0.60 0.60 15 ('Water', 'Poison') 0.00 0.00 0.00 1 ('Water', 'Psychic') 0.00 0.00 0.00 1 ('Water', 'Rock') 0.00 0.00 0.00 1 ('Water', 'Steel') 1.00 1.00 1.00 1 accuracy 0.48 242 macro avg 0.44 0.46 0.44 242 weighted avg 0.48 0.48 0.47 242
C:\Users\thors\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.10_qbz5n2kfra8p0\LocalCache\local-packages\Python310\site-packages\sklearn\metrics\_classification.py:1497: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result)) C:\Users\thors\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.10_qbz5n2kfra8p0\LocalCache\local-packages\Python310\site-packages\sklearn\metrics\_classification.py:1497: UndefinedMetricWarning: Recall is ill-defined and being set to 0.0 in labels with no true samples. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result)) C:\Users\thors\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.10_qbz5n2kfra8p0\LocalCache\local-packages\Python310\site-packages\sklearn\metrics\_classification.py:1497: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result)) C:\Users\thors\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.10_qbz5n2kfra8p0\LocalCache\local-packages\Python310\site-packages\sklearn\metrics\_classification.py:1497: UndefinedMetricWarning: Recall is ill-defined and being set to 0.0 in labels with no true samples. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result)) C:\Users\thors\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.10_qbz5n2kfra8p0\LocalCache\local-packages\Python310\site-packages\sklearn\metrics\_classification.py:1497: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result)) C:\Users\thors\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.10_qbz5n2kfra8p0\LocalCache\local-packages\Python310\site-packages\sklearn\metrics\_classification.py:1497: UndefinedMetricWarning: Recall is ill-defined and being set to 0.0 in labels with no true samples. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
Hyperparameter Tuning¶
For hyperparameter tuning we use GridSearchCV and a pipeline with a StandardScaler to make the data more uniform.
# Import GridSearchCV
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
# Setup the parameters and distributions to sample from: param_dist
pipeline = make_pipeline(StandardScaler(), DecisionTreeClassifier())
param_dist = {
"decisiontreeclassifier__max_depth": [15, 30, None],
"decisiontreeclassifier__min_samples_leaf": np.arange(1, 10)
}
# Instantiate the GridSearchCV object
grid_search_cv = GridSearchCV(pipeline, param_grid=param_dist, cv=5)
# Fit grid_search_cv using the data X and labels y.
grid_search_cv.fit(X_train, y_train)
y_pred = grid_search_cv.predict(X_test)
# Print the best score
print("Tuned Model Parameters: {}".format(grid_search_cv.best_params_))
print("Accuracy: {}".format(grid_search_cv.best_estimator_.score(X_test, y_test)))
C:\Users\thors\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.10_qbz5n2kfra8p0\LocalCache\local-packages\Python310\site-packages\sklearn\model_selection\_split.py:737: UserWarning: The least populated class in y has only 1 members, which is less than n_splits=5. warnings.warn(
Tuned Model Parameters: {'decisiontreeclassifier__max_depth': None, 'decisiontreeclassifier__min_samples_leaf': 1} Accuracy: 0.49173553719008267
Random forest¶
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV
model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)
# Predict labels for the test set
y_pred = model.predict(X_test)
score = model.score(X_test, y_test)
# Calculate accuracy
print("Score :", score )
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)
Score : 0.6487603305785123 Accuracy: 0.6487603305785123
Hyperparameter Tuning¶
# Import GridSearchCV
pipeline = make_pipeline(StandardScaler(), RandomForestClassifier())
# Setup the parameters and distributions to sample from: param_dist
param_dist = {
"randomforestclassifier__max_depth": np.arange(5, 25),
"randomforestclassifier__min_samples_leaf": np.arange(1, 10),
"randomforestclassifier__n_estimators": np.arange(60, 140, 5)
}
# Instantiate the RandomizedSearchCV object: random_grid_search_cv
random_search_cv = RandomizedSearchCV(pipeline, param_distributions=param_dist, n_iter=50, cv=3, random_state=42)
#grid_search_cv = GridSearchCV(pipeline, param_grid=param_dist, cv=3)
# Fit random_search_cv using the data X and labels y
random_search_cv.fit(X_train, y_train)
#grid_search_cv.fit(X_train, y_train)
# Print the best score
print("Best score is {}".format(random_search_cv.best_estimator_.score(X_test, y_test)))
print("Best parameters are {}".format(random_search_cv.best_params_))
c:\Users\Chau\miniconda3\Lib\site-packages\sklearn\model_selection\_split.py:737: UserWarning: The least populated class in y has only 1 members, which is less than n_splits=3. warnings.warn(
Best score is 0.6900826446280992 Best parameters are {'randomforestclassifier__max_depth': None, 'randomforestclassifier__min_samples_leaf': 1, 'randomforestclassifier__n_estimators': 136}
Support Vector Machine¶
Radial base function¶
from sklearn.svm import SVC
model = SVC(kernel='rbf', random_state=42)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)
Accuracy: 0.06198347107438017
Hyperparameter Tuning¶
from sklearn.svm import SVC
# Define the pipeline with StandardScaler and SVC
pipeline = make_pipeline(StandardScaler(), SVC(kernel='rbf'))
# Define the parameter grid
param_grid = {
'svc__C': [0.1,0.5, 1,5, 10], # Regularization parameter
'svc__coef0': [0.0, 1.0, 2.0], # Independent term in the polynomial kernel function
}
# Initialize GridSearchCV
grid_search = GridSearchCV(pipeline, param_grid, cv=5, n_jobs=-1)
# Perform grid search
grid_search.fit(X_train, y_train)
# Get the best parameters and score
best_params = grid_search.best_params_
best_score=grid_search.best_estimator_.score(X_test, y_test)
print("Best Parameters:", best_params)
print("Best score is {}".format(grid_search.best_estimator_.score(X_test, y_test)))
c:\Users\Chau\miniconda3\Lib\site-packages\sklearn\model_selection\_split.py:737: UserWarning: The least populated class in y has only 1 members, which is less than n_splits=5. warnings.warn(
Best Parameters: {'svc__C': 10, 'svc__coef0': 0.0} Best score is 0.6074380165289256
Linear¶
svm_classifier = SVC(kernel='linear', random_state=42)
# Train the SVM classifier
svm_classifier.fit(X_train, y_train)
y_pred = svm_classifier.predict(X_test)
# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)
Accuracy: 0.3677685950413223
Hyperparameter tuning¶
from sklearn.svm import SVC
# Define the pipeline with StandardScaler and SVC
pipeline = make_pipeline(StandardScaler(), SVC(kernel='linear', random_state=42))
# Define the parameter grid
param_grid = {
'svc__C': [0.1,0.5, 1 , 5, 10], # Regularization parameter # Degree of the polynomial kernel
'svc__coef0': [0.0, 1.0, 2.0], # Independent term in the polynomial kernel function
}
# Initialize GridSearchCV
grid_search = GridSearchCV(pipeline, param_grid, cv=5, n_jobs=-1)
# Perform grid search
grid_search.fit(X_train, y_train)
# Get the best parameters and score
best_params = grid_search.best_params_
best_score=grid_search.best_estimator_.score(X_test, y_test)
print("Best Parameters:", best_params)
print("Best Score:", best_score)
c:\Users\Chau\miniconda3\Lib\site-packages\sklearn\model_selection\_split.py:737: UserWarning: The least populated class in y has only 1 members, which is less than n_splits=5. warnings.warn(
Best Parameters: {'svc__C': 0.5, 'svc__coef0': 0.0} Best Score: 0.6528925619834711
Polynomial¶
model = SVC(kernel='poly', random_state=42)
# Train the SVM classifier
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)
Accuracy: 0.0743801652892562
Hyperparameter Tuning¶
from sklearn.svm import SVC
# Define the pipeline with StandardScaler and SVC
pipeline = make_pipeline(StandardScaler(), SVC(kernel='poly'))
# Define the parameter grid
param_grid = {
'svc__C': [0.1,0.5, 1, 5, 10], # Regularization parameter
'svc__degree': [2, 3, 4, 5, 6], # Degree of the polynomial kernel
'svc__coef0': [0.0, 1.0, 2.0], # Independent term in the polynomial kernel function
}
# Initialize GridSearchCV
grid_search = GridSearchCV(pipeline, param_grid, cv=5, n_jobs=-1)
# Perform grid search
grid_search.fit(X_train, y_train)
# Get the best parameters and score
best_params = grid_search.best_params_
best_score=grid_search.best_estimator_.score(X_test, y_test)
print("Best Parameters:", best_params)
print("Best Score:", best_score)
c:\Users\Chau\miniconda3\Lib\site-packages\sklearn\model_selection\_split.py:737: UserWarning: The least populated class in y has only 1 members, which is less than n_splits=5. warnings.warn(
Best Parameters: {'svc__C': 10, 'svc__coef0': 2.0, 'svc__degree': 2} Best Score: 0.6198347107438017
Sigmoid¶
model = SVC(kernel='sigmoid', random_state=42)
# Train the SVM classifier
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
# Calculate accuracy
print("Score: ", model.score(X_test, y_test))
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)
Score: 0.05785123966942149 Accuracy: 0.05785123966942149
Hyperparameter Tuning¶
from sklearn.svm import SVC
# Define the pipeline with StandardScaler and SVC
pipeline = make_pipeline(StandardScaler(), SVC(kernel='sigmoid'))
# Define the parameter grid
param_grid = {
'svc__C': [0.1,0.5, 1,5, 10], # Regularization parameter
'svc__coef0': [0.0, 1.0, 2.0], # Independent term in the polynomial kernel function
}
# Initialize GridSearchCV
grid_search = GridSearchCV(pipeline, param_grid, cv=5, n_jobs=-1)
# Perform grid search
grid_search.fit(X_train, y_train)
# Get the best parameters and score
best_params = grid_search.best_params_
best_score=grid_search.best_estimator_.score(X_test, y_test)
print("Best Parameters:", best_params)
print("Best Score:", best_score)
c:\Users\Chau\miniconda3\Lib\site-packages\sklearn\model_selection\_split.py:737: UserWarning: The least populated class in y has only 1 members, which is less than n_splits=5. warnings.warn(
Best Parameters: {'svc__C': 10, 'svc__coef0': 0.0} Best Score: 0.6033057851239669
k Nearest neighbors¶
from sklearn.neighbors import KNeighborsClassifier
model = KNeighborsClassifier()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
# Calculate accuracy
print("Score: ",model.score(X_test, y_test))
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)
Score: 0.06611570247933884 Accuracy: 0.06611570247933884
Hyperparameter Tuning¶
from sklearn.neighbors import KNeighborsClassifier
param_grid = {
'kneighborsclassifier__n_neighbors': [3, 5, 7, 9] # List of k values to try
}
pipeline = make_pipeline(StandardScaler(), KNeighborsClassifier())
grid_search = GridSearchCV(estimator=pipeline, param_grid=param_grid, cv=5)
grid_search.fit(X_train, y_train)
best_params = grid_search.best_params_
best_score=grid_search.best_estimator_.score(X_test, y_test)
print("Best Parameters:", best_params)
print("Best Score:", best_score)
c:\Users\Chau\miniconda3\Lib\site-packages\sklearn\model_selection\_split.py:737: UserWarning: The least populated class in y has only 1 members, which is less than n_splits=5. warnings.warn(
Best Parameters: {'kneighborsclassifier__n_neighbors': 3} Best Score: 0.2809917355371901
Logistic regression¶
from sklearn.linear_model import LogisticRegression
model = LogisticRegression()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
# Calculate accuracy
print("Score: ", model.score(X_test, y_test))
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)
Score: 0.1446280991735537 Accuracy: 0.1446280991735537
c:\Users\Chau\miniconda3\Lib\site-packages\sklearn\linear_model\_logistic.py:469: ConvergenceWarning: lbfgs failed to converge (status=1): STOP: TOTAL NO. of ITERATIONS REACHED LIMIT. Increase the number of iterations (max_iter) or scale the data as shown in: https://scikit-learn.org/stable/modules/preprocessing.html Please also refer to the documentation for alternative solver options: https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression n_iter_i = _check_optimize_result(
model = LogisticRegression( random_state=42, multi_class='auto', solver='liblinear', max_iter=1000)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
# Calculate accuracy
print("Score: ", model.score(X_test, y_test))
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)
Score: 0.5950413223140496 Accuracy: 0.5950413223140496
Hyperparameter tuning
param_grid = {
'logisticregression__C': np.logspace(-5, 5, 5),
'logisticregression__penalty': ['l1', 'l2']
}
pipeline = make_pipeline(StandardScaler(), LogisticRegression(solver='liblinear'))
grid_search = GridSearchCV(estimator=pipeline, param_grid=param_grid, cv=2)
grid_search.fit(X_train, y_train)
best_params = grid_search.best_params_
best_score=grid_search.best_estimator_.score(X_test, y_test)
print("Best Parameters:", best_params)
print("Best Score:", best_score)
c:\Users\Chau\miniconda3\Lib\site-packages\sklearn\model_selection\_split.py:737: UserWarning: The least populated class in y has only 1 members, which is less than n_splits=2. warnings.warn( c:\Users\Chau\miniconda3\Lib\site-packages\sklearn\svm\_base.py:1237: ConvergenceWarning: Liblinear failed to converge, increase the number of iterations. warnings.warn( c:\Users\Chau\miniconda3\Lib\site-packages\sklearn\svm\_base.py:1237: ConvergenceWarning: Liblinear failed to converge, increase the number of iterations. warnings.warn( c:\Users\Chau\miniconda3\Lib\site-packages\sklearn\svm\_base.py:1237: ConvergenceWarning: Liblinear failed to converge, increase the number of iterations. warnings.warn( c:\Users\Chau\miniconda3\Lib\site-packages\sklearn\svm\_base.py:1237: ConvergenceWarning: Liblinear failed to converge, increase the number of iterations. warnings.warn( c:\Users\Chau\miniconda3\Lib\site-packages\sklearn\svm\_base.py:1237: ConvergenceWarning: Liblinear failed to converge, increase the number of iterations. warnings.warn(
Best Parameters: {'logisticregression__C': 1.0, 'logisticregression__penalty': 'l2'} Best Score: 0.6776859504132231
from sklearn.linear_model import LogisticRegression
model = LogisticRegression( penalty='elasticnet',l1_ratio=0.5, random_state=42, multi_class='auto', solver='saga', max_iter=100)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
# Calculate accuracy
print("Score: ", model.score(X_test, y_test))
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)
Score: 0.1115702479338843 Accuracy: 0.1115702479338843
c:\Users\Chau\miniconda3\Lib\site-packages\sklearn\linear_model\_sag.py:350: ConvergenceWarning: The max_iter was reached which means the coef_ did not converge warnings.warn(
Hyperparameter tuning
param_grid = {
'logisticregression__C': np.logspace(-3, 3, 3),
'logisticregression__l1_ratio': np.linspace(0, 1, 5)
}
pipeline = make_pipeline(StandardScaler(), LogisticRegression(penalty='elasticnet', solver='saga'))
grid_search = GridSearchCV(estimator=pipeline, param_grid=param_grid, cv=2)
grid_search.fit(X_train, y_train)
best_params = grid_search.best_params_
best_score=grid_search.best_estimator_.score(X_test, y_test)
print("Best Parameters:", best_params)
print("Best Score:", best_score)
c:\Users\Chau\miniconda3\Lib\site-packages\sklearn\model_selection\_split.py:737: UserWarning: The least populated class in y has only 1 members, which is less than n_splits=2. warnings.warn( c:\Users\Chau\miniconda3\Lib\site-packages\sklearn\linear_model\_sag.py:350: ConvergenceWarning: The max_iter was reached which means the coef_ did not converge warnings.warn( c:\Users\Chau\miniconda3\Lib\site-packages\sklearn\linear_model\_sag.py:350: ConvergenceWarning: The max_iter was reached which means the coef_ did not converge warnings.warn( c:\Users\Chau\miniconda3\Lib\site-packages\sklearn\linear_model\_sag.py:350: ConvergenceWarning: The max_iter was reached which means the coef_ did not converge warnings.warn( c:\Users\Chau\miniconda3\Lib\site-packages\sklearn\linear_model\_sag.py:350: ConvergenceWarning: The max_iter was reached which means the coef_ did not converge warnings.warn( c:\Users\Chau\miniconda3\Lib\site-packages\sklearn\linear_model\_sag.py:350: ConvergenceWarning: The max_iter was reached which means the coef_ did not converge warnings.warn( c:\Users\Chau\miniconda3\Lib\site-packages\sklearn\linear_model\_sag.py:350: ConvergenceWarning: The max_iter was reached which means the coef_ did not converge warnings.warn( c:\Users\Chau\miniconda3\Lib\site-packages\sklearn\linear_model\_sag.py:350: ConvergenceWarning: The max_iter was reached which means the coef_ did not converge warnings.warn( c:\Users\Chau\miniconda3\Lib\site-packages\sklearn\linear_model\_sag.py:350: ConvergenceWarning: The max_iter was reached which means the coef_ did not converge warnings.warn( c:\Users\Chau\miniconda3\Lib\site-packages\sklearn\linear_model\_sag.py:350: ConvergenceWarning: The max_iter was reached which means the coef_ did not converge warnings.warn( c:\Users\Chau\miniconda3\Lib\site-packages\sklearn\linear_model\_sag.py:350: ConvergenceWarning: The max_iter was reached which means the coef_ did not converge warnings.warn( c:\Users\Chau\miniconda3\Lib\site-packages\sklearn\linear_model\_sag.py:350: ConvergenceWarning: The max_iter was reached which means the coef_ did not converge warnings.warn( c:\Users\Chau\miniconda3\Lib\site-packages\sklearn\linear_model\_sag.py:350: ConvergenceWarning: The max_iter was reached which means the coef_ did not converge warnings.warn( c:\Users\Chau\miniconda3\Lib\site-packages\sklearn\linear_model\_sag.py:350: ConvergenceWarning: The max_iter was reached which means the coef_ did not converge warnings.warn( c:\Users\Chau\miniconda3\Lib\site-packages\sklearn\linear_model\_sag.py:350: ConvergenceWarning: The max_iter was reached which means the coef_ did not converge warnings.warn( c:\Users\Chau\miniconda3\Lib\site-packages\sklearn\linear_model\_sag.py:350: ConvergenceWarning: The max_iter was reached which means the coef_ did not converge warnings.warn( c:\Users\Chau\miniconda3\Lib\site-packages\sklearn\linear_model\_sag.py:350: ConvergenceWarning: The max_iter was reached which means the coef_ did not converge warnings.warn( c:\Users\Chau\miniconda3\Lib\site-packages\sklearn\linear_model\_sag.py:350: ConvergenceWarning: The max_iter was reached which means the coef_ did not converge warnings.warn( c:\Users\Chau\miniconda3\Lib\site-packages\sklearn\linear_model\_sag.py:350: ConvergenceWarning: The max_iter was reached which means the coef_ did not converge warnings.warn( c:\Users\Chau\miniconda3\Lib\site-packages\sklearn\linear_model\_sag.py:350: ConvergenceWarning: The max_iter was reached which means the coef_ did not converge warnings.warn( c:\Users\Chau\miniconda3\Lib\site-packages\sklearn\linear_model\_sag.py:350: ConvergenceWarning: The max_iter was reached which means the coef_ did not converge warnings.warn( c:\Users\Chau\miniconda3\Lib\site-packages\sklearn\linear_model\_sag.py:350: ConvergenceWarning: The max_iter was reached which means the coef_ did not converge warnings.warn( c:\Users\Chau\miniconda3\Lib\site-packages\sklearn\linear_model\_sag.py:350: ConvergenceWarning: The max_iter was reached which means the coef_ did not converge warnings.warn(
Best Parameters: {'logisticregression__C': 1.0, 'logisticregression__l1_ratio': 0.25} Best Score: 0.6611570247933884
c:\Users\Chau\miniconda3\Lib\site-packages\sklearn\linear_model\_sag.py:350: ConvergenceWarning: The max_iter was reached which means the coef_ did not converge warnings.warn(
Ignoring Order of Types¶
Preprocessing¶
We do the same as before, but we sort types in the tuples alphabetically to ignore order.
df = preprocessed_df.copy()
df['Types'] = df[['type_1', 'type_2']].apply(lambda x: sorted(tuple(filter(lambda y: pd.notna(y), x))), axis=1)
df.Types = df.Types.astype(str)
# drop the Type 1 and Type 2 columns
df.drop(['type_1', 'type_2'], axis=1, inplace=True)
# print head
df.head()
generation | height_m | weight_kg | abilities_number | total_points | hp | attack | defense | sp_attack | sp_defense | ... | egg_type_2_Flying | egg_type_2_Grass | egg_type_2_Human-Like | egg_type_2_Mineral | egg_type_2_Monster | egg_type_2_None | egg_type_2_Water 1 | egg_type_2_Water 2 | egg_type_2_Water 3 | Types | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | 0.7 | 6.9 | 2 | 318 | 45 | 49 | 49 | 65 | 65 | ... | False | False | False | False | True | False | False | False | False | ['Grass', 'Poison'] |
1 | 1 | 1.0 | 13.0 | 2 | 405 | 60 | 62 | 63 | 80 | 80 | ... | False | False | False | False | True | False | False | False | False | ['Grass', 'Poison'] |
2 | 1 | 2.0 | 100.0 | 2 | 525 | 80 | 82 | 83 | 100 | 100 | ... | False | False | False | False | True | False | False | False | False | ['Grass', 'Poison'] |
3 | 1 | 2.4 | 155.5 | 1 | 625 | 80 | 100 | 123 | 122 | 120 | ... | False | False | False | False | True | False | False | False | False | ['Grass', 'Poison'] |
4 | 1 | 0.6 | 8.5 | 2 | 309 | 39 | 52 | 43 | 60 | 50 | ... | False | False | False | False | True | False | False | False | False | ['Fire', 'None'] |
5 rows × 545 columns
sns.countplot(df, y='Types');
['#1f77b4', '#ff7f0e', '#2ca02c', '#d62728', '#9467bd', '#8c564b', '#e377c2', '#7f7f7f', '#bcbd22', '#17becf'] ['#1f77b4', '#ff7f0e', '#2ca02c', '#d62728', '#9467bd', '#8c564b', '#e377c2', '#7f7f7f', '#bcbd22', '#17becf'] ['#1f77b4', '#ff7f0e', '#2ca02c', '#d62728', '#9467bd', '#8c564b', '#e377c2', '#7f7f7f', '#bcbd22', '#17becf'] ['#1f77b4', '#ff7f0e', '#2ca02c', '#d62728', '#9467bd', '#8c564b', '#e377c2', '#7f7f7f', '#bcbd22', '#17becf'] ['#1f77b4', '#ff7f0e', '#2ca02c', '#d62728', '#9467bd', '#8c564b', '#e377c2', '#7f7f7f', '#bcbd22', '#17becf']
# Check if there are any pokemon with only one type
singleton_classes = df['Types'].value_counts()[df['Types'].value_counts() == 1].index.tolist()
singleton_data = df[df['Types'].isin(singleton_classes)]
other_data = df[~df['Types'].isin(singleton_classes)]
print("Number of singleton classes",len(singleton_classes))
print("number of unique type combinations",len(df['Types'].unique()))
print(len(df['Types']))
df.head()
Number of singleton classes 23 number of unique type combinations 154 1044
generation | height_m | weight_kg | abilities_number | total_points | hp | attack | defense | sp_attack | sp_defense | ... | egg_type_2_Flying | egg_type_2_Grass | egg_type_2_Human-Like | egg_type_2_Mineral | egg_type_2_Monster | egg_type_2_None | egg_type_2_Water 1 | egg_type_2_Water 2 | egg_type_2_Water 3 | Types | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | 0.7 | 6.9 | 2 | 318 | 45 | 49 | 49 | 65 | 65 | ... | False | False | False | False | True | False | False | False | False | ['Grass', 'Poison'] |
1 | 1 | 1.0 | 13.0 | 2 | 405 | 60 | 62 | 63 | 80 | 80 | ... | False | False | False | False | True | False | False | False | False | ['Grass', 'Poison'] |
2 | 1 | 2.0 | 100.0 | 2 | 525 | 80 | 82 | 83 | 100 | 100 | ... | False | False | False | False | True | False | False | False | False | ['Grass', 'Poison'] |
3 | 1 | 2.4 | 155.5 | 1 | 625 | 80 | 100 | 123 | 122 | 120 | ... | False | False | False | False | True | False | False | False | False | ['Grass', 'Poison'] |
4 | 1 | 0.6 | 8.5 | 2 | 309 | 39 | 52 | 43 | 60 | 50 | ... | False | False | False | False | True | False | False | False | False | ['Fire', 'None'] |
5 rows × 545 columns
Decision tree¶
# Split the data into training and testing sets
X = df.drop(columns=['Types'])
y = df['Types']
X_train, X_test, y_train, y_test = train_test_split(other_data.drop(columns=['Types']), other_data['Types'], test_size=0.2, stratify=other_data['Types'], random_state=42)
X_train = pd.concat([X_train, singleton_data.drop(columns=['Types'])])
y_train = pd.concat([y_train, singleton_data['Types']])
X_test = pd.concat([X_test, singleton_data.drop(columns=['Types'])])
y_test = pd.concat([y_test, singleton_data['Types']])
# Initialize and train the decision tree classifier
model = DecisionTreeClassifier(random_state=42)
model.fit(X_train, y_train)
# Predict labels for the test set
y_pred = model.predict(X_test)
# Calculate accuracy
print("Score: ", model.score(X_test, y_test))
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)
Score: 0.4298245614035088 Accuracy: 0.4298245614035088
Hyperparameter tuning¶
# Setup the parameters and distributions to sample from: param_dist
pipeline = make_pipeline(StandardScaler(), DecisionTreeClassifier())
param_dist = {
"decisiontreeclassifier__max_depth": [15, 30, None],
"decisiontreeclassifier__min_samples_leaf": np.arange(1, 10)
}
# Instantiate the GridSearchCV object
grid_search_cv = GridSearchCV(pipeline, param_grid=param_dist, cv=5)
# Fit grid_search_cv using the data X and labels y.
grid_search_cv.fit(X_train, y_train)
y_pred = grid_search_cv.predict(X_test)
# Print the best score
print("Tuned Model Parameters: {}".format(grid_search_cv.best_params_))
print("Accuracy: {}".format(grid_search_cv.best_estimator_.score(X_test, y_test)))
c:\Users\Chau\miniconda3\Lib\site-packages\sklearn\model_selection\_split.py:737: UserWarning: The least populated class in y has only 1 members, which is less than n_splits=5. warnings.warn(
Tuned Model Parameters: {'decisiontreeclassifier__max_depth': None, 'decisiontreeclassifier__min_samples_leaf': 1} Accuracy: 0.4649122807017544
Random forest¶
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)
# Predict labels for the test set
y_pred = model.predict(X_test)
# Calculate accuracy
print("Score: ", model.score(X_test, y_test))
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)
Score: 0.5964912280701754 Accuracy: 0.5964912280701754
Hyperparameter Tuning¶
There's an issue with cross-validation because some type combinations occur only once in the training data.
pipeline = make_pipeline(StandardScaler(), RandomForestClassifier())
# Setup the parameters and distributions to sample from: param_dist
param_dist = {
"randomforestclassifier__max_depth": np.arange(30, 36),
"randomforestclassifier__min_samples_leaf": np.arange(1, 10, 4),
"randomforestclassifier__n_estimators": np.arange(100, 140, 4)
}
# Instantiate the RandomizedSearchCV object: random_grid_search_cv
random_search_cv = RandomizedSearchCV(pipeline, param_distributions=param_dist, n_iter=50, cv=3, random_state=42)
#grid_search_cv = GridSearchCV(pipeline, param_grid=param_dist, cv=3)
# Fit random_search_cv using the data X and labels y
random_search_cv.fit(X_train, y_train)
#grid_search_cv.fit(X_train, y_train)
# Print the best score
print("Best score is {}".format(random_search_cv.best_estimator_.score(X_test, y_test)))
print("Best parameters are {}".format(random_search_cv.best_params_))
c:\Users\Chau\miniconda3\Lib\site-packages\sklearn\model_selection\_split.py:737: UserWarning: The least populated class in y has only 1 members, which is less than n_splits=3. warnings.warn(
Best score is 0.5964912280701754 Best parameters are {'randomforestclassifier__max_depth': 33, 'randomforestclassifier__min_samples_leaf': 1, 'randomforestclassifier__n_estimators': 136}
Support vector machine¶
Radial base function¶
from sklearn.svm import SVC
model = SVC(kernel='rbf', random_state=42)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)
Accuracy: 0.05263157894736842
Hyperparameter Tuning¶
from sklearn.svm import SVC
# Define the pipeline with StandardScaler and SVC
pipeline = make_pipeline(StandardScaler(), SVC(kernel='rbf'))
# Define the parameter grid
param_grid = {
'svc__C': [0.1,0.5, 1,5, 10], # Regularization parameter
'svc__coef0': [0.0, 1.0, 2.0], # Independent term in the polynomial kernel function
}
# Initialize GridSearchCV
grid_search = GridSearchCV(pipeline, param_grid, cv=5, n_jobs=-1)
# Perform grid search
grid_search.fit(X_train, y_train)
# Get the best parameters and score
best_params = grid_search.best_params_
best_score = grid_search.best_estimator_.score(X_test, y_test)
print("Best Parameters:", best_params)
print("Best Score:", best_score)
c:\Users\Chau\miniconda3\Lib\site-packages\sklearn\model_selection\_split.py:737: UserWarning: The least populated class in y has only 1 members, which is less than n_splits=5. warnings.warn(
Best Parameters: {'svc__C': 10, 'svc__coef0': 0.0} Best Score: 0.5482456140350878
Linear¶
svm_classifier = SVC(kernel='linear', random_state=42)
# Train the SVM classifier
svm_classifier.fit(X_train, y_train)
y_pred = svm_classifier.predict(X_test)
# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)
Accuracy: 0.34210526315789475
Hyperparameter tuning¶
from sklearn.svm import SVC
# Define the pipeline with StandardScaler and SVC
pipeline = make_pipeline(StandardScaler(), SVC(kernel='linear'))
# Define the parameter grid
param_grid = {
'svc__C': [0.1,0.5, 1 , 5, 10], # Regularization parameter # Degree of the polynomial kernel
'svc__coef0': [0.0, 1.0, 2.0], # Independent term in the polynomial kernel function
}
# Initialize GridSearchCV
grid_search = GridSearchCV(pipeline, param_grid, cv=5, n_jobs=-1)
# Perform grid search
grid_search.fit(X_train, y_train)
# Get the best parameters and score
best_params = grid_search.best_params_
best_score = grid_search.best_estimator_.score(X_test, y_test)
print("Best Parameters:", best_params)
print("Best Score:", best_score)
c:\Users\Chau\miniconda3\Lib\site-packages\sklearn\model_selection\_split.py:737: UserWarning: The least populated class in y has only 1 members, which is less than n_splits=5. warnings.warn(
Best Parameters: {'svc__C': 0.5, 'svc__coef0': 0.0} Best Score: 0.5921052631578947
Polynomial¶
model = SVC(kernel='poly', random_state=42)
# Train the SVM classifier
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)
Accuracy: 0.07017543859649122
Hyperparameter Tuning¶
from sklearn.svm import SVC
# Define the pipeline with StandardScaler and SVC
pipeline = make_pipeline(StandardScaler(), SVC(kernel='poly'))
# Define the parameter grid
param_grid = {
'svc__C': [0.1,0.5, 1, 5, 10], # Regularization parameter
'svc__degree': [2, 3, 4, 5, 6], # Degree of the polynomial kernel
'svc__coef0': [0.0, 1.0, 2.0], # Independent term in the polynomial kernel function
}
# Initialize GridSearchCV
grid_search = GridSearchCV(pipeline, param_grid, cv=5, n_jobs=-1)
# Perform grid search
grid_search.fit(X_train, y_train)
# Get the best parameters and score
best_params = grid_search.best_params_
best_score = grid_search.best_estimator_.score(X_test, y_test)
print("Best Parameters:", best_params)
print("Best Score:", best_score)
c:\Users\Chau\miniconda3\Lib\site-packages\sklearn\model_selection\_split.py:737: UserWarning: The least populated class in y has only 1 members, which is less than n_splits=5. warnings.warn(
Best Parameters: {'svc__C': 10, 'svc__coef0': 2.0, 'svc__degree': 3} Best Score: 0.5789473684210527
Sigmoid¶
model = SVC(kernel='sigmoid', random_state=42)
# Train the SVM classifier
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
# Calculate accuracy
print("Score: ", model.score(X_test, y_test))
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)
Score: 0.07456140350877193 Accuracy: 0.07456140350877193
Hyperparameter Tuning¶
from sklearn.svm import SVC
# Define the pipeline with StandardScaler and SVC
pipeline = make_pipeline(StandardScaler(), SVC(kernel='sigmoid'))
# Define the parameter grid
param_grid = {
'svc__C': [0.1,0.5, 1,5, 10, 50], # Regularization parameter
'svc__coef0': [0.0, 1.0, 2.0], # Independent term in the polynomial kernel function
}
# Initialize GridSearchCV
grid_search = GridSearchCV(pipeline, param_grid, cv=5, n_jobs=-1)
# Perform grid search
grid_search.fit(X_train, y_train)
# Get the best parameters and score
best_params = grid_search.best_params_
best_score = grid_search.best_estimator_.score(X_test, y_test)
print("Best Parameters:", best_params)
print("Best Score:", best_score)
c:\Users\Chau\miniconda3\Lib\site-packages\sklearn\model_selection\_split.py:737: UserWarning: The least populated class in y has only 1 members, which is less than n_splits=5. warnings.warn(
Best Parameters: {'svc__C': 50, 'svc__coef0': 0.0} Best Score: 0.5614035087719298
k Nearest Neighbors¶
from sklearn.neighbors import KNeighborsClassifier
model = KNeighborsClassifier()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
# Calculate accuracy
print("Score: ", model.score(X_test, y_test))
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)
Score: 0.10087719298245613 Accuracy: 0.10087719298245613
Hyperparameter Tuning¶
param_grid = {
'kneighborsclassifier__n_neighbors': [3, 5, 7, 9] # List of k values to try
}
pipeline = make_pipeline(StandardScaler(), KNeighborsClassifier())
grid_search = GridSearchCV(estimator=pipeline, param_grid=param_grid, cv=5)
grid_search.fit(X_train, y_train)
best_params = grid_search.best_params_
best_score=grid_search.best_estimator_.score(X_test, y_test)
print("Best Parameters:", best_params)
print("Best Score:", best_score)
c:\Users\Chau\miniconda3\Lib\site-packages\sklearn\model_selection\_split.py:737: UserWarning: The least populated class in y has only 1 members, which is less than n_splits=5. warnings.warn(
Best Parameters: {'kneighborsclassifier__n_neighbors': 1} Best Score: 0.5921052631578947
Logistic regression¶
model = LogisticRegression()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
# Calculate accuracy
print("Score: ", model.score(X_test, y_test))
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)
Score: 0.14473684210526316 Accuracy: 0.14473684210526316
c:\Users\Chau\miniconda3\Lib\site-packages\sklearn\linear_model\_logistic.py:469: ConvergenceWarning: lbfgs failed to converge (status=1): STOP: TOTAL NO. of ITERATIONS REACHED LIMIT. Increase the number of iterations (max_iter) or scale the data as shown in: https://scikit-learn.org/stable/modules/preprocessing.html Please also refer to the documentation for alternative solver options: https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression n_iter_i = _check_optimize_result(
model = LogisticRegression( random_state=42, multi_class='auto', solver='liblinear', max_iter=1000)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
# Calculate accuracy
print("Score: ", model.score(X_test, y_test))
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)
Score: 0.5657894736842105 Accuracy: 0.5657894736842105
Hyperparameter tuning
param_grid = {
'logisticregression__C': np.logspace(-4, 4, 3),
'logisticregression__penalty': ['l1', 'l2']
}
pipeline = make_pipeline(StandardScaler(), LogisticRegression(solver='liblinear'))
grid_search = GridSearchCV(estimator=pipeline, param_grid=param_grid, cv=2)
grid_search.fit(X_train, y_train)
best_params = grid_search.best_params_
best_score=grid_search.best_estimator_.score(X_test, y_test)
print("Best Parameters:", best_params)
print("Best Score:", best_score)
c:\Users\Chau\miniconda3\Lib\site-packages\sklearn\model_selection\_split.py:737: UserWarning: The least populated class in y has only 1 members, which is less than n_splits=2. warnings.warn( c:\Users\Chau\miniconda3\Lib\site-packages\sklearn\svm\_base.py:1237: ConvergenceWarning: Liblinear failed to converge, increase the number of iterations. warnings.warn( c:\Users\Chau\miniconda3\Lib\site-packages\sklearn\svm\_base.py:1237: ConvergenceWarning: Liblinear failed to converge, increase the number of iterations. warnings.warn(
Best Parameters: {'logisticregression__C': 1.0, 'logisticregression__penalty': 'l2'} Best Score: 0.6776859504132231
from sklearn.linear_model import LogisticRegression
model = LogisticRegression( penalty='elasticnet',l1_ratio=0.5, random_state=42, multi_class='auto', solver='saga', max_iter=100)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
# Calculate accuracy
print("Score: ", model.score(X_test, y_test))
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)
Score: 0.1115702479338843 Accuracy: 0.1115702479338843
c:\Users\Chau\miniconda3\Lib\site-packages\sklearn\linear_model\_sag.py:350: ConvergenceWarning: The max_iter was reached which means the coef_ did not converge warnings.warn(
Hyperparameter tuning
param_grid = {
'logisticregression__C': [1.0],
'logisticregression__l1_ratio': np.linspace(0, 1, 5)
}
pipeline = make_pipeline(StandardScaler(), LogisticRegression(penalty='elasticnet', solver='saga'))
grid_search = GridSearchCV(estimator=pipeline, param_grid=param_grid, cv=2)
grid_search.fit(X_train, y_train)
best_params = grid_search.best_params_
best_score=grid_search.best_estimator_.score(X_test, y_test)
print("Best Parameters:", best_params)
print("Best Score:", best_score)
c:\Users\Chau\miniconda3\Lib\site-packages\sklearn\model_selection\_split.py:737: UserWarning: The least populated class in y has only 1 members, which is less than n_splits=2. warnings.warn( c:\Users\Chau\miniconda3\Lib\site-packages\sklearn\linear_model\_sag.py:350: ConvergenceWarning: The max_iter was reached which means the coef_ did not converge warnings.warn( c:\Users\Chau\miniconda3\Lib\site-packages\sklearn\linear_model\_sag.py:350: ConvergenceWarning: The max_iter was reached which means the coef_ did not converge warnings.warn( c:\Users\Chau\miniconda3\Lib\site-packages\sklearn\linear_model\_sag.py:350: ConvergenceWarning: The max_iter was reached which means the coef_ did not converge warnings.warn( c:\Users\Chau\miniconda3\Lib\site-packages\sklearn\linear_model\_sag.py:350: ConvergenceWarning: The max_iter was reached which means the coef_ did not converge warnings.warn( c:\Users\Chau\miniconda3\Lib\site-packages\sklearn\linear_model\_sag.py:350: ConvergenceWarning: The max_iter was reached which means the coef_ did not converge warnings.warn( c:\Users\Chau\miniconda3\Lib\site-packages\sklearn\linear_model\_sag.py:350: ConvergenceWarning: The max_iter was reached which means the coef_ did not converge warnings.warn( c:\Users\Chau\miniconda3\Lib\site-packages\sklearn\linear_model\_sag.py:350: ConvergenceWarning: The max_iter was reached which means the coef_ did not converge warnings.warn( c:\Users\Chau\miniconda3\Lib\site-packages\sklearn\linear_model\_sag.py:350: ConvergenceWarning: The max_iter was reached which means the coef_ did not converge warnings.warn( c:\Users\Chau\miniconda3\Lib\site-packages\sklearn\linear_model\_sag.py:350: ConvergenceWarning: The max_iter was reached which means the coef_ did not converge warnings.warn( c:\Users\Chau\miniconda3\Lib\site-packages\sklearn\linear_model\_sag.py:350: ConvergenceWarning: The max_iter was reached which means the coef_ did not converge warnings.warn(
Best Parameters: {'logisticregression__C': 1.0, 'logisticregression__l1_ratio': 0.25} Best Score: 0.6611570247933884
c:\Users\Chau\miniconda3\Lib\site-packages\sklearn\linear_model\_sag.py:350: ConvergenceWarning: The max_iter was reached which means the coef_ did not converge warnings.warn(
Multi-label Classification¶
Accounting for Order of Types¶
Preprocessing¶
df = preprocessed_df.copy()
df['Types'] = df[['type_1', 'type_2']].apply(lambda x: tuple(filter(lambda y: pd.notna(y), x)), axis=1)
df.Types = df.Types.astype(str)
# drop the Type 1 and Type 2 columns
df.drop(['type_1', 'type_2'], axis=1, inplace=True)
# print head
df.head()
generation | height_m | weight_kg | abilities_number | total_points | hp | attack | defense | sp_attack | sp_defense | ... | egg_type_2_Flying | egg_type_2_Grass | egg_type_2_Human-Like | egg_type_2_Mineral | egg_type_2_Monster | egg_type_2_None | egg_type_2_Water 1 | egg_type_2_Water 2 | egg_type_2_Water 3 | Types | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | 0.7 | 6.9 | 2 | 318 | 45 | 49 | 49 | 65 | 65 | ... | False | False | False | False | True | False | False | False | False | ('Grass', 'Poison') |
1 | 1 | 1.0 | 13.0 | 2 | 405 | 60 | 62 | 63 | 80 | 80 | ... | False | False | False | False | True | False | False | False | False | ('Grass', 'Poison') |
2 | 1 | 2.0 | 100.0 | 2 | 525 | 80 | 82 | 83 | 100 | 100 | ... | False | False | False | False | True | False | False | False | False | ('Grass', 'Poison') |
3 | 1 | 2.4 | 155.5 | 1 | 625 | 80 | 100 | 123 | 122 | 120 | ... | False | False | False | False | True | False | False | False | False | ('Grass', 'Poison') |
4 | 1 | 0.6 | 8.5 | 2 | 309 | 39 | 52 | 43 | 60 | 50 | ... | False | False | False | False | True | False | False | False | False | ('Fire', 'None') |
5 rows × 545 columns
# Find classes with only one type
singleton_classes = df['Types'].value_counts()[df['Types'].value_counts() == 1].index.tolist()
To account for order of types, we create binary labels for each type combination.
# Create binary labels for each Pokémon type combination
type_combinations = df['Types'].unique()
for type in type_combinations:
df[type] = df['Types'].apply(lambda x: 1 if type in x else 0)
singleton_data = df[df['Types'].isin(singleton_classes)]
other_data = df[~df['Types'].isin(singleton_classes)]
print("Number of singleton classes",len(singleton_classes))
print("number of unique type combinations",len(df['Types'].unique()))
print(len(df['Types']))
df.head()
Number of singleton classes 41 number of unique type combinations 192 1044
C:\Users\thors\AppData\Local\Temp\ipykernel_20284\3350647679.py:4: PerformanceWarning: DataFrame is highly fragmented. This is usually the result of calling `frame.insert` many times, which has poor performance. Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()` df[type] = df['Types'].apply(lambda x: 1 if type in x else 0) C:\Users\thors\AppData\Local\Temp\ipykernel_20284\3350647679.py:4: PerformanceWarning: DataFrame is highly fragmented. This is usually the result of calling `frame.insert` many times, which has poor performance. Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()` df[type] = df['Types'].apply(lambda x: 1 if type in x else 0) C:\Users\thors\AppData\Local\Temp\ipykernel_20284\3350647679.py:4: PerformanceWarning: DataFrame is highly fragmented. This is usually the result of calling `frame.insert` many times, which has poor performance. Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()` df[type] = df['Types'].apply(lambda x: 1 if type in x else 0) C:\Users\thors\AppData\Local\Temp\ipykernel_20284\3350647679.py:4: PerformanceWarning: DataFrame is highly fragmented. This is usually the result of calling `frame.insert` many times, which has poor performance. Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()` df[type] = df['Types'].apply(lambda x: 1 if type in x else 0) C:\Users\thors\AppData\Local\Temp\ipykernel_20284\3350647679.py:4: PerformanceWarning: DataFrame is highly fragmented. This is usually the result of calling `frame.insert` many times, which has poor performance. Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()` df[type] = df['Types'].apply(lambda x: 1 if type in x else 0) C:\Users\thors\AppData\Local\Temp\ipykernel_20284\3350647679.py:4: PerformanceWarning: DataFrame is highly fragmented. This is usually the result of calling `frame.insert` many times, which has poor performance. Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()` df[type] = df['Types'].apply(lambda x: 1 if type in x else 0) C:\Users\thors\AppData\Local\Temp\ipykernel_20284\3350647679.py:4: PerformanceWarning: DataFrame is highly fragmented. This is usually the result of calling `frame.insert` many times, which has poor performance. Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()` df[type] = df['Types'].apply(lambda x: 1 if type in x else 0) C:\Users\thors\AppData\Local\Temp\ipykernel_20284\3350647679.py:4: PerformanceWarning: DataFrame is highly fragmented. This is usually the result of calling `frame.insert` many times, which has poor performance. Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()` df[type] = df['Types'].apply(lambda x: 1 if type in x else 0) C:\Users\thors\AppData\Local\Temp\ipykernel_20284\3350647679.py:4: PerformanceWarning: DataFrame is highly fragmented. This is usually the result of calling `frame.insert` many times, which has poor performance. Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()` df[type] = df['Types'].apply(lambda x: 1 if type in x else 0) C:\Users\thors\AppData\Local\Temp\ipykernel_20284\3350647679.py:4: PerformanceWarning: DataFrame is highly fragmented. This is usually the result of calling `frame.insert` many times, which has poor performance. Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()` df[type] = df['Types'].apply(lambda x: 1 if type in x else 0) C:\Users\thors\AppData\Local\Temp\ipykernel_20284\3350647679.py:4: PerformanceWarning: DataFrame is highly fragmented. This is usually the result of calling `frame.insert` many times, which has poor performance. Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()` df[type] = df['Types'].apply(lambda x: 1 if type in x else 0) C:\Users\thors\AppData\Local\Temp\ipykernel_20284\3350647679.py:4: PerformanceWarning: DataFrame is highly fragmented. This is usually the result of calling `frame.insert` many times, which has poor performance. Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()` df[type] = df['Types'].apply(lambda x: 1 if type in x else 0) C:\Users\thors\AppData\Local\Temp\ipykernel_20284\3350647679.py:4: PerformanceWarning: DataFrame is highly fragmented. This is usually the result of calling `frame.insert` many times, which has poor performance. Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()` df[type] = df['Types'].apply(lambda x: 1 if type in x else 0) C:\Users\thors\AppData\Local\Temp\ipykernel_20284\3350647679.py:4: PerformanceWarning: DataFrame is highly fragmented. This is usually the result of calling `frame.insert` many times, which has poor performance. Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()` df[type] = df['Types'].apply(lambda x: 1 if type in x else 0) C:\Users\thors\AppData\Local\Temp\ipykernel_20284\3350647679.py:4: PerformanceWarning: DataFrame is highly fragmented. This is usually the result of calling `frame.insert` many times, which has poor performance. Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()` df[type] = df['Types'].apply(lambda x: 1 if type in x else 0) C:\Users\thors\AppData\Local\Temp\ipykernel_20284\3350647679.py:4: PerformanceWarning: DataFrame is highly fragmented. This is usually the result of calling `frame.insert` many times, which has poor performance. Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()` df[type] = df['Types'].apply(lambda x: 1 if type in x else 0) C:\Users\thors\AppData\Local\Temp\ipykernel_20284\3350647679.py:4: PerformanceWarning: DataFrame is highly fragmented. This is usually the result of calling `frame.insert` many times, which has poor performance. Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()` df[type] = df['Types'].apply(lambda x: 1 if type in x else 0) C:\Users\thors\AppData\Local\Temp\ipykernel_20284\3350647679.py:4: PerformanceWarning: DataFrame is highly fragmented. This is usually the result of calling `frame.insert` many times, which has poor performance. Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()` df[type] = df['Types'].apply(lambda x: 1 if type in x else 0) C:\Users\thors\AppData\Local\Temp\ipykernel_20284\3350647679.py:4: PerformanceWarning: DataFrame is highly fragmented. This is usually the result of calling `frame.insert` many times, which has poor performance. Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()` df[type] = df['Types'].apply(lambda x: 1 if type in x else 0) C:\Users\thors\AppData\Local\Temp\ipykernel_20284\3350647679.py:4: PerformanceWarning: DataFrame is highly fragmented. This is usually the result of calling `frame.insert` many times, which has poor performance. Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()` df[type] = df['Types'].apply(lambda x: 1 if type in x else 0) C:\Users\thors\AppData\Local\Temp\ipykernel_20284\3350647679.py:4: PerformanceWarning: DataFrame is highly fragmented. This is usually the result of calling `frame.insert` many times, which has poor performance. Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()` df[type] = df['Types'].apply(lambda x: 1 if type in x else 0) C:\Users\thors\AppData\Local\Temp\ipykernel_20284\3350647679.py:4: PerformanceWarning: DataFrame is highly fragmented. This is usually the result of calling `frame.insert` many times, which has poor performance. Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()` df[type] = df['Types'].apply(lambda x: 1 if type in x else 0) C:\Users\thors\AppData\Local\Temp\ipykernel_20284\3350647679.py:4: PerformanceWarning: DataFrame is highly fragmented. This is usually the result of calling `frame.insert` many times, which has poor performance. Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()` df[type] = df['Types'].apply(lambda x: 1 if type in x else 0) C:\Users\thors\AppData\Local\Temp\ipykernel_20284\3350647679.py:4: PerformanceWarning: DataFrame is highly fragmented. This is usually the result of calling `frame.insert` many times, which has poor performance. Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()` df[type] = df['Types'].apply(lambda x: 1 if type in x else 0) C:\Users\thors\AppData\Local\Temp\ipykernel_20284\3350647679.py:4: PerformanceWarning: DataFrame is highly fragmented. This is usually the result of calling `frame.insert` many times, which has poor performance. Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()` df[type] = df['Types'].apply(lambda x: 1 if type in x else 0) C:\Users\thors\AppData\Local\Temp\ipykernel_20284\3350647679.py:4: PerformanceWarning: DataFrame is highly fragmented. This is usually the result of calling `frame.insert` many times, which has poor performance. Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()` df[type] = df['Types'].apply(lambda x: 1 if type in x else 0) C:\Users\thors\AppData\Local\Temp\ipykernel_20284\3350647679.py:4: PerformanceWarning: DataFrame is highly fragmented. This is usually the result of calling `frame.insert` many times, which has poor performance. Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()` df[type] = df['Types'].apply(lambda x: 1 if type in x else 0) C:\Users\thors\AppData\Local\Temp\ipykernel_20284\3350647679.py:4: PerformanceWarning: DataFrame is highly fragmented. This is usually the result of calling `frame.insert` many times, which has poor performance. Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()` df[type] = df['Types'].apply(lambda x: 1 if type in x else 0) C:\Users\thors\AppData\Local\Temp\ipykernel_20284\3350647679.py:4: PerformanceWarning: DataFrame is highly fragmented. This is usually the result of calling `frame.insert` many times, which has poor performance. Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()` df[type] = df['Types'].apply(lambda x: 1 if type in x else 0) C:\Users\thors\AppData\Local\Temp\ipykernel_20284\3350647679.py:4: PerformanceWarning: DataFrame is highly fragmented. This is usually the result of calling `frame.insert` many times, which has poor performance. Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()` df[type] = df['Types'].apply(lambda x: 1 if type in x else 0) C:\Users\thors\AppData\Local\Temp\ipykernel_20284\3350647679.py:4: PerformanceWarning: DataFrame is highly fragmented. This is usually the result of calling `frame.insert` many times, which has poor performance. Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()` df[type] = df['Types'].apply(lambda x: 1 if type in x else 0) C:\Users\thors\AppData\Local\Temp\ipykernel_20284\3350647679.py:4: PerformanceWarning: DataFrame is highly fragmented. This is usually the result of calling `frame.insert` many times, which has poor performance. Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()` df[type] = df['Types'].apply(lambda x: 1 if type in x else 0) C:\Users\thors\AppData\Local\Temp\ipykernel_20284\3350647679.py:4: PerformanceWarning: DataFrame is highly fragmented. This is usually the result of calling `frame.insert` many times, which has poor performance. Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()` df[type] = df['Types'].apply(lambda x: 1 if type in x else 0) C:\Users\thors\AppData\Local\Temp\ipykernel_20284\3350647679.py:4: PerformanceWarning: DataFrame is highly fragmented. This is usually the result of calling `frame.insert` many times, which has poor performance. Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()` df[type] = df['Types'].apply(lambda x: 1 if type in x else 0) C:\Users\thors\AppData\Local\Temp\ipykernel_20284\3350647679.py:4: PerformanceWarning: DataFrame is highly fragmented. This is usually the result of calling `frame.insert` many times, which has poor performance. Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()` df[type] = df['Types'].apply(lambda x: 1 if type in x else 0) C:\Users\thors\AppData\Local\Temp\ipykernel_20284\3350647679.py:4: PerformanceWarning: DataFrame is highly fragmented. This is usually the result of calling `frame.insert` many times, which has poor performance. Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()` df[type] = df['Types'].apply(lambda x: 1 if type in x else 0) C:\Users\thors\AppData\Local\Temp\ipykernel_20284\3350647679.py:4: PerformanceWarning: DataFrame is highly fragmented. This is usually the result of calling `frame.insert` many times, which has poor performance. Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()` df[type] = df['Types'].apply(lambda x: 1 if type in x else 0) C:\Users\thors\AppData\Local\Temp\ipykernel_20284\3350647679.py:4: PerformanceWarning: DataFrame is highly fragmented. This is usually the result of calling `frame.insert` many times, which has poor performance. Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()` df[type] = df['Types'].apply(lambda x: 1 if type in x else 0) C:\Users\thors\AppData\Local\Temp\ipykernel_20284\3350647679.py:4: PerformanceWarning: DataFrame is highly fragmented. This is usually the result of calling `frame.insert` many times, which has poor performance. Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()` df[type] = df['Types'].apply(lambda x: 1 if type in x else 0) C:\Users\thors\AppData\Local\Temp\ipykernel_20284\3350647679.py:4: PerformanceWarning: DataFrame is highly fragmented. This is usually the result of calling `frame.insert` many times, which has poor performance. Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()` df[type] = df['Types'].apply(lambda x: 1 if type in x else 0) C:\Users\thors\AppData\Local\Temp\ipykernel_20284\3350647679.py:4: PerformanceWarning: DataFrame is highly fragmented. This is usually the result of calling `frame.insert` many times, which has poor performance. Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()` df[type] = df['Types'].apply(lambda x: 1 if type in x else 0) C:\Users\thors\AppData\Local\Temp\ipykernel_20284\3350647679.py:4: PerformanceWarning: DataFrame is highly fragmented. This is usually the result of calling `frame.insert` many times, which has poor performance. Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()` df[type] = df['Types'].apply(lambda x: 1 if type in x else 0) C:\Users\thors\AppData\Local\Temp\ipykernel_20284\3350647679.py:4: PerformanceWarning: DataFrame is highly fragmented. This is usually the result of calling `frame.insert` many times, which has poor performance. Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()` df[type] = df['Types'].apply(lambda x: 1 if type in x else 0) C:\Users\thors\AppData\Local\Temp\ipykernel_20284\3350647679.py:4: PerformanceWarning: DataFrame is highly fragmented. This is usually the result of calling `frame.insert` many times, which has poor performance. Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()` df[type] = df['Types'].apply(lambda x: 1 if type in x else 0) C:\Users\thors\AppData\Local\Temp\ipykernel_20284\3350647679.py:4: PerformanceWarning: DataFrame is highly fragmented. This is usually the result of calling `frame.insert` many times, which has poor performance. Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()` df[type] = df['Types'].apply(lambda x: 1 if type in x else 0) C:\Users\thors\AppData\Local\Temp\ipykernel_20284\3350647679.py:4: PerformanceWarning: DataFrame is highly fragmented. This is usually the result of calling `frame.insert` many times, which has poor performance. Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()` df[type] = df['Types'].apply(lambda x: 1 if type in x else 0) C:\Users\thors\AppData\Local\Temp\ipykernel_20284\3350647679.py:4: PerformanceWarning: DataFrame is highly fragmented. This is usually the result of calling `frame.insert` many times, which has poor performance. Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()` df[type] = df['Types'].apply(lambda x: 1 if type in x else 0) C:\Users\thors\AppData\Local\Temp\ipykernel_20284\3350647679.py:4: PerformanceWarning: DataFrame is highly fragmented. This is usually the result of calling `frame.insert` many times, which has poor performance. Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()` df[type] = df['Types'].apply(lambda x: 1 if type in x else 0) C:\Users\thors\AppData\Local\Temp\ipykernel_20284\3350647679.py:4: PerformanceWarning: DataFrame is highly fragmented. This is usually the result of calling `frame.insert` many times, which has poor performance. Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()` df[type] = df['Types'].apply(lambda x: 1 if type in x else 0) C:\Users\thors\AppData\Local\Temp\ipykernel_20284\3350647679.py:4: PerformanceWarning: DataFrame is highly fragmented. This is usually the result of calling `frame.insert` many times, which has poor performance. Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()` df[type] = df['Types'].apply(lambda x: 1 if type in x else 0) C:\Users\thors\AppData\Local\Temp\ipykernel_20284\3350647679.py:4: PerformanceWarning: DataFrame is highly fragmented. This is usually the result of calling `frame.insert` many times, which has poor performance. Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()` df[type] = df['Types'].apply(lambda x: 1 if type in x else 0) C:\Users\thors\AppData\Local\Temp\ipykernel_20284\3350647679.py:4: PerformanceWarning: DataFrame is highly fragmented. This is usually the result of calling `frame.insert` many times, which has poor performance. Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()` df[type] = df['Types'].apply(lambda x: 1 if type in x else 0) C:\Users\thors\AppData\Local\Temp\ipykernel_20284\3350647679.py:4: PerformanceWarning: DataFrame is highly fragmented. This is usually the result of calling `frame.insert` many times, which has poor performance. Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()` df[type] = df['Types'].apply(lambda x: 1 if type in x else 0) C:\Users\thors\AppData\Local\Temp\ipykernel_20284\3350647679.py:4: PerformanceWarning: DataFrame is highly fragmented. This is usually the result of calling `frame.insert` many times, which has poor performance. Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()` df[type] = df['Types'].apply(lambda x: 1 if type in x else 0) C:\Users\thors\AppData\Local\Temp\ipykernel_20284\3350647679.py:4: PerformanceWarning: DataFrame is highly fragmented. This is usually the result of calling `frame.insert` many times, which has poor performance. Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()` df[type] = df['Types'].apply(lambda x: 1 if type in x else 0) C:\Users\thors\AppData\Local\Temp\ipykernel_20284\3350647679.py:4: PerformanceWarning: DataFrame is highly fragmented. This is usually the result of calling `frame.insert` many times, which has poor performance. Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()` df[type] = df['Types'].apply(lambda x: 1 if type in x else 0) C:\Users\thors\AppData\Local\Temp\ipykernel_20284\3350647679.py:4: PerformanceWarning: DataFrame is highly fragmented. This is usually the result of calling `frame.insert` many times, which has poor performance. Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()` df[type] = df['Types'].apply(lambda x: 1 if type in x else 0) C:\Users\thors\AppData\Local\Temp\ipykernel_20284\3350647679.py:4: PerformanceWarning: DataFrame is highly fragmented. This is usually the result of calling `frame.insert` many times, which has poor performance. Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()` df[type] = df['Types'].apply(lambda x: 1 if type in x else 0) C:\Users\thors\AppData\Local\Temp\ipykernel_20284\3350647679.py:4: PerformanceWarning: DataFrame is highly fragmented. This is usually the result of calling `frame.insert` many times, which has poor performance. Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()` df[type] = df['Types'].apply(lambda x: 1 if type in x else 0) C:\Users\thors\AppData\Local\Temp\ipykernel_20284\3350647679.py:4: PerformanceWarning: DataFrame is highly fragmented. This is usually the result of calling `frame.insert` many times, which has poor performance. Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()` df[type] = df['Types'].apply(lambda x: 1 if type in x else 0) C:\Users\thors\AppData\Local\Temp\ipykernel_20284\3350647679.py:4: PerformanceWarning: DataFrame is highly fragmented. This is usually the result of calling `frame.insert` many times, which has poor performance. Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()` df[type] = df['Types'].apply(lambda x: 1 if type in x else 0) C:\Users\thors\AppData\Local\Temp\ipykernel_20284\3350647679.py:4: PerformanceWarning: DataFrame is highly fragmented. This is usually the result of calling `frame.insert` many times, which has poor performance. Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()` df[type] = df['Types'].apply(lambda x: 1 if type in x else 0) C:\Users\thors\AppData\Local\Temp\ipykernel_20284\3350647679.py:4: PerformanceWarning: DataFrame is highly fragmented. This is usually the result of calling `frame.insert` many times, which has poor performance. Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()` df[type] = df['Types'].apply(lambda x: 1 if type in x else 0) C:\Users\thors\AppData\Local\Temp\ipykernel_20284\3350647679.py:4: PerformanceWarning: DataFrame is highly fragmented. This is usually the result of calling `frame.insert` many times, which has poor performance. Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()` df[type] = df['Types'].apply(lambda x: 1 if type in x else 0) C:\Users\thors\AppData\Local\Temp\ipykernel_20284\3350647679.py:4: PerformanceWarning: DataFrame is highly fragmented. This is usually the result of calling `frame.insert` many times, which has poor performance. Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()` df[type] = df['Types'].apply(lambda x: 1 if type in x else 0) C:\Users\thors\AppData\Local\Temp\ipykernel_20284\3350647679.py:4: PerformanceWarning: DataFrame is highly fragmented. This is usually the result of calling `frame.insert` many times, which has poor performance. Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()` df[type] = df['Types'].apply(lambda x: 1 if type in x else 0) C:\Users\thors\AppData\Local\Temp\ipykernel_20284\3350647679.py:4: PerformanceWarning: DataFrame is highly fragmented. This is usually the result of calling `frame.insert` many times, which has poor performance. Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()` df[type] = df['Types'].apply(lambda x: 1 if type in x else 0) C:\Users\thors\AppData\Local\Temp\ipykernel_20284\3350647679.py:4: PerformanceWarning: DataFrame is highly fragmented. This is usually the result of calling `frame.insert` many times, which has poor performance. Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()` df[type] = df['Types'].apply(lambda x: 1 if type in x else 0) C:\Users\thors\AppData\Local\Temp\ipykernel_20284\3350647679.py:4: PerformanceWarning: DataFrame is highly fragmented. This is usually the result of calling `frame.insert` many times, which has poor performance. Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()` df[type] = df['Types'].apply(lambda x: 1 if type in x else 0) C:\Users\thors\AppData\Local\Temp\ipykernel_20284\3350647679.py:4: PerformanceWarning: DataFrame is highly fragmented. This is usually the result of calling `frame.insert` many times, which has poor performance. Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()` df[type] = df['Types'].apply(lambda x: 1 if type in x else 0) C:\Users\thors\AppData\Local\Temp\ipykernel_20284\3350647679.py:4: PerformanceWarning: DataFrame is highly fragmented. This is usually the result of calling `frame.insert` many times, which has poor performance. Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()` df[type] = df['Types'].apply(lambda x: 1 if type in x else 0) C:\Users\thors\AppData\Local\Temp\ipykernel_20284\3350647679.py:4: PerformanceWarning: DataFrame is highly fragmented. This is usually the result of calling `frame.insert` many times, which has poor performance. Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()` df[type] = df['Types'].apply(lambda x: 1 if type in x else 0) C:\Users\thors\AppData\Local\Temp\ipykernel_20284\3350647679.py:4: PerformanceWarning: DataFrame is highly fragmented. This is usually the result of calling `frame.insert` many times, which has poor performance. Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()` df[type] = df['Types'].apply(lambda x: 1 if type in x else 0) C:\Users\thors\AppData\Local\Temp\ipykernel_20284\3350647679.py:4: PerformanceWarning: DataFrame is highly fragmented. This is usually the result of calling `frame.insert` many times, which has poor performance. Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()` df[type] = df['Types'].apply(lambda x: 1 if type in x else 0) C:\Users\thors\AppData\Local\Temp\ipykernel_20284\3350647679.py:4: PerformanceWarning: DataFrame is highly fragmented. This is usually the result of calling `frame.insert` many times, which has poor performance. Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()` df[type] = df['Types'].apply(lambda x: 1 if type in x else 0) C:\Users\thors\AppData\Local\Temp\ipykernel_20284\3350647679.py:4: PerformanceWarning: DataFrame is highly fragmented. This is usually the result of calling `frame.insert` many times, which has poor performance. Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()` df[type] = df['Types'].apply(lambda x: 1 if type in x else 0) C:\Users\thors\AppData\Local\Temp\ipykernel_20284\3350647679.py:4: PerformanceWarning: DataFrame is highly fragmented. This is usually the result of calling `frame.insert` many times, which has poor performance. Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()` df[type] = df['Types'].apply(lambda x: 1 if type in x else 0) C:\Users\thors\AppData\Local\Temp\ipykernel_20284\3350647679.py:4: PerformanceWarning: DataFrame is highly fragmented. This is usually the result of calling `frame.insert` many times, which has poor performance. Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()` df[type] = df['Types'].apply(lambda x: 1 if type in x else 0) C:\Users\thors\AppData\Local\Temp\ipykernel_20284\3350647679.py:4: PerformanceWarning: DataFrame is highly fragmented. This is usually the result of calling `frame.insert` many times, which has poor performance. Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()` df[type] = df['Types'].apply(lambda x: 1 if type in x else 0) C:\Users\thors\AppData\Local\Temp\ipykernel_20284\3350647679.py:4: PerformanceWarning: DataFrame is highly fragmented. This is usually the result of calling `frame.insert` many times, which has poor performance. Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()` df[type] = df['Types'].apply(lambda x: 1 if type in x else 0) C:\Users\thors\AppData\Local\Temp\ipykernel_20284\3350647679.py:4: PerformanceWarning: DataFrame is highly fragmented. This is usually the result of calling `frame.insert` many times, which has poor performance. Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()` df[type] = df['Types'].apply(lambda x: 1 if type in x else 0) C:\Users\thors\AppData\Local\Temp\ipykernel_20284\3350647679.py:4: PerformanceWarning: DataFrame is highly fragmented. This is usually the result of calling `frame.insert` many times, which has poor performance. Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()` df[type] = df['Types'].apply(lambda x: 1 if type in x else 0) C:\Users\thors\AppData\Local\Temp\ipykernel_20284\3350647679.py:4: PerformanceWarning: DataFrame is highly fragmented. This is usually the result of calling `frame.insert` many times, which has poor performance. Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()` df[type] = df['Types'].apply(lambda x: 1 if type in x else 0) C:\Users\thors\AppData\Local\Temp\ipykernel_20284\3350647679.py:4: PerformanceWarning: DataFrame is highly fragmented. This is usually the result of calling `frame.insert` many times, which has poor performance. Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()` df[type] = df['Types'].apply(lambda x: 1 if type in x else 0) C:\Users\thors\AppData\Local\Temp\ipykernel_20284\3350647679.py:4: PerformanceWarning: DataFrame is highly fragmented. This is usually the result of calling `frame.insert` many times, which has poor performance. Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()` df[type] = df['Types'].apply(lambda x: 1 if type in x else 0) C:\Users\thors\AppData\Local\Temp\ipykernel_20284\3350647679.py:4: PerformanceWarning: DataFrame is highly fragmented. This is usually the result of calling `frame.insert` many times, which has poor performance. Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()` df[type] = df['Types'].apply(lambda x: 1 if type in x else 0) C:\Users\thors\AppData\Local\Temp\ipykernel_20284\3350647679.py:4: PerformanceWarning: DataFrame is highly fragmented. This is usually the result of calling `frame.insert` many times, which has poor performance. Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()` df[type] = df['Types'].apply(lambda x: 1 if type in x else 0) C:\Users\thors\AppData\Local\Temp\ipykernel_20284\3350647679.py:4: PerformanceWarning: DataFrame is highly fragmented. This is usually the result of calling `frame.insert` many times, which has poor performance. Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()` df[type] = df['Types'].apply(lambda x: 1 if type in x else 0) C:\Users\thors\AppData\Local\Temp\ipykernel_20284\3350647679.py:4: PerformanceWarning: DataFrame is highly fragmented. This is usually the result of calling `frame.insert` many times, which has poor performance. Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()` df[type] = df['Types'].apply(lambda x: 1 if type in x else 0) C:\Users\thors\AppData\Local\Temp\ipykernel_20284\3350647679.py:4: PerformanceWarning: DataFrame is highly fragmented. This is usually the result of calling `frame.insert` many times, which has poor performance. Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()` df[type] = df['Types'].apply(lambda x: 1 if type in x else 0) C:\Users\thors\AppData\Local\Temp\ipykernel_20284\3350647679.py:4: PerformanceWarning: DataFrame is highly fragmented. This is usually the result of calling `frame.insert` many times, which has poor performance. Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()` df[type] = df['Types'].apply(lambda x: 1 if type in x else 0) C:\Users\thors\AppData\Local\Temp\ipykernel_20284\3350647679.py:4: PerformanceWarning: DataFrame is highly fragmented. This is usually the result of calling `frame.insert` many times, which has poor performance. Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()` df[type] = df['Types'].apply(lambda x: 1 if type in x else 0) C:\Users\thors\AppData\Local\Temp\ipykernel_20284\3350647679.py:4: PerformanceWarning: DataFrame is highly fragmented. This is usually the result of calling `frame.insert` many times, which has poor performance. Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()` df[type] = df['Types'].apply(lambda x: 1 if type in x else 0) C:\Users\thors\AppData\Local\Temp\ipykernel_20284\3350647679.py:4: PerformanceWarning: DataFrame is highly fragmented. This is usually the result of calling `frame.insert` many times, which has poor performance. Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()` df[type] = df['Types'].apply(lambda x: 1 if type in x else 0) C:\Users\thors\AppData\Local\Temp\ipykernel_20284\3350647679.py:4: PerformanceWarning: DataFrame is highly fragmented. This is usually the result of calling `frame.insert` many times, which has poor performance. Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()` df[type] = df['Types'].apply(lambda x: 1 if type in x else 0) C:\Users\thors\AppData\Local\Temp\ipykernel_20284\3350647679.py:4: PerformanceWarning: DataFrame is highly fragmented. This is usually the result of calling `frame.insert` many times, which has poor performance. Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()` df[type] = df['Types'].apply(lambda x: 1 if type in x else 0)
generation | height_m | weight_kg | abilities_number | total_points | hp | attack | defense | sp_attack | sp_defense | ... | ('Electric', 'Poison') | ('Fire', 'Bug') | ('Dark', 'Fairy') | ('Ice', 'Bug') | ('Psychic', 'Normal') | ('Electric', 'Dark') | ('Dragon', 'Ghost') | ('Fairy', 'Steel') | ('Fighting', 'Water') | ('Dark', 'Grass') | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | 0.7 | 6.9 | 2 | 318 | 45 | 49 | 49 | 65 | 65 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
1 | 1 | 1.0 | 13.0 | 2 | 405 | 60 | 62 | 63 | 80 | 80 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
2 | 1 | 2.0 | 100.0 | 2 | 525 | 80 | 82 | 83 | 100 | 100 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
3 | 1 | 2.4 | 155.5 | 1 | 625 | 80 | 100 | 123 | 122 | 120 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
4 | 1 | 0.6 | 8.5 | 2 | 309 | 39 | 52 | 43 | 60 | 50 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
5 rows × 737 columns
# Drop the 'Types' column
df = df.drop(columns=['Types'])
other_data.drop(columns=['Types'], inplace=True)
singleton_data.drop(columns=['Types'], inplace=True)
Decision Tree¶
# Split the data into training and testing sets
y = df[type_combinations]
X_train, X_test, y_train, y_test = train_test_split(other_data.drop(columns=type_combinations), other_data[type_combinations], test_size=0.2, stratify=other_data[type_combinations], random_state=42)
X_train = pd.concat([X_train, singleton_data.drop(columns=type_combinations)])
y_train = pd.concat([y_train, singleton_data[type_combinations]])
X_test = pd.concat([X_test, singleton_data.drop(columns=type_combinations)])
y_test = pd.concat([y_test, singleton_data[type_combinations]])
# Initialize and train the decision tree classifier
model = DecisionTreeClassifier(random_state=42)
model.fit(X_train, y_train)
# Predict labels for the test set
y_pred = model.predict(X_test)
score = model.score(X_test, y_test)
# Calculate accuracy
print("Score: ", score)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy: ", accuracy)
print(classification_report(y_test, y_pred))
Score: 0.47107438016528924 Accuracy: 0.47107438016528924 precision recall f1-score support 0 1.00 0.67 0.80 3 1 0.60 0.86 0.71 7 2 0.00 0.00 0.00 2 3 0.00 0.00 0.00 0 4 0.45 0.33 0.38 15 5 0.43 0.75 0.55 4 6 0.33 0.33 0.33 3 7 0.25 0.33 0.29 3 8 0.83 0.83 0.83 6 9 0.33 0.50 0.40 14 10 0.00 0.00 0.00 1 11 0.25 0.67 0.36 3 12 1.00 0.14 0.25 7 13 1.00 1.00 1.00 1 14 0.00 0.00 0.00 4 15 0.00 0.00 0.00 0 16 0.00 0.00 0.00 0 17 0.33 0.25 0.29 4 18 1.00 0.25 0.40 4 19 1.00 1.00 1.00 1 20 1.00 1.00 1.00 1 21 1.00 1.00 1.00 1 22 1.00 1.00 1.00 1 23 0.00 0.00 0.00 1 24 0.00 0.00 0.00 3 25 0.00 0.00 0.00 2 26 0.57 0.67 0.62 6 27 1.00 1.00 1.00 1 28 0.38 0.33 0.35 9 29 0.00 0.00 0.00 1 30 1.00 1.00 1.00 1 31 1.00 1.00 1.00 1 32 0.50 0.50 0.50 2 33 0.00 0.00 0.00 1 34 0.00 0.00 0.00 0 35 0.00 0.00 0.00 1 36 0.00 0.00 0.00 1 37 0.00 0.00 0.00 1 38 0.00 0.00 0.00 1 39 0.00 0.00 0.00 0 40 0.00 0.00 0.00 1 41 0.00 0.00 0.00 0 42 1.00 1.00 1.00 1 43 1.00 1.00 1.00 1 44 0.62 0.89 0.73 9 45 0.00 0.00 0.00 1 46 0.00 0.00 0.00 0 47 1.00 0.50 0.67 2 48 0.00 0.00 0.00 2 49 0.00 0.00 0.00 1 50 1.00 1.00 1.00 1 51 0.00 0.00 0.00 0 52 0.00 0.00 0.00 2 53 0.00 0.00 0.00 1 54 0.00 0.00 0.00 0 55 0.00 0.00 0.00 1 56 0.50 0.67 0.57 3 57 0.00 0.00 0.00 1 58 0.00 0.00 0.00 1 59 0.00 0.00 0.00 0 60 0.00 0.00 0.00 0 61 0.00 0.00 0.00 0 62 1.00 1.00 1.00 1 63 0.33 0.33 0.33 3 64 1.00 0.50 0.67 2 65 0.00 0.00 0.00 2 66 0.20 0.33 0.25 3 67 0.00 0.00 0.00 1 68 0.00 0.00 0.00 2 69 0.00 0.00 0.00 1 70 0.00 0.00 0.00 0 71 0.00 0.00 0.00 1 72 0.00 0.00 0.00 1 73 0.00 0.00 0.00 0 74 1.00 1.00 1.00 1 75 0.00 0.00 0.00 1 76 0.00 0.00 0.00 1 77 0.00 0.00 0.00 0 78 0.00 0.00 0.00 1 79 0.00 0.00 0.00 1 80 0.00 0.00 0.00 0 81 0.00 0.00 0.00 0 82 0.00 0.00 0.00 1 83 1.00 1.00 1.00 1 84 0.50 1.00 0.67 1 85 0.00 0.00 0.00 1 86 0.00 0.00 0.00 1 87 0.00 0.00 0.00 0 88 1.00 1.00 1.00 1 89 0.00 0.00 0.00 0 90 0.00 0.00 0.00 1 91 1.00 1.00 1.00 1 92 0.00 0.00 0.00 1 93 0.00 0.00 0.00 1 94 0.00 0.00 0.00 0 95 0.50 1.00 0.67 1 96 0.00 0.00 0.00 0 97 0.00 0.00 0.00 0 98 0.00 0.00 0.00 0 99 0.00 0.00 0.00 0 100 0.00 0.00 0.00 1 101 0.00 0.00 0.00 2 102 0.50 1.00 0.67 1 103 1.00 1.00 1.00 1 104 1.00 1.00 1.00 1 105 1.00 1.00 1.00 1 106 0.33 1.00 0.50 1 107 0.00 0.00 0.00 1 108 0.00 0.00 0.00 1 109 0.00 0.00 0.00 1 110 0.50 1.00 0.67 1 111 0.00 0.00 0.00 1 112 0.00 0.00 0.00 1 113 1.00 1.00 1.00 1 114 0.00 0.00 0.00 0 115 0.00 0.00 0.00 1 116 1.00 1.00 1.00 1 117 0.50 1.00 0.67 1 118 0.00 0.00 0.00 1 119 0.00 0.00 0.00 1 120 0.00 0.00 0.00 0 121 0.00 0.00 0.00 1 122 0.00 0.00 0.00 0 123 1.00 1.00 1.00 1 124 0.00 0.00 0.00 0 125 0.33 1.00 0.50 1 126 1.00 1.00 1.00 1 127 1.00 1.00 1.00 1 128 0.00 0.00 0.00 0 129 1.00 1.00 1.00 1 130 0.00 0.00 0.00 0 131 0.00 0.00 0.00 1 132 0.00 0.00 0.00 0 133 0.00 0.00 0.00 0 134 0.00 0.00 0.00 1 135 0.00 0.00 0.00 1 136 1.00 1.00 1.00 1 137 1.00 1.00 1.00 1 138 0.00 0.00 0.00 0 139 0.00 0.00 0.00 1 140 0.00 0.00 0.00 0 141 1.00 1.00 1.00 1 142 1.00 1.00 1.00 1 143 0.50 1.00 0.67 1 144 0.50 1.00 0.67 1 145 1.00 1.00 1.00 1 146 0.00 0.00 0.00 1 147 1.00 1.00 1.00 1 148 0.00 0.00 0.00 0 149 0.00 0.00 0.00 0 150 0.00 0.00 0.00 1 151 0.00 0.00 0.00 0 152 0.00 0.00 0.00 1 153 0.00 0.00 0.00 1 154 0.00 0.00 0.00 0 155 0.00 0.00 0.00 0 156 0.00 0.00 0.00 0 157 0.00 0.00 0.00 0 158 0.00 0.00 0.00 1 159 0.50 0.50 0.50 2 160 0.00 0.00 0.00 0 161 0.00 0.00 0.00 1 162 1.00 1.00 1.00 1 163 1.00 1.00 1.00 1 164 1.00 1.00 1.00 1 165 1.00 1.00 1.00 1 166 1.00 1.00 1.00 1 167 0.00 0.00 0.00 0 168 0.00 0.00 0.00 0 169 0.00 0.00 0.00 0 170 0.00 0.00 0.00 0 171 1.00 1.00 1.00 1 172 0.50 1.00 0.67 1 173 0.00 0.00 0.00 0 174 0.00 0.00 0.00 0 175 1.00 1.00 1.00 1 176 1.00 1.00 1.00 1 177 1.00 1.00 1.00 1 178 1.00 1.00 1.00 1 179 0.00 0.00 0.00 0 180 0.00 0.00 0.00 0 181 1.00 1.00 1.00 1 182 0.00 0.00 0.00 1 183 0.00 0.00 0.00 0 184 1.00 1.00 1.00 1 185 0.00 0.00 0.00 0 186 0.00 0.00 0.00 0 187 0.00 0.00 0.00 0 188 1.00 1.00 1.00 1 189 1.00 1.00 1.00 1 190 0.00 0.00 0.00 1 191 1.00 1.00 1.00 1 micro avg 0.48 0.47 0.48 242 macro avg 0.33 0.35 0.33 242 weighted avg 0.46 0.47 0.44 242 samples avg 0.47 0.47 0.47 242
C:\Users\thors\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.10_qbz5n2kfra8p0\LocalCache\local-packages\Python310\site-packages\sklearn\metrics\_classification.py:1497: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result)) C:\Users\thors\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.10_qbz5n2kfra8p0\LocalCache\local-packages\Python310\site-packages\sklearn\metrics\_classification.py:1497: UndefinedMetricWarning: Recall is ill-defined and being set to 0.0 in labels with no true samples. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result)) C:\Users\thors\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.10_qbz5n2kfra8p0\LocalCache\local-packages\Python310\site-packages\sklearn\metrics\_classification.py:1497: UndefinedMetricWarning: F-score is ill-defined and being set to 0.0 in labels with no true nor predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result)) C:\Users\thors\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.10_qbz5n2kfra8p0\LocalCache\local-packages\Python310\site-packages\sklearn\metrics\_classification.py:1497: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in samples with no predicted labels. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
Hyperparameter tuning¶
pipeline = make_pipeline(StandardScaler(), DecisionTreeClassifier())
# Setup the parameters
param_dist = {
"decisiontreeclassifier__max_depth": [15, 30, None],
"decisiontreeclassifier__min_samples_leaf": np.arange(1, 10)
}
# Instantiate the GridSearchCV object
grid_search_cv = GridSearchCV(pipeline, param_grid=param_dist, cv=5)
# Fit grid_search_cv using the data X and labels y.
grid_search_cv.fit(X_train, y_train)
y_pred = grid_search_cv.predict(X_test)
# Print the best score
print("Tuned Model Parameters: {}".format(grid_search_cv.best_params_))
print("Accuracy: {}".format(grid_search_cv.best_estimator_.score(X_test, y_test)))
print(classification_report(y_test, y_pred))
Tuned Model Parameters: {'decisiontreeclassifier__max_depth': None, 'decisiontreeclassifier__min_samples_leaf': 1} Accuracy: 0.47520661157024796 precision recall f1-score support 0 1.00 0.67 0.80 3 1 0.57 0.57 0.57 7 2 0.33 0.50 0.40 2 3 0.00 0.00 0.00 0 4 0.45 0.33 0.38 15 5 0.43 0.75 0.55 4 6 0.50 0.67 0.57 3 7 0.50 0.33 0.40 3 8 0.83 0.83 0.83 6 9 0.35 0.43 0.39 14 10 0.00 0.00 0.00 1 11 0.25 0.67 0.36 3 12 0.50 0.14 0.22 7 13 1.00 1.00 1.00 1 14 1.00 0.25 0.40 4 15 0.00 0.00 0.00 0 16 0.00 0.00 0.00 0 17 0.50 0.25 0.33 4 18 0.50 0.25 0.33 4 19 1.00 1.00 1.00 1 20 1.00 1.00 1.00 1 21 1.00 1.00 1.00 1 22 1.00 1.00 1.00 1 23 0.00 0.00 0.00 1 24 0.00 0.00 0.00 3 25 0.00 0.00 0.00 2 26 0.80 0.67 0.73 6 27 0.25 1.00 0.40 1 28 0.33 0.22 0.27 9 29 0.00 0.00 0.00 1 30 1.00 1.00 1.00 1 31 1.00 1.00 1.00 1 32 0.50 0.50 0.50 2 33 0.00 0.00 0.00 1 34 0.00 0.00 0.00 0 35 0.00 0.00 0.00 1 36 0.00 0.00 0.00 1 37 0.00 0.00 0.00 1 38 0.00 0.00 0.00 1 39 0.00 0.00 0.00 0 40 0.00 0.00 0.00 1 41 0.00 0.00 0.00 0 42 1.00 1.00 1.00 1 43 1.00 1.00 1.00 1 44 0.73 0.89 0.80 9 45 0.00 0.00 0.00 1 46 0.00 0.00 0.00 0 47 1.00 0.50 0.67 2 48 0.00 0.00 0.00 2 49 0.00 0.00 0.00 1 50 1.00 1.00 1.00 1 51 0.00 0.00 0.00 0 52 0.00 0.00 0.00 2 53 0.00 0.00 0.00 1 54 0.00 0.00 0.00 0 55 0.00 0.00 0.00 1 56 1.00 0.67 0.80 3 57 0.00 0.00 0.00 1 58 0.00 0.00 0.00 1 59 0.00 0.00 0.00 0 60 0.00 0.00 0.00 0 61 0.00 0.00 0.00 0 62 0.00 0.00 0.00 1 63 0.50 0.33 0.40 3 64 1.00 0.50 0.67 2 65 0.00 0.00 0.00 2 66 0.67 0.67 0.67 3 67 0.00 0.00 0.00 1 68 0.50 0.50 0.50 2 69 0.00 0.00 0.00 1 70 0.00 0.00 0.00 0 71 0.00 0.00 0.00 1 72 0.00 0.00 0.00 1 73 0.00 0.00 0.00 0 74 1.00 1.00 1.00 1 75 0.00 0.00 0.00 1 76 0.00 0.00 0.00 1 77 0.00 0.00 0.00 0 78 0.00 0.00 0.00 1 79 0.00 0.00 0.00 1 80 0.00 0.00 0.00 0 81 0.00 0.00 0.00 0 82 0.00 0.00 0.00 1 83 1.00 1.00 1.00 1 84 0.00 0.00 0.00 1 85 0.00 0.00 0.00 1 86 0.00 0.00 0.00 1 87 0.00 0.00 0.00 0 88 1.00 1.00 1.00 1 89 0.00 0.00 0.00 0 90 0.00 0.00 0.00 1 91 1.00 1.00 1.00 1 92 0.00 0.00 0.00 1 93 0.00 0.00 0.00 1 94 0.00 0.00 0.00 0 95 1.00 1.00 1.00 1 96 0.00 0.00 0.00 0 97 0.00 0.00 0.00 0 98 0.00 0.00 0.00 0 99 0.00 0.00 0.00 0 100 0.00 0.00 0.00 1 101 1.00 0.50 0.67 2 102 0.50 1.00 0.67 1 103 1.00 1.00 1.00 1 104 1.00 1.00 1.00 1 105 1.00 1.00 1.00 1 106 0.25 1.00 0.40 1 107 0.00 0.00 0.00 1 108 0.00 0.00 0.00 1 109 0.00 0.00 0.00 1 110 0.50 1.00 0.67 1 111 0.00 0.00 0.00 1 112 0.00 0.00 0.00 1 113 1.00 1.00 1.00 1 114 0.00 0.00 0.00 0 115 0.50 1.00 0.67 1 116 1.00 1.00 1.00 1 117 0.50 1.00 0.67 1 118 0.00 0.00 0.00 1 119 0.00 0.00 0.00 1 120 0.00 0.00 0.00 0 121 0.00 0.00 0.00 1 122 0.00 0.00 0.00 0 123 1.00 1.00 1.00 1 124 0.00 0.00 0.00 0 125 1.00 1.00 1.00 1 126 1.00 1.00 1.00 1 127 1.00 1.00 1.00 1 128 0.00 0.00 0.00 0 129 1.00 1.00 1.00 1 130 0.00 0.00 0.00 0 131 0.00 0.00 0.00 1 132 0.00 0.00 0.00 0 133 0.00 0.00 0.00 0 134 0.00 0.00 0.00 1 135 0.00 0.00 0.00 1 136 1.00 1.00 1.00 1 137 1.00 1.00 1.00 1 138 0.00 0.00 0.00 0 139 0.00 0.00 0.00 1 140 0.00 0.00 0.00 0 141 1.00 1.00 1.00 1 142 1.00 1.00 1.00 1 143 0.50 1.00 0.67 1 144 1.00 1.00 1.00 1 145 1.00 1.00 1.00 1 146 0.00 0.00 0.00 1 147 0.50 1.00 0.67 1 148 0.00 0.00 0.00 0 149 0.00 0.00 0.00 0 150 0.00 0.00 0.00 1 151 0.00 0.00 0.00 0 152 0.00 0.00 0.00 1 153 0.00 0.00 0.00 1 154 0.00 0.00 0.00 0 155 0.00 0.00 0.00 0 156 0.00 0.00 0.00 0 157 0.00 0.00 0.00 0 158 0.00 0.00 0.00 1 159 0.50 0.50 0.50 2 160 0.00 0.00 0.00 0 161 0.00 0.00 0.00 1 162 1.00 1.00 1.00 1 163 0.50 1.00 0.67 1 164 1.00 1.00 1.00 1 165 0.50 1.00 0.67 1 166 0.50 1.00 0.67 1 167 0.00 0.00 0.00 0 168 0.00 0.00 0.00 0 169 0.00 0.00 0.00 0 170 0.00 0.00 0.00 0 171 0.50 1.00 0.67 1 172 0.25 1.00 0.40 1 173 0.00 0.00 0.00 0 174 0.00 0.00 0.00 0 175 1.00 1.00 1.00 1 176 1.00 1.00 1.00 1 177 1.00 1.00 1.00 1 178 1.00 1.00 1.00 1 179 0.00 0.00 0.00 0 180 0.00 0.00 0.00 0 181 1.00 1.00 1.00 1 182 0.00 0.00 0.00 1 183 0.00 0.00 0.00 0 184 1.00 1.00 1.00 1 185 0.00 0.00 0.00 0 186 0.00 0.00 0.00 0 187 0.00 0.00 0.00 0 188 1.00 1.00 1.00 1 189 1.00 1.00 1.00 1 190 0.00 0.00 0.00 1 191 1.00 1.00 1.00 1 micro avg 0.49 0.48 0.48 242 macro avg 0.33 0.35 0.33 242 weighted avg 0.49 0.48 0.46 242 samples avg 0.48 0.48 0.48 242
C:\Users\thors\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.10_qbz5n2kfra8p0\LocalCache\local-packages\Python310\site-packages\sklearn\metrics\_classification.py:1497: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result)) C:\Users\thors\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.10_qbz5n2kfra8p0\LocalCache\local-packages\Python310\site-packages\sklearn\metrics\_classification.py:1497: UndefinedMetricWarning: Recall is ill-defined and being set to 0.0 in labels with no true samples. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result)) C:\Users\thors\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.10_qbz5n2kfra8p0\LocalCache\local-packages\Python310\site-packages\sklearn\metrics\_classification.py:1497: UndefinedMetricWarning: F-score is ill-defined and being set to 0.0 in labels with no true nor predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result)) C:\Users\thors\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.10_qbz5n2kfra8p0\LocalCache\local-packages\Python310\site-packages\sklearn\metrics\_classification.py:1497: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in samples with no predicted labels. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
Random Forest¶
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)
# Predict labels for the test set
y_pred = model.predict(X_test)
score = model.score(X_test, y_test)
# Calculate accuracy
print("Score: ", score)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)
print(classification_report(y_test, y_pred))
Score: 0.25206611570247933 Accuracy: 0.25206611570247933 precision recall f1-score support 0 1.00 0.33 0.50 3 1 1.00 0.43 0.60 7 2 0.00 0.00 0.00 2 3 0.00 0.00 0.00 0 4 0.80 0.27 0.40 15 5 1.00 0.25 0.40 4 6 0.00 0.00 0.00 3 7 0.00 0.00 0.00 3 8 0.33 0.17 0.22 6 9 0.50 0.14 0.22 14 10 0.00 0.00 0.00 1 11 1.00 0.67 0.80 3 12 0.00 0.00 0.00 7 13 1.00 1.00 1.00 1 14 0.00 0.00 0.00 4 15 0.00 0.00 0.00 0 16 0.00 0.00 0.00 0 17 1.00 0.25 0.40 4 18 0.00 0.00 0.00 4 19 1.00 1.00 1.00 1 20 0.00 0.00 0.00 1 21 0.00 0.00 0.00 1 22 0.00 0.00 0.00 1 23 0.00 0.00 0.00 1 24 0.00 0.00 0.00 3 25 0.00 0.00 0.00 2 26 1.00 0.33 0.50 6 27 1.00 1.00 1.00 1 28 0.00 0.00 0.00 9 29 0.00 0.00 0.00 1 30 0.00 0.00 0.00 1 31 0.00 0.00 0.00 1 32 0.00 0.00 0.00 2 33 0.00 0.00 0.00 1 34 0.00 0.00 0.00 0 35 0.00 0.00 0.00 1 36 0.00 0.00 0.00 1 37 0.00 0.00 0.00 1 38 0.00 0.00 0.00 1 39 0.00 0.00 0.00 0 40 0.00 0.00 0.00 1 41 0.00 0.00 0.00 0 42 1.00 1.00 1.00 1 43 1.00 1.00 1.00 1 44 0.67 0.22 0.33 9 45 0.00 0.00 0.00 1 46 0.00 0.00 0.00 0 47 0.00 0.00 0.00 2 48 0.00 0.00 0.00 2 49 0.00 0.00 0.00 1 50 0.00 0.00 0.00 1 51 0.00 0.00 0.00 0 52 0.00 0.00 0.00 2 53 0.00 0.00 0.00 1 54 0.00 0.00 0.00 0 55 0.00 0.00 0.00 1 56 1.00 0.33 0.50 3 57 0.00 0.00 0.00 1 58 0.00 0.00 0.00 1 59 0.00 0.00 0.00 0 60 0.00 0.00 0.00 0 61 0.00 0.00 0.00 0 62 0.00 0.00 0.00 1 63 0.00 0.00 0.00 3 64 0.00 0.00 0.00 2 65 0.00 0.00 0.00 2 66 0.00 0.00 0.00 3 67 0.00 0.00 0.00 1 68 0.00 0.00 0.00 2 69 0.00 0.00 0.00 1 70 0.00 0.00 0.00 0 71 0.00 0.00 0.00 1 72 0.00 0.00 0.00 1 73 0.00 0.00 0.00 0 74 1.00 1.00 1.00 1 75 0.00 0.00 0.00 1 76 0.00 0.00 0.00 1 77 0.00 0.00 0.00 0 78 0.00 0.00 0.00 1 79 0.00 0.00 0.00 1 80 0.00 0.00 0.00 0 81 0.00 0.00 0.00 0 82 0.00 0.00 0.00 1 83 0.00 0.00 0.00 1 84 0.00 0.00 0.00 1 85 0.00 0.00 0.00 1 86 0.00 0.00 0.00 1 87 0.00 0.00 0.00 0 88 1.00 1.00 1.00 1 89 0.00 0.00 0.00 0 90 0.00 0.00 0.00 1 91 1.00 1.00 1.00 1 92 0.00 0.00 0.00 1 93 0.00 0.00 0.00 1 94 0.00 0.00 0.00 0 95 1.00 1.00 1.00 1 96 0.00 0.00 0.00 0 97 0.00 0.00 0.00 0 98 0.00 0.00 0.00 0 99 0.00 0.00 0.00 0 100 0.00 0.00 0.00 1 101 0.00 0.00 0.00 2 102 0.00 0.00 0.00 1 103 1.00 1.00 1.00 1 104 1.00 1.00 1.00 1 105 1.00 1.00 1.00 1 106 1.00 1.00 1.00 1 107 0.00 0.00 0.00 1 108 0.00 0.00 0.00 1 109 0.00 0.00 0.00 1 110 1.00 1.00 1.00 1 111 0.00 0.00 0.00 1 112 0.00 0.00 0.00 1 113 1.00 1.00 1.00 1 114 0.00 0.00 0.00 0 115 0.00 0.00 0.00 1 116 1.00 1.00 1.00 1 117 1.00 1.00 1.00 1 118 0.00 0.00 0.00 1 119 0.00 0.00 0.00 1 120 0.00 0.00 0.00 0 121 0.00 0.00 0.00 1 122 0.00 0.00 0.00 0 123 1.00 1.00 1.00 1 124 0.00 0.00 0.00 0 125 1.00 1.00 1.00 1 126 0.00 0.00 0.00 1 127 0.00 0.00 0.00 1 128 0.00 0.00 0.00 0 129 1.00 1.00 1.00 1 130 0.00 0.00 0.00 0 131 0.00 0.00 0.00 1 132 0.00 0.00 0.00 0 133 0.00 0.00 0.00 0 134 0.00 0.00 0.00 1 135 0.00 0.00 0.00 1 136 0.00 0.00 0.00 1 137 1.00 1.00 1.00 1 138 0.00 0.00 0.00 0 139 0.00 0.00 0.00 1 140 0.00 0.00 0.00 0 141 1.00 1.00 1.00 1 142 1.00 1.00 1.00 1 143 0.00 0.00 0.00 1 144 1.00 1.00 1.00 1 145 1.00 1.00 1.00 1 146 0.00 0.00 0.00 1 147 1.00 1.00 1.00 1 148 0.00 0.00 0.00 0 149 0.00 0.00 0.00 0 150 0.00 0.00 0.00 1 151 0.00 0.00 0.00 0 152 0.00 0.00 0.00 1 153 0.00 0.00 0.00 1 154 0.00 0.00 0.00 0 155 0.00 0.00 0.00 0 156 0.00 0.00 0.00 0 157 0.00 0.00 0.00 0 158 0.00 0.00 0.00 1 159 1.00 0.50 0.67 2 160 0.00 0.00 0.00 0 161 0.00 0.00 0.00 1 162 1.00 1.00 1.00 1 163 1.00 1.00 1.00 1 164 1.00 1.00 1.00 1 165 1.00 1.00 1.00 1 166 1.00 1.00 1.00 1 167 0.00 0.00 0.00 0 168 0.00 0.00 0.00 0 169 0.00 0.00 0.00 0 170 0.00 0.00 0.00 0 171 1.00 1.00 1.00 1 172 1.00 1.00 1.00 1 173 0.00 0.00 0.00 0 174 0.00 0.00 0.00 0 175 1.00 1.00 1.00 1 176 1.00 1.00 1.00 1 177 1.00 1.00 1.00 1 178 1.00 1.00 1.00 1 179 0.00 0.00 0.00 0 180 0.00 0.00 0.00 0 181 1.00 1.00 1.00 1 182 0.00 0.00 0.00 1 183 0.00 0.00 0.00 0 184 0.00 0.00 0.00 1 185 0.00 0.00 0.00 0 186 0.00 0.00 0.00 0 187 0.00 0.00 0.00 0 188 0.00 0.00 0.00 1 189 1.00 1.00 1.00 1 190 0.00 0.00 0.00 1 191 1.00 1.00 1.00 1 micro avg 0.86 0.25 0.39 242 macro avg 0.26 0.23 0.24 242 weighted avg 0.41 0.25 0.29 242 samples avg 0.25 0.25 0.25 242
C:\Users\thors\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.10_qbz5n2kfra8p0\LocalCache\local-packages\Python310\site-packages\sklearn\metrics\_classification.py:1497: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result)) C:\Users\thors\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.10_qbz5n2kfra8p0\LocalCache\local-packages\Python310\site-packages\sklearn\metrics\_classification.py:1497: UndefinedMetricWarning: Recall is ill-defined and being set to 0.0 in labels with no true samples. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result)) C:\Users\thors\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.10_qbz5n2kfra8p0\LocalCache\local-packages\Python310\site-packages\sklearn\metrics\_classification.py:1497: UndefinedMetricWarning: F-score is ill-defined and being set to 0.0 in labels with no true nor predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result)) C:\Users\thors\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.10_qbz5n2kfra8p0\LocalCache\local-packages\Python310\site-packages\sklearn\metrics\_classification.py:1497: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in samples with no predicted labels. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
Hyperparameter Tuning¶
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import RandomizedSearchCV
pipeline = make_pipeline(StandardScaler(), RandomForestClassifier())
# Setup the parameters and distributions to sample from: param_dist
param_dist = {
"randomforestclassifier__max_depth": [15, 30, None],
"randomforestclassifier__min_samples_leaf": np.arange(1, 10, 4),
"randomforestclassifier__n_estimators": np.arange(80, 120, 5)
}
# Instantiate the RandomizedSearchCV object: random_grid_search_cv
random_search_cv = RandomizedSearchCV(pipeline, param_distributions=param_dist, n_iter=50, cv=3, random_state=42)
#grid_search_cv = GridSearchCV(pipeline, param_grid=param_dist, cv=3)
# Fit random_search_cv using the data X and labels y
random_search_cv.fit(X_train, y_train)
#grid_search_cv.fit(X_train, y_train)
# Print the best score
print("Best score is {}".format(random_search_cv.best_estimator_.score(X_test, y_test)))
print("Best parameters are {}".format(random_search_cv.best_params_))
print(classification_report(y_test, y_pred))
Best score is 0.256198347107438 Best parameters are {'randomforestclassifier__n_estimators': 85, 'randomforestclassifier__min_samples_leaf': 1, 'randomforestclassifier__max_depth': None} precision recall f1-score support 0 1.00 0.33 0.50 3 1 1.00 0.43 0.60 7 2 0.00 0.00 0.00 2 3 0.00 0.00 0.00 0 4 0.80 0.27 0.40 15 5 1.00 0.25 0.40 4 6 0.00 0.00 0.00 3 7 0.00 0.00 0.00 3 8 0.33 0.17 0.22 6 9 0.50 0.14 0.22 14 10 0.00 0.00 0.00 1 11 1.00 0.67 0.80 3 12 0.00 0.00 0.00 7 13 1.00 1.00 1.00 1 14 0.00 0.00 0.00 4 15 0.00 0.00 0.00 0 16 0.00 0.00 0.00 0 17 1.00 0.25 0.40 4 18 0.00 0.00 0.00 4 19 1.00 1.00 1.00 1 20 0.00 0.00 0.00 1 21 0.00 0.00 0.00 1 22 0.00 0.00 0.00 1 23 0.00 0.00 0.00 1 24 0.00 0.00 0.00 3 25 0.00 0.00 0.00 2 26 1.00 0.33 0.50 6 27 1.00 1.00 1.00 1 28 0.00 0.00 0.00 9 29 0.00 0.00 0.00 1 30 0.00 0.00 0.00 1 31 0.00 0.00 0.00 1 32 0.00 0.00 0.00 2 33 0.00 0.00 0.00 1 34 0.00 0.00 0.00 0 35 0.00 0.00 0.00 1 36 0.00 0.00 0.00 1 37 0.00 0.00 0.00 1 38 0.00 0.00 0.00 1 39 0.00 0.00 0.00 0 40 0.00 0.00 0.00 1 41 0.00 0.00 0.00 0 42 1.00 1.00 1.00 1 43 1.00 1.00 1.00 1 44 0.67 0.22 0.33 9 45 0.00 0.00 0.00 1 46 0.00 0.00 0.00 0 47 0.00 0.00 0.00 2 48 0.00 0.00 0.00 2 49 0.00 0.00 0.00 1 50 0.00 0.00 0.00 1 51 0.00 0.00 0.00 0 52 0.00 0.00 0.00 2 53 0.00 0.00 0.00 1 54 0.00 0.00 0.00 0 55 0.00 0.00 0.00 1 56 1.00 0.33 0.50 3 57 0.00 0.00 0.00 1 58 0.00 0.00 0.00 1 59 0.00 0.00 0.00 0 60 0.00 0.00 0.00 0 61 0.00 0.00 0.00 0 62 0.00 0.00 0.00 1 63 0.00 0.00 0.00 3 64 0.00 0.00 0.00 2 65 0.00 0.00 0.00 2 66 0.00 0.00 0.00 3 67 0.00 0.00 0.00 1 68 0.00 0.00 0.00 2 69 0.00 0.00 0.00 1 70 0.00 0.00 0.00 0 71 0.00 0.00 0.00 1 72 0.00 0.00 0.00 1 73 0.00 0.00 0.00 0 74 1.00 1.00 1.00 1 75 0.00 0.00 0.00 1 76 0.00 0.00 0.00 1 77 0.00 0.00 0.00 0 78 0.00 0.00 0.00 1 79 0.00 0.00 0.00 1 80 0.00 0.00 0.00 0 81 0.00 0.00 0.00 0 82 0.00 0.00 0.00 1 83 0.00 0.00 0.00 1 84 0.00 0.00 0.00 1 85 0.00 0.00 0.00 1 86 0.00 0.00 0.00 1 87 0.00 0.00 0.00 0 88 1.00 1.00 1.00 1 89 0.00 0.00 0.00 0 90 0.00 0.00 0.00 1 91 1.00 1.00 1.00 1 92 0.00 0.00 0.00 1 93 0.00 0.00 0.00 1 94 0.00 0.00 0.00 0 95 1.00 1.00 1.00 1 96 0.00 0.00 0.00 0 97 0.00 0.00 0.00 0 98 0.00 0.00 0.00 0 99 0.00 0.00 0.00 0 100 0.00 0.00 0.00 1 101 0.00 0.00 0.00 2 102 0.00 0.00 0.00 1 103 1.00 1.00 1.00 1 104 1.00 1.00 1.00 1 105 1.00 1.00 1.00 1 106 1.00 1.00 1.00 1 107 0.00 0.00 0.00 1 108 0.00 0.00 0.00 1 109 0.00 0.00 0.00 1 110 1.00 1.00 1.00 1 111 0.00 0.00 0.00 1 112 0.00 0.00 0.00 1 113 1.00 1.00 1.00 1 114 0.00 0.00 0.00 0 115 0.00 0.00 0.00 1 116 1.00 1.00 1.00 1 117 1.00 1.00 1.00 1 118 0.00 0.00 0.00 1 119 0.00 0.00 0.00 1 120 0.00 0.00 0.00 0 121 0.00 0.00 0.00 1 122 0.00 0.00 0.00 0 123 1.00 1.00 1.00 1 124 0.00 0.00 0.00 0 125 1.00 1.00 1.00 1 126 0.00 0.00 0.00 1 127 0.00 0.00 0.00 1 128 0.00 0.00 0.00 0 129 1.00 1.00 1.00 1 130 0.00 0.00 0.00 0 131 0.00 0.00 0.00 1 132 0.00 0.00 0.00 0 133 0.00 0.00 0.00 0 134 0.00 0.00 0.00 1 135 0.00 0.00 0.00 1 136 0.00 0.00 0.00 1 137 1.00 1.00 1.00 1 138 0.00 0.00 0.00 0 139 0.00 0.00 0.00 1 140 0.00 0.00 0.00 0 141 1.00 1.00 1.00 1 142 1.00 1.00 1.00 1 143 0.00 0.00 0.00 1 144 1.00 1.00 1.00 1 145 1.00 1.00 1.00 1 146 0.00 0.00 0.00 1 147 1.00 1.00 1.00 1 148 0.00 0.00 0.00 0 149 0.00 0.00 0.00 0 150 0.00 0.00 0.00 1 151 0.00 0.00 0.00 0 152 0.00 0.00 0.00 1 153 0.00 0.00 0.00 1 154 0.00 0.00 0.00 0 155 0.00 0.00 0.00 0 156 0.00 0.00 0.00 0 157 0.00 0.00 0.00 0 158 0.00 0.00 0.00 1 159 1.00 0.50 0.67 2 160 0.00 0.00 0.00 0 161 0.00 0.00 0.00 1 162 1.00 1.00 1.00 1 163 1.00 1.00 1.00 1 164 1.00 1.00 1.00 1 165 1.00 1.00 1.00 1 166 1.00 1.00 1.00 1 167 0.00 0.00 0.00 0 168 0.00 0.00 0.00 0 169 0.00 0.00 0.00 0 170 0.00 0.00 0.00 0 171 1.00 1.00 1.00 1 172 1.00 1.00 1.00 1 173 0.00 0.00 0.00 0 174 0.00 0.00 0.00 0 175 1.00 1.00 1.00 1 176 1.00 1.00 1.00 1 177 1.00 1.00 1.00 1 178 1.00 1.00 1.00 1 179 0.00 0.00 0.00 0 180 0.00 0.00 0.00 0 181 1.00 1.00 1.00 1 182 0.00 0.00 0.00 1 183 0.00 0.00 0.00 0 184 0.00 0.00 0.00 1 185 0.00 0.00 0.00 0 186 0.00 0.00 0.00 0 187 0.00 0.00 0.00 0 188 0.00 0.00 0.00 1 189 1.00 1.00 1.00 1 190 0.00 0.00 0.00 1 191 1.00 1.00 1.00 1 micro avg 0.86 0.25 0.39 242 macro avg 0.26 0.23 0.24 242 weighted avg 0.41 0.25 0.29 242 samples avg 0.25 0.25 0.25 242
C:\Users\thors\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.10_qbz5n2kfra8p0\LocalCache\local-packages\Python310\site-packages\sklearn\metrics\_classification.py:1497: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result)) C:\Users\thors\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.10_qbz5n2kfra8p0\LocalCache\local-packages\Python310\site-packages\sklearn\metrics\_classification.py:1497: UndefinedMetricWarning: Recall is ill-defined and being set to 0.0 in labels with no true samples. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result)) C:\Users\thors\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.10_qbz5n2kfra8p0\LocalCache\local-packages\Python310\site-packages\sklearn\metrics\_classification.py:1497: UndefinedMetricWarning: F-score is ill-defined and being set to 0.0 in labels with no true nor predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result)) C:\Users\thors\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.10_qbz5n2kfra8p0\LocalCache\local-packages\Python310\site-packages\sklearn\metrics\_classification.py:1497: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in samples with no predicted labels. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
K Nearest neighbors¶
from sklearn.neighbors import KNeighborsClassifier
model = KNeighborsClassifier()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
# Calculate accuracy
print("Score: ", model.score(X_test, y_test))
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)
print(classification_report(y_test, y_pred))
Score: 0.03305785123966942 Accuracy: 0.03305785123966942 precision recall f1-score support 0 0.00 0.00 0.00 3 1 1.00 0.29 0.44 7 2 0.00 0.00 0.00 2 3 0.00 0.00 0.00 0 4 0.00 0.00 0.00 15 5 1.00 0.50 0.67 4 6 0.00 0.00 0.00 3 7 0.00 0.00 0.00 3 8 0.00 0.00 0.00 6 9 0.75 0.21 0.33 14 10 0.00 0.00 0.00 1 11 0.00 0.00 0.00 3 12 0.00 0.00 0.00 7 13 0.00 0.00 0.00 1 14 0.00 0.00 0.00 4 15 0.00 0.00 0.00 0 16 0.00 0.00 0.00 0 17 0.00 0.00 0.00 4 18 0.00 0.00 0.00 4 19 0.00 0.00 0.00 1 20 0.00 0.00 0.00 1 21 0.00 0.00 0.00 1 22 0.00 0.00 0.00 1 23 0.00 0.00 0.00 1 24 0.00 0.00 0.00 3 25 0.00 0.00 0.00 2 26 0.00 0.00 0.00 6 27 0.00 0.00 0.00 1 28 0.00 0.00 0.00 9 29 0.00 0.00 0.00 1 30 0.00 0.00 0.00 1 31 0.00 0.00 0.00 1 32 0.00 0.00 0.00 2 33 0.00 0.00 0.00 1 34 0.00 0.00 0.00 0 35 0.00 0.00 0.00 1 36 0.00 0.00 0.00 1 37 0.00 0.00 0.00 1 38 0.00 0.00 0.00 1 39 0.00 0.00 0.00 0 40 0.00 0.00 0.00 1 41 0.00 0.00 0.00 0 42 0.00 0.00 0.00 1 43 0.00 0.00 0.00 1 44 0.00 0.00 0.00 9 45 0.00 0.00 0.00 1 46 0.00 0.00 0.00 0 47 0.00 0.00 0.00 2 48 0.00 0.00 0.00 2 49 0.00 0.00 0.00 1 50 0.00 0.00 0.00 1 51 0.00 0.00 0.00 0 52 0.00 0.00 0.00 2 53 0.00 0.00 0.00 1 54 0.00 0.00 0.00 0 55 0.00 0.00 0.00 1 56 0.00 0.00 0.00 3 57 0.00 0.00 0.00 1 58 0.00 0.00 0.00 1 59 0.00 0.00 0.00 0 60 0.00 0.00 0.00 0 61 0.00 0.00 0.00 0 62 0.00 0.00 0.00 1 63 0.00 0.00 0.00 3 64 0.00 0.00 0.00 2 65 0.00 0.00 0.00 2 66 0.00 0.00 0.00 3 67 0.00 0.00 0.00 1 68 0.00 0.00 0.00 2 69 0.00 0.00 0.00 1 70 0.00 0.00 0.00 0 71 0.00 0.00 0.00 1 72 0.00 0.00 0.00 1 73 0.00 0.00 0.00 0 74 0.00 0.00 0.00 1 75 0.00 0.00 0.00 1 76 0.00 0.00 0.00 1 77 0.00 0.00 0.00 0 78 0.00 0.00 0.00 1 79 0.00 0.00 0.00 1 80 0.00 0.00 0.00 0 81 0.00 0.00 0.00 0 82 0.00 0.00 0.00 1 83 0.00 0.00 0.00 1 84 0.00 0.00 0.00 1 85 0.00 0.00 0.00 1 86 0.00 0.00 0.00 1 87 0.00 0.00 0.00 0 88 0.00 0.00 0.00 1 89 0.00 0.00 0.00 0 90 0.00 0.00 0.00 1 91 0.00 0.00 0.00 1 92 0.00 0.00 0.00 1 93 0.00 0.00 0.00 1 94 0.00 0.00 0.00 0 95 0.00 0.00 0.00 1 96 0.00 0.00 0.00 0 97 0.00 0.00 0.00 0 98 0.00 0.00 0.00 0 99 0.00 0.00 0.00 0 100 0.00 0.00 0.00 1 101 0.00 0.00 0.00 2 102 0.00 0.00 0.00 1 103 0.00 0.00 0.00 1 104 0.00 0.00 0.00 1 105 0.00 0.00 0.00 1 106 0.00 0.00 0.00 1 107 0.00 0.00 0.00 1 108 0.00 0.00 0.00 1 109 0.00 0.00 0.00 1 110 0.00 0.00 0.00 1 111 0.00 0.00 0.00 1 112 0.00 0.00 0.00 1 113 0.00 0.00 0.00 1 114 0.00 0.00 0.00 0 115 0.00 0.00 0.00 1 116 0.00 0.00 0.00 1 117 0.00 0.00 0.00 1 118 0.00 0.00 0.00 1 119 0.00 0.00 0.00 1 120 0.00 0.00 0.00 0 121 0.00 0.00 0.00 1 122 0.00 0.00 0.00 0 123 0.00 0.00 0.00 1 124 0.00 0.00 0.00 0 125 0.00 0.00 0.00 1 126 0.00 0.00 0.00 1 127 0.00 0.00 0.00 1 128 0.00 0.00 0.00 0 129 0.00 0.00 0.00 1 130 0.00 0.00 0.00 0 131 0.00 0.00 0.00 1 132 0.00 0.00 0.00 0 133 0.00 0.00 0.00 0 134 0.00 0.00 0.00 1 135 0.00 0.00 0.00 1 136 0.00 0.00 0.00 1 137 0.00 0.00 0.00 1 138 0.00 0.00 0.00 0 139 0.00 0.00 0.00 1 140 0.00 0.00 0.00 0 141 0.00 0.00 0.00 1 142 0.00 0.00 0.00 1 143 0.00 0.00 0.00 1 144 0.00 0.00 0.00 1 145 0.00 0.00 0.00 1 146 0.00 0.00 0.00 1 147 0.00 0.00 0.00 1 148 0.00 0.00 0.00 0 149 0.00 0.00 0.00 0 150 0.00 0.00 0.00 1 151 0.00 0.00 0.00 0 152 0.00 0.00 0.00 1 153 0.00 0.00 0.00 1 154 0.00 0.00 0.00 0 155 0.00 0.00 0.00 0 156 0.00 0.00 0.00 0 157 0.00 0.00 0.00 0 158 0.00 0.00 0.00 1 159 0.50 0.50 0.50 2 160 0.00 0.00 0.00 0 161 0.00 0.00 0.00 1 162 0.00 0.00 0.00 1 163 0.00 0.00 0.00 1 164 0.00 0.00 0.00 1 165 0.00 0.00 0.00 1 166 0.00 0.00 0.00 1 167 0.00 0.00 0.00 0 168 0.00 0.00 0.00 0 169 0.00 0.00 0.00 0 170 0.00 0.00 0.00 0 171 0.00 0.00 0.00 1 172 0.00 0.00 0.00 1 173 0.00 0.00 0.00 0 174 0.00 0.00 0.00 0 175 0.00 0.00 0.00 1 176 0.00 0.00 0.00 1 177 0.00 0.00 0.00 1 178 0.00 0.00 0.00 1 179 0.00 0.00 0.00 0 180 0.00 0.00 0.00 0 181 0.00 0.00 0.00 1 182 0.00 0.00 0.00 1 183 0.00 0.00 0.00 0 184 0.00 0.00 0.00 1 185 0.00 0.00 0.00 0 186 0.00 0.00 0.00 0 187 0.00 0.00 0.00 0 188 0.00 0.00 0.00 1 189 0.00 0.00 0.00 1 190 0.00 0.00 0.00 1 191 0.00 0.00 0.00 1 micro avg 0.44 0.03 0.06 242 macro avg 0.02 0.01 0.01 242 weighted avg 0.09 0.03 0.05 242 samples avg 0.03 0.03 0.03 242
C:\Users\thors\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.10_qbz5n2kfra8p0\LocalCache\local-packages\Python310\site-packages\sklearn\metrics\_classification.py:1497: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result)) C:\Users\thors\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.10_qbz5n2kfra8p0\LocalCache\local-packages\Python310\site-packages\sklearn\metrics\_classification.py:1497: UndefinedMetricWarning: Recall is ill-defined and being set to 0.0 in labels with no true samples. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result)) C:\Users\thors\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.10_qbz5n2kfra8p0\LocalCache\local-packages\Python310\site-packages\sklearn\metrics\_classification.py:1497: UndefinedMetricWarning: F-score is ill-defined and being set to 0.0 in labels with no true nor predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result)) C:\Users\thors\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.10_qbz5n2kfra8p0\LocalCache\local-packages\Python310\site-packages\sklearn\metrics\_classification.py:1497: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in samples with no predicted labels. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
Hyperparameter Tuning¶
param_grid = {
'kneighborsclassifier__n_neighbors': [3, 5, 7, 9] # List of k values to try
}
pipeline = make_pipeline(StandardScaler(), KNeighborsClassifier())
grid_search = GridSearchCV(estimator=pipeline, param_grid=param_grid, cv=5)
grid_search.fit(X_train, y_train)
best_params = grid_search.best_params_
best_score=grid_search.best_estimator_.score(X_test, y_test)
print("Best Parameters:", best_params)
print("Best Score:", best_score)
print(classification_report(y_test, y_pred))
Best Parameters: {'kneighborsclassifier__n_neighbors': 3} Best Score: 0.2231404958677686
Ignoring Order of Types 1¶
The first way to ignore order of types is by making a label for each type and matching the data with two labels .
Preprocessing¶
df = preprocessed_df.copy()
# Combine Type 1 and Type 2 into a single column
df['Types'] = df[['type_1', 'type_2']].apply(lambda x: tuple(filter(lambda y: pd.notna(y), x)), axis=1)
print(df['Types'][0])
# Get unique Pokémon types
unique_types = np.unique(df['Types'].explode())
df.drop(['type_1', 'type_2'], axis=1, inplace=True)
df.head()
('Grass', 'Poison')
generation | height_m | weight_kg | abilities_number | total_points | hp | attack | defense | sp_attack | sp_defense | ... | egg_type_2_Flying | egg_type_2_Grass | egg_type_2_Human-Like | egg_type_2_Mineral | egg_type_2_Monster | egg_type_2_None | egg_type_2_Water 1 | egg_type_2_Water 2 | egg_type_2_Water 3 | Types | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | 0.7 | 6.9 | 2 | 318 | 45 | 49 | 49 | 65 | 65 | ... | False | False | False | False | True | False | False | False | False | (Grass, Poison) |
1 | 1 | 1.0 | 13.0 | 2 | 405 | 60 | 62 | 63 | 80 | 80 | ... | False | False | False | False | True | False | False | False | False | (Grass, Poison) |
2 | 1 | 2.0 | 100.0 | 2 | 525 | 80 | 82 | 83 | 100 | 100 | ... | False | False | False | False | True | False | False | False | False | (Grass, Poison) |
3 | 1 | 2.4 | 155.5 | 1 | 625 | 80 | 100 | 123 | 122 | 120 | ... | False | False | False | False | True | False | False | False | False | (Grass, Poison) |
4 | 1 | 0.6 | 8.5 | 2 | 309 | 39 | 52 | 43 | 60 | 50 | ... | False | False | False | False | True | False | False | False | False | (Fire, None) |
5 rows × 545 columns
For each unique type, we create a binary label. The label is 1 if the Pokemon has that type, (so was present in the type combination tuple) and 0 if it doesn't.
# Create binary labels for each Pokémon type
for type in unique_types:
df[type] = df['Types'].apply(lambda x: 1 if type in x else 0)
df.head()
generation | height_m | weight_kg | abilities_number | total_points | hp | attack | defense | sp_attack | sp_defense | ... | Grass | Ground | Ice | None | Normal | Poison | Psychic | Rock | Steel | Water | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | 0.7 | 6.9 | 2 | 318 | 45 | 49 | 49 | 65 | 65 | ... | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 |
1 | 1 | 1.0 | 13.0 | 2 | 405 | 60 | 62 | 63 | 80 | 80 | ... | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 |
2 | 1 | 2.0 | 100.0 | 2 | 525 | 80 | 82 | 83 | 100 | 100 | ... | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 |
3 | 1 | 2.4 | 155.5 | 1 | 625 | 80 | 100 | 123 | 122 | 120 | ... | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 |
4 | 1 | 0.6 | 8.5 | 2 | 309 | 39 | 52 | 43 | 60 | 50 | ... | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 |
5 rows × 564 columns
# Some type combinations only occur once so we double them to stratify the data better
singleton_classes = df['Types'].value_counts()[df['Types'].value_counts() == 1].index.tolist()
singleton_data = df[df['Types'].isin(singleton_classes)]
other_data = df[~df['Types'].isin(singleton_classes)]
df = df.drop(columns=['Types'])
other_data.drop(columns=['Types'], inplace=True)
singleton_data.drop(columns=['Types'], inplace=True)
Decision tree¶
# Split the data into training and testing sets
y = df[unique_types]
X_train, X_test, y_train, y_test = train_test_split(other_data.drop(columns=unique_types), other_data[unique_types], test_size=0.2, stratify=other_data[unique_types], random_state=42)
X_train = pd.concat([X_train, singleton_data.drop(columns=unique_types)])
y_train = pd.concat([y_train, singleton_data[unique_types]])
X_test = pd.concat([X_test, singleton_data.drop(columns=unique_types)])
y_test = pd.concat([y_test, singleton_data[unique_types]])
# Initialize and train the decision tree classifier
model = DecisionTreeClassifier(random_state=42)
model.fit(X_train, y_train)
# Predict labels for the test set
y_pred = model.predict(X_test)
# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print("Score: ", model.score(X_test, y_test))
print("Accuracy:", accuracy)
print(classification_report(y_test, y_pred))
Score: 0.49173553719008267 Accuracy: 0.49173553719008267 precision recall f1-score support 0 0.94 0.89 0.92 19 1 0.60 0.50 0.55 18 2 0.71 0.79 0.75 19 3 0.63 0.63 0.63 19 4 0.54 0.82 0.65 17 5 0.79 0.58 0.67 19 6 0.76 0.83 0.79 23 7 0.66 0.68 0.67 28 8 0.68 0.68 0.68 19 9 1.00 0.72 0.84 29 10 0.52 0.55 0.54 20 11 0.29 0.38 0.33 13 12 0.69 0.69 0.69 101 13 0.74 0.74 0.74 27 14 0.62 0.56 0.59 18 15 0.50 0.50 0.50 26 16 0.80 0.53 0.64 15 17 0.43 0.53 0.47 17 18 0.84 0.73 0.78 37 micro avg 0.68 0.67 0.67 484 macro avg 0.67 0.65 0.65 484 weighted avg 0.69 0.67 0.68 484 samples avg 0.68 0.67 0.67 484
Hyperparameter tuning¶
pipeline = make_pipeline(StandardScaler(), DecisionTreeClassifier())
param_dist = {
"decisiontreeclassifier__max_depth": [15, 30, None],
"decisiontreeclassifier__min_samples_leaf": np.arange(1, 10)
}
# Instantiate the GridSearchCV object
grid_search_cv = GridSearchCV(pipeline, param_grid=param_dist, cv=5)
# Fit grid_search_cv using the data X and labels y.
grid_search_cv.fit(X_train, y_train)
y_pred = grid_search_cv.predict(X_test)
# Print the best score
print("Tuned Model Parameters: {}".format(grid_search_cv.best_params_))
print("Accuracy: {}".format(grid_search_cv.best_estimator_.score(X_test, y_test)))
print(classification_report(y_test, y_pred))
Tuned Model Parameters: {'decisiontreeclassifier__max_depth': None, 'decisiontreeclassifier__min_samples_leaf': 1} Accuracy: 0.5165289256198347 precision recall f1-score support 0 0.94 0.89 0.92 19 1 0.52 0.61 0.56 18 2 0.74 0.74 0.74 19 3 0.60 0.63 0.62 19 4 0.60 0.88 0.71 17 5 0.79 0.58 0.67 19 6 0.76 0.83 0.79 23 7 0.66 0.68 0.67 28 8 0.75 0.79 0.77 19 9 0.92 0.76 0.83 29 10 0.73 0.55 0.63 20 11 0.50 0.46 0.48 13 12 0.70 0.68 0.69 101 13 0.70 0.78 0.74 27 14 0.63 0.67 0.65 18 15 0.48 0.38 0.43 26 16 0.64 0.60 0.62 15 17 0.45 0.53 0.49 17 18 0.79 0.73 0.76 37 micro avg 0.69 0.68 0.68 484 macro avg 0.68 0.67 0.67 484 weighted avg 0.69 0.68 0.68 484 samples avg 0.69 0.68 0.68 484
Hyperparameter tuning can sometimes lead to worse results than using default settings. This can occur when the tuning process, typically done via cross-validation on the training data, inadvertently overfits the model. This issue is exacerbated when the dataset has classes with very few members, leading to unreliable splits during cross-validation. However, this is much less the case in this dataset because the dataset is much larger.
Random Forest¶
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)
# Predict labels for the test set
y_pred = model.predict(X_test)
# Calculate accuracy
print("Score: ", model.score(X_test, y_test))
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)
print(classification_report(y_test, y_pred))
Score: 0.29338842975206614 Accuracy: 0.29338842975206614 precision recall f1-score support 0 1.00 0.89 0.94 19 1 0.80 0.22 0.35 18 2 1.00 0.58 0.73 19 3 1.00 0.37 0.54 19 4 1.00 0.41 0.58 17 5 1.00 0.42 0.59 19 6 1.00 0.61 0.76 23 7 1.00 0.46 0.63 28 8 1.00 0.58 0.73 19 9 1.00 0.69 0.82 29 10 1.00 0.30 0.46 20 11 1.00 0.31 0.47 13 12 0.77 0.73 0.75 101 13 0.82 0.33 0.47 27 14 1.00 0.22 0.36 18 15 1.00 0.35 0.51 26 16 1.00 0.33 0.50 15 17 1.00 0.29 0.45 17 18 1.00 0.51 0.68 37 micro avg 0.91 0.51 0.65 484 macro avg 0.97 0.45 0.60 484 weighted avg 0.93 0.51 0.63 484 samples avg 0.69 0.51 0.57 484
C:\Users\thors\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.10_qbz5n2kfra8p0\LocalCache\local-packages\Python310\site-packages\sklearn\metrics\_classification.py:1497: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in samples with no predicted labels. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
Hyperparameter tuning¶
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint
# Define the parameter grid
pipeline = make_pipeline(StandardScaler(), RandomForestClassifier())
# Setup the parameters and distributions to sample from: param_dist
param_dist = {
"randomforestclassifier__max_depth": [15, 30, None],
"randomforestclassifier__min_samples_leaf": np.arange(1, 10, 4),
"randomforestclassifier__n_estimators": np.arange(120, 140, 4)
}
# Instantiate the RandomizedSearchCV object: random_grid_search_cv
random_search_cv = RandomizedSearchCV(pipeline, param_distributions=param_dist, n_iter=50, cv=3, random_state=42)
#grid_search_cv = GridSearchCV(pipeline, param_grid=param_dist, cv=3)
# Fit random_search_cv using the data X and labels y
random_search_cv.fit(X_train, y_train)
#grid_search_cv.fit(X_train, y_train)
# Print the best score
print("Best score is {}".format(random_search_cv.best_estimator_.score(X_test, y_test)))
print("Best parameters are {}".format(random_search_cv.best_params_))
print(classification_report(y_test, y_pred))
Best score is 0.3305785123966942 Best parameters are {'randomforestclassifier__max_depth': None, 'randomforestclassifier__min_samples_leaf': 1, 'randomforestclassifier__n_estimators': 120}
K Nearest neigbors¶
from sklearn.neighbors import KNeighborsClassifier
model = KNeighborsClassifier()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
# Calculate accuracy
print("Score: ", model.score(X_test, y_test))
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)
print(classification_report(y_test, y_pred))
Score: 0.02066115702479339 Accuracy: 0.02066115702479339 precision recall f1-score support 0 0.50 0.16 0.24 19 1 0.00 0.00 0.00 18 2 0.67 0.32 0.43 19 3 1.00 0.21 0.35 19 4 1.00 0.18 0.30 17 5 0.67 0.11 0.18 19 6 0.40 0.09 0.14 23 7 0.38 0.18 0.24 28 8 0.00 0.00 0.00 19 9 0.40 0.07 0.12 29 10 0.50 0.10 0.17 20 11 0.33 0.08 0.12 13 12 0.48 0.46 0.47 101 13 0.67 0.37 0.48 27 14 0.00 0.00 0.00 18 15 1.00 0.08 0.14 26 16 0.00 0.00 0.00 15 17 0.00 0.00 0.00 17 18 0.24 0.11 0.15 37 micro avg 0.49 0.19 0.27 484 macro avg 0.43 0.13 0.19 484 weighted avg 0.45 0.19 0.24 484 samples avg 0.32 0.19 0.23 484
C:\Users\thors\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.10_qbz5n2kfra8p0\LocalCache\local-packages\Python310\site-packages\sklearn\metrics\_classification.py:1497: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result)) C:\Users\thors\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.10_qbz5n2kfra8p0\LocalCache\local-packages\Python310\site-packages\sklearn\metrics\_classification.py:1497: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in samples with no predicted labels. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
Hyperparameter tuning¶
param_grid = {
'kneighborsclassifier__n_neighbors': [3, 5, 7, 9] # List of k values to try
}
pipeline = make_pipeline(StandardScaler(), KNeighborsClassifier())
grid_search = GridSearchCV(estimator=pipeline, param_grid=param_grid, cv=5)
grid_search.fit(X_train, y_train)
best_params = grid_search.best_params_
best_score=grid_search.best_estimator_.score(X_test, y_test)
print("Best Parameters:", best_params)
print("Best Score:", best_score)
print(classification_report(y_test, y_pred))
Best Parameters: {'kneighborsclassifier__n_neighbors': 3} Best Score: 0.2727272727272727 precision recall f1-score support 0 0.50 0.16 0.24 19 1 0.00 0.00 0.00 18 2 0.67 0.32 0.43 19 3 1.00 0.21 0.35 19 4 1.00 0.18 0.30 17 5 0.67 0.11 0.18 19 6 0.40 0.09 0.14 23 7 0.38 0.18 0.24 28 8 0.00 0.00 0.00 19 9 0.40 0.07 0.12 29 10 0.50 0.10 0.17 20 11 0.33 0.08 0.12 13 12 0.48 0.46 0.47 101 13 0.67 0.37 0.48 27 14 0.00 0.00 0.00 18 15 1.00 0.08 0.14 26 16 0.00 0.00 0.00 15 17 0.00 0.00 0.00 17 18 0.24 0.11 0.15 37 micro avg 0.49 0.19 0.27 484 macro avg 0.43 0.13 0.19 484 weighted avg 0.45 0.19 0.24 484 samples avg 0.32 0.19 0.23 484
C:\Users\thors\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.10_qbz5n2kfra8p0\LocalCache\local-packages\Python310\site-packages\sklearn\metrics\_classification.py:1497: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result)) C:\Users\thors\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.10_qbz5n2kfra8p0\LocalCache\local-packages\Python310\site-packages\sklearn\metrics\_classification.py:1497: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in samples with no predicted labels. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
Ignoring Order of Types 2¶
The second way of ignoring order of types is to use binary labels for each sorted type combination.
Preprocessing¶
We again combine the Type 1 and Type 2 columns into a single column that contains a list of type combination tuples that are sorted.
df = preprocessed_df.copy()
df['Types'] = df[['type_1', 'type_2']].apply(lambda x: sorted(tuple(filter(lambda y: pd.notna(y), x))), axis=1)
df['Types'] = df['Types'].astype(str)
# drop the Type 1 and Type 2 columns
df.drop(['type_1', 'type_2'], axis=1, inplace=True)
# print head
df.head()
generation | height_m | weight_kg | abilities_number | total_points | hp | attack | defense | sp_attack | sp_defense | ... | egg_type_2_Flying | egg_type_2_Grass | egg_type_2_Human-Like | egg_type_2_Mineral | egg_type_2_Monster | egg_type_2_None | egg_type_2_Water 1 | egg_type_2_Water 2 | egg_type_2_Water 3 | Types | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | 0.7 | 6.9 | 2 | 318 | 45 | 49 | 49 | 65 | 65 | ... | False | False | False | False | True | False | False | False | False | ['Grass', 'Poison'] |
1 | 1 | 1.0 | 13.0 | 2 | 405 | 60 | 62 | 63 | 80 | 80 | ... | False | False | False | False | True | False | False | False | False | ['Grass', 'Poison'] |
2 | 1 | 2.0 | 100.0 | 2 | 525 | 80 | 82 | 83 | 100 | 100 | ... | False | False | False | False | True | False | False | False | False | ['Grass', 'Poison'] |
3 | 1 | 2.4 | 155.5 | 1 | 625 | 80 | 100 | 123 | 122 | 120 | ... | False | False | False | False | True | False | False | False | False | ['Grass', 'Poison'] |
4 | 1 | 0.6 | 8.5 | 2 | 309 | 39 | 52 | 43 | 60 | 50 | ... | False | False | False | False | True | False | False | False | False | ['Fire', 'None'] |
5 rows × 545 columns
# Find classes with only one type
singleton_classes = df['Types'].value_counts()[df['Types'].value_counts() == 1].index.tolist()
Make binary labels for each sorted type combination
# Create binary labels for each Pokémon type combination
unique_type_combinations = df['Types'].unique()
for type_combination in unique_type_combinations:
df[type_combination] = df['Types'].apply(lambda x: 1 if type in x else 0)
singleton_data = df[df['Types'].isin(singleton_classes)]
other_data = df[~df['Types'].isin(singleton_classes)]
print("Number of singleton classes",len(singleton_classes))
print("number of unique type combinations",len(df['Types'].unique()))
print(len(df['Types']))
df.head()
Number of singleton classes 23 number of unique type combinations 154 1044
C:\Users\thors\AppData\Local\Temp\ipykernel_20284\1134825012.py:4: PerformanceWarning: DataFrame is highly fragmented. This is usually the result of calling `frame.insert` many times, which has poor performance. Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()` df[type_combination] = df['Types'].apply(lambda x: 1 if type in x else 0) C:\Users\thors\AppData\Local\Temp\ipykernel_20284\1134825012.py:4: PerformanceWarning: DataFrame is highly fragmented. This is usually the result of calling `frame.insert` many times, which has poor performance. Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()` df[type_combination] = df['Types'].apply(lambda x: 1 if type in x else 0) C:\Users\thors\AppData\Local\Temp\ipykernel_20284\1134825012.py:4: PerformanceWarning: DataFrame is highly fragmented. This is usually the result of calling `frame.insert` many times, which has poor performance. Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()` df[type_combination] = df['Types'].apply(lambda x: 1 if type in x else 0) C:\Users\thors\AppData\Local\Temp\ipykernel_20284\1134825012.py:4: PerformanceWarning: DataFrame is highly fragmented. This is usually the result of calling `frame.insert` many times, which has poor performance. Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()` df[type_combination] = df['Types'].apply(lambda x: 1 if type in x else 0) C:\Users\thors\AppData\Local\Temp\ipykernel_20284\1134825012.py:4: PerformanceWarning: DataFrame is highly fragmented. This is usually the result of calling `frame.insert` many times, which has poor performance. Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()` df[type_combination] = df['Types'].apply(lambda x: 1 if type in x else 0) C:\Users\thors\AppData\Local\Temp\ipykernel_20284\1134825012.py:4: PerformanceWarning: DataFrame is highly fragmented. This is usually the result of calling `frame.insert` many times, which has poor performance. Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()` df[type_combination] = df['Types'].apply(lambda x: 1 if type in x else 0) C:\Users\thors\AppData\Local\Temp\ipykernel_20284\1134825012.py:4: PerformanceWarning: DataFrame is highly fragmented. This is usually the result of calling `frame.insert` many times, which has poor performance. Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()` df[type_combination] = df['Types'].apply(lambda x: 1 if type in x else 0) C:\Users\thors\AppData\Local\Temp\ipykernel_20284\1134825012.py:4: PerformanceWarning: DataFrame is highly fragmented. This is usually the result of calling `frame.insert` many times, which has poor performance. Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()` df[type_combination] = df['Types'].apply(lambda x: 1 if type in x else 0) C:\Users\thors\AppData\Local\Temp\ipykernel_20284\1134825012.py:4: PerformanceWarning: DataFrame is highly fragmented. This is usually the result of calling `frame.insert` many times, which has poor performance. Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()` df[type_combination] = df['Types'].apply(lambda x: 1 if type in x else 0) C:\Users\thors\AppData\Local\Temp\ipykernel_20284\1134825012.py:4: PerformanceWarning: DataFrame is highly fragmented. This is usually the result of calling `frame.insert` many times, which has poor performance. Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()` df[type_combination] = df['Types'].apply(lambda x: 1 if type in x else 0) C:\Users\thors\AppData\Local\Temp\ipykernel_20284\1134825012.py:4: PerformanceWarning: DataFrame is highly fragmented. This is usually the result of calling `frame.insert` many times, which has poor performance. Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()` df[type_combination] = df['Types'].apply(lambda x: 1 if type in x else 0) C:\Users\thors\AppData\Local\Temp\ipykernel_20284\1134825012.py:4: PerformanceWarning: DataFrame is highly fragmented. This is usually the result of calling `frame.insert` many times, which has poor performance. Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()` df[type_combination] = df['Types'].apply(lambda x: 1 if type in x else 0) C:\Users\thors\AppData\Local\Temp\ipykernel_20284\1134825012.py:4: PerformanceWarning: DataFrame is highly fragmented. This is usually the result of calling `frame.insert` many times, which has poor performance. Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()` df[type_combination] = df['Types'].apply(lambda x: 1 if type in x else 0) C:\Users\thors\AppData\Local\Temp\ipykernel_20284\1134825012.py:4: PerformanceWarning: DataFrame is highly fragmented. This is usually the result of calling `frame.insert` many times, which has poor performance. Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()` df[type_combination] = df['Types'].apply(lambda x: 1 if type in x else 0) C:\Users\thors\AppData\Local\Temp\ipykernel_20284\1134825012.py:4: PerformanceWarning: DataFrame is highly fragmented. This is usually the result of calling `frame.insert` many times, which has poor performance. Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()` df[type_combination] = df['Types'].apply(lambda x: 1 if type in x else 0) C:\Users\thors\AppData\Local\Temp\ipykernel_20284\1134825012.py:4: PerformanceWarning: DataFrame is highly fragmented. This is usually the result of calling `frame.insert` many times, which has poor performance. Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()` df[type_combination] = df['Types'].apply(lambda x: 1 if type in x else 0) C:\Users\thors\AppData\Local\Temp\ipykernel_20284\1134825012.py:4: PerformanceWarning: DataFrame is highly fragmented. This is usually the result of calling `frame.insert` many times, which has poor performance. Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()` df[type_combination] = df['Types'].apply(lambda x: 1 if type in x else 0) C:\Users\thors\AppData\Local\Temp\ipykernel_20284\1134825012.py:4: PerformanceWarning: DataFrame is highly fragmented. This is usually the result of calling `frame.insert` many times, which has poor performance. Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()` df[type_combination] = df['Types'].apply(lambda x: 1 if type in x else 0) C:\Users\thors\AppData\Local\Temp\ipykernel_20284\1134825012.py:4: PerformanceWarning: DataFrame is highly fragmented. This is usually the result of calling `frame.insert` many times, which has poor performance. Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()` df[type_combination] = df['Types'].apply(lambda x: 1 if type in x else 0) C:\Users\thors\AppData\Local\Temp\ipykernel_20284\1134825012.py:4: PerformanceWarning: DataFrame is highly fragmented. This is usually the result of calling `frame.insert` many times, which has poor performance. Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()` df[type_combination] = df['Types'].apply(lambda x: 1 if type in x else 0) C:\Users\thors\AppData\Local\Temp\ipykernel_20284\1134825012.py:4: PerformanceWarning: DataFrame is highly fragmented. This is usually the result of calling `frame.insert` many times, which has poor performance. Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()` df[type_combination] = df['Types'].apply(lambda x: 1 if type in x else 0) C:\Users\thors\AppData\Local\Temp\ipykernel_20284\1134825012.py:4: PerformanceWarning: DataFrame is highly fragmented. This is usually the result of calling `frame.insert` many times, which has poor performance. Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()` df[type_combination] = df['Types'].apply(lambda x: 1 if type in x else 0) C:\Users\thors\AppData\Local\Temp\ipykernel_20284\1134825012.py:4: PerformanceWarning: DataFrame is highly fragmented. This is usually the result of calling `frame.insert` many times, which has poor performance. Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()` df[type_combination] = df['Types'].apply(lambda x: 1 if type in x else 0) C:\Users\thors\AppData\Local\Temp\ipykernel_20284\1134825012.py:4: PerformanceWarning: DataFrame is highly fragmented. This is usually the result of calling `frame.insert` many times, which has poor performance. Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()` df[type_combination] = df['Types'].apply(lambda x: 1 if type in x else 0) C:\Users\thors\AppData\Local\Temp\ipykernel_20284\1134825012.py:4: PerformanceWarning: DataFrame is highly fragmented. This is usually the result of calling `frame.insert` many times, which has poor performance. Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()` df[type_combination] = df['Types'].apply(lambda x: 1 if type in x else 0) C:\Users\thors\AppData\Local\Temp\ipykernel_20284\1134825012.py:4: PerformanceWarning: DataFrame is highly fragmented. This is usually the result of calling `frame.insert` many times, which has poor performance. Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()` df[type_combination] = df['Types'].apply(lambda x: 1 if type in x else 0) C:\Users\thors\AppData\Local\Temp\ipykernel_20284\1134825012.py:4: PerformanceWarning: DataFrame is highly fragmented. This is usually the result of calling `frame.insert` many times, which has poor performance. Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()` df[type_combination] = df['Types'].apply(lambda x: 1 if type in x else 0) C:\Users\thors\AppData\Local\Temp\ipykernel_20284\1134825012.py:4: PerformanceWarning: DataFrame is highly fragmented. This is usually the result of calling `frame.insert` many times, which has poor performance. Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()` df[type_combination] = df['Types'].apply(lambda x: 1 if type in x else 0) C:\Users\thors\AppData\Local\Temp\ipykernel_20284\1134825012.py:4: PerformanceWarning: DataFrame is highly fragmented. This is usually the result of calling `frame.insert` many times, which has poor performance. Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()` df[type_combination] = df['Types'].apply(lambda x: 1 if type in x else 0) C:\Users\thors\AppData\Local\Temp\ipykernel_20284\1134825012.py:4: PerformanceWarning: DataFrame is highly fragmented. This is usually the result of calling `frame.insert` many times, which has poor performance. Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()` df[type_combination] = df['Types'].apply(lambda x: 1 if type in x else 0) C:\Users\thors\AppData\Local\Temp\ipykernel_20284\1134825012.py:4: PerformanceWarning: DataFrame is highly fragmented. This is usually the result of calling `frame.insert` many times, which has poor performance. Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()` df[type_combination] = df['Types'].apply(lambda x: 1 if type in x else 0) C:\Users\thors\AppData\Local\Temp\ipykernel_20284\1134825012.py:4: PerformanceWarning: DataFrame is highly fragmented. This is usually the result of calling `frame.insert` many times, which has poor performance. Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()` df[type_combination] = df['Types'].apply(lambda x: 1 if type in x else 0) C:\Users\thors\AppData\Local\Temp\ipykernel_20284\1134825012.py:4: PerformanceWarning: DataFrame is highly fragmented. This is usually the result of calling `frame.insert` many times, which has poor performance. Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()` df[type_combination] = df['Types'].apply(lambda x: 1 if type in x else 0) C:\Users\thors\AppData\Local\Temp\ipykernel_20284\1134825012.py:4: PerformanceWarning: DataFrame is highly fragmented. This is usually the result of calling `frame.insert` many times, which has poor performance. Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()` df[type_combination] = df['Types'].apply(lambda x: 1 if type in x else 0) C:\Users\thors\AppData\Local\Temp\ipykernel_20284\1134825012.py:4: PerformanceWarning: DataFrame is highly fragmented. This is usually the result of calling `frame.insert` many times, which has poor performance. Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()` df[type_combination] = df['Types'].apply(lambda x: 1 if type in x else 0) C:\Users\thors\AppData\Local\Temp\ipykernel_20284\1134825012.py:4: PerformanceWarning: DataFrame is highly fragmented. This is usually the result of calling `frame.insert` many times, which has poor performance. Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()` df[type_combination] = df['Types'].apply(lambda x: 1 if type in x else 0) C:\Users\thors\AppData\Local\Temp\ipykernel_20284\1134825012.py:4: PerformanceWarning: DataFrame is highly fragmented. This is usually the result of calling `frame.insert` many times, which has poor performance. Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()` df[type_combination] = df['Types'].apply(lambda x: 1 if type in x else 0) C:\Users\thors\AppData\Local\Temp\ipykernel_20284\1134825012.py:4: PerformanceWarning: DataFrame is highly fragmented. This is usually the result of calling `frame.insert` many times, which has poor performance. Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()` df[type_combination] = df['Types'].apply(lambda x: 1 if type in x else 0) C:\Users\thors\AppData\Local\Temp\ipykernel_20284\1134825012.py:4: PerformanceWarning: DataFrame is highly fragmented. This is usually the result of calling `frame.insert` many times, which has poor performance. Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()` df[type_combination] = df['Types'].apply(lambda x: 1 if type in x else 0) C:\Users\thors\AppData\Local\Temp\ipykernel_20284\1134825012.py:4: PerformanceWarning: DataFrame is highly fragmented. This is usually the result of calling `frame.insert` many times, which has poor performance. Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()` df[type_combination] = df['Types'].apply(lambda x: 1 if type in x else 0) C:\Users\thors\AppData\Local\Temp\ipykernel_20284\1134825012.py:4: PerformanceWarning: DataFrame is highly fragmented. This is usually the result of calling `frame.insert` many times, which has poor performance. Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()` df[type_combination] = df['Types'].apply(lambda x: 1 if type in x else 0) C:\Users\thors\AppData\Local\Temp\ipykernel_20284\1134825012.py:4: PerformanceWarning: DataFrame is highly fragmented. This is usually the result of calling `frame.insert` many times, which has poor performance. Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()` df[type_combination] = df['Types'].apply(lambda x: 1 if type in x else 0) C:\Users\thors\AppData\Local\Temp\ipykernel_20284\1134825012.py:4: PerformanceWarning: DataFrame is highly fragmented. This is usually the result of calling `frame.insert` many times, which has poor performance. Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()` df[type_combination] = df['Types'].apply(lambda x: 1 if type in x else 0) C:\Users\thors\AppData\Local\Temp\ipykernel_20284\1134825012.py:4: PerformanceWarning: DataFrame is highly fragmented. This is usually the result of calling `frame.insert` many times, which has poor performance. Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()` df[type_combination] = df['Types'].apply(lambda x: 1 if type in x else 0) C:\Users\thors\AppData\Local\Temp\ipykernel_20284\1134825012.py:4: PerformanceWarning: DataFrame is highly fragmented. This is usually the result of calling `frame.insert` many times, which has poor performance. Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()` df[type_combination] = df['Types'].apply(lambda x: 1 if type in x else 0) C:\Users\thors\AppData\Local\Temp\ipykernel_20284\1134825012.py:4: PerformanceWarning: DataFrame is highly fragmented. This is usually the result of calling `frame.insert` many times, which has poor performance. Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()` df[type_combination] = df['Types'].apply(lambda x: 1 if type in x else 0) C:\Users\thors\AppData\Local\Temp\ipykernel_20284\1134825012.py:4: PerformanceWarning: DataFrame is highly fragmented. This is usually the result of calling `frame.insert` many times, which has poor performance. Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()` df[type_combination] = df['Types'].apply(lambda x: 1 if type in x else 0) C:\Users\thors\AppData\Local\Temp\ipykernel_20284\1134825012.py:4: PerformanceWarning: DataFrame is highly fragmented. This is usually the result of calling `frame.insert` many times, which has poor performance. Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()` df[type_combination] = df['Types'].apply(lambda x: 1 if type in x else 0) C:\Users\thors\AppData\Local\Temp\ipykernel_20284\1134825012.py:4: PerformanceWarning: DataFrame is highly fragmented. This is usually the result of calling `frame.insert` many times, which has poor performance. Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()` df[type_combination] = df['Types'].apply(lambda x: 1 if type in x else 0) C:\Users\thors\AppData\Local\Temp\ipykernel_20284\1134825012.py:4: PerformanceWarning: DataFrame is highly fragmented. This is usually the result of calling `frame.insert` many times, which has poor performance. Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()` df[type_combination] = df['Types'].apply(lambda x: 1 if type in x else 0) C:\Users\thors\AppData\Local\Temp\ipykernel_20284\1134825012.py:4: PerformanceWarning: DataFrame is highly fragmented. This is usually the result of calling `frame.insert` many times, which has poor performance. Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()` df[type_combination] = df['Types'].apply(lambda x: 1 if type in x else 0) C:\Users\thors\AppData\Local\Temp\ipykernel_20284\1134825012.py:4: PerformanceWarning: DataFrame is highly fragmented. This is usually the result of calling `frame.insert` many times, which has poor performance. Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()` df[type_combination] = df['Types'].apply(lambda x: 1 if type in x else 0) C:\Users\thors\AppData\Local\Temp\ipykernel_20284\1134825012.py:4: PerformanceWarning: DataFrame is highly fragmented. This is usually the result of calling `frame.insert` many times, which has poor performance. Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()` df[type_combination] = df['Types'].apply(lambda x: 1 if type in x else 0) C:\Users\thors\AppData\Local\Temp\ipykernel_20284\1134825012.py:4: PerformanceWarning: DataFrame is highly fragmented. This is usually the result of calling `frame.insert` many times, which has poor performance. Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()` df[type_combination] = df['Types'].apply(lambda x: 1 if type in x else 0) C:\Users\thors\AppData\Local\Temp\ipykernel_20284\1134825012.py:4: PerformanceWarning: DataFrame is highly fragmented. This is usually the result of calling `frame.insert` many times, which has poor performance. Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()` df[type_combination] = df['Types'].apply(lambda x: 1 if type in x else 0) C:\Users\thors\AppData\Local\Temp\ipykernel_20284\1134825012.py:4: PerformanceWarning: DataFrame is highly fragmented. This is usually the result of calling `frame.insert` many times, which has poor performance. Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()` df[type_combination] = df['Types'].apply(lambda x: 1 if type in x else 0) C:\Users\thors\AppData\Local\Temp\ipykernel_20284\1134825012.py:4: PerformanceWarning: DataFrame is highly fragmented. This is usually the result of calling `frame.insert` many times, which has poor performance. Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()` df[type_combination] = df['Types'].apply(lambda x: 1 if type in x else 0) C:\Users\thors\AppData\Local\Temp\ipykernel_20284\1134825012.py:4: PerformanceWarning: DataFrame is highly fragmented. This is usually the result of calling `frame.insert` many times, which has poor performance. Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()` df[type_combination] = df['Types'].apply(lambda x: 1 if type in x else 0)
generation | height_m | weight_kg | abilities_number | total_points | hp | attack | defense | sp_attack | sp_defense | ... | ['Fairy', 'Ghost'] | ['Dragon', 'Normal'] | ['Dragon', 'Fighting'] | ['Poison', 'Rock'] | ['Fighting', 'Ghost'] | ['Bug', 'Psychic'] | ['Electric', 'Poison'] | ['Dark', 'Fairy'] | ['Bug', 'Ice'] | ['Dark', 'Electric'] | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | 0.7 | 6.9 | 2 | 318 | 45 | 49 | 49 | 65 | 65 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
1 | 1 | 1.0 | 13.0 | 2 | 405 | 60 | 62 | 63 | 80 | 80 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
2 | 1 | 2.0 | 100.0 | 2 | 525 | 80 | 82 | 83 | 100 | 100 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
3 | 1 | 2.4 | 155.5 | 1 | 625 | 80 | 100 | 123 | 122 | 120 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
4 | 1 | 0.6 | 8.5 | 2 | 309 | 39 | 52 | 43 | 60 | 50 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
5 rows × 699 columns
# Drop the 'Types' column
df = df.drop(columns=['Types'])
other_data.drop(columns=['Types'], inplace=True)
singleton_data.drop(columns=['Types'], inplace=True)
Decision Tree¶
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(other_data.drop(columns=unique_type_combinations), other_data[unique_type_combinations], test_size=0.2, stratify=other_data[unique_type_combinations], random_state=42)
X_train = pd.concat([X_train, singleton_data.drop(columns=unique_type_combinations)])
y_train = pd.concat([y_train, singleton_data[unique_type_combinations]])
X_test = pd.concat([X_test, singleton_data.drop(columns=unique_type_combinations)])
y_test = pd.concat([y_test, singleton_data[unique_type_combinations]])
# Initialize and train the decision tree classifier
model = DecisionTreeClassifier(random_state=42)
model.fit(X_train, y_train)
# Predict labels for the test set
y_pred = model.predict(X_test)
# Calculate accuracy
print("Score: ", model.score(X_test, y_test))
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)
print(classification_report(y_test, y_pred))
Score: 0.9692982456140351 Accuracy: 0.9692982456140351 precision recall f1-score support 0 0.88 0.91 0.90 33 1 0.88 0.91 0.90 33 2 0.88 0.91 0.90 33 3 0.88 0.91 0.90 33 4 0.88 0.91 0.90 33 5 0.88 0.91 0.90 33 6 0.88 0.91 0.90 33 7 0.88 0.91 0.90 33 8 0.88 0.91 0.90 33 9 0.88 0.91 0.90 33 10 0.88 0.91 0.90 33 11 0.88 0.91 0.90 33 12 0.88 0.91 0.90 33 13 0.88 0.91 0.90 33 14 0.88 0.91 0.90 33 15 0.88 0.91 0.90 33 16 0.88 0.91 0.90 33 17 0.88 0.91 0.90 33 18 0.88 0.91 0.90 33 19 0.88 0.91 0.90 33 20 0.88 0.91 0.90 33 21 0.88 0.91 0.90 33 22 0.88 0.91 0.90 33 23 0.88 0.91 0.90 33 24 0.88 0.91 0.90 33 25 0.88 0.91 0.90 33 26 0.88 0.91 0.90 33 27 0.88 0.91 0.90 33 28 0.88 0.91 0.90 33 29 0.88 0.91 0.90 33 30 0.88 0.91 0.90 33 31 0.88 0.91 0.90 33 32 0.88 0.91 0.90 33 33 0.88 0.91 0.90 33 34 0.88 0.91 0.90 33 35 0.88 0.91 0.90 33 36 0.88 0.91 0.90 33 37 0.88 0.91 0.90 33 38 0.88 0.91 0.90 33 39 0.88 0.91 0.90 33 40 0.88 0.91 0.90 33 41 0.88 0.91 0.90 33 42 0.88 0.91 0.90 33 43 0.88 0.91 0.90 33 44 0.88 0.91 0.90 33 45 0.88 0.91 0.90 33 46 0.88 0.91 0.90 33 47 0.88 0.91 0.90 33 48 0.88 0.91 0.90 33 49 0.88 0.91 0.90 33 50 0.88 0.91 0.90 33 51 0.88 0.91 0.90 33 52 0.88 0.91 0.90 33 53 0.88 0.91 0.90 33 54 0.88 0.91 0.90 33 55 0.88 0.91 0.90 33 56 0.88 0.91 0.90 33 57 0.88 0.91 0.90 33 58 0.88 0.91 0.90 33 59 0.88 0.91 0.90 33 60 0.88 0.91 0.90 33 61 0.88 0.91 0.90 33 62 0.88 0.91 0.90 33 63 0.88 0.91 0.90 33 64 0.88 0.91 0.90 33 65 0.88 0.91 0.90 33 66 0.88 0.91 0.90 33 67 0.88 0.91 0.90 33 68 0.88 0.91 0.90 33 69 0.88 0.91 0.90 33 70 0.88 0.91 0.90 33 71 0.88 0.91 0.90 33 72 0.88 0.91 0.90 33 73 0.88 0.91 0.90 33 74 0.88 0.91 0.90 33 75 0.88 0.91 0.90 33 76 0.88 0.91 0.90 33 77 0.88 0.91 0.90 33 78 0.88 0.91 0.90 33 79 0.88 0.91 0.90 33 80 0.88 0.91 0.90 33 81 0.88 0.91 0.90 33 82 0.88 0.91 0.90 33 83 0.88 0.91 0.90 33 84 0.88 0.91 0.90 33 85 0.88 0.91 0.90 33 86 0.88 0.91 0.90 33 87 0.88 0.91 0.90 33 88 0.88 0.91 0.90 33 89 0.88 0.91 0.90 33 90 0.88 0.91 0.90 33 91 0.88 0.91 0.90 33 92 0.88 0.91 0.90 33 93 0.88 0.91 0.90 33 94 0.88 0.91 0.90 33 95 0.88 0.91 0.90 33 96 0.88 0.91 0.90 33 97 0.88 0.91 0.90 33 98 0.88 0.91 0.90 33 99 0.88 0.91 0.90 33 100 0.88 0.91 0.90 33 101 0.88 0.91 0.90 33 102 0.88 0.91 0.90 33 103 0.88 0.91 0.90 33 104 0.88 0.91 0.90 33 105 0.88 0.91 0.90 33 106 0.88 0.91 0.90 33 107 0.88 0.91 0.90 33 108 0.88 0.91 0.90 33 109 0.88 0.91 0.90 33 110 0.88 0.91 0.90 33 111 0.88 0.91 0.90 33 112 0.88 0.91 0.90 33 113 0.88 0.91 0.90 33 114 0.88 0.91 0.90 33 115 0.88 0.91 0.90 33 116 0.88 0.91 0.90 33 117 0.88 0.91 0.90 33 118 0.88 0.91 0.90 33 119 0.88 0.91 0.90 33 120 0.88 0.91 0.90 33 121 0.88 0.91 0.90 33 122 0.88 0.91 0.90 33 123 0.88 0.91 0.90 33 124 0.88 0.91 0.90 33 125 0.88 0.91 0.90 33 126 0.88 0.91 0.90 33 127 0.88 0.91 0.90 33 128 0.88 0.91 0.90 33 129 0.88 0.91 0.90 33 130 0.88 0.91 0.90 33 131 0.88 0.91 0.90 33 132 0.88 0.91 0.90 33 133 0.88 0.91 0.90 33 134 0.88 0.91 0.90 33 135 0.88 0.91 0.90 33 136 0.88 0.91 0.90 33 137 0.88 0.91 0.90 33 138 0.88 0.91 0.90 33 139 0.88 0.91 0.90 33 140 0.88 0.91 0.90 33 141 0.88 0.91 0.90 33 142 0.88 0.91 0.90 33 143 0.88 0.91 0.90 33 144 0.88 0.91 0.90 33 145 0.88 0.91 0.90 33 146 0.88 0.91 0.90 33 147 0.88 0.91 0.90 33 148 0.88 0.91 0.90 33 149 0.88 0.91 0.90 33 150 0.88 0.91 0.90 33 151 0.88 0.91 0.90 33 152 0.88 0.91 0.90 33 153 0.88 0.91 0.90 33 micro avg 0.88 0.91 0.90 5082 macro avg 0.88 0.91 0.90 5082 weighted avg 0.88 0.91 0.90 5082 samples avg 0.13 0.13 0.13 5082
C:\Users\thors\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.10_qbz5n2kfra8p0\LocalCache\local-packages\Python310\site-packages\sklearn\metrics\_classification.py:1497: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in samples with no predicted labels. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result)) C:\Users\thors\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.10_qbz5n2kfra8p0\LocalCache\local-packages\Python310\site-packages\sklearn\metrics\_classification.py:1497: UndefinedMetricWarning: Recall is ill-defined and being set to 0.0 in samples with no true labels. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result)) C:\Users\thors\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.10_qbz5n2kfra8p0\LocalCache\local-packages\Python310\site-packages\sklearn\metrics\_classification.py:1497: UndefinedMetricWarning: F-score is ill-defined and being set to 0.0 in samples with no true nor predicted labels. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
Here we have the highest accuracy score of all the models.
Hyperparameter Tuning¶
pipeline = make_pipeline(StandardScaler(), DecisionTreeClassifier())
param_dist = {
"decisiontreeclassifier__max_depth": [15, 30, None],
"decisiontreeclassifier__min_samples_leaf": np.arange(1, 10)
}
# Instantiate the GridSearchCV object
grid_search_cv = GridSearchCV(pipeline, param_grid=param_dist, cv=5)
# Fit grid_search_cv using the data X and labels y.
grid_search_cv.fit(X_train, y_train)
y_pred = grid_search_cv.predict(X_test)
# Print the best score
print("Tuned Model Parameters: {}".format(grid_search_cv.best_params_))
print("Accuracy: {}".format(grid_search_cv.best_estimator_.score(X_test, y_test)))
print(classification_report(y_test, y_pred))
Tuned Model Parameters: {'decisiontreeclassifier__max_depth': 30, 'decisiontreeclassifier__min_samples_leaf': 1} Accuracy: 0.9692982456140351 precision recall f1-score support 0 0.86 0.94 0.90 33 1 0.86 0.94 0.90 33 2 0.86 0.94 0.90 33 3 0.86 0.94 0.90 33 4 0.86 0.94 0.90 33 5 0.86 0.94 0.90 33 6 0.86 0.94 0.90 33 7 0.86 0.94 0.90 33 8 0.86 0.94 0.90 33 9 0.86 0.94 0.90 33 10 0.86 0.94 0.90 33 11 0.86 0.94 0.90 33 12 0.86 0.94 0.90 33 13 0.86 0.94 0.90 33 14 0.86 0.94 0.90 33 15 0.86 0.94 0.90 33 16 0.86 0.94 0.90 33 17 0.86 0.94 0.90 33 18 0.86 0.94 0.90 33 19 0.86 0.94 0.90 33 20 0.86 0.94 0.90 33 21 0.86 0.94 0.90 33 22 0.86 0.94 0.90 33 23 0.86 0.94 0.90 33 24 0.86 0.94 0.90 33 25 0.86 0.94 0.90 33 26 0.86 0.94 0.90 33 27 0.86 0.94 0.90 33 28 0.86 0.94 0.90 33 29 0.86 0.94 0.90 33 30 0.86 0.94 0.90 33 31 0.86 0.94 0.90 33 32 0.86 0.94 0.90 33 33 0.86 0.94 0.90 33 34 0.86 0.94 0.90 33 35 0.86 0.94 0.90 33 36 0.86 0.94 0.90 33 37 0.86 0.94 0.90 33 38 0.86 0.94 0.90 33 39 0.86 0.94 0.90 33 40 0.86 0.94 0.90 33 41 0.86 0.94 0.90 33 42 0.86 0.94 0.90 33 43 0.86 0.94 0.90 33 44 0.86 0.94 0.90 33 45 0.86 0.94 0.90 33 46 0.86 0.94 0.90 33 47 0.86 0.94 0.90 33 48 0.86 0.94 0.90 33 49 0.86 0.94 0.90 33 50 0.86 0.94 0.90 33 51 0.86 0.94 0.90 33 52 0.86 0.94 0.90 33 53 0.86 0.94 0.90 33 54 0.86 0.94 0.90 33 55 0.86 0.94 0.90 33 56 0.86 0.94 0.90 33 57 0.86 0.94 0.90 33 58 0.86 0.94 0.90 33 59 0.86 0.94 0.90 33 60 0.86 0.94 0.90 33 61 0.86 0.94 0.90 33 62 0.86 0.94 0.90 33 63 0.86 0.94 0.90 33 64 0.86 0.94 0.90 33 65 0.86 0.94 0.90 33 66 0.86 0.94 0.90 33 67 0.86 0.94 0.90 33 68 0.86 0.94 0.90 33 69 0.86 0.94 0.90 33 70 0.86 0.94 0.90 33 71 0.86 0.94 0.90 33 72 0.86 0.94 0.90 33 73 0.86 0.94 0.90 33 74 0.86 0.94 0.90 33 75 0.86 0.94 0.90 33 76 0.86 0.94 0.90 33 77 0.86 0.94 0.90 33 78 0.86 0.94 0.90 33 79 0.86 0.94 0.90 33 80 0.86 0.94 0.90 33 81 0.86 0.94 0.90 33 82 0.86 0.94 0.90 33 83 0.86 0.94 0.90 33 84 0.86 0.94 0.90 33 85 0.86 0.94 0.90 33 86 0.86 0.94 0.90 33 87 0.86 0.94 0.90 33 88 0.86 0.94 0.90 33 89 0.86 0.94 0.90 33 90 0.86 0.94 0.90 33 91 0.86 0.94 0.90 33 92 0.86 0.94 0.90 33 93 0.86 0.94 0.90 33 94 0.86 0.94 0.90 33 95 0.86 0.94 0.90 33 96 0.86 0.94 0.90 33 97 0.86 0.94 0.90 33 98 0.86 0.94 0.90 33 99 0.86 0.94 0.90 33 100 0.86 0.94 0.90 33 101 0.86 0.94 0.90 33 102 0.86 0.94 0.90 33 103 0.86 0.94 0.90 33 104 0.86 0.94 0.90 33 105 0.86 0.94 0.90 33 106 0.86 0.94 0.90 33 107 0.86 0.94 0.90 33 108 0.86 0.94 0.90 33 109 0.86 0.94 0.90 33 110 0.86 0.94 0.90 33 111 0.86 0.94 0.90 33 112 0.86 0.94 0.90 33 113 0.86 0.94 0.90 33 114 0.86 0.94 0.90 33 115 0.86 0.94 0.90 33 116 0.86 0.94 0.90 33 117 0.86 0.94 0.90 33 118 0.86 0.94 0.90 33 119 0.86 0.94 0.90 33 120 0.86 0.94 0.90 33 121 0.86 0.94 0.90 33 122 0.86 0.94 0.90 33 123 0.86 0.94 0.90 33 124 0.86 0.94 0.90 33 125 0.86 0.94 0.90 33 126 0.86 0.94 0.90 33 127 0.86 0.94 0.90 33 128 0.86 0.94 0.90 33 129 0.86 0.94 0.90 33 130 0.86 0.94 0.90 33 131 0.86 0.94 0.90 33 132 0.86 0.94 0.90 33 133 0.86 0.94 0.90 33 134 0.86 0.94 0.90 33 135 0.86 0.94 0.90 33 136 0.86 0.94 0.90 33 137 0.86 0.94 0.90 33 138 0.86 0.94 0.90 33 139 0.86 0.94 0.90 33 140 0.86 0.94 0.90 33 141 0.86 0.94 0.90 33 142 0.86 0.94 0.90 33 143 0.86 0.94 0.90 33 144 0.86 0.94 0.90 33 145 0.86 0.94 0.90 33 146 0.86 0.94 0.90 33 147 0.86 0.94 0.90 33 148 0.86 0.94 0.90 33 149 0.86 0.94 0.90 33 150 0.86 0.94 0.90 33 151 0.86 0.94 0.90 33 152 0.86 0.94 0.90 33 153 0.86 0.94 0.90 33 micro avg 0.86 0.94 0.90 5082 macro avg 0.86 0.94 0.90 5082 weighted avg 0.86 0.94 0.90 5082 samples avg 0.14 0.14 0.14 5082
C:\Users\thors\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.10_qbz5n2kfra8p0\LocalCache\local-packages\Python310\site-packages\sklearn\metrics\_classification.py:1497: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in samples with no predicted labels. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result)) C:\Users\thors\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.10_qbz5n2kfra8p0\LocalCache\local-packages\Python310\site-packages\sklearn\metrics\_classification.py:1497: UndefinedMetricWarning: Recall is ill-defined and being set to 0.0 in samples with no true labels. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result)) C:\Users\thors\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.10_qbz5n2kfra8p0\LocalCache\local-packages\Python310\site-packages\sklearn\metrics\_classification.py:1497: UndefinedMetricWarning: F-score is ill-defined and being set to 0.0 in samples with no true nor predicted labels. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
Random forests¶
# Initialize and train the decision tree classifier
model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)
# Predict labels for the test set
y_pred = model.predict(X_test)
# Calculate accuracy
print("Score: ", model.score(X_test,y_test))
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)
print(classification_report(y_test, y_pred))
Score: 0.9385964912280702 Accuracy: 0.9385964912280702 precision recall f1-score support 0 0.95 0.61 0.74 33 1 0.95 0.61 0.74 33 2 0.95 0.61 0.74 33 3 0.95 0.61 0.74 33 4 0.95 0.61 0.74 33 5 0.95 0.61 0.74 33 6 0.95 0.61 0.74 33 7 0.95 0.61 0.74 33 8 0.95 0.61 0.74 33 9 0.95 0.61 0.74 33 10 0.95 0.61 0.74 33 11 0.95 0.61 0.74 33 12 0.95 0.61 0.74 33 13 0.95 0.61 0.74 33 14 0.95 0.61 0.74 33 15 0.95 0.61 0.74 33 16 0.95 0.61 0.74 33 17 0.95 0.61 0.74 33 18 0.95 0.61 0.74 33 19 0.95 0.61 0.74 33 20 0.95 0.61 0.74 33 21 0.95 0.61 0.74 33 22 0.95 0.61 0.74 33 23 0.95 0.61 0.74 33 24 0.95 0.61 0.74 33 25 0.95 0.61 0.74 33 26 0.95 0.61 0.74 33 27 0.95 0.61 0.74 33 28 0.95 0.61 0.74 33 29 0.95 0.61 0.74 33 30 0.95 0.61 0.74 33 31 0.95 0.61 0.74 33 32 0.95 0.61 0.74 33 33 0.95 0.61 0.74 33 34 0.95 0.61 0.74 33 35 0.95 0.61 0.74 33 36 0.95 0.61 0.74 33 37 0.95 0.61 0.74 33 38 0.95 0.61 0.74 33 39 0.95 0.61 0.74 33 40 0.95 0.61 0.74 33 41 0.95 0.61 0.74 33 42 0.95 0.61 0.74 33 43 0.95 0.61 0.74 33 44 0.95 0.61 0.74 33 45 0.95 0.61 0.74 33 46 0.95 0.61 0.74 33 47 0.95 0.61 0.74 33 48 0.95 0.61 0.74 33 49 0.95 0.61 0.74 33 50 0.95 0.61 0.74 33 51 0.95 0.61 0.74 33 52 0.95 0.61 0.74 33 53 0.95 0.61 0.74 33 54 0.95 0.61 0.74 33 55 0.95 0.61 0.74 33 56 0.95 0.61 0.74 33 57 0.95 0.61 0.74 33 58 0.95 0.61 0.74 33 59 0.95 0.61 0.74 33 60 0.95 0.61 0.74 33 61 0.95 0.61 0.74 33 62 0.95 0.61 0.74 33 63 0.95 0.61 0.74 33 64 0.95 0.61 0.74 33 65 0.95 0.61 0.74 33 66 0.95 0.61 0.74 33 67 0.95 0.61 0.74 33 68 0.95 0.61 0.74 33 69 0.95 0.61 0.74 33 70 0.95 0.61 0.74 33 71 0.95 0.61 0.74 33 72 0.95 0.61 0.74 33 73 0.95 0.61 0.74 33 74 0.95 0.61 0.74 33 75 0.95 0.61 0.74 33 76 0.95 0.61 0.74 33 77 0.95 0.61 0.74 33 78 0.95 0.61 0.74 33 79 0.95 0.61 0.74 33 80 0.95 0.61 0.74 33 81 0.95 0.61 0.74 33 82 0.95 0.61 0.74 33 83 0.95 0.61 0.74 33 84 0.95 0.61 0.74 33 85 0.95 0.61 0.74 33 86 0.95 0.61 0.74 33 87 0.95 0.61 0.74 33 88 0.95 0.61 0.74 33 89 0.95 0.61 0.74 33 90 0.95 0.61 0.74 33 91 0.95 0.61 0.74 33 92 0.95 0.61 0.74 33 93 0.95 0.61 0.74 33 94 0.95 0.61 0.74 33 95 0.95 0.61 0.74 33 96 0.95 0.61 0.74 33 97 0.95 0.61 0.74 33 98 0.95 0.61 0.74 33 99 0.95 0.61 0.74 33 100 0.95 0.61 0.74 33 101 0.95 0.61 0.74 33 102 0.95 0.61 0.74 33 103 0.95 0.61 0.74 33 104 0.95 0.61 0.74 33 105 0.95 0.61 0.74 33 106 0.95 0.61 0.74 33 107 0.95 0.61 0.74 33 108 0.95 0.61 0.74 33 109 0.95 0.61 0.74 33 110 0.95 0.61 0.74 33 111 0.95 0.61 0.74 33 112 0.95 0.61 0.74 33 113 0.95 0.61 0.74 33 114 0.95 0.61 0.74 33 115 0.95 0.61 0.74 33 116 0.95 0.61 0.74 33 117 0.95 0.61 0.74 33 118 0.95 0.61 0.74 33 119 0.95 0.61 0.74 33 120 0.95 0.61 0.74 33 121 0.95 0.61 0.74 33 122 0.95 0.61 0.74 33 123 0.95 0.61 0.74 33 124 0.95 0.61 0.74 33 125 0.95 0.61 0.74 33 126 0.95 0.61 0.74 33 127 0.95 0.61 0.74 33 128 0.95 0.61 0.74 33 129 0.95 0.61 0.74 33 130 0.95 0.61 0.74 33 131 0.95 0.61 0.74 33 132 0.95 0.61 0.74 33 133 0.95 0.61 0.74 33 134 0.95 0.61 0.74 33 135 0.95 0.61 0.74 33 136 0.95 0.61 0.74 33 137 0.95 0.61 0.74 33 138 0.95 0.61 0.74 33 139 0.95 0.61 0.74 33 140 0.95 0.61 0.74 33 141 0.95 0.61 0.74 33 142 0.95 0.61 0.74 33 143 0.95 0.61 0.74 33 144 0.95 0.61 0.74 33 145 0.95 0.61 0.74 33 146 0.95 0.61 0.74 33 147 0.95 0.61 0.74 33 148 0.95 0.61 0.74 33 149 0.95 0.61 0.74 33 150 0.95 0.61 0.74 33 151 0.95 0.61 0.74 33 152 0.95 0.61 0.74 33 153 0.95 0.61 0.74 33 micro avg 0.95 0.61 0.74 5082 macro avg 0.95 0.61 0.74 5082 weighted avg 0.95 0.61 0.74 5082 samples avg 0.09 0.09 0.09 5082
C:\Users\thors\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.10_qbz5n2kfra8p0\LocalCache\local-packages\Python310\site-packages\sklearn\metrics\_classification.py:1497: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in samples with no predicted labels. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result)) C:\Users\thors\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.10_qbz5n2kfra8p0\LocalCache\local-packages\Python310\site-packages\sklearn\metrics\_classification.py:1497: UndefinedMetricWarning: Recall is ill-defined and being set to 0.0 in samples with no true labels. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result)) C:\Users\thors\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.10_qbz5n2kfra8p0\LocalCache\local-packages\Python310\site-packages\sklearn\metrics\_classification.py:1497: UndefinedMetricWarning: F-score is ill-defined and being set to 0.0 in samples with no true nor predicted labels. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
Hyperparameter Tuning¶
pipeline = make_pipeline(StandardScaler(), RandomForestClassifier())
# Setup the parameters and distributions to sample from: param_dist
param_dist = {
"randomforestclassifier__max_depth": [28, 30, 32, None],
"randomforestclassifier__min_samples_leaf": np.arange(1, 10, 4),
"randomforestclassifier__n_estimators": np.arange(60, 100, 4)
}
# Instantiate the RandomizedSearchCV object: random_grid_search_cv
random_search_cv = RandomizedSearchCV(pipeline, param_distributions=param_dist, n_iter=50, cv=3, random_state=42)
#grid_search_cv = GridSearchCV(pipeline, param_grid=param_dist, cv=3)
# Fit random_search_cv using the data X and labels y
random_search_cv.fit(X_train, y_train)
#grid_search_cv.fit(X_train, y_train)
# Print the best score
print("Best score is {}".format(random_search_cv.best_estimator_.score(X_test, y_test)))
print("Best parameters are {}".format(random_search_cv.best_params_))
print(classification_report(y_test, y_pred))
Best score is 0.956140350877193 Best parameters are {'randomforestclassifier__max_depth': 28, 'randomforestclassifier__min_samples_leaf': 1, 'randomforestclassifier__n_estimators': 68}
K Nearest Neighbors¶
from sklearn.neighbors import KNeighborsClassifier
model = KNeighborsClassifier()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
# Calculate accuracy
print("Score: ", model.score(X_test, y_test))
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)
print(classification_report(y_test, y_pred))
Score: 0.8333333333333334 Accuracy: 0.8333333333333334 precision recall f1-score support 0 0.14 0.03 0.05 33 1 0.14 0.03 0.05 33 2 0.14 0.03 0.05 33 3 0.14 0.03 0.05 33 4 0.14 0.03 0.05 33 5 0.14 0.03 0.05 33 6 0.14 0.03 0.05 33 7 0.14 0.03 0.05 33 8 0.14 0.03 0.05 33 9 0.14 0.03 0.05 33 10 0.14 0.03 0.05 33 11 0.14 0.03 0.05 33 12 0.14 0.03 0.05 33 13 0.14 0.03 0.05 33 14 0.14 0.03 0.05 33 15 0.14 0.03 0.05 33 16 0.14 0.03 0.05 33 17 0.14 0.03 0.05 33 18 0.14 0.03 0.05 33 19 0.14 0.03 0.05 33 20 0.14 0.03 0.05 33 21 0.14 0.03 0.05 33 22 0.14 0.03 0.05 33 23 0.14 0.03 0.05 33 24 0.14 0.03 0.05 33 25 0.14 0.03 0.05 33 26 0.14 0.03 0.05 33 27 0.14 0.03 0.05 33 28 0.14 0.03 0.05 33 29 0.14 0.03 0.05 33 30 0.14 0.03 0.05 33 31 0.14 0.03 0.05 33 32 0.14 0.03 0.05 33 33 0.14 0.03 0.05 33 34 0.14 0.03 0.05 33 35 0.14 0.03 0.05 33 36 0.14 0.03 0.05 33 37 0.14 0.03 0.05 33 38 0.14 0.03 0.05 33 39 0.14 0.03 0.05 33 40 0.14 0.03 0.05 33 41 0.14 0.03 0.05 33 42 0.14 0.03 0.05 33 43 0.14 0.03 0.05 33 44 0.14 0.03 0.05 33 45 0.14 0.03 0.05 33 46 0.14 0.03 0.05 33 47 0.14 0.03 0.05 33 48 0.14 0.03 0.05 33 49 0.14 0.03 0.05 33 50 0.14 0.03 0.05 33 51 0.14 0.03 0.05 33 52 0.14 0.03 0.05 33 53 0.14 0.03 0.05 33 54 0.14 0.03 0.05 33 55 0.14 0.03 0.05 33 56 0.14 0.03 0.05 33 57 0.14 0.03 0.05 33 58 0.14 0.03 0.05 33 59 0.14 0.03 0.05 33 60 0.14 0.03 0.05 33 61 0.14 0.03 0.05 33 62 0.14 0.03 0.05 33 63 0.14 0.03 0.05 33 64 0.14 0.03 0.05 33 65 0.14 0.03 0.05 33 66 0.14 0.03 0.05 33 67 0.14 0.03 0.05 33 68 0.14 0.03 0.05 33 69 0.14 0.03 0.05 33 70 0.14 0.03 0.05 33 71 0.14 0.03 0.05 33 72 0.14 0.03 0.05 33 73 0.14 0.03 0.05 33 74 0.14 0.03 0.05 33 75 0.14 0.03 0.05 33 76 0.14 0.03 0.05 33 77 0.14 0.03 0.05 33 78 0.14 0.03 0.05 33 79 0.14 0.03 0.05 33 80 0.14 0.03 0.05 33 81 0.14 0.03 0.05 33 82 0.14 0.03 0.05 33 83 0.14 0.03 0.05 33 84 0.14 0.03 0.05 33 85 0.14 0.03 0.05 33 86 0.14 0.03 0.05 33 87 0.14 0.03 0.05 33 88 0.14 0.03 0.05 33 89 0.14 0.03 0.05 33 90 0.14 0.03 0.05 33 91 0.14 0.03 0.05 33 92 0.14 0.03 0.05 33 93 0.14 0.03 0.05 33 94 0.14 0.03 0.05 33 95 0.14 0.03 0.05 33 96 0.14 0.03 0.05 33 97 0.14 0.03 0.05 33 98 0.14 0.03 0.05 33 99 0.14 0.03 0.05 33 100 0.14 0.03 0.05 33 101 0.14 0.03 0.05 33 102 0.14 0.03 0.05 33 103 0.14 0.03 0.05 33 104 0.14 0.03 0.05 33 105 0.14 0.03 0.05 33 106 0.14 0.03 0.05 33 107 0.14 0.03 0.05 33 108 0.14 0.03 0.05 33 109 0.14 0.03 0.05 33 110 0.14 0.03 0.05 33 111 0.14 0.03 0.05 33 112 0.14 0.03 0.05 33 113 0.14 0.03 0.05 33 114 0.14 0.03 0.05 33 115 0.14 0.03 0.05 33 116 0.14 0.03 0.05 33 117 0.14 0.03 0.05 33 118 0.14 0.03 0.05 33 119 0.14 0.03 0.05 33 120 0.14 0.03 0.05 33 121 0.14 0.03 0.05 33 122 0.14 0.03 0.05 33 123 0.14 0.03 0.05 33 124 0.14 0.03 0.05 33 125 0.14 0.03 0.05 33 126 0.14 0.03 0.05 33 127 0.14 0.03 0.05 33 128 0.14 0.03 0.05 33 129 0.14 0.03 0.05 33 130 0.14 0.03 0.05 33 131 0.14 0.03 0.05 33 132 0.14 0.03 0.05 33 133 0.14 0.03 0.05 33 134 0.14 0.03 0.05 33 135 0.14 0.03 0.05 33 136 0.14 0.03 0.05 33 137 0.14 0.03 0.05 33 138 0.14 0.03 0.05 33 139 0.14 0.03 0.05 33 140 0.14 0.03 0.05 33 141 0.14 0.03 0.05 33 142 0.14 0.03 0.05 33 143 0.14 0.03 0.05 33 144 0.14 0.03 0.05 33 145 0.14 0.03 0.05 33 146 0.14 0.03 0.05 33 147 0.14 0.03 0.05 33 148 0.14 0.03 0.05 33 149 0.14 0.03 0.05 33 150 0.14 0.03 0.05 33 151 0.14 0.03 0.05 33 152 0.14 0.03 0.05 33 153 0.14 0.03 0.05 33 micro avg 0.14 0.03 0.05 5082 macro avg 0.14 0.03 0.05 5082 weighted avg 0.14 0.03 0.05 5082 samples avg 0.00 0.00 0.00 5082
C:\Users\thors\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.10_qbz5n2kfra8p0\LocalCache\local-packages\Python310\site-packages\sklearn\metrics\_classification.py:1497: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in samples with no predicted labels. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result)) C:\Users\thors\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.10_qbz5n2kfra8p0\LocalCache\local-packages\Python310\site-packages\sklearn\metrics\_classification.py:1497: UndefinedMetricWarning: Recall is ill-defined and being set to 0.0 in samples with no true labels. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result)) C:\Users\thors\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.10_qbz5n2kfra8p0\LocalCache\local-packages\Python310\site-packages\sklearn\metrics\_classification.py:1497: UndefinedMetricWarning: F-score is ill-defined and being set to 0.0 in samples with no true nor predicted labels. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
Hyperparameter Tuning¶
from sklearn.neighbors import KNeighborsClassifier
param_grid = {
'kneighborsclassifier__n_neighbors': [3, 5, 7, 9] # List of k values to try
}
pipeline = make_pipeline(StandardScaler(), KNeighborsClassifier())
grid_search = GridSearchCV(estimator=pipeline, param_grid=param_grid, cv=5)
grid_search.fit(X_train, y_train)
best_params = grid_search.best_params_
best_score=grid_search.best_estimator_.score(X_test, y_test)
print("Best Parameters:", best_params)
print("Best Score:", best_score)
print(classification_report(y_test, y_pred))
Best Parameters: {'kneighborsclassifier__n_neighbors': 3} Best Score: 0.9035087719298246 precision recall f1-score support 0 0.14 0.03 0.05 33 1 0.14 0.03 0.05 33 2 0.14 0.03 0.05 33 3 0.14 0.03 0.05 33 4 0.14 0.03 0.05 33 5 0.14 0.03 0.05 33 6 0.14 0.03 0.05 33 7 0.14 0.03 0.05 33 8 0.14 0.03 0.05 33 9 0.14 0.03 0.05 33 10 0.14 0.03 0.05 33 11 0.14 0.03 0.05 33 12 0.14 0.03 0.05 33 13 0.14 0.03 0.05 33 14 0.14 0.03 0.05 33 15 0.14 0.03 0.05 33 16 0.14 0.03 0.05 33 17 0.14 0.03 0.05 33 18 0.14 0.03 0.05 33 19 0.14 0.03 0.05 33 20 0.14 0.03 0.05 33 21 0.14 0.03 0.05 33 22 0.14 0.03 0.05 33 23 0.14 0.03 0.05 33 24 0.14 0.03 0.05 33 25 0.14 0.03 0.05 33 26 0.14 0.03 0.05 33 27 0.14 0.03 0.05 33 28 0.14 0.03 0.05 33 29 0.14 0.03 0.05 33 30 0.14 0.03 0.05 33 31 0.14 0.03 0.05 33 32 0.14 0.03 0.05 33 33 0.14 0.03 0.05 33 34 0.14 0.03 0.05 33 35 0.14 0.03 0.05 33 36 0.14 0.03 0.05 33 37 0.14 0.03 0.05 33 38 0.14 0.03 0.05 33 39 0.14 0.03 0.05 33 40 0.14 0.03 0.05 33 41 0.14 0.03 0.05 33 42 0.14 0.03 0.05 33 43 0.14 0.03 0.05 33 44 0.14 0.03 0.05 33 45 0.14 0.03 0.05 33 46 0.14 0.03 0.05 33 47 0.14 0.03 0.05 33 48 0.14 0.03 0.05 33 49 0.14 0.03 0.05 33 50 0.14 0.03 0.05 33 51 0.14 0.03 0.05 33 52 0.14 0.03 0.05 33 53 0.14 0.03 0.05 33 54 0.14 0.03 0.05 33 55 0.14 0.03 0.05 33 56 0.14 0.03 0.05 33 57 0.14 0.03 0.05 33 58 0.14 0.03 0.05 33 59 0.14 0.03 0.05 33 60 0.14 0.03 0.05 33 61 0.14 0.03 0.05 33 62 0.14 0.03 0.05 33 63 0.14 0.03 0.05 33 64 0.14 0.03 0.05 33 65 0.14 0.03 0.05 33 66 0.14 0.03 0.05 33 67 0.14 0.03 0.05 33 68 0.14 0.03 0.05 33 69 0.14 0.03 0.05 33 70 0.14 0.03 0.05 33 71 0.14 0.03 0.05 33 72 0.14 0.03 0.05 33 73 0.14 0.03 0.05 33 74 0.14 0.03 0.05 33 75 0.14 0.03 0.05 33 76 0.14 0.03 0.05 33 77 0.14 0.03 0.05 33 78 0.14 0.03 0.05 33 79 0.14 0.03 0.05 33 80 0.14 0.03 0.05 33 81 0.14 0.03 0.05 33 82 0.14 0.03 0.05 33 83 0.14 0.03 0.05 33 84 0.14 0.03 0.05 33 85 0.14 0.03 0.05 33 86 0.14 0.03 0.05 33 87 0.14 0.03 0.05 33 88 0.14 0.03 0.05 33 89 0.14 0.03 0.05 33 90 0.14 0.03 0.05 33 91 0.14 0.03 0.05 33 92 0.14 0.03 0.05 33 93 0.14 0.03 0.05 33 94 0.14 0.03 0.05 33 95 0.14 0.03 0.05 33 96 0.14 0.03 0.05 33 97 0.14 0.03 0.05 33 98 0.14 0.03 0.05 33 99 0.14 0.03 0.05 33 100 0.14 0.03 0.05 33 101 0.14 0.03 0.05 33 102 0.14 0.03 0.05 33 103 0.14 0.03 0.05 33 104 0.14 0.03 0.05 33 105 0.14 0.03 0.05 33 106 0.14 0.03 0.05 33 107 0.14 0.03 0.05 33 108 0.14 0.03 0.05 33 109 0.14 0.03 0.05 33 110 0.14 0.03 0.05 33 111 0.14 0.03 0.05 33 112 0.14 0.03 0.05 33 113 0.14 0.03 0.05 33 114 0.14 0.03 0.05 33 115 0.14 0.03 0.05 33 116 0.14 0.03 0.05 33 117 0.14 0.03 0.05 33 118 0.14 0.03 0.05 33 119 0.14 0.03 0.05 33 120 0.14 0.03 0.05 33 121 0.14 0.03 0.05 33 122 0.14 0.03 0.05 33 123 0.14 0.03 0.05 33 124 0.14 0.03 0.05 33 125 0.14 0.03 0.05 33 126 0.14 0.03 0.05 33 127 0.14 0.03 0.05 33 128 0.14 0.03 0.05 33 129 0.14 0.03 0.05 33 130 0.14 0.03 0.05 33 131 0.14 0.03 0.05 33 132 0.14 0.03 0.05 33 133 0.14 0.03 0.05 33 134 0.14 0.03 0.05 33 135 0.14 0.03 0.05 33 136 0.14 0.03 0.05 33 137 0.14 0.03 0.05 33 138 0.14 0.03 0.05 33 139 0.14 0.03 0.05 33 140 0.14 0.03 0.05 33 141 0.14 0.03 0.05 33 142 0.14 0.03 0.05 33 143 0.14 0.03 0.05 33 144 0.14 0.03 0.05 33 145 0.14 0.03 0.05 33 146 0.14 0.03 0.05 33 147 0.14 0.03 0.05 33 148 0.14 0.03 0.05 33 149 0.14 0.03 0.05 33 150 0.14 0.03 0.05 33 151 0.14 0.03 0.05 33 152 0.14 0.03 0.05 33 153 0.14 0.03 0.05 33 micro avg 0.14 0.03 0.05 5082 macro avg 0.14 0.03 0.05 5082 weighted avg 0.14 0.03 0.05 5082 samples avg 0.00 0.00 0.00 5082
C:\Users\thors\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.10_qbz5n2kfra8p0\LocalCache\local-packages\Python310\site-packages\sklearn\metrics\_classification.py:1497: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in samples with no predicted labels. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result)) C:\Users\thors\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.10_qbz5n2kfra8p0\LocalCache\local-packages\Python310\site-packages\sklearn\metrics\_classification.py:1497: UndefinedMetricWarning: Recall is ill-defined and being set to 0.0 in samples with no true labels. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result)) C:\Users\thors\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.10_qbz5n2kfra8p0\LocalCache\local-packages\Python310\site-packages\sklearn\metrics\_classification.py:1497: UndefinedMetricWarning: F-score is ill-defined and being set to 0.0 in samples with no true nor predicted labels. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
The accuracy scores generally appear to be higher for multilabel classifiers than for the multiclass classifiers. However, we should take these accuracy scores for the multilabel classifiers with a grain of salt. In a multilabel setting, accuracy can be deceptively high when the majority of labels are negative. Since only one type combination (or two types) can be true for each Pokémon (row), all others are false, meaning we are definitely dealing with an imbalanced dataset. Recall is a mathematical measure of how many of the actual positive cases were correctly identified by a model. As we can see, recall scores are generally pretty low for the multilabel classifiers.
Multiclass multioutput Classification¶
Finally, there is Multiclass multioutput Classification, which is supported natively by some scikit-learn models.
Preprocessing¶
Here, we don't want to drop the Type 1 and Type 2 columns because we want to use them as the y vector.
df = preprocessed_df.copy()
# Some type combinations only occur once so we extract them and add them to the test data
df['Types'] = df[['type_1', 'type_2']].apply(lambda x: tuple(filter(lambda y: pd.notna(y), x)), axis=1)
singleton_classes = df['Types'].value_counts()[df['Types'].value_counts() == 1].index.tolist()
singleton_data = df[df['Types'].isin(singleton_classes)]
other_data = df[~df['Types'].isin(singleton_classes)]
df = df.drop(columns=['Types'])
other_data.drop(columns=['Types'], inplace=True)
singleton_data.drop(columns=['Types'], inplace=True)
y = df[['type_1', 'type_2']]
df.head()
generation | height_m | weight_kg | abilities_number | total_points | hp | attack | defense | sp_attack | sp_defense | ... | egg_type_2_Grass | egg_type_2_Human-Like | egg_type_2_Mineral | egg_type_2_Monster | egg_type_2_None | egg_type_2_Water 1 | egg_type_2_Water 2 | egg_type_2_Water 3 | type_1 | type_2 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | 0.7 | 6.9 | 2 | 318 | 45 | 49 | 49 | 65 | 65 | ... | False | False | False | True | False | False | False | False | Grass | Poison |
1 | 1 | 1.0 | 13.0 | 2 | 405 | 60 | 62 | 63 | 80 | 80 | ... | False | False | False | True | False | False | False | False | Grass | Poison |
2 | 1 | 2.0 | 100.0 | 2 | 525 | 80 | 82 | 83 | 100 | 100 | ... | False | False | False | True | False | False | False | False | Grass | Poison |
3 | 1 | 2.4 | 155.5 | 1 | 625 | 80 | 100 | 123 | 122 | 120 | ... | False | False | False | True | False | False | False | False | Grass | Poison |
4 | 1 | 0.6 | 8.5 | 2 | 309 | 39 | 52 | 43 | 60 | 50 | ... | False | False | False | True | False | False | False | False | Fire | None |
5 rows × 546 columns
y.head()
type_1 | type_2 | |
---|---|---|
0 | Grass | Poison |
1 | Grass | Poison |
2 | Grass | Poison |
3 | Grass | Poison |
4 | Fire | None |
Decision Tree¶
We used a multioutput classifier to measure the accuracy score.
We calculated the accuracy for each type individually and together (where both types are correctly predicted).
We made our own method to calculate the accuracy for both types together and concluded it was similar to .score.
from sklearn.multioutput import MultiOutputClassifier
# Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(other_data.drop(columns=['type_1', 'type_2']), other_data[['type_1', 'type_2']], test_size=0.2, stratify=other_data[['type_1', 'type_2']], random_state=42)
X_train = pd.concat([X_train, singleton_data.drop(columns=['type_1', 'type_2'])])
y_train = pd.concat([y_train, singleton_data[['type_1', 'type_2']]])
X_test = pd.concat([X_test, singleton_data.drop(columns=['type_1', 'type_2'])])
y_test = pd.concat([y_test, singleton_data[['type_1', 'type_2']]])
base_classifier = DecisionTreeClassifier()
multi_output_classifier = MultiOutputClassifier(base_classifier)
multi_output_classifier.fit(X_train, y_train)
base_classifier.fit(X_train, y_train)
# Step 4: Model Evaluation
y_pred = multi_output_classifier.predict(X_test)
# Evaluate
print("Score: ", multi_output_classifier.score(X_test, y_test))
y_pred = base_classifier.predict(X_test)
# our own score function
a = (y_test == y_pred)
b = []
for i, j in enumerate(a.iterrows()):
b.append(j[1]['type_1'] and j[1]['type_2'])
nb_correct = 0
for i in b:
if i:
nb_correct += 1
score_ratio = nb_correct/len(b)
print("score ratio: ",score_ratio)
# accuracy score for each type
accuracy_list=[]
y_test = np.asarray(y_test)
y_pred = np.asarray(y_pred)
for i in range(2):
accuracy = accuracy_score(y_test[:, i], y_pred[:, i])
accuracy_list.append(accuracy)
print("Accuracy type ", i+1, ": ", accuracy )
print("Averaged Accuracy for types: ",np.mean(accuracy_list))
Score: 0.4049586776859504 score ratio: 0.45867768595041325 Accuracy type 1 : 0.6487603305785123 Accuracy type 2 : 0.6074380165289256 Averaged Accuracy for types: 0.6280991735537189
Hyperparameter Tuning¶
pipeline = make_pipeline(StandardScaler(), MultiOutputClassifier(DecisionTreeClassifier()))
param_dist = {
"multioutputclassifier__estimator__max_depth": [5, 6, 7, 8, 9, 10, 15, 30, None],
"multioutputclassifier__estimator__min_samples_leaf": np.arange(1, 10)
}
# Instantiate the GridSearchCV object
grid_search_cv = GridSearchCV(pipeline, param_grid=param_dist, cv=5)
# Fit grid_search_cv using the data X and labels y.
grid_search_cv.fit(X_train, y_train)
y_pred = grid_search_cv.predict(X_test)
# Print the best score
print("Tuned Model Parameters: {}".format(grid_search_cv.best_params_))
print("Best score is {}".format(grid_search_cv.best_estimator_.score(X_test, y_test)))
Tuned Model Parameters: {'multioutputclassifier__estimator__max_depth': 30, 'multioutputclassifier__estimator__min_samples_leaf': 1} Best score is 0.4049586776859504
Random Forest¶
from sklearn.multioutput import MultiOutputClassifier
from sklearn.ensemble import RandomForestClassifier
base_classifier = RandomForestClassifier()
#base_classifier.fit(X_train, y_train)
multi_output_classifier = MultiOutputClassifier(base_classifier)
multi_output_classifier.fit(X_train, y_train)
# Step 4: Model Evaluation
accuracy_list=[]
y_pred = multi_output_classifier.predict(X_test)
print("score: ", multi_output_classifier.score(X_test, y_test))
y_test = np.asarray(y_test)
y_pred = np.asarray(y_pred)
for i in range(2):
accuracy = accuracy_score(y_test[:, i], y_pred[:, i])
print("Accuracy type ", i+1, ": ", accuracy )
accuracy_list.append(accuracy)
print("Averaged Accuracy for types: ",np.mean(accuracy_list))
score: 0.5619834710743802 Accuracy type 1 : 0.7933884297520661 Accuracy type 2 : 0.6818181818181818 Averaged Accuracy for types: 0.7376033057851239
Hyperparameter Tuning¶
pipeline = make_pipeline(StandardScaler(), MultiOutputClassifier(RandomForestClassifier()))
# Setup the parameters and distributions to sample from: param_dist
param_dist = {
"multioutputclassifier__estimator__max_depth": [5, 10, 15, 30, None],
"multioutputclassifier__estimator__min_samples_leaf": np.arange(1, 10, 2),
"multioutputclassifier__estimator__n_estimators": np.arange(60, 140, 8)
}
# Instantiate the RandomizedSearchCV object: random_grid_search_cv
random_search_cv = RandomizedSearchCV(pipeline, param_distributions=param_dist, n_iter=50, cv=3, random_state=42)
#grid_search_cv = GridSearchCV(pipeline, param_grid=param_dist, cv=3)
# Fit random_search_cv using the data X and labels y
random_search_cv.fit(X_train, y_train)
#grid_search_cv.fit(X_train, y_train)
# Print the best score
print("Best score is {}".format(random_search_cv.best_estimator_.score(X_test, y_test)))
print("Best parameters are {}".format(random_search_cv.best_params_))
Best score is 0.5578512396694215 Best parameters are {'multioutputclassifier__estimator__max_depth': None, 'multioutputclassifier__estimator__min_samples_leaf': 1, 'multioutputclassifier__estimator__n_estimators': 116}
KNeighborsClassifier¶
from sklearn.multioutput import MultiOutputClassifier
from sklearn.neighbors import KNeighborsClassifier
base_classifier = KNeighborsClassifier()
multi_output_classifier = MultiOutputClassifier(base_classifier)
multi_output_classifier.fit(X_train, y_train)
# Step 4: Model Evaluation
y_pred = multi_output_classifier.predict(X_test)
print("score: ", multi_output_classifier.score(X_test, y_test))
y_test = np.asarray(y_test)
y_pred = np.asarray(y_pred)
accuracy_list=[]
for i in range(2):
accuracy = accuracy_score(y_test[:, i], y_pred[:, i])
print("Accuracy type ", i+1, ": ", accuracy )
accuracy_list.append(accuracy)
print("Averaged Accuracy for types: ",np.mean(accuracy_list))
score: 0.08264462809917356 Accuracy type 1 : 0.24793388429752067 Accuracy type 2 : 0.35537190082644626 Averaged Accuracy for types: 0.30165289256198347
Hyperparameter Tuning¶
param_grid = {
'multioutputclassifier__estimator__n_neighbors': [3, 5, 7, 9] # List of k values to try
}
pipeline = make_pipeline(StandardScaler(), MultiOutputClassifier(KNeighborsClassifier()))
grid_search = GridSearchCV(estimator=pipeline, param_grid=param_grid, cv=5)
grid_search.fit(X_train, y_train)
best_params = grid_search.best_params_
best_score=grid_search.best_estimator_.score(X_test, y_test)
print("Best Parameters:", best_params)
print("Best Score:", best_score)
Best Parameters: {'multioutputclassifier__estimator__n_neighbors': 3} Best Score: 0.2892561983471074
We can clearly state that this extended dataset performs much better on most of the models, especially the Decision Tree for multilabel classification in Ignoring Order of Types 2.