Copy of TP3.ipynb - Colab

1/11/25, 6:15 PM Copy of TP3.
ipynb - Colab
Classification Metrics # Classification Metrics and

keyboard_arrow_down Pipelines and GridSearchCV
In this notebook we will explore evaluating 2 different classification models the data using Sci-kit
Learn (AKA sklearn).
Secondly, we will build the best model possible using Sci-kit Learn (AKA sklearn).
This is a bi-class problem because there are 2 possible classes!
Commencez à coder ou à générer avec l'IA.
from google.colab import drive

drive.mount('/content/drive')
Mounted at /content/drive
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import make_column_selector, make_column_transformer
from sklearn.pipeline import make_pipeline
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer,make_column_transformer,make_column_selector
from sklearn import set_config
set_config(display='diagram')
path='/content/drive/MyDrive/datasets/Belt2_A_drugtype_v2_final.csv'
df = pd.read_csv(path)
df.head()
Age Gender BP Cholesterol Na_to_K Drug
0 59.0 M High HIGH 13.935 drugQ
1 40.0 F Normal HIGH 10.103 drugZ
2 NaN M Normal HIGH 9.084 drugZ
3 62.0 M Normal HIGH 16.594 drugZ
4 55.0 F High NORMAL 10.977 drugQ
https://colab.research.google.com/drive/1dyIJhK-MOTsYG7A9Iva35jutRGlQmRUZ?usp=classroom_web#printMode=true 1/17
1/11/25, 6:15 PM Copy of TP3.ipynb - Colab
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 296 entries, 0 to 295
Data columns (total 6 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 Age 237 non-null float64
1 Gender 296 non-null object
2 BP 221 non-null object
3 Cholesterol 296 non-null object
4 Na_to_K 296 non-null object
5 Drug 296 non-null object
dtypes: float64(1), object(5)
memory usage: 14.0+ KB
# Check for duplicates and missing values

print('Missing data:\n', df.isna().sum())
Missing data:
Age 59
Gender 0
BP 75
Cholesterol 0
Na_to_K 0
Drug 0
dtype: int64
import matplotlib.pyplot as plt

# creating a histogram
plt.hist(df['Age'],bins = 70)
plt.show()
# the missing values are numerical and the distribution of the variable is approximately ske
df['Age'].fillna(df['Age'].median(), inplace = True)
<ipython-input-9-677a6229886c>:2: FutureWarning: A value is trying to be set on a copy o

The behavior will change in pandas 3.0. This inplace method will never work because the
For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col
df['Age'].fillna(df['Age'].median(), inplace = True)
# Looking at" the full dataset in google docs we can notice that the data is ordered by BP,
df['BP'].ffill(inplace=True)
<ipython-input-10-a9453bca72fe>:2: FutureWarning: A value is trying to be set on a copy

df['BP'].ffill(inplace=True)
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 296 entries, 0 to 295
Data columns (total 6 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 Age 296 non-null float64
1 Gender 296 non-null object
2 BP 296 non-null object
3 Cholesterol 296 non-null object
4 Na_to_K 296 non-null object
5 Drug 296 non-null object
dtypes: float64(1), object(5)
memory usage: 14.0+ KB
df['Gender'].value_counts()
count
Gender
M 149
F 137
male 4
female 2
Female 1
Male 1
femal 1
Femal 1
dtype: int64
df['Gender'] = df['Gender'].replace(['male','Male'],'M')
df['Gender'] = df['Gender'].replace(['Female','female','Femal','femal'],'F')
df['Gender'].value_counts()
count
Gender
M 154
F 142
dtype: int64
df['BP'].value_counts()
count
BP
High 194
Normal 60
Low 42
dtype: int64
df['Cholesterol'].value_counts()
count
Cholesterol
HIGH 156
NORMAL 117
norm 9
high 8
NORM 6
dtype: int64
df['Cholesterol'] = df['Cholesterol'].replace(['high'],'HIGH')
df['Cholesterol'] = df['Cholesterol'].replace(['NORM','norm'],'NORMAL')
df['Cholesterol'].value_counts()
count
Cholesterol
HIGH 164
NORMAL 132
dtype: int64
df['Drug'].value_counts()
count
Drug
drugQ 148
drugZ 148
dtype: int64
df['Na_to_K']= df['Na_to_K'].str.strip('_')
df['Na_to_K']=df['Na_to_K'].astype(float)
df.describe()
Age Na_to_K
count 296.000000 296.000000
mean 47.020270 14.709355
std 33.735455 6.364668
min 15.000000 6.269000
25% 36.000000 10.445250
50% 45.000000 12.856000
75% 56.000000 16.732000
max 570.000000 38.247000
df['Age'].replace({570:57},inplace=True)
df.describe()
<ipython-input-20-cd34380d2017>:1: FutureWarning: A value is trying to be set on a copy

df['Age'].replace({570:57},inplace=True)
Age Na_to_K
count 296.000000 296.000000
mean 45.287162 14.709355
std 14.431203 6.364668
min 15.000000 6.269000
25% 36.000000 10.445250
50% 45.000000 12.856000
75% 56.000000 16.732000
max 74.000000 38.247000
df['Na_to_K'].hist()
<Axes: >
The distribution is skewed to the right, which means that the tail of the distribution is longer on the
right side. This also means that the mean is greater than the mode.
import seaborn as sns

dims = (8, 8)
fig, ax = plt.subplots(figsize=dims)
sns.boxplot(data=df, x='Gender', y='Age',ax=ax)
<Axes: xlabel='Gender', ylabel='Age'>
Females are younger than Males in this data.
target = 'Drug'
X = df.drop(columns=target).copy()
y = df[target].copy()
X.head()
Age Gender BP Cholesterol Na_to_K
0 59.0 M High HIGH 13.935
1 40.0 F Normal HIGH 10.103
2 45.0 M Normal HIGH 9.084
3 62.0 M Normal HIGH 16.594
4 55.0 F High NORMAL 10.977
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 42)
df['BP'].replace({'Low':0, 'Normal':1, 'High':2}, inplace = True) # It is a numinal column w

df['Cholesterol'].replace({'NORMAL':0, 'HIGH':1}, inplace = True) # It is a numinal column w
<ipython-input-25-f6b2f86520ea>:1: FutureWarning: A value is trying to be set on a copy

df['BP'].replace({'Low':0, 'Normal':1, 'High':2}, inplace = True) # It is a numinal co

<ipython-input-25-f6b2f86520ea>:1: FutureWarning: Downcasting behavior in `replace` is d
df['BP'].replace({'Low':0, 'Normal':1, 'High':2}, inplace = True) # It is a numinal co
<ipython-input-25-f6b2f86520ea>:2: FutureWarning: A value is trying to be set on a copy
df['Cholesterol'].replace({'NORMAL':0, 'HIGH':1}, inplace = True) # It is a numinal co

<ipython-input-25-f6b2f86520ea>:2: FutureWarning: Downcasting behavior in `replace` is d
df['Cholesterol'].replace({'NORMAL':0, 'HIGH':1}, inplace = True) # It is a numinal co
cat_feature = make_column_selector(dtype_include='object') # Creating a categorical data s

num_feature = make_column_selector(dtype_include='number') # Creating a numeric data selec
# Instantiating the Transformers
impute_cat = SimpleImputer(strategy='most_frequent') #better to keep the ffil but for simpli

impute_num = SimpleImputer(strategy='median') # I'll keep it as mentioned before
ohe_encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore') # It is a numina
scaler = StandardScaler()
# Making pipelines for each category
pip_cat = make_pipeline(impute_cat, ohe_encoder)

pip_num = make_pipeline(impute_num, scaler)
# Defining a tuple for each pathway
categorical_tup = ('Categorical', pip_cat, cat_feature)

numerical_tup = ('Numerical', pip_num, num_feature)
# Instantiating the ColumnTransformer
col_transformer = ColumnTransformer([numerical_tup, categorical_tup], verbose_feature_names_

col_transformer
▸ ColumnTransformer i ?
▸ Numerical ▸ Categorical
▸ SimpleImputer ? ▸ SimpleImputer ?
▸ StandardScaler ? ▸ OneHotEncoder ?
from sklearn.tree import DecisionTreeClassifier

from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, Confusi
DTC = DecisionTreeClassifier()
DTC_pipe = make_pipeline(col_transformer, DTC)
DTC_pipe.fit(X_train, y_train)
▸ Pipeline i ?
▸ columntransformer: ColumnTransformer ?
▸ DecisionTreeClassifier ?
knn = KNeighborsClassifier()
knn_pipe = make_pipeline(col_transformer, knn)
knn_pipe.fit(X_train, y_train)
▸ Pipeline i ?
▸ KNeighborsClassifier ?
y_pred_train_DTC = DTC_pipe.predict(X_train)
y_pred_test_DTC = DTC_pipe.predict(X_test)
y_pred_train_knn = knn_pipe.predict(X_train)
y_pred_test_knn = knn_pipe.predict(X_test)
print('DTC Train Metrics')

print(classification_report(y_train, y_pred_train_DTC))
print('\nDTC Test Metrics')
print(classification_report(y_test, y_pred_test_DTC))
DTC Train Metrics

precision recall f1-score support
drugQ 1.00 1.00 1.00 114

drugZ 1.00 1.00 1.00 108
accuracy 1.00 222

macro avg 1.00 1.00 1.00 222
weighted avg 1.00 1.00 1.00 222
DTC Test Metrics

drugQ 0.68 0.62 0.65 34

drugZ 0.70 0.75 0.72 40
accuracy 0.69 74
macro avg 0.69 0.68 0.68 74
weighted avg 0.69 0.69 0.69 74
print('knn Train Metrics')

print(classification_report(y_train, y_pred_train_knn))
print('\nknn Test Metrics')
print(classification_report(y_test, y_pred_test_knn))
knn Train Metrics

drugQ 0.94 1.00 0.97 114

drugZ 1.00 0.94 0.97 108
accuracy 0.97 222

macro avg 0.97 0.97 0.97 222
weighted avg 0.97 0.97 0.97 222
knn Test Metrics

drugQ 0.57 0.68 0.62 34

drugZ 0.68 0.57 0.62 40
accuracy 0.62 74
macro avg 0.63 0.63 0.62 74
weighted avg 0.63 0.62 0.62 74
DTC_pipe.get_params()
{'memory': None,
'steps': [('columntransformer',
ColumnTransformer(transformers=[('Numerical',
Pipeline(steps=[('simpleimputer',
SimpleImputer(strategy='median')),
('standardscaler',
StandardScaler())]),
<sklearn.compose._column_transformer.make_column_selector object at
0x7caf7a3b3fd0>),
('Categorical',
SimpleImputer(strategy='most_frequent')),
('onehotencoder',
OneHotEncoder(handle_unknown='ignore',
sparse_output=False))]),
0x7caf7cf09030>)],
verbose_feature_names_out=False)),
('decisiontreeclassifier', DecisionTreeClassifier())],
'verbose': False,
'columntransformer': ColumnTransformer(transformers=[('Numerical',
SimpleImputer(strategy='median')),
('standardscaler',
StandardScaler())]),
0x7caf7a3b3fd0>),
('Categorical',
('onehotencoder',
OneHotEncoder(handle_unknown='ignore',
sparse_output=False))]),
0x7caf7cf09030>)],
verbose_feature_names_out=False),
'decisiontreeclassifier': DecisionTreeClassifier(),
'columntransformer__force_int_remainder_cols': True,
'columntransformer__n_jobs': None,
'columntransformer__remainder': 'drop',
'columntransformer__sparse_threshold': 0.3,
'columntransformer__transformer_weights': None,
'columntransformer__transformers': [('Numerical',
Pipeline(steps=[('simpleimputer', SimpleImputer(strategy='median')),
('standardscaler', StandardScaler())]),
<sklearn.compose. column transformer.make column selector at 0x7caf7a3b3fd0>),
DTC_params = {'decisiontreeclassifier__max_depth' : range(1,50),'decisiontreeclassifier__cri
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

DTC_grid = GridSearchCV(DTC_pipe, DTC_params)
DTC_grid.fit(X_train, y_train)
▸ GridSearchCV i ?
▸ best_estimator_: Pipeline
▸ DecisionTreeClassifier ?
knn_pipe.get_params()
1/11/25, 6:15 PM
__ g __ yCopy of TP3.ipynb
, - Colab
'columntransformer__Categorical__steps': [('simpleimputer',
('onehotencoder',
OneHotEncoder(handle_unknown='ignore', sparse_output=False))],
'columntransformer__Categorical__verbose': False,
'columntransformer__Categorical__simpleimputer':
SimpleImputer(strategy='most_frequent'),
'columntransformer__Categorical__onehotencoder':
OneHotEncoder(handle_unknown='ignore', sparse_output=False),
'columntransformer__Categorical__simpleimputer__add_indicator': False,
'columntransformer__Categorical__simpleimputer__copy': True,
'columntransformer__Categorical__simpleimputer__fill_value': None,
'columntransformer__Categorical__simpleimputer__keep_empty_features': False,
'columntransformer__Categorical__simpleimputer__missing_values': nan,
'columntransformer__Categorical__simpleimputer__strategy': 'most_frequent',
'columntransformer__Categorical__onehotencoder__categories': 'auto',
'columntransformer__Categorical__onehotencoder__drop': None,
'columntransformer__Categorical__onehotencoder__dtype': numpy.float64,
'columntransformer__Categorical__onehotencoder__feature_name_combiner': 'concat',
'columntransformer__Categorical__onehotencoder__handle_unknown': 'ignore',
'columntransformer__Categorical__onehotencoder__max_categories': None,
'columntransformer Categorical onehotencoder min frequency': None,
knn_params = {'kneighborsclassifier__n_neighbors' : range(1,20),

'kneighborsclassifier__weights' : ['uniform', 'distance'],
'kneighborsclassifier__p' : [2, 3, 4]}
knn_grid = GridSearchCV(knn_pipe, knn_params)

knn_grid.fit(X_train, y_train)
▸ GridSearchCV i ?
▸ best_estimator_: Pipeline
▸ KNeighborsClassifier ?
DTC_grid.best_params_
{'decisiontreeclassifier__criterion': 'gini',
'decisiontreeclassifier__max_depth': 2}
knn_grid.best_params_
{'kneighborsclassifier__n_neighbors': 1,
'kneighborsclassifier__p': 2,
'kneighborsclassifier__weights': 'uniform'}
best_DTC = DTC_grid.best_estimator_
best_knn = knn_grid.best_estimator_
y_pred_train_DTC = best_DTC.predict(X_train)
y_pred_test_DTC = best_DTC.predict(X_test)
y_pred_train_knn = best_knn.predict(X_train)
y_pred_test_knn = best_knn.predict(X_test)
print('DTC Train Metrics')

print(classification_report(y_train, y_pred_train_DTC))
print('\nDTC Test Metrics')
print(classification_report(y_test, y_pred_test_DTC))
DTC Train Metrics

drugQ 1.00 1.00 1.00 114

drugZ 1.00 1.00 1.00 108
accuracy 1.00 222

macro avg 1.00 1.00 1.00 222
weighted avg 1.00 1.00 1.00 222
DTC Test Metrics

drugQ 0.68 0.62 0.65 34

drugZ 0.70 0.75 0.72 40
accuracy 0.69 74
macro avg 0.69 0.68 0.68 74
weighted avg 0.69 0.69 0.69 74
print('knn Train Metrics')

print(classification_report(y_train, y_pred_train_knn))
print('\nknn Test Metrics')
print(classification_report(y_test, y_pred_test_knn))
knn Train Metrics

drugQ 1.00 1.00 1.00 114

drugZ 1.00 1.00 1.00 108
accuracy 1.00 222

macro avg 1.00 1.00 1.00 222
weighted avg 1.00 1.00 1.00 222
knn Test Metrics

drugQ 0.64 0.62 0.63 34

drugZ 0.68 0.70 0.69 40
accuracy 0.66 74
macro avg 0.66 0.66 0.66 74
weighted avg 0.66 0.66 0.66 74
ConfusionMatrixDisplay.from_estimator(best_DTC, X_test, y_test, cmap='Blues')
<sklearn.metrics._plot.confusion_matrix.ConfusionMatrixDisplay at 0x7caf7d255270>

Copy of TP3.ipynb - Colab

Uploaded by

Copy of TP3.ipynb - Colab

Uploaded by

1/11/25, 6:15 PM Copy of TP3.

Classification Metrics # Classification Metrics and

This is a bi-class problem because there are 2 possible classes!

Commencez à coder ou à générer avec l'IA.

from google.colab import drive

Age Gender BP Cholesterol Na_to_K Drug

0 59.0 M High HIGH 13.935 drugQ

1 40.0 F Normal HIGH 10.103 drugZ

2 NaN M Normal HIGH 9.084 drugZ

3 62.0 M Normal HIGH 16.594 drugZ

4 55.0 F High NORMAL 10.977 drugQ

# Check for duplicates and missing values

import matplotlib.pyplot as plt

<ipython-input-9-677a6229886c>:2: FutureWarning: A value is trying to be set on a copy o

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col

df['Age'].fillna(df['Age'].median(), inplace = True)

<ipython-input-10-a9453bca72fe>:2: FutureWarning: A value is trying to be set on a copy

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col

count 296.000000 296.000000

mean 47.020270 14.709355

std 33.735455 6.364668

min 15.000000 6.269000

25% 36.000000 10.445250

50% 45.000000 12.856000

75% 56.000000 16.732000

max 570.000000 38.247000

<ipython-input-20-cd34380d2017>:1: FutureWarning: A value is trying to be set on a copy

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col

count 296.000000 296.000000

mean 45.287162 14.709355

std 14.431203 6.364668

min 15.000000 6.269000

25% 36.000000 10.445250

50% 45.000000 12.856000

75% 56.000000 16.732000

max 74.000000 38.247000

import seaborn as sns

<Axes: xlabel='Gender', ylabel='Age'>

Females are younger than Males in this data.

Age Gender BP Cholesterol Na_to_K

0 59.0 M High HIGH 13.935

1 40.0 F Normal HIGH 10.103

2 45.0 M Normal HIGH 9.084

3 62.0 M Normal HIGH 16.594

4 55.0 F High NORMAL 10.977

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 42)

df['BP'].replace({'Low':0, 'Normal':1, 'High':2}, inplace = True) # It is a numinal column w

<ipython-input-25-f6b2f86520ea>:1: FutureWarning: A value is trying to be set on a copy

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col

df['BP'].replace({'Low':0, 'Normal':1, 'High':2}, inplace = True) # It is a numinal co

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col

df['Cholesterol'].replace({'NORMAL':0, 'HIGH':1}, inplace = True) # It is a numinal co

cat_feature = make_column_selector(dtype_include='object') # Creating a categorical data s

# Instantiating the Transformers

impute_cat = SimpleImputer(strategy='most_frequent') #better to keep the ffil but for simpli

# Making pipelines for each category

pip_cat = make_pipeline(impute_cat, ohe_encoder)

# Defining a tuple for each pathway

categorical_tup = ('Categorical', pip_cat, cat_feature)

# Instantiating the ColumnTransformer

col_transformer = ColumnTransformer([numerical_tup, categorical_tup], verbose_feature_names_

from sklearn.tree import DecisionTreeClassifier

DTC_pipe = make_pipeline(col_transformer, DTC)

knn_pipe = make_pipeline(col_transformer, knn)

print('DTC Train Metrics')

DTC Train Metrics

drugQ 1.00 1.00 1.00 114

accuracy 1.00 222

DTC Test Metrics

drugQ 0.68 0.62 0.65 34

print('knn Train Metrics')

knn Train Metrics

drugQ 0.94 1.00 0.97 114

accuracy 0.97 222

knn Test Metrics

drugQ 0.57 0.68 0.62 34