Supervised Learning with scikit-learn
Original Source: https://www.coursera.org/learn/machine-learning
Before We Start
We will use and apply the following with sklearn.
Datasets
- breast cancer: sklearn provided binary classification dataset; whether a patient’s cancer is benign or malignant
- boston: sklearn provided regression dataset; house price in boston
Model Selection
- train test split
- cross validate (k-fold)
Feature Preprocessing
- standard scaler
- polynomial features
Algorithms
- k nearest neighbors
- linear, logistic regression
- ridge regression
- lasso regression
- support vector machine
- decision tree
- random forest
- gradient boosting
- neural network
Evaluation Metric
- accuracy
- r-squared score
- precision
- recall
- ROC AUC
%matplotlib inline
import numpy as np
import pandas as pd
import seaborn as sns
sns.set()
import matplotlib.pyplot as plt
from sklearn.datasets import load_breast_cancer, load_boston
from sklearn.model_selection import train_test_split, cross_validate, cross_val_score
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.metrics import roc_auc_score, precision_score, recall_score, f1_score, classification_report
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
from sklearn.linear_model import LogisticRegression, Ridge, Lasso, LinearRegression
from sklearn.svm import SVC, SVR
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.neural_network import MLPClassifier, MLPRegressor
np.set_printoptions(precision=2)
np.random.seed(0)
Function to evaluate different models
def evaluate_model(model, scaling, poly_degree=False):
if poly_degree:
poly = PolynomialFeatures(degree=poly_degree)
X_train_poly = poly.fit_transform(X_train)
X_test_poly = poly.transform(X_test)
if scaling:
# scale both X_train and X_test with scaler fitted on X_train
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_poly)
X_test_scaled = scaler.transform(X_test_poly)
else:
X_train_scaled = X_train_poly
X_test_scaled = X_test_poly
else:
if scaling:
# scale both X_train and X_test with scaler fitted on X_train
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
else:
X_train_scaled = X_train
X_test_scaled = X_test
# fit the model with X_train_scaled, y_train
model.fit(X_train_scaled, y_train)
# predict the test labels
prediction = model.predict(X_test_scaled)
# evaluate the model
train_score = model.score(X_train_scaled, y_train)
test_score = model.score(X_test_scaled, y_test)
print('Train Score: '+ str(train_score))
print('Test Score:' + str(test_score))
print('')
return model
Dataset for classification: Breast Cancer
data = load_breast_cancer()
X_iris = data['data']
y_iris = data['target']
target_names = data['target_names']
feature_names = data['feature_names']
print(target_names, feature_names)
X_train, X_test, y_train, y_test = train_test_split(X_iris, y_iris, random_state=1)
['malignant' 'benign'] ['mean radius' 'mean texture' 'mean perimeter' 'mean area'
'mean smoothness' 'mean compactness' 'mean concavity'
'mean concave points' 'mean symmetry' 'mean fractal dimension'
'radius error' 'texture error' 'perimeter error' 'area error'
'smoothness error' 'compactness error' 'concavity error'
'concave points error' 'symmetry error' 'fractal dimension error'
'worst radius' 'worst texture' 'worst perimeter' 'worst area'
'worst smoothness' 'worst compactness' 'worst concavity'
'worst concave points' 'worst symmetry' 'worst fractal dimension']
k-NN Classifier
# n_neighbors = 5 (default)
model = KNeighborsClassifier()
trained_model = evaluate_model(model, scaling=True)
# n_neighbors = 10
model = KNeighborsClassifier(n_neighbors=10)
trained_model = evaluate_model(model, scaling=True)
# n_neighbors = 1
model = KNeighborsClassifier(n_neighbors=1)
trained_model = evaluate_model(model, scaling=True)
Train Score: 0.9812206572769953
Test Score:0.951048951048951
Train Score: 0.9788732394366197
Test Score:0.958041958041958
Train Score: 1.0
Test Score:0.958041958041958
Logistic Regression
# C=1 (Default)
model = LogisticRegression()
trained_model = evaluate_model(model, scaling=False)
# C=10
model = LogisticRegression(C=10)
trained_model = evaluate_model(model, scaling=False)
print('Test precision: ' + str(precision_score(y_test, trained_model.predict(X_test))))
print('Test recall: ' + str(recall_score(y_test, trained_model.predict(X_test))))
print('Test f1 score: ' + str(f1_score(y_test, trained_model.predict(X_test))))
print('Test ROC AUC score: ' + str(roc_auc_score(y_test, trained_model.predict_proba(X_test)[:, 1])) + '\n')
# C=0.1
model = LogisticRegression(C=0.1)
trained_model = evaluate_model(model, scaling=False)
Train Score: 0.9577464788732394
Test Score:0.951048951048951
Train Score: 0.9671361502347418
Test Score:0.965034965034965
Test precision: 0.9662921348314607
Test recall: 0.9772727272727273
Test f1 score: 0.9717514124293786
Test ROC AUC score: 0.9915289256198347
Train Score: 0.9460093896713615
Test Score:0.951048951048951
Linear Support Vector Classifier
# C=1 (default)
model = SVC(kernel='linear')
trained_model = evaluate_model(model, scaling=True)
# C=10
model = SVC(kernel='linear', C=10)
trained_model = evaluate_model(model, scaling=True)
# C=0.1
model = SVC(kernel='linear', C=0.1)
trained_model = evaluate_model(model, scaling=True)
Train Score: 0.9882629107981221
Test Score:0.965034965034965
Train Score: 0.9929577464788732
Test Score:0.958041958041958
Train Score: 0.9882629107981221
Test Score:0.958041958041958
Kernelized Support Vector Classifier
# C=1 (default)
model = SVC()
trained_model = evaluate_model(model, scaling=True)
# C=10
model = SVC(C=10)
trained_model = evaluate_model(model, scaling=True)
# C=0.1
model = SVC(C=0.1)
trained_model = evaluate_model(model, scaling=True)
Train Score: 0.9882629107981221
Test Score:0.965034965034965
Train Score: 0.9953051643192489
Test Score:0.958041958041958
Train Score: 0.9530516431924883
Test Score:0.951048951048951
Decision Tree
# max_depth=None (default)
model = DecisionTreeClassifier()
trained_model = evaluate_model(model, scaling=False)
print('Feature Importances')
print(list(zip(feature_names, trained_model.feature_importances_)))
print('')
# max_depth=3
model = DecisionTreeClassifier(max_depth=3)
trained_model = evaluate_model(model, scaling=False)
print(classification_report(y_test, trained_model.predict(X_test)))
Train Score: 1.0
Test Score:0.9440559440559441
Feature Importances
[('mean radius', 0.0), ('mean texture', 0.022007779353930466), ('mean perimeter', 0.0), ('mean area', 0.0), ('mean smoothness', 0.010012730496503279), ('mean compactness', 0.0), ('mean concavity', 0.006724599247034309), ('mean concave points', 0.021009493579550387), ('mean symmetry', 0.0), ('mean fractal dimension', 0.0), ('radius error', 0.0033206182232256025), ('texture error', 0.0), ('perimeter error', 0.0), ('area error', 0.0), ('smoothness error', 0.0), ('compactness error', 0.036312835933985266), ('concavity error', 0.0), ('concave points error', 0.0), ('symmetry error', 0.0), ('fractal dimension error', 0.0), ('worst radius', 0.0), ('worst texture', 0.08065645725141522), ('worst perimeter', 0.7505955544414274), ('worst area', 0.0), ('worst smoothness', 0.0), ('worst compactness', 0.0), ('worst concavity', 0.022191177515213226), ('worst concave points', 0.047168753957714735), ('worst symmetry', 0.0), ('worst fractal dimension', 0.0)]
Train Score: 0.971830985915493
Test Score:0.8951048951048951
precision recall f1-score support
0 0.87 0.85 0.86 55
1 0.91 0.92 0.92 88
avg / total 0.89 0.90 0.89 143
RandomForest Classifier
model = RandomForestClassifier()
trained_model = evaluate_model(model, scaling=False)
model = RandomForestClassifier(max_depth=3)
trained_model = evaluate_model(model, scaling=False)
model = RandomForestClassifier(max_depth=10)
trained_model = evaluate_model(model, scaling=False)
Train Score: 1.0
Test Score:0.9370629370629371
Train Score: 0.9788732394366197
Test Score:0.951048951048951
Train Score: 0.9953051643192489
Test Score:0.9300699300699301
Gradient Boosted Decision Tree Classifier
model = GradientBoostingClassifier()
trained_model = evaluate_model(model, scaling=False)
model = GradientBoostingClassifier(n_estimators=5)
trained_model = evaluate_model(model, scaling=False)
model = GradientBoostingClassifier(learning_rate = 0.001)
trained_model = evaluate_model(model, scaling=False)
Train Score: 1.0
Test Score:0.965034965034965
Train Score: 0.9647887323943662
Test Score:0.916083916083916
Train Score: 0.6314553990610329
Test Score:0.6153846153846154
Neural Network
model = MLPClassifier(max_iter=10000)
trained_model = evaluate_model(model, scaling=True)
Train Score: 0.9953051643192489
Test Score:0.972027972027972
Dataset for Regression: boston
boston = load_boston()
X_boston = boston['data']
y_boston = boston['target']
feature_names = boston['feature_names']
X_train, X_test, y_train, y_test = train_test_split(X_boston, y_boston, random_state=1)
KNeighbors Regression
model = KNeighborsRegressor()
trained_model = evaluate_model(model, scaling=False)
model = KNeighborsRegressor(n_neighbors=4)
trained_model = evaluate_model(model, scaling=False)
Train Score: 0.6667843569655112
Test Score:0.5281871748119744
Train Score: 0.7169330804027261
Test Score:0.5847379837420476
Linear regression
model = LinearRegression()
trained_model = evaluate_model(model, scaling=False)
model = Ridge()
trained_model = evaluate_model(model, scaling=True)
model = Ridge(alpha=20)
trained_model = evaluate_model(model, scaling=True)
model = Lasso()
trained_model = evaluate_model(model, scaling=True)
print('Features with non-zero weight (sorted by absolute magnitude):')
for e in sorted (list(zip(list(feature_names), trained_model.coef_)), key = lambda e: -abs(e[1])):
print('\t{}, {:.3f}'.format(e[0], e[1]))
Train Score: 0.7167286808673382
Test Score:0.7790257749137307
Train Score: 0.7167018610210802
Test Score:0.7790559050301669
Train Score: 0.7120104783011092
Test Score:0.7743235092832248
Train Score: 0.6350155488350683
Test Score:0.669415418149101
Features with non-zero weight (sorted by absolute magnitude):
LSTAT, -3.882
RM, 1.981
PTRATIO, -1.353
CRIM, -0.000
ZN, 0.000
INDUS, -0.000
CHAS, 0.000
NOX, -0.000
AGE, -0.000
DIS, -0.000
RAD, -0.000
TAX, -0.000
B, 0.000
Polynomial regression
model = LinearRegression()
trained_model = evaluate_model(model, scaling=False)
model = LinearRegression()
trained_model = evaluate_model(model, scaling=False, poly_degree=2)
model = Ridge()
trained_model = evaluate_model(model, scaling=True, poly_degree=2)
model = Ridge()
trained_model = evaluate_model(model, scaling=True, poly_degree=3)
Train Score: 0.7167286808673382
Test Score:0.7790257749137307
Train Score: 0.7389256270176032
Test Score:0.5867916321234202
Train Score: 0.9014299197022099
Test Score:0.9191961799874806
Train Score: 0.9490398580910372
Test Score:0.9321929473192055
Support Vector Regression
model = SVR()
trained_model = evaluate_model(model, scaling=True)
model = SVR(C=40)
trained_model = evaluate_model(model, scaling=True)
model = SVR(kernel='linear')
trained_model = evaluate_model(model, scaling=True)
Train Score: 0.6474681713166364
Test Score:0.6629378122683037
Train Score: 0.9298640839552765
Test Score:0.9269844037332291
Train Score: 0.6812467645514385
Test Score:0.7740854988602284
Neural Network
model = MLPRegressor(max_iter=10000)
trained_model = evaluate_model(model, scaling=True)
Train Score: 0.9360686347144158
Test Score:0.9165525614065948
Cross-validation
model = LinearRegression()
result = pd.DataFrame(cross_validate(model, X_boston, y_boston, scoring='r2', cv=10, return_train_score=True))
result
fit_time | score_time | test_score | train_score | |
---|---|---|---|---|
0 | 0.001995 | 0.000000 | 0.733349 | 0.738790 |
1 | 0.000997 | 0.000000 | 0.472298 | 0.747077 |
2 | 0.000000 | 0.000000 | -1.010977 | 0.743820 |
3 | 0.000999 | 0.000000 | 0.641263 | 0.717158 |
4 | 0.004989 | 0.000000 | 0.547098 | 0.741726 |
5 | 0.002018 | 0.000976 | 0.736102 | 0.704297 |
6 | 0.000997 | 0.000000 | 0.377618 | 0.746045 |
7 | 0.000000 | 0.000000 | -0.130269 | 0.838793 |
8 | 0.000000 | 0.000000 | -0.783723 | 0.736517 |
9 | 0.000000 | 0.000997 | 0.418618 | 0.742091 |
result.mean(axis=0)
fit_time 0.001199
score_time 0.000197
test_score 0.200138
train_score 0.745632
dtype: float64
model = LogisticRegression()
result = pd.DataFrame(cross_validate(model, X_iris, y_iris, scoring='accuracy', cv=10, return_train_score=True))
result.mean(axis=0)
fit_time 0.008273
score_time 0.000399
test_score 0.950900
train_score 0.958212
dtype: float64
Leave a Comment