Chapter 15: Machine learning¶

Robert Johansson

Source code listings for Numerical Python - Scientific Computing and Data Science Applications with Numpy, SciPy and Matplotlib (ISBN 979-8-8688-0412-0).

In [1]:

from sklearn import datasets
from sklearn import model_selection
from sklearn import linear_model
from sklearn import metrics
from sklearn import tree
from sklearn import neighbors

from sklearn import svm
from sklearn import ensemble
from sklearn import cluster

In [2]:

%matplotlib inline
import matplotlib.pyplot as plt

In [3]:

import numpy as np

In [4]:

import seaborn as sns

In [5]:

import matplotlib as mpl
mpl.rcParams['mathtext.fontset'] = 'stix'
mpl.rcParams['font.family'] = 'serif'
mpl.rcParams['font.sans-serif'] = 'stix'

sns.set(style="whitegrid")

In [6]:

sns.set(style="darkgrid")

Built in datasets¶

In [7]:

datasets.load_wine #()

Out[7]:

<function sklearn.datasets._base.load_wine(*, return_X_y=False, as_frame=False)>

In [8]:

datasets.fetch_california_housing

Out[8]:

<function sklearn.datasets._california_housing.fetch_california_housing(*, data_home=None, download_if_missing=True, return_X_y=False, as_frame=False)>

In [9]:

datasets.make_regression

Out[9]:

<function sklearn.datasets._samples_generator.make_regression(n_samples=100, n_features=100, *, n_informative=10, n_targets=1, bias=0.0, effective_rank=None, tail_strength=0.5, noise=0.0, shuffle=True, coef=False, random_state=None)>

Regression¶

In [10]:

np.random.seed(123)

In [11]:

X_all, y_all = datasets.make_regression(n_samples=50, n_features=50, n_informative=10) #, noise=2.5)

In [12]:

X_train, X_test, y_train, y_test = model_selection.train_test_split(X_all, y_all, train_size=0.5)

In [13]:

X_train.shape, y_train.shape

Out[13]:

((25, 50), (25,))

In [14]:

X_test.shape, y_test.shape

Out[14]:

((25, 50), (25,))

In [15]:

model = linear_model.LinearRegression()

In [16]:

model.fit(X_train, y_train)

Out[16]:

LinearRegression()

In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.

In [17]:

def sse(resid):
    return sum(resid**2)

In [18]:

resid_train = y_train - model.predict(X_train)
sse_train = sse(resid_train)
sse_train

Out[18]:

5.8411047954908775e-25

In [19]:

resid_test = y_test - model.predict(X_test)
sse_test = sse(resid_train)
sse_test

Out[19]:

5.8411047954908775e-25

In [20]:

model.score(X_train, y_train)

Out[20]:

1.0

In [21]:

model.score(X_test, y_test)

Out[21]:

0.314074006752018

In [22]:

def plot_residuals_and_coeff(resid_train, resid_test, coeff):
    fig, axes = plt.subplots(1, 3, figsize=(12, 3))
    axes[0].bar(np.arange(len(resid_train)), resid_train)
    axes[0].set_xlabel("sample number")
    axes[0].set_ylabel("residual")
    axes[0].set_title("training data")
    axes[1].bar(np.arange(len(resid_test)), resid_test)
    axes[1].set_xlabel("sample number")
    axes[1].set_ylabel("residual")
    axes[1].set_title("testing data")
    axes[2].bar(np.arange(len(coeff)), coeff)
    axes[2].set_xlabel("coefficient number")
    axes[2].set_ylabel("coefficient")
    fig.tight_layout()
    return fig, axes

In [23]:

fig, ax = plot_residuals_and_coeff(resid_train, resid_test, model.coef_)
fig.savefig("ch15-regression-ols.pdf")

In [24]:

model = linear_model.Ridge() #alpha=2.5)

In [25]:

model.fit(X_train, y_train)

Out[25]:

Ridge()

In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.

In [26]:

resid_train = y_train - model.predict(X_train)
sse_train = sum(resid_train**2)
sse_train

Out[26]:

178.50695164950955

In [27]:

resid_test = y_test - model.predict(X_test)
sse_test = sum(resid_test**2)
sse_test

Out[27]:

212737.00160105838

In [28]:

model.score(X_train, y_train), model.score(X_test, y_test)

Out[28]:

(0.9994595515017335, 0.3167033273607547)

In [29]:

fig, ax = plot_residuals_and_coeff(resid_train, resid_test, model.coef_)
fig.savefig("ch15-regression-ridge.pdf")

In [30]:

model = linear_model.Lasso(alpha=1.0)

In [31]:

model.fit(X_train, y_train)

Out[31]:

Lasso()

In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.

In [32]:

resid_train = y_train - model.predict(X_train)
sse_train = sse(resid_train)
sse_train

Out[32]:

309.7497138953243

In [33]:

resid_test = y_test - model.predict(X_test)
sse_test = sse(resid_test)
sse_test

Out[33]:

1489.1176065002646

In [34]:

fig, ax = plot_residuals_and_coeff(resid_train, resid_test, model.coef_)
fig.savefig("ch15-regression-lasso.pdf")

In [35]:

alphas = np.logspace(-4, 2, 100)

In [36]:

coeffs = np.zeros((len(alphas), X_train.shape[1]))
sse_train = np.zeros_like(alphas)
sse_test = np.zeros_like(alphas)

for n, alpha in enumerate(alphas):
    model = linear_model.Lasso(alpha=alpha)
    model.fit(X_train, y_train)
    coeffs[n, :] = model.coef_
    resid = y_train - model.predict(X_train)
    sse_train[n] = sum(resid**2)
    resid = y_test - model.predict(X_test)
    sse_test[n] = sum(resid**2)

/Users/rob/miniconda3/envs/npbook_py310/lib/python3.10/site-packages/sklearn/linear_model/_coordinate_descent.py:628: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations, check the scale of the features or consider increasing regularisation. Duality gap: 3.311e+01, tolerance: 3.303e+01
model = cd_fast.enet_coordinate_descent(
/Users/rob/miniconda3/envs/npbook_py310/lib/python3.10/site-packages/sklearn/linear_model/_coordinate_descent.py:628: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations, check the scale of the features or consider increasing regularisation. Duality gap: 3.682e+01, tolerance: 3.303e+01
model = cd_fast.enet_coordinate_descent(
/Users/rob/miniconda3/envs/npbook_py310/lib/python3.10/site-packages/sklearn/linear_model/_coordinate_descent.py:628: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations, check the scale of the features or consider increasing regularisation. Duality gap: 4.220e+01, tolerance: 3.303e+01
model = cd_fast.enet_coordinate_descent(
/Users/rob/miniconda3/envs/npbook_py310/lib/python3.10/site-packages/sklearn/linear_model/_coordinate_descent.py:628: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations, check the scale of the features or consider increasing regularisation. Duality gap: 3.515e+01, tolerance: 3.303e+01
model = cd_fast.enet_coordinate_descent(
/Users/rob/miniconda3/envs/npbook_py310/lib/python3.10/site-packages/sklearn/linear_model/_coordinate_descent.py:628: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations, check the scale of the features or consider increasing regularisation. Duality gap: 3.462e+01, tolerance: 3.303e+01
model = cd_fast.enet_coordinate_descent(
/Users/rob/miniconda3/envs/npbook_py310/lib/python3.10/site-packages/sklearn/linear_model/_coordinate_descent.py:628: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations, check the scale of the features or consider increasing regularisation. Duality gap: 3.865e+01, tolerance: 3.303e+01
model = cd_fast.enet_coordinate_descent(
/Users/rob/miniconda3/envs/npbook_py310/lib/python3.10/site-packages/sklearn/linear_model/_coordinate_descent.py:628: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations, check the scale of the features or consider increasing regularisation. Duality gap: 4.408e+01, tolerance: 3.303e+01
model = cd_fast.enet_coordinate_descent(
/Users/rob/miniconda3/envs/npbook_py310/lib/python3.10/site-packages/sklearn/linear_model/_coordinate_descent.py:628: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations, check the scale of the features or consider increasing regularisation. Duality gap: 3.514e+01, tolerance: 3.303e+01
model = cd_fast.enet_coordinate_descent(
/Users/rob/miniconda3/envs/npbook_py310/lib/python3.10/site-packages/sklearn/linear_model/_coordinate_descent.py:628: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations, check the scale of the features or consider increasing regularisation. Duality gap: 3.364e+01, tolerance: 3.303e+01
model = cd_fast.enet_coordinate_descent(
/Users/rob/miniconda3/envs/npbook_py310/lib/python3.10/site-packages/sklearn/linear_model/_coordinate_descent.py:628: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations, check the scale of the features or consider increasing regularisation. Duality gap: 3.928e+01, tolerance: 3.303e+01
model = cd_fast.enet_coordinate_descent(
/Users/rob/miniconda3/envs/npbook_py310/lib/python3.10/site-packages/sklearn/linear_model/_coordinate_descent.py:628: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations, check the scale of the features or consider increasing regularisation. Duality gap: 3.447e+01, tolerance: 3.303e+01
model = cd_fast.enet_coordinate_descent(
/Users/rob/miniconda3/envs/npbook_py310/lib/python3.10/site-packages/sklearn/linear_model/_coordinate_descent.py:628: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations, check the scale of the features or consider increasing regularisation. Duality gap: 3.509e+01, tolerance: 3.303e+01
model = cd_fast.enet_coordinate_descent(
/Users/rob/miniconda3/envs/npbook_py310/lib/python3.10/site-packages/sklearn/linear_model/_coordinate_descent.py:628: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations, check the scale of the features or consider increasing regularisation. Duality gap: 3.718e+01, tolerance: 3.303e+01
model = cd_fast.enet_coordinate_descent(
/Users/rob/miniconda3/envs/npbook_py310/lib/python3.10/site-packages/sklearn/linear_model/_coordinate_descent.py:628: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations, check the scale of the features or consider increasing regularisation. Duality gap: 3.941e+01, tolerance: 3.303e+01
model = cd_fast.enet_coordinate_descent(
/Users/rob/miniconda3/envs/npbook_py310/lib/python3.10/site-packages/sklearn/linear_model/_coordinate_descent.py:628: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations, check the scale of the features or consider increasing regularisation. Duality gap: 4.351e+01, tolerance: 3.303e+01
model = cd_fast.enet_coordinate_descent(

In [37]:

fig, axes = plt.subplots(1, 2, figsize=(12, 4), sharex=True)

for n in range(coeffs.shape[1]):
    axes[0].plot(np.log10(alphas), coeffs[:, n], color='k', lw=0.5)

axes[1].semilogy(np.log10(alphas), sse_train, label="train")
axes[1].semilogy(np.log10(alphas), sse_test, label="test")
axes[1].legend(loc=0)

axes[0].set_xlabel(r"${\log_{10}}\alpha$", fontsize=18)
axes[0].set_ylabel(r"coefficients", fontsize=18)
axes[1].set_xlabel(r"${\log_{10}}\alpha$", fontsize=18)
axes[1].set_ylabel(r"sse", fontsize=18)
fig.tight_layout()
fig.savefig("ch15-regression-lasso-vs-alpha.pdf")

In [38]:

model = linear_model.LassoCV()

In [39]:

model.fit(X_all, y_all)

Out[39]:

LassoCV()

In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.

In [40]:

model.alpha_

Out[40]:

0.06559238747534717

In [41]:

resid_train = y_train - model.predict(X_train)
sse_train = sse(resid_train)
sse_train

Out[41]:

1.5450589323146602

In [42]:

resid_test = y_test - model.predict(X_test)
sse_test = sse(resid_test)
sse_test

Out[42]:

1.5321417406215792

In [43]:

model.score(X_train, y_train), model.score(X_test, y_test)

Out[43]:

(0.9999953221722068, 0.9999950788657098)

In [44]:

fig, ax = plot_residuals_and_coeff(resid_train, resid_test, model.coef_)
fig.savefig("ch15-regression-lasso-cv.pdf")

In [45]:

model = linear_model.ElasticNetCV()

In [46]:

model.fit(X_all, y_all)

Out[46]:

ElasticNetCV()

In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.

In [47]:

model.alpha_

Out[47]:

0.13118477495069433

In [48]:

model.l1_ratio

Out[48]:

0.5

In [49]:

resid_train = y_train - model.predict(X_train)
sse_train = sum(resid_train**2)
sse_train

Out[49]:

2183.83917293912

In [50]:

resid_test = y_test - model.predict(X_test)
sse_test = sum(resid_test**2)
sse_test

Out[50]:

2650.0504463382536

In [51]:

model.score(X_train, y_train), model.score(X_test, y_test)

Out[51]:

(0.9933881981034111, 0.9914882195448783)

In [52]:

fig, ax = plot_residuals_and_coeff(resid_train, resid_test, model.coef_)
fig.savefig("ch15-regression-elastic-net-cv.pdf")

Classification¶

In [53]:

iris = datasets.load_iris()

In [54]:

type(iris)

Out[54]:

sklearn.utils._bunch.Bunch

In [55]:

iris.target_names

Out[55]:

array(['setosa', 'versicolor', 'virginica'], dtype='<U10')

In [56]:

iris.feature_names

Out[56]:

['sepal length (cm)',
 'sepal width (cm)',
 'petal length (cm)',
 'petal width (cm)']

In [57]:

iris.data.shape

Out[57]:

(150, 4)

In [58]:

iris.target.shape

Out[58]:

(150,)

In [59]:

# print(iris['DESCR'])

In [60]:

X_train, X_test, y_train, y_test = model_selection.train_test_split(
    iris.data, iris.target, train_size=0.7, random_state=0)

In [61]:

classifier = linear_model.LogisticRegression()

In [62]:

classifier.fit(X_train, y_train)

/Users/rob/miniconda3/envs/npbook_py310/lib/python3.10/site-packages/sklearn/linear_model/_logistic.py:460: ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(

Out[62]:

LogisticRegression()

In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.

In [63]:

y_test_pred = classifier.predict(X_test)

In [64]:

print(metrics.classification_report(y_test, y_test_pred))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00        16
           1       1.00      0.94      0.97        18
           2       0.92      1.00      0.96        11

    accuracy                           0.98        45
   macro avg       0.97      0.98      0.98        45
weighted avg       0.98      0.98      0.98        45

In [65]:

np.bincount(y_test)

Out[65]:

array([16, 18, 11])

In [66]:

metrics.confusion_matrix(y_test, y_test_pred)

Out[66]:

array([[16,  0,  0],
       [ 0, 17,  1],
       [ 0,  0, 11]])

In [67]:

classifier = tree.DecisionTreeClassifier()
classifier.fit(X_train, y_train)
y_test_pred = classifier.predict(X_test)
metrics.confusion_matrix(y_test, y_test_pred)

Out[67]:

array([[16,  0,  0],
       [ 0, 17,  1],
       [ 0,  0, 11]])

In [68]:

classifier = neighbors.KNeighborsClassifier()
classifier.fit(X_train, y_train)
y_test_pred = classifier.predict(X_test)
metrics.confusion_matrix(y_test, y_test_pred)

Out[68]:

array([[16,  0,  0],
       [ 0, 17,  1],
       [ 0,  0, 11]])

In [69]:

classifier = svm.SVC()
classifier.fit(X_train, y_train)
y_test_pred = classifier.predict(X_test)
metrics.confusion_matrix(y_test, y_test_pred)

Out[69]:

array([[16,  0,  0],
       [ 0, 17,  1],
       [ 0,  0, 11]])

In [70]:

classifier = ensemble.RandomForestClassifier()
classifier.fit(X_train, y_train)
y_test_pred = classifier.predict(X_test)
metrics.confusion_matrix(y_test, y_test_pred)

Out[70]:

array([[16,  0,  0],
       [ 0, 17,  1],
       [ 0,  0, 11]])

In [71]:

train_size_vec = np.linspace(0.1, 0.9, 30)

In [72]:

classifiers = [tree.DecisionTreeClassifier,
               neighbors.KNeighborsClassifier,
               svm.SVC,
               ensemble.RandomForestClassifier
              ]

In [73]:

cm_diags = np.zeros((3, len(train_size_vec), len(classifiers)), dtype=float)

In [74]:

for n, train_size in enumerate(train_size_vec):
    X_train, X_test, y_train, y_test = \
        model_selection.train_test_split(iris.data, iris.target, train_size=train_size)

    for m, Classifier in enumerate(classifiers): 
        classifier = Classifier()
        classifier.fit(X_train, y_train)
        y_test_pred = classifier.predict(X_test)
        cm_diags[:, n, m] = metrics.confusion_matrix(y_test, y_test_pred).diagonal()
        cm_diags[:, n, m] /= np.bincount(y_test)

In [75]:

fig, axes = plt.subplots(1, len(classifiers), figsize=(12, 3))

for m, Classifier in enumerate(classifiers): 
    axes[m].plot(train_size_vec, cm_diags[2, :, m], label=iris.target_names[2])
    axes[m].plot(train_size_vec, cm_diags[1, :, m], label=iris.target_names[1])
    axes[m].plot(train_size_vec, cm_diags[0, :, m], label=iris.target_names[0])
    axes[m].set_title(type(Classifier()).__name__)
    axes[m].set_ylim(0, 1.1)
    axes[m].set_xlim(0.1, 0.9)
    axes[m].set_ylabel("classification accuracy")
    axes[m].set_xlabel("training size ratio")
    axes[m].legend(loc=4)

fig.tight_layout()
fig.savefig("ch15-classification-comparison.pdf")

Clustering¶

In [76]:

X, y = iris.data, iris.target

In [77]:

np.random.seed(123)

In [78]:

n_clusters = 3

In [79]:

c = cluster.KMeans(n_clusters=n_clusters)

In [80]:

c.fit(X)

/Users/rob/miniconda3/envs/npbook_py310/lib/python3.10/site-packages/sklearn/cluster/_kmeans.py:1412: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning
  super()._check_params_vs_input(X, default_n_init=10)

Out[80]:

KMeans(n_clusters=3)

In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.

In [81]:

y_pred = c.predict(X)

In [82]:

y_pred[::8]

Out[82]:

array([1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 0, 0, 0, 0, 0, 0],
      dtype=int32)

In [83]:

y[::8]

Out[83]:

array([0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2])

In [84]:

idx_0, idx_1, idx_2 = (np.where(y_pred == n) for n in range(3))

In [85]:

y_pred[idx_0], y_pred[idx_1], y_pred[idx_2] = 2, 0, 1

In [86]:

y_pred[::8]

Out[86]:

array([0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2],
      dtype=int32)

In [87]:

metrics.confusion_matrix(y, y_pred)

Out[87]:

array([[50,  0,  0],
       [ 0, 48,  2],
       [ 0, 14, 36]])

In [88]:

N = X.shape[1]

fig, axes = plt.subplots(N, N, figsize=(12, 12), sharex=True, sharey=True)

colors = ["coral", "blue", "green"]
markers = ["^", "v", "o"]
for m in range(N):
    for n in range(N):
        for p in range(n_clusters):
            mask = y_pred == p
            axes[m, n].scatter(X[:, m][mask], X[:, n][mask],
                               marker=markers[p], s=30, 
                               color=colors[p], alpha=0.25)

        for idx in np.where(y != y_pred):
            axes[m, n].scatter(X[idx, m], X[idx, n],
                               marker="s", s=30, 
                               edgecolor="red", 
                               facecolor=(1,1,1,0))
            
            
    axes[N-1, m].set_xlabel(iris.feature_names[m], fontsize=16)
    axes[m, 0].set_ylabel(iris.feature_names[m], fontsize=16)
fig.tight_layout()
fig.savefig("ch15-clustering.pdf")
fig.savefig("ch15-clustering.png", dpi=600)

Versions¶

In [89]:

%reload_ext version_information

In [90]:

%version_information sklearn, numpy, matplotlib, seaborn

Out[90]:

Software	Version
Python	3.10.12 64bit [Clang 14.0.6 ]
IPython	8.12.0
OS	macOS 10.15.7 x86\_64 i386 64bit
sklearn	1.3.0
numpy	1.22.3
matplotlib	3.7.1
seaborn	0.12.2
Sun Nov 03 09:20:45 2024 JST