Robert Johansson
Source code listings for Numerical Python - Scientific Computing and Data Science Applications with Numpy, SciPy and Matplotlib (ISBN 979-8-8688-0412-0).
from sklearn import datasets
from sklearn import model_selection
from sklearn import linear_model
from sklearn import metrics
from sklearn import tree
from sklearn import neighbors
from sklearn import svm
from sklearn import ensemble
from sklearn import cluster
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import matplotlib as mpl
mpl.rcParams['mathtext.fontset'] = 'stix'
mpl.rcParams['font.family'] = 'serif'
mpl.rcParams['font.sans-serif'] = 'stix'
sns.set(style="whitegrid")
sns.set(style="darkgrid")
datasets.load_wine #()
<function sklearn.datasets._base.load_wine(*, return_X_y=False, as_frame=False)>
datasets.fetch_california_housing
<function sklearn.datasets._california_housing.fetch_california_housing(*, data_home=None, download_if_missing=True, return_X_y=False, as_frame=False)>
datasets.make_regression
<function sklearn.datasets._samples_generator.make_regression(n_samples=100, n_features=100, *, n_informative=10, n_targets=1, bias=0.0, effective_rank=None, tail_strength=0.5, noise=0.0, shuffle=True, coef=False, random_state=None)>
np.random.seed(123)
X_all, y_all = datasets.make_regression(n_samples=50, n_features=50, n_informative=10) #, noise=2.5)
X_train, X_test, y_train, y_test = model_selection.train_test_split(X_all, y_all, train_size=0.5)
X_train.shape, y_train.shape
((25, 50), (25,))
X_test.shape, y_test.shape
((25, 50), (25,))
model = linear_model.LinearRegression()
model.fit(X_train, y_train)
LinearRegression()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
LinearRegression()
def sse(resid):
return sum(resid**2)
resid_train = y_train - model.predict(X_train)
sse_train = sse(resid_train)
sse_train
5.8411047954908775e-25
resid_test = y_test - model.predict(X_test)
sse_test = sse(resid_train)
sse_test
5.8411047954908775e-25
model.score(X_train, y_train)
1.0
model.score(X_test, y_test)
0.314074006752018
def plot_residuals_and_coeff(resid_train, resid_test, coeff):
fig, axes = plt.subplots(1, 3, figsize=(12, 3))
axes[0].bar(np.arange(len(resid_train)), resid_train)
axes[0].set_xlabel("sample number")
axes[0].set_ylabel("residual")
axes[0].set_title("training data")
axes[1].bar(np.arange(len(resid_test)), resid_test)
axes[1].set_xlabel("sample number")
axes[1].set_ylabel("residual")
axes[1].set_title("testing data")
axes[2].bar(np.arange(len(coeff)), coeff)
axes[2].set_xlabel("coefficient number")
axes[2].set_ylabel("coefficient")
fig.tight_layout()
return fig, axes
fig, ax = plot_residuals_and_coeff(resid_train, resid_test, model.coef_)
fig.savefig("ch15-regression-ols.pdf")
model = linear_model.Ridge() #alpha=2.5)
model.fit(X_train, y_train)
Ridge()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
Ridge()
resid_train = y_train - model.predict(X_train)
sse_train = sum(resid_train**2)
sse_train
178.50695164950955
resid_test = y_test - model.predict(X_test)
sse_test = sum(resid_test**2)
sse_test
212737.00160105838
model.score(X_train, y_train), model.score(X_test, y_test)
(0.9994595515017335, 0.3167033273607547)
fig, ax = plot_residuals_and_coeff(resid_train, resid_test, model.coef_)
fig.savefig("ch15-regression-ridge.pdf")
model = linear_model.Lasso(alpha=1.0)
model.fit(X_train, y_train)
Lasso()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
Lasso()
resid_train = y_train - model.predict(X_train)
sse_train = sse(resid_train)
sse_train
309.7497138953243
resid_test = y_test - model.predict(X_test)
sse_test = sse(resid_test)
sse_test
1489.1176065002646
fig, ax = plot_residuals_and_coeff(resid_train, resid_test, model.coef_)
fig.savefig("ch15-regression-lasso.pdf")
alphas = np.logspace(-4, 2, 100)
coeffs = np.zeros((len(alphas), X_train.shape[1]))
sse_train = np.zeros_like(alphas)
sse_test = np.zeros_like(alphas)
for n, alpha in enumerate(alphas):
model = linear_model.Lasso(alpha=alpha)
model.fit(X_train, y_train)
coeffs[n, :] = model.coef_
resid = y_train - model.predict(X_train)
sse_train[n] = sum(resid**2)
resid = y_test - model.predict(X_test)
sse_test[n] = sum(resid**2)
/Users/rob/miniconda3/envs/npbook_py310/lib/python3.10/site-packages/sklearn/linear_model/_coordinate_descent.py:628: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations, check the scale of the features or consider increasing regularisation. Duality gap: 3.311e+01, tolerance: 3.303e+01 model = cd_fast.enet_coordinate_descent( /Users/rob/miniconda3/envs/npbook_py310/lib/python3.10/site-packages/sklearn/linear_model/_coordinate_descent.py:628: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations, check the scale of the features or consider increasing regularisation. Duality gap: 3.682e+01, tolerance: 3.303e+01 model = cd_fast.enet_coordinate_descent( /Users/rob/miniconda3/envs/npbook_py310/lib/python3.10/site-packages/sklearn/linear_model/_coordinate_descent.py:628: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations, check the scale of the features or consider increasing regularisation. Duality gap: 4.220e+01, tolerance: 3.303e+01 model = cd_fast.enet_coordinate_descent( /Users/rob/miniconda3/envs/npbook_py310/lib/python3.10/site-packages/sklearn/linear_model/_coordinate_descent.py:628: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations, check the scale of the features or consider increasing regularisation. Duality gap: 3.515e+01, tolerance: 3.303e+01 model = cd_fast.enet_coordinate_descent( /Users/rob/miniconda3/envs/npbook_py310/lib/python3.10/site-packages/sklearn/linear_model/_coordinate_descent.py:628: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations, check the scale of the features or consider increasing regularisation. Duality gap: 3.462e+01, tolerance: 3.303e+01 model = cd_fast.enet_coordinate_descent( /Users/rob/miniconda3/envs/npbook_py310/lib/python3.10/site-packages/sklearn/linear_model/_coordinate_descent.py:628: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations, check the scale of the features or consider increasing regularisation. Duality gap: 3.865e+01, tolerance: 3.303e+01 model = cd_fast.enet_coordinate_descent( /Users/rob/miniconda3/envs/npbook_py310/lib/python3.10/site-packages/sklearn/linear_model/_coordinate_descent.py:628: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations, check the scale of the features or consider increasing regularisation. Duality gap: 4.408e+01, tolerance: 3.303e+01 model = cd_fast.enet_coordinate_descent( /Users/rob/miniconda3/envs/npbook_py310/lib/python3.10/site-packages/sklearn/linear_model/_coordinate_descent.py:628: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations, check the scale of the features or consider increasing regularisation. Duality gap: 3.514e+01, tolerance: 3.303e+01 model = cd_fast.enet_coordinate_descent( /Users/rob/miniconda3/envs/npbook_py310/lib/python3.10/site-packages/sklearn/linear_model/_coordinate_descent.py:628: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations, check the scale of the features or consider increasing regularisation. Duality gap: 3.364e+01, tolerance: 3.303e+01 model = cd_fast.enet_coordinate_descent( /Users/rob/miniconda3/envs/npbook_py310/lib/python3.10/site-packages/sklearn/linear_model/_coordinate_descent.py:628: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations, check the scale of the features or consider increasing regularisation. Duality gap: 3.928e+01, tolerance: 3.303e+01 model = cd_fast.enet_coordinate_descent( /Users/rob/miniconda3/envs/npbook_py310/lib/python3.10/site-packages/sklearn/linear_model/_coordinate_descent.py:628: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations, check the scale of the features or consider increasing regularisation. Duality gap: 3.447e+01, tolerance: 3.303e+01 model = cd_fast.enet_coordinate_descent( /Users/rob/miniconda3/envs/npbook_py310/lib/python3.10/site-packages/sklearn/linear_model/_coordinate_descent.py:628: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations, check the scale of the features or consider increasing regularisation. Duality gap: 3.509e+01, tolerance: 3.303e+01 model = cd_fast.enet_coordinate_descent( /Users/rob/miniconda3/envs/npbook_py310/lib/python3.10/site-packages/sklearn/linear_model/_coordinate_descent.py:628: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations, check the scale of the features or consider increasing regularisation. Duality gap: 3.718e+01, tolerance: 3.303e+01 model = cd_fast.enet_coordinate_descent( /Users/rob/miniconda3/envs/npbook_py310/lib/python3.10/site-packages/sklearn/linear_model/_coordinate_descent.py:628: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations, check the scale of the features or consider increasing regularisation. Duality gap: 3.941e+01, tolerance: 3.303e+01 model = cd_fast.enet_coordinate_descent( /Users/rob/miniconda3/envs/npbook_py310/lib/python3.10/site-packages/sklearn/linear_model/_coordinate_descent.py:628: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations, check the scale of the features or consider increasing regularisation. Duality gap: 4.351e+01, tolerance: 3.303e+01 model = cd_fast.enet_coordinate_descent(
fig, axes = plt.subplots(1, 2, figsize=(12, 4), sharex=True)
for n in range(coeffs.shape[1]):
axes[0].plot(np.log10(alphas), coeffs[:, n], color='k', lw=0.5)
axes[1].semilogy(np.log10(alphas), sse_train, label="train")
axes[1].semilogy(np.log10(alphas), sse_test, label="test")
axes[1].legend(loc=0)
axes[0].set_xlabel(r"${\log_{10}}\alpha$", fontsize=18)
axes[0].set_ylabel(r"coefficients", fontsize=18)
axes[1].set_xlabel(r"${\log_{10}}\alpha$", fontsize=18)
axes[1].set_ylabel(r"sse", fontsize=18)
fig.tight_layout()
fig.savefig("ch15-regression-lasso-vs-alpha.pdf")
model = linear_model.LassoCV()
model.fit(X_all, y_all)
LassoCV()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
LassoCV()
model.alpha_
0.06559238747534717
resid_train = y_train - model.predict(X_train)
sse_train = sse(resid_train)
sse_train
1.5450589323146602
resid_test = y_test - model.predict(X_test)
sse_test = sse(resid_test)
sse_test
1.5321417406215792
model.score(X_train, y_train), model.score(X_test, y_test)
(0.9999953221722068, 0.9999950788657098)
fig, ax = plot_residuals_and_coeff(resid_train, resid_test, model.coef_)
fig.savefig("ch15-regression-lasso-cv.pdf")
model = linear_model.ElasticNetCV()
model.fit(X_all, y_all)
ElasticNetCV()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
ElasticNetCV()
model.alpha_
0.13118477495069433
model.l1_ratio
0.5
resid_train = y_train - model.predict(X_train)
sse_train = sum(resid_train**2)
sse_train
2183.83917293912
resid_test = y_test - model.predict(X_test)
sse_test = sum(resid_test**2)
sse_test
2650.0504463382536
model.score(X_train, y_train), model.score(X_test, y_test)
(0.9933881981034111, 0.9914882195448783)
fig, ax = plot_residuals_and_coeff(resid_train, resid_test, model.coef_)
fig.savefig("ch15-regression-elastic-net-cv.pdf")
iris = datasets.load_iris()
type(iris)
sklearn.utils._bunch.Bunch
iris.target_names
array(['setosa', 'versicolor', 'virginica'], dtype='<U10')
iris.feature_names
['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)', 'petal width (cm)']
iris.data.shape
(150, 4)
iris.target.shape
(150,)
# print(iris['DESCR'])
X_train, X_test, y_train, y_test = model_selection.train_test_split(
iris.data, iris.target, train_size=0.7, random_state=0)
classifier = linear_model.LogisticRegression()
classifier.fit(X_train, y_train)
/Users/rob/miniconda3/envs/npbook_py310/lib/python3.10/site-packages/sklearn/linear_model/_logistic.py:460: ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.
Increase the number of iterations (max_iter) or scale the data as shown in:
https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
n_iter_i = _check_optimize_result(
LogisticRegression()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
LogisticRegression()
y_test_pred = classifier.predict(X_test)
print(metrics.classification_report(y_test, y_test_pred))
precision recall f1-score support
0 1.00 1.00 1.00 16
1 1.00 0.94 0.97 18
2 0.92 1.00 0.96 11
accuracy 0.98 45
macro avg 0.97 0.98 0.98 45
weighted avg 0.98 0.98 0.98 45
np.bincount(y_test)
array([16, 18, 11])
metrics.confusion_matrix(y_test, y_test_pred)
array([[16, 0, 0],
[ 0, 17, 1],
[ 0, 0, 11]])
classifier = tree.DecisionTreeClassifier()
classifier.fit(X_train, y_train)
y_test_pred = classifier.predict(X_test)
metrics.confusion_matrix(y_test, y_test_pred)
array([[16, 0, 0],
[ 0, 17, 1],
[ 0, 0, 11]])
classifier = neighbors.KNeighborsClassifier()
classifier.fit(X_train, y_train)
y_test_pred = classifier.predict(X_test)
metrics.confusion_matrix(y_test, y_test_pred)
array([[16, 0, 0],
[ 0, 17, 1],
[ 0, 0, 11]])
classifier = svm.SVC()
classifier.fit(X_train, y_train)
y_test_pred = classifier.predict(X_test)
metrics.confusion_matrix(y_test, y_test_pred)
array([[16, 0, 0],
[ 0, 17, 1],
[ 0, 0, 11]])
classifier = ensemble.RandomForestClassifier()
classifier.fit(X_train, y_train)
y_test_pred = classifier.predict(X_test)
metrics.confusion_matrix(y_test, y_test_pred)
array([[16, 0, 0],
[ 0, 17, 1],
[ 0, 0, 11]])
train_size_vec = np.linspace(0.1, 0.9, 30)
classifiers = [tree.DecisionTreeClassifier,
neighbors.KNeighborsClassifier,
svm.SVC,
ensemble.RandomForestClassifier
]
cm_diags = np.zeros((3, len(train_size_vec), len(classifiers)), dtype=float)
for n, train_size in enumerate(train_size_vec):
X_train, X_test, y_train, y_test = \
model_selection.train_test_split(iris.data, iris.target, train_size=train_size)
for m, Classifier in enumerate(classifiers):
classifier = Classifier()
classifier.fit(X_train, y_train)
y_test_pred = classifier.predict(X_test)
cm_diags[:, n, m] = metrics.confusion_matrix(y_test, y_test_pred).diagonal()
cm_diags[:, n, m] /= np.bincount(y_test)
fig, axes = plt.subplots(1, len(classifiers), figsize=(12, 3))
for m, Classifier in enumerate(classifiers):
axes[m].plot(train_size_vec, cm_diags[2, :, m], label=iris.target_names[2])
axes[m].plot(train_size_vec, cm_diags[1, :, m], label=iris.target_names[1])
axes[m].plot(train_size_vec, cm_diags[0, :, m], label=iris.target_names[0])
axes[m].set_title(type(Classifier()).__name__)
axes[m].set_ylim(0, 1.1)
axes[m].set_xlim(0.1, 0.9)
axes[m].set_ylabel("classification accuracy")
axes[m].set_xlabel("training size ratio")
axes[m].legend(loc=4)
fig.tight_layout()
fig.savefig("ch15-classification-comparison.pdf")
X, y = iris.data, iris.target
np.random.seed(123)
n_clusters = 3
c = cluster.KMeans(n_clusters=n_clusters)
c.fit(X)
/Users/rob/miniconda3/envs/npbook_py310/lib/python3.10/site-packages/sklearn/cluster/_kmeans.py:1412: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning super()._check_params_vs_input(X, default_n_init=10)
KMeans(n_clusters=3)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
KMeans(n_clusters=3)
y_pred = c.predict(X)
y_pred[::8]
array([1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 0, 0, 0, 0, 0, 0],
dtype=int32)
y[::8]
array([0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2])
idx_0, idx_1, idx_2 = (np.where(y_pred == n) for n in range(3))
y_pred[idx_0], y_pred[idx_1], y_pred[idx_2] = 2, 0, 1
y_pred[::8]
array([0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2],
dtype=int32)
metrics.confusion_matrix(y, y_pred)
array([[50, 0, 0],
[ 0, 48, 2],
[ 0, 14, 36]])
N = X.shape[1]
fig, axes = plt.subplots(N, N, figsize=(12, 12), sharex=True, sharey=True)
colors = ["coral", "blue", "green"]
markers = ["^", "v", "o"]
for m in range(N):
for n in range(N):
for p in range(n_clusters):
mask = y_pred == p
axes[m, n].scatter(X[:, m][mask], X[:, n][mask],
marker=markers[p], s=30,
color=colors[p], alpha=0.25)
for idx in np.where(y != y_pred):
axes[m, n].scatter(X[idx, m], X[idx, n],
marker="s", s=30,
edgecolor="red",
facecolor=(1,1,1,0))
axes[N-1, m].set_xlabel(iris.feature_names[m], fontsize=16)
axes[m, 0].set_ylabel(iris.feature_names[m], fontsize=16)
fig.tight_layout()
fig.savefig("ch15-clustering.pdf")
fig.savefig("ch15-clustering.png", dpi=600)
%reload_ext version_information
%version_information sklearn, numpy, matplotlib, seaborn
| Software | Version |
|---|---|
| Python | 3.10.12 64bit [Clang 14.0.6 ] |
| IPython | 8.12.0 |
| OS | macOS 10.15.7 x86\_64 i386 64bit |
| sklearn | 1.3.0 |
| numpy | 1.22.3 |
| matplotlib | 3.7.1 |
| seaborn | 0.12.2 |
| Sun Nov 03 09:20:45 2024 JST | |