(Notebook prepared by Pr Fabien MOUTARDE, Center for Robotics, MINES ParisTech, PSL Université Paris)
###########################################################################################
# Author: Pr Fabien MOUTARDE, Center for Robotics, MINES ParisTech, PSL Research University
###########################################################################################
%matplotlib inline
import numpy as np
from matplotlib import pyplot as plt
from matplotlib.colors import ListedColormap
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler
from sklearn.datasets import make_moons
# Create artificial dataset (classification problem within 2 classes within R^2 input space)
Xmoon, y_moon = make_moons(n_samples=900, noise=0.2, random_state=0)
# Preprocess dataset, and split into training and test part
Xmoon = StandardScaler().fit_transform(Xmoon)
Xmoon_train, Xmoon_test, y_moon_train, y_moon_test = train_test_split(Xmoon, y_moon, test_size=0.7)
# Encode class labels as binary vector (with exactly ONE bit set to 1, and all others to 0)
Ymoon_train_OneHot = np.eye(2)[y_moon_train]
Ymoon_test_OneHot = np.eye(2)[y_moon_test]
# Print beginning of training dataset (for verification)
print("Number of training examples = ", y_moon_train.size)
print()
print(" first ", round(y_moon_train.size/10), "training examples" )
print("[ Input_features ] [Target_output]")
for i in range( int(round(y_moon_train.size/10) )):
print( Xmoon_train[i], Ymoon_train_OneHot[i])
# Plot training+testing dataset
################################
cm = plt.cm.RdBu
cm_bright = ListedColormap(['#FF0000', '#0000FF'])
# Plot the training points...
plt.scatter(Xmoon_train[:, 0], Xmoon_train[:, 1], c=y_moon_train, cmap=cm_bright)
# ...and testing points
plt.scatter(Xmoon_test[:, 0], Xmoon_test[:, 1], marker='x', c=y_moon_test, cmap=cm_bright, alpha=0.3)
# Define limits/scale of plot axis
x_min, x_max = Xmoon[:, 0].min() - .5, Xmoon[:, 0].max() + .5
y_min, y_max = Xmoon[:, 1].min() - .5, Xmoon[:, 1].max() + .5
plt.xlim(x_min, x_max)
plt.ylim(y_min, y_max)
plt.xticks(())
plt.yticks(())
# Actually render the plot
print()
print("PLOT OF TRAINING EXAMPLES AND TEST DATASET")
print("Datasets: circles=training, light-crosses=test [and red=class_1, blue=class_2]")
plt.ioff()
plt.show()
Number of training examples = 270 first 27 training examples [ Input_features ] [Target_output] [-0.3679812 -1.86009484] [0. 1.] [0.62800901 1.19257404] [1. 0.] [0.33051721 0.40740697] [1. 0.] [-0.25971931 -0.15739371] [0. 1.] [-0.02145466 -1.02559593] [0. 1.] [-0.68154896 -0.31988568] [0. 1.] [0.18197496 1.34115123] [1. 0.] [-0.2243102 -1.18873707] [0. 1.] [-0.25900409 1.49419652] [1. 0.] [ 1.40266943 -1.56227453] [0. 1.] [-1.40154583 0.46345254] [1. 0.] [ 0.51227218 -0.06210374] [1. 0.] [-0.20894604 -0.11402918] [0. 1.] [-1.06200566 1.53211713] [1. 0.] [-0.27498973 1.699569 ] [1. 0.] [0.08606107 0.23109958] [1. 0.] [0.28524607 1.0570958 ] [1. 0.] [ 1.23305107 -1.02088605] [0. 1.] [ 1.26386763 -1.12944512] [0. 1.] [ 0.79973754 -1.62096834] [0. 1.] [ 0.64140736 -1.27886427] [0. 1.] [-1.10724722 -0.09380121] [1. 0.] [-1.28468144 1.16889587] [1. 0.] [ 1.23761378 -1.23192948] [0. 1.] [-0.11844324 -0.58011352] [0. 1.] [0.58699126 0.7784088 ] [1. 0.] [-0.4905587 -0.91391279] [0. 1.] PLOT OF TRAINING EXAMPLES AND TEST DATASET Datasets: circles=training, light-crosses=test [and red=class_1, blue=class_2]
Building, training and evaluating a simple Neural Network classifier (Multi Layer Perceptron, MLP)
The SciKit-learn class for MLP is MLPClassifier. Please first read the MLPClassifier documentation; to understand all parameters of the constructor. You can then begin by running the code block below, in which an initial set of hyper-parameter values has been chosen. YOU MAY NEED TO CHANGE AT LEAST THE NUMBER OF HIDDEN NEURONS (and probably other hyper-parameters) IN ORDER TO BE ABLE TO LEARN A CORRECT CLASSIFIER
#########################################################
# Create, fit and evaluate a MLP neural network classifier
#########################################################
from sklearn.neural_network import MLPClassifier
# Create the MLP (with specific values for hyper-parameters)
mlp = MLPClassifier(hidden_layer_sizes=(1, ), activation='tanh', solver='sgd',
alpha=0.0000001, batch_size=4, learning_rate='constant', learning_rate_init=0.005,
power_t=0.5, max_iter=9, shuffle=True, random_state=11, tol=0.00001,
verbose=True, warm_start=False, momentum=0.8, nesterovs_momentum=True,
early_stopping=False, validation_fraction=0.2,
beta_1=0.9, beta_2=0.999, epsilon=1e-08)
print(mlp)
# NB about syntax for hidden layers: hidden_layer_sizes=(H1, ) means ONE hidden layer containing H1 neurons,
# while hidden_layer_sizes=(H1,H2, ) would mean TWO hidden layers of respective sizes H1 and H2
# NB about iteration: max_iter specifies a number of EPOCHS (= going through all training examples)
# Train the MLP classifier on the training dataset
mlp.fit(Xmoon_train, Ymoon_train_OneHot)
print()
# Plot the LEARNING CURVE
plt.title("Evolution of TRAINING ERROR during training")
plt.xlabel("Iterations (epochs)")
plt.ylabel("TRAINING ERROR")
plt.plot(mlp.loss_curve_)
plt.show()
# Evaluate acuracy on TEST data
score = mlp.score(Xmoon_test,Ymoon_test_OneHot)
print("Acuracy (on test set) = ", score)
MLPClassifier(activation='tanh', alpha=1e-07, batch_size=4, hidden_layer_sizes=(1,), learning_rate_init=0.005, max_iter=9, momentum=0.8, random_state=11, solver='sgd', tol=1e-05, validation_fraction=0.2, verbose=True) Iteration 1, loss = 1.68023624 Iteration 2, loss = 1.48509406 Iteration 3, loss = 1.38305513 Iteration 4, loss = 1.29878734 Iteration 5, loss = 1.20252111 Iteration 6, loss = 1.09824327 Iteration 7, loss = 1.00176541 Iteration 8, loss = 0.90904336 Iteration 9, loss = 0.81725922
C:\Users\fabien\anaconda3_2020-07\envs\envML2020\lib\site-packages\sklearn\neural_network\_multilayer_perceptron.py:585: ConvergenceWarning: Stochastic Optimizer: Maximum iterations (9) reached and the optimization hasn't converged yet. % self.max_iter, ConvergenceWarning)
Acuracy (on test set) = 0.8507936507936508
THIS SHOULD HELP YOU UNDERSTAND WHAT HAPPENS IF THERE ARE NOT ENOUGH HIDDEN NEURONS
Optional: add code that visualises on the same plot the straight lines corresponding to each hidden neuron (you will need to dig into MLPClassifier documentation to find the 2 input weights and the bias of each hidden neuron). YOU SHOULD NOTICE THAT THE CLASSIFICATION BOUNDARY IS SOME INTERPOLATION BETWEEN THOSE STRAIGHT LINES.
# Plot the decision boundary. For that, we will assign a color to each
# point in the mesh [x_min, x_max]x[y_min, y_max].
h = .02 # Step size in the mesh
xx, yy = np.meshgrid(np.arange(x_min, x_max, h),
np.arange(y_min, y_max, h))
# Compute class probabilities for each mesh point
Z = mlp.predict_proba(np.c_[xx.ravel(), yy.ravel()])[:, 1]
# Put the result into a color plot
Z = Z.reshape(xx.shape)
plt.contourf(xx, yy, Z, cmap=cm, alpha=.8)
# Plot also the training points
plt.scatter(Xmoon_train[:, 0], Xmoon_train[:, 1], c=y_moon_train, cmap=cm_bright)
# and testing points
plt.scatter(Xmoon_test[:, 0], Xmoon_test[:, 1], marker='x', c=y_moon_test, cmap=cm_bright, alpha=0.3)
# Axis ranges
plt.xlim(xx.min(), xx.max())
plt.ylim(yy.min(), yy.max())
plt.xticks(())
plt.yticks(())
# Print acuracy on plot
plt.text(xx.max() - .3, yy.min() + .3, ('%.2f' % score).lstrip('0'),
size=15, horizontalalignment='right')
# Actually plot
plt.ioff()
plt.show()
Now, check, by changing MLPClassifier parameters above and then rerunning training+eval+plots, the impact of main learning hyper-parameters:
Because the values of learning hyper-parameters can DRASTICALLY change the outcome of training, it is ESSENTIAL THAT YOU ALWAYS TRY TO FIND OPTIMIZED VALUES FOR THE ALGORITHM HYPER-PARAMETERS. And this ABSOLUTELY NEEDS TO BE DONE USING "VALIDATION", either with a validation set separate from the training set, or using cross-validation. CROSS-VALIDATION is the MOST ROBUST WAY OF FINDING OPTIMIZED HYPER-PARAMETRS VALUES, and the GridSearchCV function of SciKit-Learn makes this rather straightforward.
WARNING: GridSearchCV launches many successive training sessions, so can be rather long to execute if you compare too many combinations
from sklearn.model_selection import GridSearchCV
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import classification_report
param_grid = [
{'hidden_layer_sizes': [(5,), (10,), (20,), (40,)],
'learning_rate_init':[0.003, 0.01, 0.03, 0.1],
'alpha': [0.00001, 0.0001, 0.001]}
]
print(param_grid)
# Cross-validation grid-search (for finding best possible accuracy)
clf = GridSearchCV( MLPClassifier(activation='tanh', alpha=1e-07, batch_size=4, beta_1=0.9,
beta_2=0.999, early_stopping=True, epsilon=1e-08,
hidden_layer_sizes=(10,), learning_rate='constant',
learning_rate_init=0.005, max_iter=500, momentum=0.8,
nesterovs_momentum=True, power_t=0.5, random_state=11, shuffle=True,
solver='adam', tol=1e-05, validation_fraction=0.3, verbose=False,
warm_start=False),
param_grid, cv=3, scoring='accuracy')
# NOTE THAT YOU CAN USE OTHER VALUE FOR cv (# of folds) and OTHER SCORING CRITERIA OTHER THAN 'accuracy'
clf.fit(Xmoon_train, Ymoon_train_OneHot)
print("Best parameters set found on development set:")
print()
print(clf.best_params_)
print()
print("Grid scores on development set:")
print()
means = clf.cv_results_['mean_test_score']
stds = clf.cv_results_['std_test_score']
for mean, std, params in zip(means, stds, clf.cv_results_['params']):
print("%0.3f (+/-%0.03f) for %r"
% (mean, std * 2, params))
print()
print("Detailed classification report:")
print()
print("The model is trained on the full development set.")
print("The scores are computed on the full evaluation set.")
print()
y_true, y_pred = Ymoon_test_OneHot, clf.predict(Xmoon_test)
print(classification_report(y_true, y_pred))
print()
[{'hidden_layer_sizes': [(5,), (10,), (20,), (40,)], 'learning_rate_init': [0.003, 0.01, 0.03, 0.1], 'alpha': [1e-05, 0.0001, 0.001]}] Best parameters set found on development set: {'alpha': 1e-05, 'hidden_layer_sizes': (20,), 'learning_rate_init': 0.03} Grid scores on development set: 0.830 (+/-0.121) for {'alpha': 1e-05, 'hidden_layer_sizes': (5,), 'learning_rate_init': 0.003} 0.859 (+/-0.117) for {'alpha': 1e-05, 'hidden_layer_sizes': (5,), 'learning_rate_init': 0.01} 0.911 (+/-0.048) for {'alpha': 1e-05, 'hidden_layer_sizes': (5,), 'learning_rate_init': 0.03} 0.904 (+/-0.021) for {'alpha': 1e-05, 'hidden_layer_sizes': (5,), 'learning_rate_init': 0.1} 0.844 (+/-0.158) for {'alpha': 1e-05, 'hidden_layer_sizes': (10,), 'learning_rate_init': 0.003} 0.859 (+/-0.151) for {'alpha': 1e-05, 'hidden_layer_sizes': (10,), 'learning_rate_init': 0.01} 0.911 (+/-0.031) for {'alpha': 1e-05, 'hidden_layer_sizes': (10,), 'learning_rate_init': 0.03} 0.915 (+/-0.028) for {'alpha': 1e-05, 'hidden_layer_sizes': (10,), 'learning_rate_init': 0.1} 0.863 (+/-0.069) for {'alpha': 1e-05, 'hidden_layer_sizes': (20,), 'learning_rate_init': 0.003} 0.881 (+/-0.064) for {'alpha': 1e-05, 'hidden_layer_sizes': (20,), 'learning_rate_init': 0.01} 0.941 (+/-0.064) for {'alpha': 1e-05, 'hidden_layer_sizes': (20,), 'learning_rate_init': 0.03} 0.930 (+/-0.042) for {'alpha': 1e-05, 'hidden_layer_sizes': (20,), 'learning_rate_init': 0.1} 0.844 (+/-0.127) for {'alpha': 1e-05, 'hidden_layer_sizes': (40,), 'learning_rate_init': 0.003} 0.885 (+/-0.052) for {'alpha': 1e-05, 'hidden_layer_sizes': (40,), 'learning_rate_init': 0.01} 0.904 (+/-0.076) for {'alpha': 1e-05, 'hidden_layer_sizes': (40,), 'learning_rate_init': 0.03} 0.911 (+/-0.036) for {'alpha': 1e-05, 'hidden_layer_sizes': (40,), 'learning_rate_init': 0.1} 0.830 (+/-0.121) for {'alpha': 0.0001, 'hidden_layer_sizes': (5,), 'learning_rate_init': 0.003} 0.859 (+/-0.117) for {'alpha': 0.0001, 'hidden_layer_sizes': (5,), 'learning_rate_init': 0.01} 0.911 (+/-0.048) for {'alpha': 0.0001, 'hidden_layer_sizes': (5,), 'learning_rate_init': 0.03} 0.904 (+/-0.021) for {'alpha': 0.0001, 'hidden_layer_sizes': (5,), 'learning_rate_init': 0.1} 0.844 (+/-0.158) for {'alpha': 0.0001, 'hidden_layer_sizes': (10,), 'learning_rate_init': 0.003} 0.859 (+/-0.151) for {'alpha': 0.0001, 'hidden_layer_sizes': (10,), 'learning_rate_init': 0.01} 0.915 (+/-0.021) for {'alpha': 0.0001, 'hidden_layer_sizes': (10,), 'learning_rate_init': 0.03} 0.915 (+/-0.028) for {'alpha': 0.0001, 'hidden_layer_sizes': (10,), 'learning_rate_init': 0.1} 0.863 (+/-0.069) for {'alpha': 0.0001, 'hidden_layer_sizes': (20,), 'learning_rate_init': 0.003} 0.881 (+/-0.064) for {'alpha': 0.0001, 'hidden_layer_sizes': (20,), 'learning_rate_init': 0.01} 0.941 (+/-0.064) for {'alpha': 0.0001, 'hidden_layer_sizes': (20,), 'learning_rate_init': 0.03} 0.926 (+/-0.038) for {'alpha': 0.0001, 'hidden_layer_sizes': (20,), 'learning_rate_init': 0.1} 0.844 (+/-0.127) for {'alpha': 0.0001, 'hidden_layer_sizes': (40,), 'learning_rate_init': 0.003} 0.885 (+/-0.052) for {'alpha': 0.0001, 'hidden_layer_sizes': (40,), 'learning_rate_init': 0.01} 0.907 (+/-0.069) for {'alpha': 0.0001, 'hidden_layer_sizes': (40,), 'learning_rate_init': 0.03} 0.930 (+/-0.073) for {'alpha': 0.0001, 'hidden_layer_sizes': (40,), 'learning_rate_init': 0.1} 0.830 (+/-0.121) for {'alpha': 0.001, 'hidden_layer_sizes': (5,), 'learning_rate_init': 0.003} 0.859 (+/-0.117) for {'alpha': 0.001, 'hidden_layer_sizes': (5,), 'learning_rate_init': 0.01} 0.904 (+/-0.028) for {'alpha': 0.001, 'hidden_layer_sizes': (5,), 'learning_rate_init': 0.03} 0.907 (+/-0.010) for {'alpha': 0.001, 'hidden_layer_sizes': (5,), 'learning_rate_init': 0.1} 0.844 (+/-0.158) for {'alpha': 0.001, 'hidden_layer_sizes': (10,), 'learning_rate_init': 0.003} 0.852 (+/-0.137) for {'alpha': 0.001, 'hidden_layer_sizes': (10,), 'learning_rate_init': 0.01} 0.926 (+/-0.046) for {'alpha': 0.001, 'hidden_layer_sizes': (10,), 'learning_rate_init': 0.03} 0.933 (+/-0.048) for {'alpha': 0.001, 'hidden_layer_sizes': (10,), 'learning_rate_init': 0.1} 0.863 (+/-0.069) for {'alpha': 0.001, 'hidden_layer_sizes': (20,), 'learning_rate_init': 0.003} 0.881 (+/-0.064) for {'alpha': 0.001, 'hidden_layer_sizes': (20,), 'learning_rate_init': 0.01} 0.937 (+/-0.064) for {'alpha': 0.001, 'hidden_layer_sizes': (20,), 'learning_rate_init': 0.03} 0.896 (+/-0.055) for {'alpha': 0.001, 'hidden_layer_sizes': (20,), 'learning_rate_init': 0.1} 0.844 (+/-0.127) for {'alpha': 0.001, 'hidden_layer_sizes': (40,), 'learning_rate_init': 0.003} 0.885 (+/-0.052) for {'alpha': 0.001, 'hidden_layer_sizes': (40,), 'learning_rate_init': 0.01} 0.885 (+/-0.010) for {'alpha': 0.001, 'hidden_layer_sizes': (40,), 'learning_rate_init': 0.03} 0.922 (+/-0.073) for {'alpha': 0.001, 'hidden_layer_sizes': (40,), 'learning_rate_init': 0.1} Detailed classification report: The model is trained on the full development set. The scores are computed on the full evaluation set. precision recall f1-score support 0 0.97 0.93 0.95 323 1 0.93 0.97 0.95 307 micro avg 0.95 0.95 0.95 630 macro avg 0.95 0.95 0.95 630 weighted avg 0.95 0.95 0.95 630 samples avg 0.95 0.95 0.95 630
Please FIRST READ the Digits DATASET DESCRIPTION. In this classification problem, there are 10 classes, with a total of 1797 examples (each one being a 64D vector corresponding to an 8x8 pixmap).
Assignment #1: find out what learning hyper-parameters should be modified in order to obtain a satisfying MLP digits classifier
Assignment #2: modify the code below to use cross-validation and find best training hyper-parameters and MLP classifier you can for this handwritten digits classification task.
Assignment #3: compute and plot the precision-recall curve (for each class). NB: search into sciKit-learn documentation to find the function for that, and then add a code cell that uses it.
Assignment #4: display the confusion matrix as a prettier and more easily understable plot (cf. example on https://scikit-learn.org/stable/auto_examples/model_selection/plot_confusion_matrix.html)
Assignment #5 (optional): plot the first layer of weights as images (see explanations and example code at http://scikit-learn.org/stable/auto_examples/neural_networks/plot_mnist_filters.html)
from sklearn.datasets import load_digits
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import classification_report
digits = load_digits()
n_samples = len(digits.images)
print("Number_of-examples = ", n_samples)
import matplotlib.pyplot as plt
print("\n Plot of first example")
plt.gray()
plt.matshow(digits.images[0])
print("CLOSE PLOT WINDOW TO CONTINUE")
plt.ioff()
plt.show()
# Flatten the images, to turn data in a (samples, feature) matrix:
data = digits.images.reshape((n_samples, -1))
Xdigits = data
y_digits = digits.target
Xdigits_train, Xdigits_test, y_digits_train, y_digits_test = train_test_split(Xdigits, y_digits, test_size=0.5)
clf = MLPClassifier(hidden_layer_sizes=(10, ), activation='tanh', solver='sgd',
alpha=0.00001, batch_size=4, learning_rate='constant', learning_rate_init=0.01,
power_t=0.5, max_iter=9, shuffle=True, random_state=11, tol=0.00001,
verbose=True, warm_start=False, momentum=0.8, nesterovs_momentum=True,
early_stopping=False, validation_fraction=0.1,
beta_1=0.9, beta_2=0.999, epsilon=1e-08)
print(clf)
# Train the MLP classifier on training dataset
clf.fit(Xdigits_train, y_digits_train)
# Plot the LEARNING CURVE
plt.title("Evolution of TRAINING ERROR during training")
plt.xlabel("Iterations (epochs)")
plt.ylabel("TRAINING ERROR")
plt.plot(mlp.loss_curve_)
plt.show()
# Evaluate acuracy on test data
score = clf.score(Xdigits_test,y_digits_test)
print("Acuracy (on test set) = ", score)
y_true, y_pred = y_digits_test, clf.predict(Xdigits_test)
print(classification_report(y_true, y_pred))
# Display CONFUSION MATRIX on TEST set
from sklearn.metrics import confusion_matrix
print("CONFUSION MATRIX below")
confusion_matrix(y_true, y_pred)
Number_of-examples = 1797 Plot of first example CLOSE PLOT WINDOW TO CONTINUE
<Figure size 432x288 with 0 Axes>
MLPClassifier(activation='tanh', alpha=1e-05, batch_size=4, hidden_layer_sizes=(10,), learning_rate_init=0.01, max_iter=9, momentum=0.8, random_state=11, solver='sgd', tol=1e-05, verbose=True) Iteration 1, loss = 2.04081277 Iteration 2, loss = 1.56330125 Iteration 3, loss = 1.23753682 Iteration 4, loss = 1.15332191 Iteration 5, loss = 0.91743745 Iteration 6, loss = 0.84968576 Iteration 7, loss = 0.79511375 Iteration 8, loss = 0.85375545 Iteration 9, loss = 0.75832428
C:\Users\fabien\anaconda3_2020-07\envs\envML2020\lib\site-packages\sklearn\neural_network\_multilayer_perceptron.py:585: ConvergenceWarning: Stochastic Optimizer: Maximum iterations (9) reached and the optimization hasn't converged yet. % self.max_iter, ConvergenceWarning)
Acuracy (on test set) = 0.7864293659621802 precision recall f1-score support 0 0.81 0.99 0.89 91 1 0.73 0.74 0.73 91 2 0.91 0.92 0.91 84 3 0.60 0.91 0.73 89 4 0.86 0.90 0.88 88 5 0.78 0.87 0.82 95 6 0.82 0.91 0.86 94 7 0.94 0.90 0.92 89 8 0.57 0.14 0.22 94 9 0.78 0.61 0.68 84 accuracy 0.79 899 macro avg 0.78 0.79 0.77 899 weighted avg 0.78 0.79 0.76 899 CONFUSION MATRIX below
array([[90, 0, 0, 0, 1, 0, 0, 0, 0, 0], [ 0, 67, 0, 0, 2, 3, 11, 0, 0, 8], [ 0, 1, 77, 4, 0, 0, 0, 1, 0, 1], [ 0, 0, 1, 81, 0, 2, 1, 1, 3, 0], [ 2, 1, 0, 0, 79, 0, 4, 1, 0, 1], [ 3, 0, 3, 1, 0, 83, 0, 1, 3, 1], [ 2, 0, 1, 0, 0, 4, 86, 0, 0, 1], [ 0, 0, 2, 0, 3, 0, 0, 80, 4, 0], [ 2, 21, 0, 39, 1, 13, 3, 0, 13, 2], [12, 2, 1, 9, 6, 2, 0, 1, 0, 51]], dtype=int64)