###########################################################################################
# Author: Pr Fabien MOUTARDE, Center for Robotics, MINES ParisTech, PSL Research University
###########################################################################################

%matplotlib inline

import numpy as np

from matplotlib import pyplot as plt
from matplotlib.colors import ListedColormap

from sklearn.model_selection import train_test_split
from sklearn import preprocessing 
from sklearn.preprocessing import StandardScaler
from sklearn.datasets import make_moons

# Create artificial dataset (classification problem within 2 classes within R^2 input space)
Xmoon, y_moon = make_moons(n_samples=900, noise=0.2, random_state=0)

# Preprocess dataset, and split into training and test part
Xmoon = StandardScaler().fit_transform(Xmoon)
Xmoon_train, Xmoon_test, y_moon_train, y_moon_test = train_test_split(Xmoon, y_moon, test_size=0.7)

# Encode class labels as binary vector (with exactly ONE bit set to 1, and all others to 0)
Ymoon_train_OneHot = np.eye(2)[y_moon_train]
Ymoon_test_OneHot = np.eye(2)[y_moon_test]

# Print beginning of training dataset (for verification)
print("Number of training examples = ", y_moon_train.size)
print()
print("  first ", round(y_moon_train.size/10), "training examples" )
print("[  Input_features  ]     [Target_output]")
for i in range( int(round(y_moon_train.size/10) )):
    print( Xmoon_train[i], Ymoon_train_OneHot[i])

# Plot training+testing dataset
################################
cm = plt.cm.RdBu
cm_bright = ListedColormap(['#FF0000', '#0000FF'])

# Plot the training points...
plt.scatter(Xmoon_train[:, 0], Xmoon_train[:, 1], c=y_moon_train, cmap=cm_bright)
#   ...and testing points
plt.scatter(Xmoon_test[:, 0], Xmoon_test[:, 1], marker='x', c=y_moon_test, cmap=cm_bright, alpha=0.3)

# Define limits/scale of plot axis
x_min, x_max = Xmoon[:, 0].min() - .5, Xmoon[:, 0].max() + .5
y_min, y_max = Xmoon[:, 1].min() - .5, Xmoon[:, 1].max() + .5
plt.xlim(x_min, x_max)
plt.ylim(y_min, y_max)

plt.xticks(())
plt.yticks(())

# Actually render the plot
print()
print("PLOT OF TRAINING EXAMPLES AND TEST DATASET")
print("Datasets: circles=training, light-crosses=test [and red=class_1, blue=class_2]")

plt.ioff()
plt.show()

Number of training examples =  270

  first  27 training examples
[  Input_features  ]     [Target_output]
[-0.3679812  -1.86009484] [0. 1.]
[0.62800901 1.19257404] [1. 0.]
[0.33051721 0.40740697] [1. 0.]
[-0.25971931 -0.15739371] [0. 1.]
[-0.02145466 -1.02559593] [0. 1.]
[-0.68154896 -0.31988568] [0. 1.]
[0.18197496 1.34115123] [1. 0.]
[-0.2243102  -1.18873707] [0. 1.]
[-0.25900409  1.49419652] [1. 0.]
[ 1.40266943 -1.56227453] [0. 1.]
[-1.40154583  0.46345254] [1. 0.]
[ 0.51227218 -0.06210374] [1. 0.]
[-0.20894604 -0.11402918] [0. 1.]
[-1.06200566  1.53211713] [1. 0.]
[-0.27498973  1.699569  ] [1. 0.]
[0.08606107 0.23109958] [1. 0.]
[0.28524607 1.0570958 ] [1. 0.]
[ 1.23305107 -1.02088605] [0. 1.]
[ 1.26386763 -1.12944512] [0. 1.]
[ 0.79973754 -1.62096834] [0. 1.]
[ 0.64140736 -1.27886427] [0. 1.]
[-1.10724722 -0.09380121] [1. 0.]
[-1.28468144  1.16889587] [1. 0.]
[ 1.23761378 -1.23192948] [0. 1.]
[-0.11844324 -0.58011352] [0. 1.]
[0.58699126 0.7784088 ] [1. 0.]
[-0.4905587  -0.91391279] [0. 1.]

PLOT OF TRAINING EXAMPLES AND TEST DATASET
Datasets: circles=training, light-crosses=test [and red=class_1, blue=class_2]


#########################################################
# Create, fit and evaluate a MLP neural network classifier
#########################################################
from sklearn.neural_network import MLPClassifier

# Create the MLP (with specific values for hyper-parameters)
mlp = MLPClassifier(hidden_layer_sizes=(1, ), activation='tanh', solver='sgd', 
                    alpha=0.0000001, batch_size=4, learning_rate='constant', learning_rate_init=0.005, 
                    power_t=0.5, max_iter=9, shuffle=True, random_state=11, tol=0.00001, 
                    verbose=True, warm_start=False, momentum=0.8, nesterovs_momentum=True, 
                    early_stopping=False, validation_fraction=0.2, 
                    beta_1=0.9, beta_2=0.999, epsilon=1e-08)
print(mlp)
# NB about syntax for hidden layers: hidden_layer_sizes=(H1, ) means ONE hidden layer containing H1 neurons,
#   while hidden_layer_sizes=(H1,H2, ) would mean TWO hidden layers of respective sizes H1 and H2
# NB about iteration: max_iter specifies a number of EPOCHS (= going through all training examples)

# Train the MLP classifier on the training dataset
mlp.fit(Xmoon_train, Ymoon_train_OneHot)
print()

# Plot the LEARNING CURVE
plt.title("Evolution of TRAINING ERROR during training")
plt.xlabel("Iterations (epochs)")
plt.ylabel("TRAINING ERROR")
plt.plot(mlp.loss_curve_)
plt.show()

# Evaluate acuracy on TEST data
score = mlp.score(Xmoon_test,Ymoon_test_OneHot)
print("Acuracy (on test set) = ", score)

MLPClassifier(activation='tanh', alpha=1e-07, batch_size=4,
              hidden_layer_sizes=(1,), learning_rate_init=0.005, max_iter=9,
              momentum=0.8, random_state=11, solver='sgd', tol=1e-05,
              validation_fraction=0.2, verbose=True)
Iteration 1, loss = 1.68023624
Iteration 2, loss = 1.48509406
Iteration 3, loss = 1.38305513
Iteration 4, loss = 1.29878734
Iteration 5, loss = 1.20252111
Iteration 6, loss = 1.09824327
Iteration 7, loss = 1.00176541
Iteration 8, loss = 0.90904336
Iteration 9, loss = 0.81725922

C:\Users\fabien\anaconda3_2020-07\envs\envML2020\lib\site-packages\sklearn\neural_network\_multilayer_perceptron.py:585: ConvergenceWarning: Stochastic Optimizer: Maximum iterations (9) reached and the optimization hasn't converged yet.
  % self.max_iter, ConvergenceWarning)

Acuracy (on test set) =  0.8507936507936508


# Plot the decision boundary. For that, we will assign a color to each
#   point in the mesh [x_min, x_max]x[y_min, y_max].

h = .02  # Step size in the mesh
xx, yy = np.meshgrid(np.arange(x_min, x_max, h),
                         np.arange(y_min, y_max, h))

# Compute class probabilities for each mesh point
Z = mlp.predict_proba(np.c_[xx.ravel(), yy.ravel()])[:, 1]

# Put the result into a color plot
Z = Z.reshape(xx.shape)
plt.contourf(xx, yy, Z, cmap=cm, alpha=.8)

# Plot also the training points
plt.scatter(Xmoon_train[:, 0], Xmoon_train[:, 1], c=y_moon_train, cmap=cm_bright)
# and testing points
plt.scatter(Xmoon_test[:, 0], Xmoon_test[:, 1], marker='x', c=y_moon_test, cmap=cm_bright, alpha=0.3)

# Axis ranges 
plt.xlim(xx.min(), xx.max())
plt.ylim(yy.min(), yy.max())
plt.xticks(())
plt.yticks(())

# Print acuracy on plot
plt.text(xx.max() - .3, yy.min() + .3, ('%.2f' % score).lstrip('0'),
                size=15, horizontalalignment='right')

# Actually plot
plt.ioff()
plt.show()


from sklearn.model_selection import GridSearchCV
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import classification_report

param_grid = [
  {'hidden_layer_sizes': [(5,), (10,), (20,), (40,)], 
   'learning_rate_init':[0.003, 0.01, 0.03, 0.1],
   'alpha': [0.00001, 0.0001, 0.001]}
 ]
print(param_grid)

# Cross-validation grid-search (for finding best possible accuracy)
clf = GridSearchCV( MLPClassifier(activation='tanh', alpha=1e-07, batch_size=4, beta_1=0.9,
                                  beta_2=0.999, early_stopping=True, epsilon=1e-08,
                                  hidden_layer_sizes=(10,), learning_rate='constant',
                                  learning_rate_init=0.005, max_iter=500, momentum=0.8,
                                  nesterovs_momentum=True, power_t=0.5, random_state=11, shuffle=True,
                                  solver='adam', tol=1e-05, validation_fraction=0.3, verbose=False,
                                  warm_start=False), 
                   param_grid, cv=3, scoring='accuracy') 
# NOTE THAT YOU CAN USE OTHER VALUE FOR cv (# of folds) and OTHER SCORING CRITERIA OTHER THAN 'accuracy'
    
clf.fit(Xmoon_train, Ymoon_train_OneHot)
print("Best parameters set found on development set:")
print()
print(clf.best_params_)
print()
print("Grid scores on development set:")
print()
means = clf.cv_results_['mean_test_score']
stds = clf.cv_results_['std_test_score']
for mean, std, params in zip(means, stds, clf.cv_results_['params']):
    print("%0.3f (+/-%0.03f) for %r"
           % (mean, std * 2, params))
print()
print("Detailed classification report:")
print()
print("The model is trained on the full development set.")
print("The scores are computed on the full evaluation set.")
print()
y_true, y_pred = Ymoon_test_OneHot, clf.predict(Xmoon_test)
print(classification_report(y_true, y_pred))
print()

[{'hidden_layer_sizes': [(5,), (10,), (20,), (40,)], 'learning_rate_init': [0.003, 0.01, 0.03, 0.1], 'alpha': [1e-05, 0.0001, 0.001]}]
Best parameters set found on development set:

{'alpha': 1e-05, 'hidden_layer_sizes': (20,), 'learning_rate_init': 0.03}

Grid scores on development set:

0.830 (+/-0.121) for {'alpha': 1e-05, 'hidden_layer_sizes': (5,), 'learning_rate_init': 0.003}
0.859 (+/-0.117) for {'alpha': 1e-05, 'hidden_layer_sizes': (5,), 'learning_rate_init': 0.01}
0.911 (+/-0.048) for {'alpha': 1e-05, 'hidden_layer_sizes': (5,), 'learning_rate_init': 0.03}
0.904 (+/-0.021) for {'alpha': 1e-05, 'hidden_layer_sizes': (5,), 'learning_rate_init': 0.1}
0.844 (+/-0.158) for {'alpha': 1e-05, 'hidden_layer_sizes': (10,), 'learning_rate_init': 0.003}
0.859 (+/-0.151) for {'alpha': 1e-05, 'hidden_layer_sizes': (10,), 'learning_rate_init': 0.01}
0.911 (+/-0.031) for {'alpha': 1e-05, 'hidden_layer_sizes': (10,), 'learning_rate_init': 0.03}
0.915 (+/-0.028) for {'alpha': 1e-05, 'hidden_layer_sizes': (10,), 'learning_rate_init': 0.1}
0.863 (+/-0.069) for {'alpha': 1e-05, 'hidden_layer_sizes': (20,), 'learning_rate_init': 0.003}
0.881 (+/-0.064) for {'alpha': 1e-05, 'hidden_layer_sizes': (20,), 'learning_rate_init': 0.01}
0.941 (+/-0.064) for {'alpha': 1e-05, 'hidden_layer_sizes': (20,), 'learning_rate_init': 0.03}
0.930 (+/-0.042) for {'alpha': 1e-05, 'hidden_layer_sizes': (20,), 'learning_rate_init': 0.1}
0.844 (+/-0.127) for {'alpha': 1e-05, 'hidden_layer_sizes': (40,), 'learning_rate_init': 0.003}
0.885 (+/-0.052) for {'alpha': 1e-05, 'hidden_layer_sizes': (40,), 'learning_rate_init': 0.01}
0.904 (+/-0.076) for {'alpha': 1e-05, 'hidden_layer_sizes': (40,), 'learning_rate_init': 0.03}
0.911 (+/-0.036) for {'alpha': 1e-05, 'hidden_layer_sizes': (40,), 'learning_rate_init': 0.1}
0.830 (+/-0.121) for {'alpha': 0.0001, 'hidden_layer_sizes': (5,), 'learning_rate_init': 0.003}
0.859 (+/-0.117) for {'alpha': 0.0001, 'hidden_layer_sizes': (5,), 'learning_rate_init': 0.01}
0.911 (+/-0.048) for {'alpha': 0.0001, 'hidden_layer_sizes': (5,), 'learning_rate_init': 0.03}
0.904 (+/-0.021) for {'alpha': 0.0001, 'hidden_layer_sizes': (5,), 'learning_rate_init': 0.1}
0.844 (+/-0.158) for {'alpha': 0.0001, 'hidden_layer_sizes': (10,), 'learning_rate_init': 0.003}
0.859 (+/-0.151) for {'alpha': 0.0001, 'hidden_layer_sizes': (10,), 'learning_rate_init': 0.01}
0.915 (+/-0.021) for {'alpha': 0.0001, 'hidden_layer_sizes': (10,), 'learning_rate_init': 0.03}
0.915 (+/-0.028) for {'alpha': 0.0001, 'hidden_layer_sizes': (10,), 'learning_rate_init': 0.1}
0.863 (+/-0.069) for {'alpha': 0.0001, 'hidden_layer_sizes': (20,), 'learning_rate_init': 0.003}
0.881 (+/-0.064) for {'alpha': 0.0001, 'hidden_layer_sizes': (20,), 'learning_rate_init': 0.01}
0.941 (+/-0.064) for {'alpha': 0.0001, 'hidden_layer_sizes': (20,), 'learning_rate_init': 0.03}
0.926 (+/-0.038) for {'alpha': 0.0001, 'hidden_layer_sizes': (20,), 'learning_rate_init': 0.1}
0.844 (+/-0.127) for {'alpha': 0.0001, 'hidden_layer_sizes': (40,), 'learning_rate_init': 0.003}
0.885 (+/-0.052) for {'alpha': 0.0001, 'hidden_layer_sizes': (40,), 'learning_rate_init': 0.01}
0.907 (+/-0.069) for {'alpha': 0.0001, 'hidden_layer_sizes': (40,), 'learning_rate_init': 0.03}
0.930 (+/-0.073) for {'alpha': 0.0001, 'hidden_layer_sizes': (40,), 'learning_rate_init': 0.1}
0.830 (+/-0.121) for {'alpha': 0.001, 'hidden_layer_sizes': (5,), 'learning_rate_init': 0.003}
0.859 (+/-0.117) for {'alpha': 0.001, 'hidden_layer_sizes': (5,), 'learning_rate_init': 0.01}
0.904 (+/-0.028) for {'alpha': 0.001, 'hidden_layer_sizes': (5,), 'learning_rate_init': 0.03}
0.907 (+/-0.010) for {'alpha': 0.001, 'hidden_layer_sizes': (5,), 'learning_rate_init': 0.1}
0.844 (+/-0.158) for {'alpha': 0.001, 'hidden_layer_sizes': (10,), 'learning_rate_init': 0.003}
0.852 (+/-0.137) for {'alpha': 0.001, 'hidden_layer_sizes': (10,), 'learning_rate_init': 0.01}
0.926 (+/-0.046) for {'alpha': 0.001, 'hidden_layer_sizes': (10,), 'learning_rate_init': 0.03}
0.933 (+/-0.048) for {'alpha': 0.001, 'hidden_layer_sizes': (10,), 'learning_rate_init': 0.1}
0.863 (+/-0.069) for {'alpha': 0.001, 'hidden_layer_sizes': (20,), 'learning_rate_init': 0.003}
0.881 (+/-0.064) for {'alpha': 0.001, 'hidden_layer_sizes': (20,), 'learning_rate_init': 0.01}
0.937 (+/-0.064) for {'alpha': 0.001, 'hidden_layer_sizes': (20,), 'learning_rate_init': 0.03}
0.896 (+/-0.055) for {'alpha': 0.001, 'hidden_layer_sizes': (20,), 'learning_rate_init': 0.1}
0.844 (+/-0.127) for {'alpha': 0.001, 'hidden_layer_sizes': (40,), 'learning_rate_init': 0.003}
0.885 (+/-0.052) for {'alpha': 0.001, 'hidden_layer_sizes': (40,), 'learning_rate_init': 0.01}
0.885 (+/-0.010) for {'alpha': 0.001, 'hidden_layer_sizes': (40,), 'learning_rate_init': 0.03}
0.922 (+/-0.073) for {'alpha': 0.001, 'hidden_layer_sizes': (40,), 'learning_rate_init': 0.1}

Detailed classification report:

The model is trained on the full development set.
The scores are computed on the full evaluation set.

              precision    recall  f1-score   support

           0       0.97      0.93      0.95       323
           1       0.93      0.97      0.95       307

   micro avg       0.95      0.95      0.95       630
   macro avg       0.95      0.95      0.95       630
weighted avg       0.95      0.95      0.95       630
 samples avg       0.95      0.95      0.95       630


from sklearn.datasets import load_digits
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import classification_report

digits = load_digits()
n_samples = len(digits.images)
print("Number_of-examples = ", n_samples)

import matplotlib.pyplot as plt
print("\n Plot of first example")
plt.gray() 
plt.matshow(digits.images[0]) 
print("CLOSE PLOT WINDOW TO CONTINUE")
plt.ioff()
plt.show()

# Flatten the images, to turn data in a (samples, feature) matrix:
data = digits.images.reshape((n_samples, -1))

Xdigits = data
y_digits = digits.target
Xdigits_train, Xdigits_test, y_digits_train, y_digits_test = train_test_split(Xdigits, y_digits, test_size=0.5)

clf = MLPClassifier(hidden_layer_sizes=(10, ), activation='tanh', solver='sgd', 
                    alpha=0.00001, batch_size=4, learning_rate='constant', learning_rate_init=0.01, 
                    power_t=0.5, max_iter=9, shuffle=True, random_state=11, tol=0.00001, 
                    verbose=True, warm_start=False, momentum=0.8, nesterovs_momentum=True, 
                    early_stopping=False, validation_fraction=0.1, 
                    beta_1=0.9, beta_2=0.999, epsilon=1e-08)
print(clf)

# Train the MLP classifier on training dataset
clf.fit(Xdigits_train, y_digits_train)

# Plot the LEARNING CURVE
plt.title("Evolution of TRAINING ERROR during training")
plt.xlabel("Iterations (epochs)")
plt.ylabel("TRAINING ERROR")
plt.plot(mlp.loss_curve_)
plt.show()

# Evaluate acuracy on test data
score = clf.score(Xdigits_test,y_digits_test)
print("Acuracy (on test set) = ", score)
y_true, y_pred = y_digits_test, clf.predict(Xdigits_test)
print(classification_report(y_true, y_pred))

# Display CONFUSION MATRIX on TEST set
from sklearn.metrics import confusion_matrix
print("CONFUSION MATRIX below")
confusion_matrix(y_true, y_pred)

Number_of-examples =  1797

 Plot of first example
CLOSE PLOT WINDOW TO CONTINUE

<Figure size 432x288 with 0 Axes>

MLPClassifier(activation='tanh', alpha=1e-05, batch_size=4,
              hidden_layer_sizes=(10,), learning_rate_init=0.01, max_iter=9,
              momentum=0.8, random_state=11, solver='sgd', tol=1e-05,
              verbose=True)
Iteration 1, loss = 2.04081277
Iteration 2, loss = 1.56330125
Iteration 3, loss = 1.23753682
Iteration 4, loss = 1.15332191
Iteration 5, loss = 0.91743745
Iteration 6, loss = 0.84968576
Iteration 7, loss = 0.79511375
Iteration 8, loss = 0.85375545
Iteration 9, loss = 0.75832428

C:\Users\fabien\anaconda3_2020-07\envs\envML2020\lib\site-packages\sklearn\neural_network\_multilayer_perceptron.py:585: ConvergenceWarning: Stochastic Optimizer: Maximum iterations (9) reached and the optimization hasn't converged yet.
  % self.max_iter, ConvergenceWarning)

Acuracy (on test set) =  0.7864293659621802
              precision    recall  f1-score   support

           0       0.81      0.99      0.89        91
           1       0.73      0.74      0.73        91
           2       0.91      0.92      0.91        84
           3       0.60      0.91      0.73        89
           4       0.86      0.90      0.88        88
           5       0.78      0.87      0.82        95
           6       0.82      0.91      0.86        94
           7       0.94      0.90      0.92        89
           8       0.57      0.14      0.22        94
           9       0.78      0.61      0.68        84

    accuracy                           0.79       899
   macro avg       0.78      0.79      0.77       899
weighted avg       0.78      0.79      0.76       899

CONFUSION MATRIX below

array([[90,  0,  0,  0,  1,  0,  0,  0,  0,  0],
       [ 0, 67,  0,  0,  2,  3, 11,  0,  0,  8],
       [ 0,  1, 77,  4,  0,  0,  0,  1,  0,  1],
       [ 0,  0,  1, 81,  0,  2,  1,  1,  3,  0],
       [ 2,  1,  0,  0, 79,  0,  4,  1,  0,  1],
       [ 3,  0,  3,  1,  0, 83,  0,  1,  3,  1],
       [ 2,  0,  1,  0,  0,  4, 86,  0,  0,  1],
       [ 0,  0,  2,  0,  3,  0,  0, 80,  4,  0],
       [ 2, 21,  0, 39,  1, 13,  3,  0, 13,  2],
       [12,  2,  1,  9,  6,  2,  0,  1,  0, 51]], dtype=int64)

Configuring and Training a Multi-layer Perceptron (MLP) in SciKit-Learn¶

1. Understand and experiment MLP on a VERY simple classification problem¶

Below, we visualize the learnt boundary between classes in (2D) input space¶

Finally, use grid-search and cross-validation to find an optimized set of learning hyper-parameters (see code below).¶

2. WORK ON A REALISTIC DATASET: A SIMPLIFIED HANDWRITTEN DIGITS DATASET¶