diff --git a/chapter05/mlpClass.m b/chapter05/mlpClass.m
new file mode 100644
index 0000000..0a5d645
--- /dev/null
+++ b/chapter05/mlpClass.m
@@ -0,0 +1,63 @@
+function [model, L] = mlpClass(X,y,k,lambda)
+% Train a multilayer perceptron neural network for classification with backpropagation
+% logistic activation function is used.
+% Input:
+%   X: d x n data matrix
+%   Y: p x n response matrix
+%   k: T x 1 vector to specify number of hidden nodes in each layer
+%   lambda: regularization parameter
+% Ouput:
+%   model: model structure
+%   L: (regularized cross entropy) loss
+% Written by Mo Chen (sth4nth@gmail.com).
+if nargin < 4
+    lambda = 1e-2;
+end
+eta = 1e-3;
+tol = 1e-4;
+maxiter = 50000;
+L = inf(1,maxiter);
+
+Y = sparse(y,1:numel(y),1);
+k = [size(X,1);k(:);size(Y,1)];
+T = numel(k)-1;
+W = cell(T,1);
+b = cell(T,1);
+for t = 1:T
+    W{t} = randn(k(t),k(t+1));
+    b{t} = randn(k(t+1),1);
+end
+R = cell(T,1);
+Z = cell(T+1,1);
+Z{1} = X;
+for iter = 2:maxiter
+%     forward
+    for t = 1:T-1
+        Z{t+1} = sigmoid(W{t}'*Z{t}+b{t});         % 5.10 5.113
+    end
+    Z{T+1} = softmax(W{T}'*Z{T}+b{T});   
+    
+%     loss
+    E = Z{T+1};
+    Wn = cellfun(@(x) dot(x(:),x(:)),W);            % |W|^2
+    L(iter) = -dot(Y(:),log(E(:)))+0.5*lambda*sum(Wn);
+    if abs(L(iter)-L(iter-1)) < tol*L(iter-1); break; end
+
+%     backward
+    R{T} = Z{T+1}-Y;                
+    for t = T-1:-1:1
+        df = Z{t+1}.*(1-Z{t+1});    % h'(a)
+        R{t} = df.*(W{t+1}*R{t+1});     % 5.66
+    end
+    
+%     gradient descent
+    for t=1:T
+        dW = Z{t}*R{t}'+lambda*W{t};      % 5.67
+        db = sum(R{t},2);
+        W{t} = W{t}-eta*dW;               % 5.43
+        b{t} = b{t}-eta*db;
+    end
+end
+L = L(2:iter);
+model.W = W;
+model.b = b;
diff --git a/chapter05/mlpClassPred.m b/chapter05/mlpClassPred.m
new file mode 100644
index 0000000..0c94742
--- /dev/null
+++ b/chapter05/mlpClassPred.m
@@ -0,0 +1,19 @@
+function [y, P] = mlpClassPred(model, X)
+% Multilayer perceptron classification prediction
+% logistic activation function is used.
+% Input:
+%   model: model structure
+%   X: d x n data matrix
+% Ouput:
+%   y: 1 x n label vector
+%   P: k x n probability matrix
+% Written by Mo Chen (sth4nth@gmail.com).
+W = model.W;
+b = model.b;
+T = length(W);
+Z = X;
+for t = 1:T-1
+    Z = sigmoid(W{t}'*Z+b{t});
+end
+P = softmax(W{T}'*Z+b{T});
+[~,y] = max(P,[],1);  
\ No newline at end of file
diff --git a/chapter05/mlpReg.m b/chapter05/mlpReg.m
index caf42d1..d3759eb 100644
--- a/chapter05/mlpReg.m
+++ b/chapter05/mlpReg.m
@@ -1,22 +1,24 @@
-function [model, L] = mlpReg(X,Y,k,lambda)
-% Train a multilayer perceptron neural network
+function [model, L] = mlpReg(X,y,k,lambda)
+% Train a multilayer perceptron neural network for regression with backpropagation
+% tanh activation function is used
 % Input:
 %   X: d x n data matrix
-%   Y: p x n response matrix
+%   y: p x n response matrix
 %   k: T x 1 vector to specify number of hidden nodes in each layer
 %   lambda: regularization parameter
 % Ouput:
 %   model: model structure
-%   L: loss
+%   L: (regularized least square) loss
 % Written by Mo Chen (sth4nth@gmail.com).
 if nargin < 4
     lambda = 1e-2;
 end
-eta = 1e-3;
+eta = 1e-5;
+tol = 1e-5;
 maxiter = 50000;
 L = inf(1,maxiter);
 
-k = [size(X,1);k(:);size(Y,1)];
+k = [size(X,1);k(:);size(y,1)];
 T = numel(k)-1;
 W = cell(T,1);
 b = cell(T,1);
@@ -30,30 +32,31 @@
 for iter = 2:maxiter
 %     forward
     for t = 1:T-1
-        Z{t+1} = tanh(W{t}'*Z{t}+b{t});
+        Z{t+1} = tanh(W{t}'*Z{t}+b{t});             % 5.10 5.113
     end
-    Z{T+1} = W{T}'*Z{T}+b{T};
+    Z{T+1} = W{T}'*Z{T}+b{T};                       % 5.114
 
 %     loss
-    E = Z{T+1}-Y;     
+    E = Z{T+1}-y;     
     Wn = cellfun(@(x) dot(x(:),x(:)),W);            % |W|^2
     L(iter) = dot(E(:),E(:))+lambda*sum(Wn);
-
+    if abs(L(iter)-L(iter-1)) < tol*L(iter-1); break; end
+    
 %     backward
-    R{T} = E;                % delta
+    R{T} = E;                
     for t = T-1:-1:1
         df = 1-Z{t+1}.^2;    % h'(a)
-        R{t} = df.*(W{t+1}*R{t+1});    % delta
+        R{t} = df.*(W{t+1}*R{t+1});    % 5.66
     end
     
 %     gradient descent
     for t=1:T
-        dW = Z{t}*R{t}'+lambda*W{t};
+        dW = Z{t}*R{t}'+lambda*W{t};    % 5.67
         db = sum(R{t},2);
-        W{t} = W{t}-eta*dW;
+        W{t} = W{t}-eta*dW;             % 5.43
         b{t} = b{t}-eta*db;
     end
 end
-L = L(1,2:iter);
+L = L(2:iter);
 model.W = W;
 model.b = b;
diff --git a/chapter05/mlpRegPred.m b/chapter05/mlpRegPred.m
index e3bba3f..d2e67f9 100644
--- a/chapter05/mlpRegPred.m
+++ b/chapter05/mlpRegPred.m
@@ -1,5 +1,6 @@
 function Y = mlpRegPred(model, X)
-% Multilayer perceptron prediction
+% Multilayer perceptron regression prediction
+% tanh activation function is used.
 % Input:
 %   model: model structure
 %   X: d x n data matrix
diff --git a/demo/ch05/mlp_demo.m b/demo/ch05/mlp_demo.m
index 75c170a..70b57b3 100644
--- a/demo/ch05/mlp_demo.m
+++ b/demo/ch05/mlp_demo.m
@@ -1,15 +1,32 @@
-clear; close all;
+clear; close all
+%% Regression
 n = 200;
 x = linspace(0,2*pi,n);
 y = sin(x);
 
-k = [3,4];            % two hidden layers with 3 and 4 hidden nodes
+h = [10,6];            % two hidden layers with 10 and 6 neurons
 lambda = 1e-2;
-[model, L] = mlpReg(x,y,k);
+[model, L] = mlpReg(x,y,h,lambda);
 t = mlpRegPred(model,x);
 plot(L);
 figure;
 hold on
 plot(x,y,'.');
 plot(x,t);
-hold off
\ No newline at end of file
+hold off
+%% Classification
+clear;
+k = 2;
+n = 200;
+[X,y] = kmeansRnd(2,k,n);
+figure;
+plotClass(X,y);
+
+h = 3;
+lambda = 1e-2;
+[model, llh] = mlpClass(X,y,h,lambda);
+[t,p] = mlpClassPred(model,X);
+figure;
+plotClass(X,t);
+figure;
+plot(llh);
\ No newline at end of file