Skip to content

Commit

Permalink
Merge pull request #4 from seldon-code/refactor
Browse files Browse the repository at this point in the history
Refactor
  • Loading branch information
MSallermann authored Nov 18, 2023
2 parents 1d50a8a + 19ee4de commit 37a4e51
Show file tree
Hide file tree
Showing 9 changed files with 145 additions and 98 deletions.
5 changes: 4 additions & 1 deletion examples/mnist/main.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -67,7 +67,10 @@ int main()
// No. of trainable params
network.summary();
network.loss_tol = 2e-2;
network.fit( x_train, y_train, 1000, 0.01, true );

auto opt = Robbie::Optimizers::StochasticGradientDescent( 0.01 );
network.set_optimizer( &opt );
network.fit( x_train, y_train, 1000, true );

fmt::print( "Loss on test set = {:.3e}\n", network.loss( x_test, y_test ) );

Expand Down
37 changes: 29 additions & 8 deletions include/fc_layer.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -13,13 +13,18 @@ class FCLayer : public Layer<scalar>
{
protected:
Matrix<scalar> weights;
Vector<scalar> bias;
Matrix<scalar> weights_error;

Matrix<scalar> bias;
Matrix<scalar> bias_error;

public:
FCLayer( size_t input_size, size_t output_size )
: Layer<scalar>( input_size, output_size ),
weights( Matrix<scalar>( output_size, input_size ) ),
bias( Vector<scalar>( output_size ) )
weights_error( Matrix<scalar>( output_size, input_size ) ),
bias( Vector<scalar>( output_size ) ),
bias_error( Vector<scalar>( output_size ) )
{
auto rd = std::random_device();
auto gen = std::mt19937( rd() );
Expand All @@ -39,18 +44,20 @@ class FCLayer : public Layer<scalar>
// returns output for a given input
Matrix<scalar> forward_propagation( const Matrix<scalar> & input_data ) override
{
this->input = input_data;
this->output = ( weights * input_data ).colwise() + bias;
this->input = input_data;
this->output
= ( weights * input_data ).colwise()
+ bias.col(
0 ); // We use .col(0), so the bias can be treated as a matrix with fixed columns at compile time
return this->output;
}

// computes dE/dW, dE/dB for a given output_error=dE/dY. Returns input_error=dE/dX.
Matrix<scalar> backward_propagation( const Matrix<scalar> & output_error ) override
{
auto input_error = weights.transpose() * output_error;
Matrix<scalar> weights_error = output_error * this->input.transpose() / output_error.cols();
Vector<scalar> bias_error = ( output_error ).rowwise().mean();
this->opt->optimize( &weights, &weights_error, &bias, &bias_error );
auto input_error = weights.transpose() * output_error;
weights_error = output_error * this->input.transpose() / output_error.cols();
bias_error = ( output_error ).rowwise().mean();
return input_error;
}

Expand All @@ -60,6 +67,20 @@ class FCLayer : public Layer<scalar>
return this->weights.size() + this->bias.size();
}

// Get ref to trainable parameters
std::vector<Eigen::Ref<Matrix<scalar>>> variables() override
{
return std::vector<Eigen::Ref<Matrix<scalar>>>{ Eigen::Ref<Matrix<scalar>>( weights ),
Eigen::Ref<Matrix<scalar>>( bias ) };
};

// Get ref to trainable parameters
std::vector<Eigen::Ref<Matrix<scalar>>> gradients() override
{
return std::vector<Eigen::Ref<Matrix<scalar>>>{ Eigen::Ref<Matrix<scalar>>( weights_error ),
Eigen::Ref<Matrix<scalar>>( bias_error ) };
};

// Access the current weights
Matrix<scalar> get_weights()
{
Expand Down
32 changes: 18 additions & 14 deletions include/layer.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -21,21 +21,10 @@ class Layer
public:
Layer() = default;
Layer( std::optional<size_t> input_size, std::optional<size_t> output_size )
: input_size( input_size ),
output_size( output_size ),
opt( std::make_unique<Optimizers::StochasticGradientDescent<scalar>>( 0.1 ) )
: input_size( input_size ), output_size( output_size )
{
}

std::unique_ptr<Optimizers::Optimizer<scalar>> opt;

// TODO: figure out how to implement copy constructor
// Layer( const Layer & l )
// : input( l.input ), output( l.output ), input_size( l.input_size ), output_size( l.output_size )
// {
// opt = std::make_unique<Optimizer<scalar>>( l.opt );
// }

virtual std::string name() = 0;

std::optional<size_t> get_input_size()
Expand All @@ -60,8 +49,23 @@ class Layer
// computes dE/dX for a given dE/dY (and update parameters if any)
virtual Matrix<scalar> backward_propagation( const Matrix<scalar> & output_error ) = 0;

// Get trainable parameters
virtual size_t get_trainable_params() = 0;
// Get number of trainable parameters
virtual size_t get_trainable_params()
{
return 0;
};

// Get ref to trainable parameters
virtual std::vector<Eigen::Ref<Matrix<scalar>>> variables()
{
return {}; // Standard behaviour is to return an empty vector, i.e. no trainable params
};

// Get ref to gradients of parameters
virtual std::vector<Eigen::Ref<Matrix<scalar>>> gradients()
{
return {};
};

virtual ~Layer() = default;
};
Expand Down
39 changes: 33 additions & 6 deletions include/network.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
#include <cstddef>
#include <memory>
#include <optional>
#include <stdexcept>
#include <type_traits>
#include <vector>

Expand Down Expand Up @@ -64,18 +65,40 @@ class Network
return loss;
}

void set_optimizer( Optimizers::Optimizer<scalar> * opt )
{
this->opt = opt;
}

void register_optimizer_variables()
{
this->opt->clear();

for( auto & layer : layers )
{
for( size_t iv = 0; iv < layer->variables().size(); iv++ )
{
auto v = layer->variables()[iv];
auto g = layer->gradients()[iv];

this->opt->register_variable( v, g );
}
}
}

void
fit( const std::vector<Matrix<scalar>> & x_train, const std::vector<Matrix<scalar>> & y_train, size_t epochs,
scalar learning_rate, bool print_progress = false )
bool print_progress = false )
{

if( this->opt == nullptr )
throw std::runtime_error( "Optimizer has not been set!" );

register_optimizer_variables();

auto n_samples = x_train.size();
auto batch_size = x_train[0].cols();

for( auto & l : layers )
{
l->opt = std::move( std::make_unique<Optimizers::StochasticGradientDescent<scalar>>( learning_rate ) );
}

fmt::print(
"Fitting with {} samples of batchsize {} ({} total)\n\n", n_samples, batch_size, n_samples * batch_size );

Expand All @@ -86,6 +109,7 @@ class Network
{
auto t_epoch_start = std::chrono::high_resolution_clock::now();
err = 0;

for( size_t j = 0; j < n_samples; j++ )
{
// forward propagation
Expand All @@ -104,6 +128,8 @@ class Network
auto & layer = layers[i_layer];
error = layer->backward_propagation( error );
}

opt->optimize();
}

auto t_epoch_end = std::chrono::high_resolution_clock::now();
Expand Down Expand Up @@ -167,6 +193,7 @@ class Network

private:
std::vector<std::unique_ptr<Layer<scalar>>> layers;
Optimizers::Optimizer<scalar> * opt = nullptr;
};

} // namespace Robbie
111 changes: 50 additions & 61 deletions include/optimizers.hpp
Original file line number Diff line number Diff line change
@@ -1,33 +1,50 @@
#pragma once
#include "defines.hpp"
#include <fmt/ostream.h>
#include <cstddef>
#include <optional>
#include <stdexcept>
#include <vector>

namespace Robbie::Optimizers
{

template<typename scalar>
class Optimizer
{
protected:
std::vector<Eigen::Ref<Robbie::Matrix<scalar>>> variables;
std::vector<Eigen::Ref<Robbie::Matrix<scalar>>> gradients;

public:
Optimizer() = default;
virtual ~Optimizer() = default;
Optimizer() = default;

virtual void clear()
{
variables.clear();
gradients.clear();
}

virtual void register_variable( Eigen::Ref<Robbie::Matrix<scalar>> var, Eigen::Ref<Robbie::Matrix<scalar>> grad )
{
if( !( ( var.rows() == grad.rows() ) && ( var.cols() == grad.cols() ) ) )
{
throw std::runtime_error( "Tried to use variable and gradient of different shapes!" );
}
variables.push_back( var );
gradients.push_back( grad );
};

virtual void optimize(
Matrix<scalar> * matrix_variable, Matrix<scalar> * matrix_gradient, Vector<scalar> * vector_variable,
Vector<scalar> * vector_gradient )
= 0;
virtual ~Optimizer() = default;
virtual void optimize() = 0;
};

template<typename scalar>
class DoNothing : public Optimizer<scalar>
{
public:
DoNothing() = default;

virtual void optimize(
[[maybe_unused]] Matrix<scalar> * matrix_variable, [[maybe_unused]] Matrix<scalar> * matrix_gradient,
[[maybe_unused]] Vector<scalar> * vector_variable, [[maybe_unused]] Vector<scalar> * vector_gradient ){};
virtual void optimize(){};
};

template<typename scalar>
Expand All @@ -40,18 +57,11 @@ class StochasticGradientDescent : public Optimizer<scalar>
public:
StochasticGradientDescent( scalar learning_rate ) : learning_rate( learning_rate ) {}

void optimize(
Matrix<scalar> * matrix_variable, Matrix<scalar> * matrix_gradient, Vector<scalar> * vector_variable,
Vector<scalar> * vector_gradient ) override
void optimize() override
{
if( matrix_variable != nullptr )
{
*( matrix_variable ) -= learning_rate * ( *matrix_gradient );
}

if( vector_variable != nullptr )
for( size_t iv = 0; iv < this->variables.size(); iv++ )
{
*( vector_variable ) -= learning_rate * ( *vector_gradient );
this->variables[iv] -= learning_rate * this->gradients[iv];
}
}
};
Expand All @@ -67,70 +77,49 @@ class Adam : public Optimizer<scalar>
scalar epsilon = 1e-8;

// first moments
Matrix<scalar> m_matrix;
Vector<scalar> m_vector;

std::vector<Matrix<scalar>> m_matrix;
// second moments
Matrix<scalar> v_matrix;
Vector<scalar> v_vector;

std::vector<Matrix<scalar>> v_matrix;
size_t timestep = 0;

void initialize( Matrix<scalar> * matrix_variable, Vector<scalar> * vector_variable )
public:
void register_variable( Eigen::Ref<Robbie::Matrix<scalar>> var, Eigen::Ref<Robbie::Matrix<scalar>> grad ) override
{
if( matrix_variable != nullptr )
{
m_matrix = Matrix<scalar>::Zero( matrix_variable->rows(), matrix_variable->cols() );
v_matrix = Matrix<scalar>::Zero( matrix_variable->rows(), matrix_variable->cols() );
}
Optimizer<scalar>::register_variable( var, grad );
m_matrix.push_back( Matrix<scalar>::Zero( var.rows(), var.cols() ) );
v_matrix.push_back( Matrix<scalar>::Zero( var.rows(), var.cols() ) );
}

if( vector_variable != nullptr )
{
m_vector = Vector<scalar>::Zero( vector_variable->size() );
v_vector = Vector<scalar>::Zero( vector_variable->size() );
}
void clear() override
{
Optimizer<scalar>::clear();
m_matrix.clear();
v_matrix.clear();
timestep = 0;
}

public:
Adam() = default;
Adam( scalar alpha ) : alpha( alpha ) {}
Adam( scalar alpha, scalar beta1, scalar beta2, scalar epsilon )
: alpha( alpha ), beta1( beta1 ), beta2( beta2 ), epsilon( epsilon )
{
}

void optimize(
Matrix<scalar> * matrix_variable, Matrix<scalar> * matrix_gradient, Vector<scalar> * vector_variable,
Vector<scalar> * vector_gradient ) override
void optimize() override
{
if( timestep == 0 )
{
initialize( matrix_variable, vector_variable );
}

scalar beta_1_t = std::pow( beta1, timestep + 1 );
scalar beta_2_t = std::pow( beta2, timestep + 1 );
scalar alpha_t = alpha * std::sqrt( 1.0 - beta_2_t ) / ( 1.0 - beta_1_t );

if( matrix_variable != nullptr )
for( size_t iv = 0; iv < this->variables.size(); iv++ )
{
// Update first moments
m_matrix = m_matrix * beta1 + ( 1.0 - beta1 ) * ( *matrix_gradient );
m_matrix[iv] = m_matrix[iv] * beta1 + ( 1.0 - beta1 ) * ( this->gradients[iv] );
// Update second moments
v_matrix = v_matrix * beta2 + ( 1.0 - beta2 ) * matrix_gradient->array().pow( 2 ).matrix();
*matrix_variable -= alpha_t * ( m_matrix.array() / ( v_matrix.array().sqrt() + epsilon ) ).matrix();
v_matrix[iv] = v_matrix[iv] * beta2 + ( 1.0 - beta2 ) * this->gradients[iv].array().pow( 2 ).matrix();
this->variables[iv]
-= alpha_t * ( m_matrix[iv].array() / ( v_matrix[iv].array().sqrt() + epsilon ) ).matrix();
}

if( vector_variable != nullptr )
{
// Update first moments
m_vector = m_vector * beta1 + ( 1.0 - beta1 ) * ( *vector_gradient );
// Update second moments
v_vector = v_vector * beta2 + ( 1.0 - beta2 ) * vector_gradient->array().pow( 2 ).matrix();
*vector_variable -= alpha_t * ( m_vector.array() / ( v_vector.array().sqrt() + epsilon ) ).matrix();
}

timestep++;
}
};

Expand Down
3 changes: 2 additions & 1 deletion include/robbie.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -6,4 +6,5 @@
#include "fc_layer.hpp"
#include "layer.hpp"
#include "loss_functions.hpp"
#include "network.hpp"
#include "network.hpp"
#include "optimizers.hpp"
Loading

0 comments on commit 37a4e51

Please sign in to comment.