From ae0b3294def374d839669ec95cd68d27bca1e663 Mon Sep 17 00:00:00 2001 From: Moritz Sallermann Date: Thu, 16 Nov 2023 18:17:06 +0000 Subject: [PATCH] Started optimizer refactoring --- include/fc_layer.hpp | 37 ++++-- include/layer.hpp | 32 +++--- include/network.hpp | 37 +++++- include/optimizers.hpp | 172 +++++++++++++--------------- test/test_backward_propagations.cpp | 4 - 5 files changed, 161 insertions(+), 121 deletions(-) diff --git a/include/fc_layer.hpp b/include/fc_layer.hpp index 584d143..bfd2668 100644 --- a/include/fc_layer.hpp +++ b/include/fc_layer.hpp @@ -13,13 +13,18 @@ class FCLayer : public Layer { protected: Matrix weights; - Vector bias; + Matrix weights_error; + + Matrix bias; + Matrix bias_error; public: FCLayer( size_t input_size, size_t output_size ) : Layer( input_size, output_size ), weights( Matrix( output_size, input_size ) ), - bias( Vector( output_size ) ) + weights_error( Matrix( output_size, input_size ) ), + bias( Vector( output_size ) ), + bias_error( Vector( output_size ) ) { auto rd = std::random_device(); auto gen = std::mt19937( rd() ); @@ -39,18 +44,20 @@ class FCLayer : public Layer // returns output for a given input Matrix forward_propagation( const Matrix & input_data ) override { - this->input = input_data; - this->output = ( weights * input_data ).colwise() + bias; + this->input = input_data; + this->output + = ( weights * input_data ).colwise() + + bias.col( + 0 ); // We use .col(0), so the bias can be treated as a matrix with fixed columns at compile time return this->output; } // computes dE/dW, dE/dB for a given output_error=dE/dY. Returns input_error=dE/dX. Matrix backward_propagation( const Matrix & output_error ) override { - auto input_error = weights.transpose() * output_error; - Matrix weights_error = output_error * this->input.transpose() / output_error.cols(); - Vector bias_error = ( output_error ).rowwise().mean(); - this->opt->optimize( &weights, &weights_error, &bias, &bias_error ); + auto input_error = weights.transpose() * output_error; + weights_error = output_error * this->input.transpose() / output_error.cols(); + bias_error = ( output_error ).rowwise().mean(); return input_error; } @@ -60,6 +67,20 @@ class FCLayer : public Layer return this->weights.size() + this->bias.size(); } + // Get ref to trainable parameters + std::vector>> variables() override + { + return std::vector>>{ Eigen::Ref>( weights ), + Eigen::Ref>( bias ) }; + }; + + // Get ref to trainable parameters + std::vector>> gradients() override + { + return std::vector>>{ Eigen::Ref>( weights_error ), + Eigen::Ref>( bias_error ) }; + }; + // Access the current weights Matrix get_weights() { diff --git a/include/layer.hpp b/include/layer.hpp index c34af4a..9fe0677 100644 --- a/include/layer.hpp +++ b/include/layer.hpp @@ -21,21 +21,10 @@ class Layer public: Layer() = default; Layer( std::optional input_size, std::optional output_size ) - : input_size( input_size ), - output_size( output_size ), - opt( std::make_unique>( 0.1 ) ) + : input_size( input_size ), output_size( output_size ) { } - std::unique_ptr> opt; - - // TODO: figure out how to implement copy constructor - // Layer( const Layer & l ) - // : input( l.input ), output( l.output ), input_size( l.input_size ), output_size( l.output_size ) - // { - // opt = std::make_unique>( l.opt ); - // } - virtual std::string name() = 0; std::optional get_input_size() @@ -60,8 +49,23 @@ class Layer // computes dE/dX for a given dE/dY (and update parameters if any) virtual Matrix backward_propagation( const Matrix & output_error ) = 0; - // Get trainable parameters - virtual size_t get_trainable_params() = 0; + // Get number of trainable parameters + virtual size_t get_trainable_params() + { + return 0; + }; + + // Get ref to trainable parameters + virtual std::vector>> variables() + { + return {}; // Standard behaviour is to return an empty vector, i.e. no trainable params + }; + + // Get ref to gradients of parameters + virtual std::vector>> gradients() + { + return {}; + }; virtual ~Layer() = default; }; diff --git a/include/network.hpp b/include/network.hpp index 0f7c14e..84dd16d 100644 --- a/include/network.hpp +++ b/include/network.hpp @@ -22,6 +22,9 @@ class Network public: std::optional loss_tol = std::nullopt; + std::unique_ptr> opt + = std::make_unique>( 0.00001 ); + Network() = default; template @@ -64,18 +67,39 @@ class Network return loss; } + // void set_optimizer( const Optimizers::Optimizer & opt ) + // { + // this->opt = s + // } + + void register_optimizer_variables() + { + this->opt->variables.clear(); + this->opt->variables.clear(); + + for( auto & layer : layers ) + { + for( auto & v : layer->variables() ) + { + this->opt->variables.push_back( v ); + } + + for( auto & g : layer->gradients() ) + { + this->opt->gradients.push_back( g ); + } + } + } + void fit( const std::vector> & x_train, const std::vector> & y_train, size_t epochs, scalar learning_rate, bool print_progress = false ) { + register_optimizer_variables(); + auto n_samples = x_train.size(); auto batch_size = x_train[0].cols(); - for( auto & l : layers ) - { - l->opt = std::move( std::make_unique>( learning_rate ) ); - } - fmt::print( "Fitting with {} samples of batchsize {} ({} total)\n\n", n_samples, batch_size, n_samples * batch_size ); @@ -86,6 +110,7 @@ class Network { auto t_epoch_start = std::chrono::high_resolution_clock::now(); err = 0; + for( size_t j = 0; j < n_samples; j++ ) { // forward propagation @@ -104,6 +129,8 @@ class Network auto & layer = layers[i_layer]; error = layer->backward_propagation( error ); } + + opt->optimize(); } auto t_epoch_end = std::chrono::high_resolution_clock::now(); diff --git a/include/optimizers.hpp b/include/optimizers.hpp index f6aeeb6..f8fd272 100644 --- a/include/optimizers.hpp +++ b/include/optimizers.hpp @@ -1,7 +1,9 @@ #pragma once #include "defines.hpp" #include +#include #include +#include namespace Robbie::Optimizers { @@ -13,10 +15,10 @@ class Optimizer Optimizer() = default; virtual ~Optimizer() = default; - virtual void optimize( - Matrix * matrix_variable, Matrix * matrix_gradient, Vector * vector_variable, - Vector * vector_gradient ) - = 0; + std::vector>> variables; + std::vector>> gradients; + + virtual void optimize() = 0; }; template @@ -24,10 +26,7 @@ class DoNothing : public Optimizer { public: DoNothing() = default; - - virtual void optimize( - [[maybe_unused]] Matrix * matrix_variable, [[maybe_unused]] Matrix * matrix_gradient, - [[maybe_unused]] Vector * vector_variable, [[maybe_unused]] Vector * vector_gradient ){}; + virtual void optimize(){}; }; template @@ -40,18 +39,11 @@ class StochasticGradientDescent : public Optimizer public: StochasticGradientDescent( scalar learning_rate ) : learning_rate( learning_rate ) {} - void optimize( - Matrix * matrix_variable, Matrix * matrix_gradient, Vector * vector_variable, - Vector * vector_gradient ) override + void optimize() override { - if( matrix_variable != nullptr ) - { - *( matrix_variable ) -= learning_rate * ( *matrix_gradient ); - } - - if( vector_variable != nullptr ) + for( size_t iv = 0; iv < this->variables.size(); iv++ ) { - *( vector_variable ) -= learning_rate * ( *vector_gradient ); + this->variables[iv] -= learning_rate * this->gradients[iv]; } } }; @@ -60,78 +52,78 @@ template class Adam : public Optimizer { -private: - scalar alpha = 0.001; - scalar beta1 = 0.9; - scalar beta2 = 0.999; - scalar epsilon = 1e-8; - - // first moments - Matrix m_matrix; - Vector m_vector; - - // second moments - Matrix v_matrix; - Vector v_vector; - - size_t timestep = 0; - - void initialize( Matrix * matrix_variable, Vector * vector_variable ) - { - if( matrix_variable != nullptr ) - { - m_matrix = Matrix::Zero( matrix_variable->rows(), matrix_variable->cols() ); - v_matrix = Matrix::Zero( matrix_variable->rows(), matrix_variable->cols() ); - } - - if( vector_variable != nullptr ) - { - m_vector = Vector::Zero( vector_variable->size() ); - v_vector = Vector::Zero( vector_variable->size() ); - } - } - -public: - Adam() = default; - Adam( scalar alpha ) : alpha( alpha ) {} - Adam( scalar alpha, scalar beta1, scalar beta2, scalar epsilon ) - : alpha( alpha ), beta1( beta1 ), beta2( beta2 ), epsilon( epsilon ) - { - } - - void optimize( - Matrix * matrix_variable, Matrix * matrix_gradient, Vector * vector_variable, - Vector * vector_gradient ) override - { - if( timestep == 0 ) - { - initialize( matrix_variable, vector_variable ); - } - - scalar beta_1_t = std::pow( beta1, timestep + 1 ); - scalar beta_2_t = std::pow( beta2, timestep + 1 ); - scalar alpha_t = alpha * std::sqrt( 1.0 - beta_2_t ) / ( 1.0 - beta_1_t ); - - if( matrix_variable != nullptr ) - { - // Update first moments - m_matrix = m_matrix * beta1 + ( 1.0 - beta1 ) * ( *matrix_gradient ); - // Update second moments - v_matrix = v_matrix * beta2 + ( 1.0 - beta2 ) * matrix_gradient->array().pow( 2 ).matrix(); - *matrix_variable -= alpha_t * ( m_matrix.array() / ( v_matrix.array().sqrt() + epsilon ) ).matrix(); - } - - if( vector_variable != nullptr ) - { - // Update first moments - m_vector = m_vector * beta1 + ( 1.0 - beta1 ) * ( *vector_gradient ); - // Update second moments - v_vector = v_vector * beta2 + ( 1.0 - beta2 ) * vector_gradient->array().pow( 2 ).matrix(); - *vector_variable -= alpha_t * ( m_vector.array() / ( v_vector.array().sqrt() + epsilon ) ).matrix(); - } - - timestep++; - } + // private: + // scalar alpha = 0.001; + // scalar beta1 = 0.9; + // scalar beta2 = 0.999; + // scalar epsilon = 1e-8; + + // // first moments + // Matrix m_matrix; + // Vector m_vector; + + // // second moments + // Matrix v_matrix; + // Vector v_vector; + + // size_t timestep = 0; + + // void initialize( Matrix * matrix_variable, Vector * vector_variable ) + // { + // if( matrix_variable != nullptr ) + // { + // m_matrix = Matrix::Zero( matrix_variable->rows(), matrix_variable->cols() ); + // v_matrix = Matrix::Zero( matrix_variable->rows(), matrix_variable->cols() ); + // } + + // if( vector_variable != nullptr ) + // { + // m_vector = Vector::Zero( vector_variable->size() ); + // v_vector = Vector::Zero( vector_variable->size() ); + // } + // } + + // public: + // Adam() = default; + // Adam( scalar alpha ) : alpha( alpha ) {} + // Adam( scalar alpha, scalar beta1, scalar beta2, scalar epsilon ) + // : alpha( alpha ), beta1( beta1 ), beta2( beta2 ), epsilon( epsilon ) + // { + // } + + // void optimize( + // Matrix * matrix_variable, Matrix * matrix_gradient, Vector * vector_variable, + // Vector * vector_gradient ) override + // { + // if( timestep == 0 ) + // { + // initialize( matrix_variable, vector_variable ); + // } + + // scalar beta_1_t = std::pow( beta1, timestep + 1 ); + // scalar beta_2_t = std::pow( beta2, timestep + 1 ); + // scalar alpha_t = alpha * std::sqrt( 1.0 - beta_2_t ) / ( 1.0 - beta_1_t ); + + // if( matrix_variable != nullptr ) + // { + // // Update first moments + // m_matrix = m_matrix * beta1 + ( 1.0 - beta1 ) * ( *matrix_gradient ); + // // Update second moments + // v_matrix = v_matrix * beta2 + ( 1.0 - beta2 ) * matrix_gradient->array().pow( 2 ).matrix(); + // *matrix_variable -= alpha_t * ( m_matrix.array() / ( v_matrix.array().sqrt() + epsilon ) ).matrix(); + // } + + // if( vector_variable != nullptr ) + // { + // // Update first moments + // m_vector = m_vector * beta1 + ( 1.0 - beta1 ) * ( *vector_gradient ); + // // Update second moments + // v_vector = v_vector * beta2 + ( 1.0 - beta2 ) * vector_gradient->array().pow( 2 ).matrix(); + // *vector_variable -= alpha_t * ( m_vector.array() / ( v_vector.array().sqrt() + epsilon ) ).matrix(); + // } + + // timestep++; + // } }; } // namespace Robbie::Optimizers \ No newline at end of file diff --git a/test/test_backward_propagations.cpp b/test/test_backward_propagations.cpp index f21fb3c..7407d1f 100644 --- a/test/test_backward_propagations.cpp +++ b/test/test_backward_propagations.cpp @@ -14,10 +14,6 @@ template void test_backward_propagation( Robbie::Layer * layer, const Robbie::Matrix & x0, size_t output_size ) { - // Use the DoNothing optimizer - // Otherwise the weights will change on the backward propagation and we cannot compare to subsequent forward optimizations - layer->opt = std::move( std::make_unique>() ); - // The loss function is the sum of outputs E = y1 + ... + yN auto loss0 = layer->forward_propagation( x0 ).colwise().sum().eval();