Merge pull request #4 from seldon-code/refactor

Refactor
seldon-code · Nov 18, 2023 · 37a4e51 · 37a4e51
2 parents 1d50a8a + 19ee4de
commit 37a4e51
Show file tree

Hide file tree

Showing 9 changed files with 145 additions and 98 deletions.
diff --git a/examples/mnist/main.cpp b/examples/mnist/main.cpp
@@ -67,7 +67,10 @@ int main()
     // No. of trainable params
     network.summary();
     network.loss_tol = 2e-2;
-    network.fit( x_train, y_train, 1000, 0.01, true );
+
+    auto opt = Robbie::Optimizers::StochasticGradientDescent( 0.01 );
+    network.set_optimizer( &opt );
+    network.fit( x_train, y_train, 1000, true );
 
     fmt::print( "Loss on test set = {:.3e}\n", network.loss( x_test, y_test ) );
 

diff --git a/include/fc_layer.hpp b/include/fc_layer.hpp
@@ -13,13 +13,18 @@ class FCLayer : public Layer<scalar>
 {
 protected:
     Matrix<scalar> weights;
-    Vector<scalar> bias;
+    Matrix<scalar> weights_error;
+
+    Matrix<scalar> bias;
+    Matrix<scalar> bias_error;
 
 public:
     FCLayer( size_t input_size, size_t output_size )
             : Layer<scalar>( input_size, output_size ),
               weights( Matrix<scalar>( output_size, input_size ) ),
-              bias( Vector<scalar>( output_size ) )
+              weights_error( Matrix<scalar>( output_size, input_size ) ),
+              bias( Vector<scalar>( output_size ) ),
+              bias_error( Vector<scalar>( output_size ) )
     {
         auto rd   = std::random_device();
         auto gen  = std::mt19937( rd() );
@@ -39,18 +44,20 @@ class FCLayer : public Layer<scalar>
     // returns output for a given input
     Matrix<scalar> forward_propagation( const Matrix<scalar> & input_data ) override
     {
-        this->input  = input_data;
-        this->output = ( weights * input_data ).colwise() + bias;
+        this->input = input_data;
+        this->output
+            = ( weights * input_data ).colwise()
+              + bias.col(
+                  0 ); // We use .col(0), so the bias can be treated as a matrix with fixed columns at compile time
         return this->output;
     }
 
     // computes dE/dW, dE/dB for a given output_error=dE/dY. Returns input_error=dE/dX.
     Matrix<scalar> backward_propagation( const Matrix<scalar> & output_error ) override
     {
-        auto input_error             = weights.transpose() * output_error;
-        Matrix<scalar> weights_error = output_error * this->input.transpose() / output_error.cols();
-        Vector<scalar> bias_error    = ( output_error ).rowwise().mean();
-        this->opt->optimize( &weights, &weights_error, &bias, &bias_error );
+        auto input_error = weights.transpose() * output_error;
+        weights_error    = output_error * this->input.transpose() / output_error.cols();
+        bias_error       = ( output_error ).rowwise().mean();
         return input_error;
     }
 
@@ -60,6 +67,20 @@ class FCLayer : public Layer<scalar>
         return this->weights.size() + this->bias.size();
     }
 
+    // Get ref to trainable parameters
+    std::vector<Eigen::Ref<Matrix<scalar>>> variables() override
+    {
+        return std::vector<Eigen::Ref<Matrix<scalar>>>{ Eigen::Ref<Matrix<scalar>>( weights ),
+                                                        Eigen::Ref<Matrix<scalar>>( bias ) };
+    };
+
+    // Get ref to trainable parameters
+    std::vector<Eigen::Ref<Matrix<scalar>>> gradients() override
+    {
+        return std::vector<Eigen::Ref<Matrix<scalar>>>{ Eigen::Ref<Matrix<scalar>>( weights_error ),
+                                                        Eigen::Ref<Matrix<scalar>>( bias_error ) };
+    };
+
     // Access the current weights
     Matrix<scalar> get_weights()
     {

diff --git a/include/layer.hpp b/include/layer.hpp
@@ -21,21 +21,10 @@ class Layer
 public:
     Layer() = default;
     Layer( std::optional<size_t> input_size, std::optional<size_t> output_size )
-            : input_size( input_size ),
-              output_size( output_size ),
-              opt( std::make_unique<Optimizers::StochasticGradientDescent<scalar>>( 0.1 ) )
+            : input_size( input_size ), output_size( output_size )
     {
     }
 
-    std::unique_ptr<Optimizers::Optimizer<scalar>> opt;
-
-    // TODO: figure out how to implement copy constructor
-    // Layer( const Layer & l )
-    //         : input( l.input ), output( l.output ), input_size( l.input_size ), output_size( l.output_size )
-    // {
-    //     opt = std::make_unique<Optimizer<scalar>>( l.opt );
-    // }
-
     virtual std::string name() = 0;
 
     std::optional<size_t> get_input_size()
@@ -60,8 +49,23 @@ class Layer
     // computes dE/dX for a given dE/dY (and update parameters if any)
     virtual Matrix<scalar> backward_propagation( const Matrix<scalar> & output_error ) = 0;
 
-    // Get trainable parameters
-    virtual size_t get_trainable_params() = 0;
+    // Get number of trainable parameters
+    virtual size_t get_trainable_params()
+    {
+        return 0;
+    };
+
+    // Get ref to trainable parameters
+    virtual std::vector<Eigen::Ref<Matrix<scalar>>> variables()
+    {
+        return {}; // Standard behaviour is to return an empty vector, i.e. no trainable params
+    };
+
+    // Get ref to gradients of parameters
+    virtual std::vector<Eigen::Ref<Matrix<scalar>>> gradients()
+    {
+        return {};
+    };
 
     virtual ~Layer() = default;
 };

diff --git a/include/network.hpp b/include/network.hpp
@@ -9,6 +9,7 @@
 #include <cstddef>
 #include <memory>
 #include <optional>
+#include <stdexcept>
 #include <type_traits>
 #include <vector>
 
@@ -64,18 +65,40 @@ class Network
         return loss;
     }
 
+    void set_optimizer( Optimizers::Optimizer<scalar> * opt )
+    {
+        this->opt = opt;
+    }
+
+    void register_optimizer_variables()
+    {
+        this->opt->clear();
+
+        for( auto & layer : layers )
+        {
+            for( size_t iv = 0; iv < layer->variables().size(); iv++ )
+            {
+                auto v = layer->variables()[iv];
+                auto g = layer->gradients()[iv];
+
+                this->opt->register_variable( v, g );
+            }
+        }
+    }
+
     void
     fit( const std::vector<Matrix<scalar>> & x_train, const std::vector<Matrix<scalar>> & y_train, size_t epochs,
-         scalar learning_rate, bool print_progress = false )
+         bool print_progress = false )
     {
+
+        if( this->opt == nullptr )
+            throw std::runtime_error( "Optimizer has not been set!" );
+
+        register_optimizer_variables();
+
         auto n_samples  = x_train.size();
         auto batch_size = x_train[0].cols();
 
-        for( auto & l : layers )
-        {
-            l->opt = std::move( std::make_unique<Optimizers::StochasticGradientDescent<scalar>>( learning_rate ) );
-        }
-
         fmt::print(
             "Fitting with {} samples of batchsize {} ({} total)\n\n", n_samples, batch_size, n_samples * batch_size );
 
@@ -86,6 +109,7 @@ class Network
         {
             auto t_epoch_start = std::chrono::high_resolution_clock::now();
             err                = 0;
+
             for( size_t j = 0; j < n_samples; j++ )
             {
                 // forward propagation
@@ -104,6 +128,8 @@ class Network
                     auto & layer = layers[i_layer];
                     error        = layer->backward_propagation( error );
                 }
+
+                opt->optimize();
             }
 
             auto t_epoch_end = std::chrono::high_resolution_clock::now();
@@ -167,6 +193,7 @@ class Network
 
 private:
     std::vector<std::unique_ptr<Layer<scalar>>> layers;
+    Optimizers::Optimizer<scalar> * opt = nullptr;
 };
 
 } // namespace Robbie
diff --git a/include/optimizers.hpp b/include/optimizers.hpp
@@ -1,33 +1,50 @@
 #pragma once
 #include "defines.hpp"
 #include <fmt/ostream.h>
+#include <cstddef>
 #include <optional>
+#include <stdexcept>
+#include <vector>
 
 namespace Robbie::Optimizers
 {
 
 template<typename scalar>
 class Optimizer
 {
+protected:
+    std::vector<Eigen::Ref<Robbie::Matrix<scalar>>> variables;
+    std::vector<Eigen::Ref<Robbie::Matrix<scalar>>> gradients;
+
 public:
-    Optimizer()          = default;
-    virtual ~Optimizer() = default;
+    Optimizer() = default;
+
+    virtual void clear()
+    {
+        variables.clear();
+        gradients.clear();
+    }
+
+    virtual void register_variable( Eigen::Ref<Robbie::Matrix<scalar>> var, Eigen::Ref<Robbie::Matrix<scalar>> grad )
+    {
+        if( !( ( var.rows() == grad.rows() ) && ( var.cols() == grad.cols() ) ) )
+        {
+            throw std::runtime_error( "Tried to use variable and gradient of different shapes!" );
+        }
+        variables.push_back( var );
+        gradients.push_back( grad );
+    };
 
-    virtual void optimize(
-        Matrix<scalar> * matrix_variable, Matrix<scalar> * matrix_gradient, Vector<scalar> * vector_variable,
-        Vector<scalar> * vector_gradient )
-        = 0;
+    virtual ~Optimizer()    = default;
+    virtual void optimize() = 0;
 };
 
 template<typename scalar>
 class DoNothing : public Optimizer<scalar>
 {
 public:
     DoNothing() = default;
-
-    virtual void optimize(
-        [[maybe_unused]] Matrix<scalar> * matrix_variable, [[maybe_unused]] Matrix<scalar> * matrix_gradient,
-        [[maybe_unused]] Vector<scalar> * vector_variable, [[maybe_unused]] Vector<scalar> * vector_gradient ){};
+    virtual void optimize(){};
 };
 
 template<typename scalar>
@@ -40,18 +57,11 @@ class StochasticGradientDescent : public Optimizer<scalar>
 public:
     StochasticGradientDescent( scalar learning_rate ) : learning_rate( learning_rate ) {}
 
-    void optimize(
-        Matrix<scalar> * matrix_variable, Matrix<scalar> * matrix_gradient, Vector<scalar> * vector_variable,
-        Vector<scalar> * vector_gradient ) override
+    void optimize() override
     {
-        if( matrix_variable != nullptr )
-        {
-            *( matrix_variable ) -= learning_rate * ( *matrix_gradient );
-        }
-
-        if( vector_variable != nullptr )
+        for( size_t iv = 0; iv < this->variables.size(); iv++ )
         {
-            *( vector_variable ) -= learning_rate * ( *vector_gradient );
+            this->variables[iv] -= learning_rate * this->gradients[iv];
         }
     }
 };
@@ -67,70 +77,49 @@ class Adam : public Optimizer<scalar>
     scalar epsilon = 1e-8;
 
     // first moments
-    Matrix<scalar> m_matrix;
-    Vector<scalar> m_vector;
-
+    std::vector<Matrix<scalar>> m_matrix;
     // second moments
-    Matrix<scalar> v_matrix;
-    Vector<scalar> v_vector;
-
+    std::vector<Matrix<scalar>> v_matrix;
     size_t timestep = 0;
 
-    void initialize( Matrix<scalar> * matrix_variable, Vector<scalar> * vector_variable )
+public:
+    void register_variable( Eigen::Ref<Robbie::Matrix<scalar>> var, Eigen::Ref<Robbie::Matrix<scalar>> grad ) override
     {
-        if( matrix_variable != nullptr )
-        {
-            m_matrix = Matrix<scalar>::Zero( matrix_variable->rows(), matrix_variable->cols() );
-            v_matrix = Matrix<scalar>::Zero( matrix_variable->rows(), matrix_variable->cols() );
-        }
+        Optimizer<scalar>::register_variable( var, grad );
+        m_matrix.push_back( Matrix<scalar>::Zero( var.rows(), var.cols() ) );
+        v_matrix.push_back( Matrix<scalar>::Zero( var.rows(), var.cols() ) );
+    }
 
-        if( vector_variable != nullptr )
-        {
-            m_vector = Vector<scalar>::Zero( vector_variable->size() );
-            v_vector = Vector<scalar>::Zero( vector_variable->size() );
-        }
+    void clear() override
+    {
+        Optimizer<scalar>::clear();
+        m_matrix.clear();
+        v_matrix.clear();
+        timestep = 0;
     }
 
-public:
     Adam() = default;
     Adam( scalar alpha ) : alpha( alpha ) {}
     Adam( scalar alpha, scalar beta1, scalar beta2, scalar epsilon )
             : alpha( alpha ), beta1( beta1 ), beta2( beta2 ), epsilon( epsilon )
     {
     }
 
-    void optimize(
-        Matrix<scalar> * matrix_variable, Matrix<scalar> * matrix_gradient, Vector<scalar> * vector_variable,
-        Vector<scalar> * vector_gradient ) override
+    void optimize() override
     {
-        if( timestep == 0 )
-        {
-            initialize( matrix_variable, vector_variable );
-        }
-
         scalar beta_1_t = std::pow( beta1, timestep + 1 );
         scalar beta_2_t = std::pow( beta2, timestep + 1 );
         scalar alpha_t  = alpha * std::sqrt( 1.0 - beta_2_t ) / ( 1.0 - beta_1_t );
 
-        if( matrix_variable != nullptr )
+        for( size_t iv = 0; iv < this->variables.size(); iv++ )
         {
             // Update first moments
-            m_matrix = m_matrix * beta1 + ( 1.0 - beta1 ) * ( *matrix_gradient );
+            m_matrix[iv] = m_matrix[iv] * beta1 + ( 1.0 - beta1 ) * ( this->gradients[iv] );
             // Update second moments
-            v_matrix = v_matrix * beta2 + ( 1.0 - beta2 ) * matrix_gradient->array().pow( 2 ).matrix();
-            *matrix_variable -= alpha_t * ( m_matrix.array() / ( v_matrix.array().sqrt() + epsilon ) ).matrix();
+            v_matrix[iv] = v_matrix[iv] * beta2 + ( 1.0 - beta2 ) * this->gradients[iv].array().pow( 2 ).matrix();
+            this->variables[iv]
+                -= alpha_t * ( m_matrix[iv].array() / ( v_matrix[iv].array().sqrt() + epsilon ) ).matrix();
         }
-
-        if( vector_variable != nullptr )
-        {
-            // Update first moments
-            m_vector = m_vector * beta1 + ( 1.0 - beta1 ) * ( *vector_gradient );
-            // Update second moments
-            v_vector = v_vector * beta2 + ( 1.0 - beta2 ) * vector_gradient->array().pow( 2 ).matrix();
-            *vector_variable -= alpha_t * ( m_vector.array() / ( v_vector.array().sqrt() + epsilon ) ).matrix();
-        }
-
-        timestep++;
     }
 };
 

diff --git a/include/robbie.hpp b/include/robbie.hpp
@@ -6,4 +6,5 @@
 #include "fc_layer.hpp"
 #include "layer.hpp"
 #include "loss_functions.hpp"
-#include "network.hpp"
+#include "network.hpp"
+#include "optimizers.hpp"