Skip to content

Commit

Permalink
Merge pull request #3 from seldon-code/adam_optimizer
Browse files Browse the repository at this point in the history
Adam optimizer
  • Loading branch information
MSallermann authored Nov 16, 2023
2 parents 7a9ec85 + c9de534 commit 1d50a8a
Show file tree
Hide file tree
Showing 4 changed files with 97 additions and 21 deletions.
9 changes: 5 additions & 4 deletions include/activation_functions.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -48,15 +48,16 @@ class Softmax
template<typename Derived>
static auto f( const Eigen::MatrixBase<Derived> & x )
{
// The static_cast is necessary to make different scalar types than double compile
return x.array().exp() / x.array().exp().sum();
// Shift by max(x) to avoid numerical problems with large exponents
auto sum = ( x.array() - x.maxCoeff() ).exp().sum();
return ( x.array() - x.maxCoeff() ).exp() / sum;
}

template<typename Derived>
static auto df( const Eigen::MatrixBase<Derived> & x )
{
auto sum = x.array().exp().sum();
return 1.0 / sum * ( 1.0 - 1.0 / sum ) * x.array().exp();
auto sum = ( x.array() - x.maxCoeff() ).exp().sum();
return 1.0 / sum * ( 1.0 - 1.0 / sum ) * ( x.array() - x.maxCoeff() ).exp();
}
};

Expand Down
7 changes: 5 additions & 2 deletions include/loss_functions.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -43,13 +43,16 @@ class CrossEntropy
template<typename Derived, typename Derived2>
static auto f( const Eigen::MatrixBase<Derived> & y_true, const Eigen::MatrixBase<Derived2> & y_pred )
{
return -( y_true.array() * y_pred.array().log() ).colwise().sum();
constexpr typename Derived::Scalar epsilon = 1e-12;
return -( y_true.array() * ( y_pred.array() + epsilon ).log2() ).colwise().sum();
}

template<typename Derived, typename Derived2>
static auto df( const Eigen::MatrixBase<Derived> & y_true, const Eigen::MatrixBase<Derived2> & y_pred )
{
return -( y_true.array() * y_pred.array().pow( -1 ) ).matrix();
constexpr typename Derived::Scalar log2e = 1.44269504089;
constexpr typename Derived::Scalar epsilon = 1e-12;
return -( y_true.array() * ( y_pred.array() + epsilon ).pow( -1 ) ).matrix() * log2e;
}
};

Expand Down
3 changes: 1 addition & 2 deletions include/network.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -69,7 +69,6 @@ class Network
scalar learning_rate, bool print_progress = false )
{
auto n_samples = x_train.size();
auto input_size = x_train[0].rows();
auto batch_size = x_train[0].cols();

for( auto & l : layers )
Expand All @@ -78,7 +77,7 @@ class Network
}

fmt::print(
"Fitting with n_samples = {}, input_size = {}, batch_size = {}\n\n", n_samples, input_size, batch_size );
"Fitting with {} samples of batchsize {} ({} total)\n\n", n_samples, batch_size, n_samples * batch_size );

auto t_fit_start = std::chrono::high_resolution_clock::now();

Expand Down
99 changes: 86 additions & 13 deletions include/optimizers.hpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
#pragma once
#include "defines.hpp"
#include <fmt/ostream.h>
#include <optional>

namespace Robbie::Optimizers
Expand All @@ -11,33 +12,27 @@ class Optimizer
public:
Optimizer() = default;
virtual ~Optimizer() = default;
using opt_matrix_t = Matrix<scalar> *;
using opt_vector_t = Vector<scalar> *;

virtual void optimize(
opt_matrix_t matrix_variable, opt_matrix_t matrix_gradient, opt_vector_t vector_variable,
opt_vector_t vector_gradient )
Matrix<scalar> * matrix_variable, Matrix<scalar> * matrix_gradient, Vector<scalar> * vector_variable,
Vector<scalar> * vector_gradient )
= 0;
};

template<typename scalar>
class DoNothing : public Optimizer<scalar>
{
public:
DoNothing() = default;
using opt_matrix_t = typename Optimizer<scalar>::opt_matrix_t;
using opt_vector_t = typename Optimizer<scalar>::opt_vector_t;
DoNothing() = default;

virtual void optimize(
[[maybe_unused]] opt_matrix_t matrix_variable, [[maybe_unused]] opt_matrix_t matrix_gradient,
[[maybe_unused]] opt_vector_t vector_variable, [[maybe_unused]] opt_vector_t vector_gradient ){};
[[maybe_unused]] Matrix<scalar> * matrix_variable, [[maybe_unused]] Matrix<scalar> * matrix_gradient,
[[maybe_unused]] Vector<scalar> * vector_variable, [[maybe_unused]] Vector<scalar> * vector_gradient ){};
};

template<typename scalar>
class StochasticGradientDescent : public Optimizer<scalar>
{
using opt_matrix_t = typename Optimizer<scalar>::opt_matrix_t;
using opt_vector_t = typename Optimizer<scalar>::opt_vector_t;

private:
scalar learning_rate = 0.0;
Expand All @@ -46,8 +41,8 @@ class StochasticGradientDescent : public Optimizer<scalar>
StochasticGradientDescent( scalar learning_rate ) : learning_rate( learning_rate ) {}

void optimize(
opt_matrix_t matrix_variable, opt_matrix_t matrix_gradient, opt_vector_t vector_variable,
opt_vector_t vector_gradient ) override
Matrix<scalar> * matrix_variable, Matrix<scalar> * matrix_gradient, Vector<scalar> * vector_variable,
Vector<scalar> * vector_gradient ) override
{
if( matrix_variable != nullptr )
{
Expand All @@ -61,4 +56,82 @@ class StochasticGradientDescent : public Optimizer<scalar>
}
};

template<typename scalar>
class Adam : public Optimizer<scalar>
{

private:
scalar alpha = 0.001;
scalar beta1 = 0.9;
scalar beta2 = 0.999;
scalar epsilon = 1e-8;

// first moments
Matrix<scalar> m_matrix;
Vector<scalar> m_vector;

// second moments
Matrix<scalar> v_matrix;
Vector<scalar> v_vector;

size_t timestep = 0;

void initialize( Matrix<scalar> * matrix_variable, Vector<scalar> * vector_variable )
{
if( matrix_variable != nullptr )
{
m_matrix = Matrix<scalar>::Zero( matrix_variable->rows(), matrix_variable->cols() );
v_matrix = Matrix<scalar>::Zero( matrix_variable->rows(), matrix_variable->cols() );
}

if( vector_variable != nullptr )
{
m_vector = Vector<scalar>::Zero( vector_variable->size() );
v_vector = Vector<scalar>::Zero( vector_variable->size() );
}
}

public:
Adam() = default;
Adam( scalar alpha ) : alpha( alpha ) {}
Adam( scalar alpha, scalar beta1, scalar beta2, scalar epsilon )
: alpha( alpha ), beta1( beta1 ), beta2( beta2 ), epsilon( epsilon )
{
}

void optimize(
Matrix<scalar> * matrix_variable, Matrix<scalar> * matrix_gradient, Vector<scalar> * vector_variable,
Vector<scalar> * vector_gradient ) override
{
if( timestep == 0 )
{
initialize( matrix_variable, vector_variable );
}

scalar beta_1_t = std::pow( beta1, timestep + 1 );
scalar beta_2_t = std::pow( beta2, timestep + 1 );
scalar alpha_t = alpha * std::sqrt( 1.0 - beta_2_t ) / ( 1.0 - beta_1_t );

if( matrix_variable != nullptr )
{
// Update first moments
m_matrix = m_matrix * beta1 + ( 1.0 - beta1 ) * ( *matrix_gradient );
// Update second moments
v_matrix = v_matrix * beta2 + ( 1.0 - beta2 ) * matrix_gradient->array().pow( 2 ).matrix();
*matrix_variable -= alpha_t * ( m_matrix.array() / ( v_matrix.array().sqrt() + epsilon ) ).matrix();
}

if( vector_variable != nullptr )
{
// Update first moments
m_vector = m_vector * beta1 + ( 1.0 - beta1 ) * ( *vector_gradient );
// Update second moments
v_vector = v_vector * beta2 + ( 1.0 - beta2 ) * vector_gradient->array().pow( 2 ).matrix();
*vector_variable -= alpha_t * ( m_vector.array() / ( v_vector.array().sqrt() + epsilon ) ).matrix();
}

timestep++;
}
};

} // namespace Robbie::Optimizers

0 comments on commit 1d50a8a

Please sign in to comment.