2018年3月3日 星期六

AI II -- gradient descend

Today I will present to you a C++ sample code demonstrating gradient descend and back propagation.

Here we assume you the readers have already studied (or studying) the Stanford course CS231n as I recommended in previous blog and you want to try your hand coding one neural network implementation. You can use my code for reference.

Be sure you understand what is back propagation before reading the following code.


/* 
 * This example use gradient descend to find a solution for XOR problem
 */

#include <vector>
#include <deque>
#include <algorithm>
#include <iostream>
#include <cmath>
#include <cassert>
#include <ctime>
#include "MesenneTwister.h"

using namespace std;

#define INIT_WEIGHT_SCALE   1.0
#define LEAKY_RELU_CONSTANT 0.005
#define MAX_FITNESS         1000000.0
const double network_error_epsilon = 1e-3;
const double epsilon = 1e-8;

MTRand g_rng(time(NULL));
int g_generation = 0;
    

double error_func (double expect, double actual)
{
    return 0.5 * (expect - actual) * (expect - actual);
}

double error_drv_func(double target, double actual)
{
    // by chain rule. derivative of loss function 0.5 * (actual - target)^2 = (actual - target)
    // or 0.5 * (target - actual)^2 = (target - actual) * -1. 
    return actual - target;
}

//--------------------------------

template <int ACTIVATE_FUNC_TYPE>
double activate (double x)
{
    return 0;
}

template <int ACTIVATE_FUNC_TYPE>
double activate_drv (double y)
{
    return 0;
}

enum {
    SIGMOID,
    TANH,
    RELU
};

template<>
double activate<SIGMOID>(double x)
{
    return 1.0 / (1.0 + exp(-x));
}

template<>
double activate_drv<SIGMOID> (double y)
{
    return y * (1.0 - y);
}

template<>
double activate<TANH>(double x)
{
    return tanh (x);
}

template<>
double activate_drv<TANH> (double y)
{
    return 1.0 - y * y;
}

template<>
double activate<RELU>(double x)
{
    return (x > 0) ? x : LEAKY_RELU_CONSTANT * x;
}

template<>
double activate_drv<RELU>(double y)
{
    return (y > 0) ? 1.0 : LEAKY_RELU_CONSTANT;
}

//--------- try using different activation functions
double activate_func (double x)
{
    return activate<RELU>(x);
//    return activate<TANH>(x);
//    return activate<SIGMOID>(x);
}

double activate_drv_func (double y)
{
    return activate_drv<RELU>(y);
//    return activate_drv<TANH>(y);
//    return activate_drv<SIGMOID>(y);
}

//-----------
class Layer;

class Neuron
{
public:
    Neuron (int numInputs=0);

    void init_weights (double mean, double variance);    
    double calculate_output_from (vector<double> const&inputs);
    void calculate_pdrv_of_error_wrt_input_weights(vector<double> const&inputs, double target);
    void calculate_pdrv_of_error_wrt_input_weights(int indexToOutputLayer, vector<double> const&inputs, const Layer &output_layer);
    const vector<double> & weights() const;
    size_t set_input_weights(const double *weights);
    void clear_pdrvs ();
    void update_weights (double learning_rate, double momentum);
    
public:
    vector<double> inputWeights_;
    double         value_;
    
    vector<double> deltaWeight_;
    vector<double> momentum_;
    double         pdrvErrorWrtInput_;
    vector<double> pdrvErrorWrtWeights_;
};

class Layer
{
public:
    Layer(int numNeurons, int numInputs);
    ~Layer();

    void init_weights (double mean, double variance);
    void forward_pass(const Layer &last_layer);
    void calculate_output_layer_pdrv_of_error_wrt_input_weights (vector<double> const&inputs, vector<double> const&targets);
    void calculate_middle_layer_pdrv_of_error_wrt_input_weights (vector<double> const&inputs, const Layer &output_layer);
    void save_weights_to(vector<double> &weights) const;
    size_t set_input_weights(const double *weights);
    const vector<double> & get_outputs() const;
    void set_outputs(const vector<double> &value);
    size_t num_of_neurons () const;
    void clear_pdrvs ();
    void update_weights (double learning_rate, double momentum);
    
public:
    vector<Neuron> nodes_;
    vector<double> outputs_;
    
};


Neuron::Neuron(int numInputs)
{
    inputWeights_.resize(numInputs+1); // plust weight for bias
}

void Neuron::init_weights(double mean, double variance)
{
    double sqrtn = sqrt(2.0 * inputWeights_.size());
    for (size_t i=0; i < inputWeights_.size()-1; ++i) {
        inputWeights_[i] = INIT_WEIGHT_SCALE * g_rng.randNorm(mean, variance) / sqrtn;
    }
    inputWeights_.back() = 0;
}

double Neuron::calculate_output_from (const vector< double >& inputs)
{
    assert (inputs.size()+1 == inputWeights_.size() && "number of inputs wrong");
    double v=0;
    for (size_t i=0; i < inputs.size(); ++i) {
        v += inputs[i] * inputWeights_[i];
    }
    v += inputWeights_.back(); // bias
    value_ =  activate_func(v);
    return value_;
}

void Neuron::calculate_pdrv_of_error_wrt_input_weights(vector<double> const&inputs, double target)
{
    pdrvErrorWrtInput_ = error_drv_func (target, value_) * activate_drv_func(value_);
    size_t i=0;
    for (; i < pdrvErrorWrtWeights_.size() - 1; ++i) {
        pdrvErrorWrtWeights_[i] += pdrvErrorWrtInput_ * inputs[i];
    }
    pdrvErrorWrtWeights_[i] += pdrvErrorWrtInput_ * 1.0; // and bias which is 1.0
}

void Neuron::calculate_pdrv_of_error_wrt_input_weights(int indexToOutputLayer, const vector< double >& inputs, const Layer& output_layer)
{
    double d = 0;
    for (size_t i=0; i < output_layer.num_of_neurons(); ++i) {
        d += output_layer.nodes_[i].inputWeights_[indexToOutputLayer] * output_layer.nodes_[i].pdrvErrorWrtInput_;
    }
    
    pdrvErrorWrtInput_ = d * activate_drv_func(value_);
    size_t i=0;
    for (; i< pdrvErrorWrtWeights_.size() - 1; ++i) {
        pdrvErrorWrtWeights_[i] += pdrvErrorWrtInput_ * inputs[i];
    }
    pdrvErrorWrtWeights_[i] += pdrvErrorWrtInput_ * 1.0; // and bias which is 1.0
}


const vector< double >& Neuron::weights() const
{
    return inputWeights_;
}

size_t Neuron::set_input_weights(const double* weights)
{
    inputWeights_.assign(weights, weights + inputWeights_.size());
    pdrvErrorWrtWeights_.resize(inputWeights_.size());
    deltaWeight_.resize(inputWeights_.size());
    momentum_.resize(inputWeights_.size());
    return inputWeights_.size();
}

void Neuron::clear_pdrvs()
{
    pdrvErrorWrtWeights_.clear();
    pdrvErrorWrtWeights_.resize(inputWeights_.size());
}

void Neuron::update_weights(double learning_rate, double momentum)
{
    for (size_t i=0; i < inputWeights_.size(); ++i) {
        double dx = pdrvErrorWrtWeights_[i];
        /* Nesterov momentum
        double deltaWeightKeep = deltaWeight_[i];
        deltaWeight_[i] = momentum * deltaWeight_[i] - learning_rate * dx;
        inputWeights_[i] += -momentum * deltaWeightKeep + (1 + momentum) * deltaWeight_[i];
        //*/
        //* RMSprop
        double decay_rate = 0.99;
        deltaWeight_[i] = decay_rate * deltaWeight_[i] + (1.0 - decay_rate) * dx * dx;
        inputWeights_[i] += -learning_rate * dx / (sqrt(deltaWeight_[i]) + epsilon);
        //*/
    }
}



Layer::Layer(int numNeurons=0, int numInputsPerNeuron=0)
{
    nodes_.resize(numNeurons);
    for (size_t i=0; i < nodes_.size(); ++i) {
        nodes_[i] = Neuron(numInputsPerNeuron);
    }
    outputs_.resize(numNeurons);
}

Layer::~Layer()
{
    nodes_.clear();
}

void Layer::init_weights(double mean, double variance)
{
    for (size_t i=0; i < nodes_.size(); ++i) {
        nodes_[i].init_weights (mean, variance);
    }
}

void Layer::forward_pass(const Layer &last_layer)
{
    const vector<double> & inputs = last_layer.get_outputs();
    for (size_t i=0; i < nodes_.size(); ++i) {
        outputs_[i] = nodes_[i].calculate_output_from (inputs);
    }
}

void Layer::calculate_output_layer_pdrv_of_error_wrt_input_weights(vector<double> const&inputs, const vector< double >& targets)
{
    assert (targets.size() == nodes_.size());
    for (size_t i=0; i < nodes_.size(); ++i) {
        nodes_[i].calculate_pdrv_of_error_wrt_input_weights (inputs, targets[i]);
    }
}

void Layer::calculate_middle_layer_pdrv_of_error_wrt_input_weights(const vector< double >& inputs, const Layer& output_layer)
{
    for (size_t i=0; i < nodes_.size(); ++i) {
        nodes_[i].calculate_pdrv_of_error_wrt_input_weights (i, inputs, output_layer);
    }
}


void Layer::save_weights_to(vector< double >& weights) const
{
    for (size_t i=0; i < nodes_.size(); ++i) {
        const vector<double> &w = nodes_[i].weights();
        weights.insert(weights.end(), w.begin(), w.end());
    }

}

size_t Layer::set_input_weights(const double* weights)
{
    size_t pos = 0;
    for (size_t i=0; i < nodes_.size(); ++i) {
        pos += nodes_[i].set_input_weights(weights + pos);
    }
    return pos;
}

void Layer::set_outputs(const vector< double >& value)
{
    outputs_ = value;
    for (size_t i=0; i < nodes_.size(); ++i) {
        nodes_[i].value_ = outputs_[i];
    }
}

const vector< double >& Layer::get_outputs() const
{
    return outputs_;
}


size_t Layer::num_of_neurons() const
{
    return nodes_.size();
}

void Layer::clear_pdrvs()
{
    for (size_t i=0; i < nodes_.size(); ++i) {
        nodes_[i].clear_pdrvs();
    }
}

void Layer::update_weights(double learning_rate, double momentum)
{
    for (size_t i=0; i < nodes_.size(); ++i) {
        nodes_[i].update_weights(learning_rate, momentum);
    }
}



class NeuralNetwork
{
public:
    NeuralNetwork(int numInputs, vector<int> const& hiddenLayerTopology, int numOutpus);
    ~NeuralNetwork();
    void init_weights(double mean_value, double variance_value);
    size_t number_of_weights () const;
    void run (vector<double> const &inputs, vector<double> &outputs, vector<double> const &expects);
    void forward_pass (vector<double> const &inputs);
    void backward_pass (vector<double> &outputs, vector<double> const &expects);
    void save_weights_to(vector<double> &weights) const;
    void set_weights(vector<double> const &weights);
    void clear_pdrvs ();
    void update_weights (double learning_rate, double momentum);
    double error () const { return last_error_; }
    
protected:
    void feed_forward ();
    
private:
    vector<Layer> layers_;
    double last_error_;
    NeuralNetwork(const NeuralNetwork &rhs) {}
    NeuralNetwork &operator=(const NeuralNetwork &rhs) { return *this; }
};

NeuralNetwork::NeuralNetwork(int nInputs, const vector< int >& hiddenLayerTopology, int nOutputs)
{
    layers_.resize(hiddenLayerTopology.size() + 2); // hidden + inputs + outputs
    int nNeuron = nInputs;
    layers_[0] = Layer(nNeuron, 0); // layer zero is inputs
    for (size_t i=0; i < hiddenLayerTopology.size(); ++i) {
        layers_[i+1] = Layer(hiddenLayerTopology[i], layers_[i].num_of_neurons());
    }
    nNeuron = nOutputs;
    layers_.back() = Layer(nNeuron, hiddenLayerTopology.back());
}

NeuralNetwork::~NeuralNetwork()
{
    layers_.clear();
}

void NeuralNetwork::init_weights(double mean, double variance)
{
    for (size_t i=1; i < layers_.size(); ++i) {
        layers_[i].init_weights(mean, variance);
    }    
}

size_t NeuralNetwork::number_of_weights() const
{
    vector<double> weights;
    this->save_weights_to(weights);
    return weights.size();
}

void NeuralNetwork::run(const vector< double >& inputs, vector< double >& outputs, vector<double> const &expects)
{
    forward_pass(inputs);
    outputs = layers_.back().get_outputs();
    backward_pass(outputs, expects);
}

void NeuralNetwork::forward_pass(const vector< double >& inputs)
{
    // feed forward
    layers_[0].set_outputs(inputs);
    feed_forward ();
}

void NeuralNetwork::backward_pass(vector< double >& outputs, const vector< double >& expects)
{
    // calculate errors
    last_error_ = 0;
    for (size_t i=0; i < outputs.size(); ++i) {
        last_error_ += error_func(expects[i], outputs[i]);
    }
    
    // calculate partial derivatives
    // first the last layer(output layer)
    vector< double > layer_inputs = layers_[layers_.size()-2].get_outputs();
    layers_.back().calculate_output_layer_pdrv_of_error_wrt_input_weights(layer_inputs, expects);
    
    // back propagate
    for (int i=layers_.size()-2; i > 0; --i) {
        vector< double > layer_inputs = layers_[i-1].get_outputs();
        layers_[i].calculate_middle_layer_pdrv_of_error_wrt_input_weights(layer_inputs, layers_[i+1]);
    }
    
}


void NeuralNetwork::feed_forward()
{
    for (size_t i=1; i < layers_.size(); ++i) {
        layers_[i].forward_pass(layers_[i-1]);
    }    
}


void NeuralNetwork::clear_pdrvs()
{
    for (size_t i=1; i < layers_.size(); ++i) {
        layers_[i].clear_pdrvs();
    }
}

void NeuralNetwork::update_weights(double learning_rate, double momentum)
{
    // update weights
    for (size_t i=1; i < layers_.size(); ++i) {
        layers_[i].update_weights(learning_rate, momentum);
    }
    
}

void NeuralNetwork::save_weights_to(vector< double >& weights) const
{
    weights.clear();
    for (size_t i=1; i < layers_.size(); ++i) {
        layers_[i].save_weights_to (weights);
    }

}

void NeuralNetwork::set_weights(const vector< double >& weights)
{
    int pos = 0;
    for (size_t i=1; i < layers_.size(); ++i) {
        pos += layers_[i].set_input_weights (&weights[pos]);
    }
}

struct Genome
{
    double fitness_;
    vector<double> genes_;
    
    Genome(size_t numGenes=0);
    void copy_genes_from (Genome const&source, size_t from_index, size_t to_index, size_t replace_start_index, double mutate_rate);
    void report () const;
    
    bool operator<(const Genome &rhs) const
    {
        return fitness_ > rhs.fitness_;
    }
};

Genome::Genome(size_t numGenes)
{
    assert (numGenes && "number of genes can't be zero");
    genes_.resize(numGenes);
}


void Genome::copy_genes_from(const Genome& source, size_t from_index, size_t to_index, size_t replace_start_index, double mutate_rate)
{
    for (size_t i=from_index,cgene=0; i < to_index; ++i,++cgene) {
        genes_[replace_start_index+cgene] = source.genes_[i];
    }
}

void Genome::report() const
{
    cout << "weights ";
    for (size_t i=0; i < genes_.size(); ++i) {
        cout << genes_[i] << " ";
    }
}

int main()
{
    // XOR test, two inputs, two output
    const int numSamples = 4;
    const int numInputPoints = 2;
    const int numOutputPoints = 2;

    int iSeed = time(NULL)%1000000;
    g_rng.seed(iSeed);
    cout << "use seed " << iSeed << endl;
        
    vector<int> hidden_layers_topology; // hidden layers, from input to output
    hidden_layers_topology.push_back(2);

    NeuralNetwork network(numInputPoints, hidden_layers_topology, numOutputPoints);
    network.init_weights(0, 1.0);
    
    const size_t numGenes = network.number_of_weights();

    Genome entity(numGenes);
    network.save_weights_to(entity.genes_);
    
    vector<double> inputs(numInputPoints);
    vector<double> outputs(numOutputPoints);
    
    vector<double> expects(2);
        
    vector<double> returns(numOutputPoints * numSamples);
        
    network.set_weights(entity.genes_);
    
    const double learning_rate = 0.001;
    const double momentum = 0.9; // not used when update weights using RMSprop.

    while (g_generation < 100000) {
        cout << "generation " << g_generation << "    ";
        ++g_generation;
        
        int rno = 0;
        /*
        inputs[0] = 0.05;
        inputs[1] = 0.1;
        expects[0] = 0.01;
        expects[1] = 0.99;
        network.run(inputs, outputs, expects);
        returns[rno++] = outputs[0];
        returns[rno++] = outputs[1];
        */
        
        inputs[0] = 0;
        inputs[1] = 0;
        expects[0] = 1;
        expects[1] = 0;
        network.run(inputs, outputs, expects);
        returns[rno++] = outputs[0];
        returns[rno++] = outputs[1];

        inputs[0] = 1;
        inputs[1] = 1;
        expects[0] = 1;
        expects[1] = 0;
        network.run(inputs, outputs, expects);
        returns[rno++] = outputs[0];
        returns[rno++] = outputs[1];
        
        inputs[0] = 0;
        inputs[1] = 1;
        expects[0] = 0;
        expects[1] = 1;
        network.run(inputs, outputs, expects);
        returns[rno++] = outputs[0];
        returns[rno++] = outputs[1];
        
        inputs[0] = 1;
        inputs[1] = 0;
        expects[0] = 0;
        expects[1] = 1;
        network.run(inputs, outputs, expects);
        returns[rno++] = outputs[0];
        returns[rno++] = outputs[1];
        
        
        cout << "network error: " << network.error() << "    "; 
        

        network.save_weights_to(entity.genes_);
        entity.report();
        
        cout << "|| results ";
        for (size_t i=0; i < returns.size(); ++i) {
            cout << returns[i] << ", ";
        }
        cout << "\n";
        
        network.update_weights(learning_rate, momentum);
        
        network.clear_pdrvs();

        if (network.error() < network_error_epsilon) {
            cout << "solution found!" << endl;
            return 0;
        }
    }

    cout << "solution not found! adjust learning rate, etc. and try again!" << endl;
    
    return 0;
}


Actually the only thing that I want to emphasize is you will keep and add the partial derivatives after running each sample. Here I used a vector for every neuron to store it, pdrvErrorWrtWeights_.  For an XOR network, there is only 4 input combinations, you run every samples and then update the weights. For a more realistic case you handle it by mini batch. We will see an implementation in next article. Be careful to clear working partial derivatives after each weights update.

You can modify the code to try variant activation functions and see how it affect the performance. Here are some observations. If by chance the initialization put you around some local minimum positions, you can't even find a solution. In contrary, when using GA you seldom fail. And although the sigmoid function is the default activation function when people initially come up with this neural network system, but as the technology 'evolve' it is seldom used nowadays. Just like nature, technology also evolve through time. And you should be amazed how Nature ever come up with this neural network system, that's really amazing!


沒有留言:

張貼留言

ftps.space released!

  I started doing web site development some time ago.   After fiddled with Flask, nginx, javascript, html, css, websocket, etc, etc... I h...