2018年3月3日 星期六

人工智慧 II -- gradient descend

今天要為各位呈現的是一個使用gradient descend,back propagation來找神經網路權重的c++範例。

這邊假設讀者已經讀過第一篇,已經依我的建議讀完了(或者閱讀中)CS231n的課程,你想自己實作一套,希望有個對照的範本。

請確定你已經了解什麼叫back propagation再來看下面的code。


/* 
 * This example use gradient descend to find a solution for XOR problem
 */

#include <vector>
#include <deque>
#include <algorithm>
#include <iostream>
#include <cmath>
#include <cassert>
#include <ctime>
#include "MesenneTwister.h"

using namespace std;

#define INIT_WEIGHT_SCALE   1.0
#define LEAKY_RELU_CONSTANT 0.005
#define MAX_FITNESS         1000000.0
const double network_error_epsilon = 1e-3;
const double epsilon = 1e-8;

MTRand g_rng(time(NULL));
int g_generation = 0;
    

double error_func (double expect, double actual)
{
    return 0.5 * (expect - actual) * (expect - actual);
}

double error_drv_func(double target, double actual)
{
    // by chain rule. derivative of loss function 0.5 * (actual - target)^2 = (actual - target)
    // or 0.5 * (target - actual)^2 = (target - actual) * -1. 
    return actual - target;
}

//--------------------------------

template <int ACTIVATE_FUNC_TYPE>
double activate (double x)
{
    return 0;
}

template <int ACTIVATE_FUNC_TYPE>
double activate_drv (double y)
{
    return 0;
}

enum {
    SIGMOID,
    TANH,
    RELU
};

template<>
double activate<SIGMOID>(double x)
{
    return 1.0 / (1.0 + exp(-x));
}

template<>
double activate_drv<SIGMOID> (double y)
{
    return y * (1.0 - y);
}

template<>
double activate<TANH>(double x)
{
    return tanh (x);
}

template<>
double activate_drv<TANH> (double y)
{
    return 1.0 - y * y;
}

template<>
double activate<RELU>(double x)
{
    return (x > 0) ? x : LEAKY_RELU_CONSTANT * x;
}

template<>
double activate_drv<RELU>(double y)
{
    return (y > 0) ? 1.0 : LEAKY_RELU_CONSTANT;
}

//--------- try using different activation functions
double activate_func (double x)
{
    return activate<RELU>(x);
//    return activate<TANH>(x);
//    return activate<SIGMOID>(x);
}

double activate_drv_func (double y)
{
    return activate_drv<RELU>(y);
//    return activate_drv<TANH>(y);
//    return activate_drv<SIGMOID>(y);
}

//-----------
class Layer;

class Neuron
{
public:
    Neuron (int numInputs=0);

    void init_weights (double mean, double variance);    
    double calculate_output_from (vector<double> const&inputs);
    void calculate_pdrv_of_error_wrt_input_weights(vector<double> const&inputs, double target);
    void calculate_pdrv_of_error_wrt_input_weights(int indexToOutputLayer, vector<double> const&inputs, const Layer &output_layer);
    const vector<double> & weights() const;
    size_t set_input_weights(const double *weights);
    void clear_pdrvs ();
    void update_weights (double learning_rate, double momentum);
    
public:
    vector<double> inputWeights_;
    double         value_;
    
    vector<double> deltaWeight_;
    vector<double> momentum_;
    double         pdrvErrorWrtInput_;
    vector<double> pdrvErrorWrtWeights_;
};

class Layer
{
public:
    Layer(int numNeurons, int numInputs);
    ~Layer();

    void init_weights (double mean, double variance);
    void forward_pass(const Layer &last_layer);
    void calculate_output_layer_pdrv_of_error_wrt_input_weights (vector<double> const&inputs, vector<double> const&targets);
    void calculate_middle_layer_pdrv_of_error_wrt_input_weights (vector<double> const&inputs, const Layer &output_layer);
    void save_weights_to(vector<double> &weights) const;
    size_t set_input_weights(const double *weights);
    const vector<double> & get_outputs() const;
    void set_outputs(const vector<double> &value);
    size_t num_of_neurons () const;
    void clear_pdrvs ();
    void update_weights (double learning_rate, double momentum);
    
public:
    vector<Neuron> nodes_;
    vector<double> outputs_;
    
};


Neuron::Neuron(int numInputs)
{
    inputWeights_.resize(numInputs+1); // plust weight for bias
}

void Neuron::init_weights(double mean, double variance)
{
    double sqrtn = sqrt(2.0 * inputWeights_.size());
    for (size_t i=0; i < inputWeights_.size()-1; ++i) {
        inputWeights_[i] = INIT_WEIGHT_SCALE * g_rng.randNorm(mean, variance) / sqrtn;
    }
    inputWeights_.back() = 0;
}

double Neuron::calculate_output_from (const vector< double >& inputs)
{
    assert (inputs.size()+1 == inputWeights_.size() && "number of inputs wrong");
    double v=0;
    for (size_t i=0; i < inputs.size(); ++i) {
        v += inputs[i] * inputWeights_[i];
    }
    v += inputWeights_.back(); // bias
    value_ =  activate_func(v);
    return value_;
}

void Neuron::calculate_pdrv_of_error_wrt_input_weights(vector<double> const&inputs, double target)
{
    pdrvErrorWrtInput_ = error_drv_func (target, value_) * activate_drv_func(value_);
    size_t i=0;
    for (; i < pdrvErrorWrtWeights_.size() - 1; ++i) {
        pdrvErrorWrtWeights_[i] += pdrvErrorWrtInput_ * inputs[i];
    }
    pdrvErrorWrtWeights_[i] += pdrvErrorWrtInput_ * 1.0; // and bias which is 1.0
}

void Neuron::calculate_pdrv_of_error_wrt_input_weights(int indexToOutputLayer, const vector< double >& inputs, const Layer& output_layer)
{
    double d = 0;
    for (size_t i=0; i < output_layer.num_of_neurons(); ++i) {
        d += output_layer.nodes_[i].inputWeights_[indexToOutputLayer] * output_layer.nodes_[i].pdrvErrorWrtInput_;
    }
    
    pdrvErrorWrtInput_ = d * activate_drv_func(value_);
    size_t i=0;
    for (; i< pdrvErrorWrtWeights_.size() - 1; ++i) {
        pdrvErrorWrtWeights_[i] += pdrvErrorWrtInput_ * inputs[i];
    }
    pdrvErrorWrtWeights_[i] += pdrvErrorWrtInput_ * 1.0; // and bias which is 1.0
}


const vector< double >& Neuron::weights() const
{
    return inputWeights_;
}

size_t Neuron::set_input_weights(const double* weights)
{
    inputWeights_.assign(weights, weights + inputWeights_.size());
    pdrvErrorWrtWeights_.resize(inputWeights_.size());
    deltaWeight_.resize(inputWeights_.size());
    momentum_.resize(inputWeights_.size());
    return inputWeights_.size();
}

void Neuron::clear_pdrvs()
{
    pdrvErrorWrtWeights_.clear();
    pdrvErrorWrtWeights_.resize(inputWeights_.size());
}

void Neuron::update_weights(double learning_rate, double momentum)
{
    for (size_t i=0; i < inputWeights_.size(); ++i) {
        double dx = pdrvErrorWrtWeights_[i];
        /* Nesterov momentum
        double deltaWeightKeep = deltaWeight_[i];
        deltaWeight_[i] = momentum * deltaWeight_[i] - learning_rate * dx;
        inputWeights_[i] += -momentum * deltaWeightKeep + (1 + momentum) * deltaWeight_[i];
        //*/
        //* RMSprop
        double decay_rate = 0.99;
        deltaWeight_[i] = decay_rate * deltaWeight_[i] + (1.0 - decay_rate) * dx * dx;
        inputWeights_[i] += -learning_rate * dx / (sqrt(deltaWeight_[i]) + epsilon);
        //*/
    }
}



Layer::Layer(int numNeurons=0, int numInputsPerNeuron=0)
{
    nodes_.resize(numNeurons);
    for (size_t i=0; i < nodes_.size(); ++i) {
        nodes_[i] = Neuron(numInputsPerNeuron);
    }
    outputs_.resize(numNeurons);
}

Layer::~Layer()
{
    nodes_.clear();
}

void Layer::init_weights(double mean, double variance)
{
    for (size_t i=0; i < nodes_.size(); ++i) {
        nodes_[i].init_weights (mean, variance);
    }
}

void Layer::forward_pass(const Layer &last_layer)
{
    const vector<double> & inputs = last_layer.get_outputs();
    for (size_t i=0; i < nodes_.size(); ++i) {
        outputs_[i] = nodes_[i].calculate_output_from (inputs);
    }
}

void Layer::calculate_output_layer_pdrv_of_error_wrt_input_weights(vector<double> const&inputs, const vector< double >& targets)
{
    assert (targets.size() == nodes_.size());
    for (size_t i=0; i < nodes_.size(); ++i) {
        nodes_[i].calculate_pdrv_of_error_wrt_input_weights (inputs, targets[i]);
    }
}

void Layer::calculate_middle_layer_pdrv_of_error_wrt_input_weights(const vector< double >& inputs, const Layer& output_layer)
{
    for (size_t i=0; i < nodes_.size(); ++i) {
        nodes_[i].calculate_pdrv_of_error_wrt_input_weights (i, inputs, output_layer);
    }
}


void Layer::save_weights_to(vector< double >& weights) const
{
    for (size_t i=0; i < nodes_.size(); ++i) {
        const vector<double> &w = nodes_[i].weights();
        weights.insert(weights.end(), w.begin(), w.end());
    }

}

size_t Layer::set_input_weights(const double* weights)
{
    size_t pos = 0;
    for (size_t i=0; i < nodes_.size(); ++i) {
        pos += nodes_[i].set_input_weights(weights + pos);
    }
    return pos;
}

void Layer::set_outputs(const vector< double >& value)
{
    outputs_ = value;
    for (size_t i=0; i < nodes_.size(); ++i) {
        nodes_[i].value_ = outputs_[i];
    }
}

const vector< double >& Layer::get_outputs() const
{
    return outputs_;
}


size_t Layer::num_of_neurons() const
{
    return nodes_.size();
}

void Layer::clear_pdrvs()
{
    for (size_t i=0; i < nodes_.size(); ++i) {
        nodes_[i].clear_pdrvs();
    }
}

void Layer::update_weights(double learning_rate, double momentum)
{
    for (size_t i=0; i < nodes_.size(); ++i) {
        nodes_[i].update_weights(learning_rate, momentum);
    }
}



class NeuralNetwork
{
public:
    NeuralNetwork(int numInputs, vector<int> const& hiddenLayerTopology, int numOutpus);
    ~NeuralNetwork();
    void init_weights(double mean_value, double variance_value);
    size_t number_of_weights () const;
    void run (vector<double> const &inputs, vector<double> &outputs, vector<double> const &expects);
    void forward_pass (vector<double> const &inputs);
    void backward_pass (vector<double> &outputs, vector<double> const &expects);
    void save_weights_to(vector<double> &weights) const;
    void set_weights(vector<double> const &weights);
    void clear_pdrvs ();
    void update_weights (double learning_rate, double momentum);
    double error () const { return last_error_; }
    
protected:
    void feed_forward ();
    
private:
    vector<Layer> layers_;
    double last_error_;
    NeuralNetwork(const NeuralNetwork &rhs) {}
    NeuralNetwork &operator=(const NeuralNetwork &rhs) { return *this; }
};

NeuralNetwork::NeuralNetwork(int nInputs, const vector< int >& hiddenLayerTopology, int nOutputs)
{
    layers_.resize(hiddenLayerTopology.size() + 2); // hidden + inputs + outputs
    int nNeuron = nInputs;
    layers_[0] = Layer(nNeuron, 0); // layer zero is inputs
    for (size_t i=0; i < hiddenLayerTopology.size(); ++i) {
        layers_[i+1] = Layer(hiddenLayerTopology[i], layers_[i].num_of_neurons());
    }
    nNeuron = nOutputs;
    layers_.back() = Layer(nNeuron, hiddenLayerTopology.back());
}

NeuralNetwork::~NeuralNetwork()
{
    layers_.clear();
}

void NeuralNetwork::init_weights(double mean, double variance)
{
    for (size_t i=1; i < layers_.size(); ++i) {
        layers_[i].init_weights(mean, variance);
    }    
}

size_t NeuralNetwork::number_of_weights() const
{
    vector<double> weights;
    this->save_weights_to(weights);
    return weights.size();
}

void NeuralNetwork::run(const vector< double >& inputs, vector< double >& outputs, vector<double> const &expects)
{
    forward_pass(inputs);
    outputs = layers_.back().get_outputs();
    backward_pass(outputs, expects);
}

void NeuralNetwork::forward_pass(const vector< double >& inputs)
{
    // feed forward
    layers_[0].set_outputs(inputs);
    feed_forward ();
}

void NeuralNetwork::backward_pass(vector< double >& outputs, const vector< double >& expects)
{
    // calculate errors
    last_error_ = 0;
    for (size_t i=0; i < outputs.size(); ++i) {
        last_error_ += error_func(expects[i], outputs[i]);
    }
    
    // calculate partial derivatives
    // first the last layer(output layer)
    vector< double > layer_inputs = layers_[layers_.size()-2].get_outputs();
    layers_.back().calculate_output_layer_pdrv_of_error_wrt_input_weights(layer_inputs, expects);
    
    // back propagate
    for (int i=layers_.size()-2; i > 0; --i) {
        vector< double > layer_inputs = layers_[i-1].get_outputs();
        layers_[i].calculate_middle_layer_pdrv_of_error_wrt_input_weights(layer_inputs, layers_[i+1]);
    }
    
}


void NeuralNetwork::feed_forward()
{
    for (size_t i=1; i < layers_.size(); ++i) {
        layers_[i].forward_pass(layers_[i-1]);
    }    
}


void NeuralNetwork::clear_pdrvs()
{
    for (size_t i=1; i < layers_.size(); ++i) {
        layers_[i].clear_pdrvs();
    }
}

void NeuralNetwork::update_weights(double learning_rate, double momentum)
{
    // update weights
    for (size_t i=1; i < layers_.size(); ++i) {
        layers_[i].update_weights(learning_rate, momentum);
    }
    
}

void NeuralNetwork::save_weights_to(vector< double >& weights) const
{
    weights.clear();
    for (size_t i=1; i < layers_.size(); ++i) {
        layers_[i].save_weights_to (weights);
    }

}

void NeuralNetwork::set_weights(const vector< double >& weights)
{
    int pos = 0;
    for (size_t i=1; i < layers_.size(); ++i) {
        pos += layers_[i].set_input_weights (&weights[pos]);
    }
}

struct Genome
{
    double fitness_;
    vector<double> genes_;
    
    Genome(size_t numGenes=0);
    void copy_genes_from (Genome const&source, size_t from_index, size_t to_index, size_t replace_start_index, double mutate_rate);
    void report () const;
    
    bool operator<(const Genome &rhs) const
    {
        return fitness_ > rhs.fitness_;
    }
};

Genome::Genome(size_t numGenes)
{
    assert (numGenes && "number of genes can't be zero");
    genes_.resize(numGenes);
}


void Genome::copy_genes_from(const Genome& source, size_t from_index, size_t to_index, size_t replace_start_index, double mutate_rate)
{
    for (size_t i=from_index,cgene=0; i < to_index; ++i,++cgene) {
        genes_[replace_start_index+cgene] = source.genes_[i];
    }
}

void Genome::report() const
{
    cout << "weights ";
    for (size_t i=0; i < genes_.size(); ++i) {
        cout << genes_[i] << " ";
    }
}

int main()
{
    // XOR test, two inputs, two output
    const int numSamples = 4;
    const int numInputPoints = 2;
    const int numOutputPoints = 2;

    int iSeed = time(NULL)%1000000;
    g_rng.seed(iSeed);
    cout << "use seed " << iSeed << endl;
        
    vector<int> hidden_layers_topology; // hidden layers, from input to output
    hidden_layers_topology.push_back(2);

    NeuralNetwork network(numInputPoints, hidden_layers_topology, numOutputPoints);
    network.init_weights(0, 1.0);
    
    const size_t numGenes = network.number_of_weights();

    Genome entity(numGenes);
    network.save_weights_to(entity.genes_);
    
    vector<double> inputs(numInputPoints);
    vector<double> outputs(numOutputPoints);
    
    vector<double> expects(2);
        
    vector<double> returns(numOutputPoints * numSamples);
        
    network.set_weights(entity.genes_);
    
    const double learning_rate = 0.001;
    const double momentum = 0.9; // not used when update weights using RMSprop.

    while (g_generation < 100000) {
        cout << "generation " << g_generation << "    ";
        ++g_generation;
        
        int rno = 0;
        /*
        inputs[0] = 0.05;
        inputs[1] = 0.1;
        expects[0] = 0.01;
        expects[1] = 0.99;
        network.run(inputs, outputs, expects);
        returns[rno++] = outputs[0];
        returns[rno++] = outputs[1];
        */
        
        inputs[0] = 0;
        inputs[1] = 0;
        expects[0] = 1;
        expects[1] = 0;
        network.run(inputs, outputs, expects);
        returns[rno++] = outputs[0];
        returns[rno++] = outputs[1];

        inputs[0] = 1;
        inputs[1] = 1;
        expects[0] = 1;
        expects[1] = 0;
        network.run(inputs, outputs, expects);
        returns[rno++] = outputs[0];
        returns[rno++] = outputs[1];
        
        inputs[0] = 0;
        inputs[1] = 1;
        expects[0] = 0;
        expects[1] = 1;
        network.run(inputs, outputs, expects);
        returns[rno++] = outputs[0];
        returns[rno++] = outputs[1];
        
        inputs[0] = 1;
        inputs[1] = 0;
        expects[0] = 0;
        expects[1] = 1;
        network.run(inputs, outputs, expects);
        returns[rno++] = outputs[0];
        returns[rno++] = outputs[1];
        
        
        cout << "network error: " << network.error() << "    "; 
        

        network.save_weights_to(entity.genes_);
        entity.report();
        
        cout << "|| results ";
        for (size_t i=0; i < returns.size(); ++i) {
            cout << returns[i] << ", ";
        }
        cout << "\n";
        
        network.update_weights(learning_rate, momentum);
        
        network.clear_pdrvs();
        
        if (network.error() < network_error_epsilon) {
            cout << "solution found!" << endl;
            return 0;
        }
    }

    cout << "solution not found! adjust learning rate, etc. and try again!" << endl;
    
    return 0;
}


實作上要注意的大概只有每run一個sample後,我們把back propagation過來的偏微分加起來,最後再呼叫update_weights來更新權重。因為xor的input也才四種所以可以這樣做,通常的話你會使用mini batch的方式來處理, 記得更新權重後把暫存的偏微分清除再run下一個sample。

你可以修改使用的activation function試試。這邊是一點觀察,使用gradient descend,看初始化的情形,如果不幸卡在某個local minimum,有時甚至跑不出解答...相比之下用GA反而很少失敗 XD。雖然sigmoid function可以說是這套神經網路誕生時預設的activation function,但是一旦我們有個基礎後,這套技術(神經網路)也漸漸演化出自己的路... 大自然是怎麼演化出這套解決方案的呢?實在不禁讓人拍案叫絕!



沒有留言:

張貼留言

ftps.space released!

  I started doing web site development some time ago.   After fiddled with Flask, nginx, javascript, html, css, websocket, etc, etc... I h...