The Necessary Programmer Blog!: 人工智慧 III -- 分類

今天我們要解複雜一點點的問題。不同上一篇的XOR問題，這次我們的目標是分類(classify)。第一個是螺旋型分佈的資料 http://cs231n.github.io/neural-networks-case-study/，第二個是fizz buzz http://joelgrus.com/2016/05/23/fizz-buzz-in-tensorflow/。順便我們把regularization, dropout, max-norm等技巧加上去了...不過我不確定我的實作有沒有什麼問題 XD

as always, 請依第一篇的建議學習machine learning http://zen747.blogspot.tw/2018/03/i.html 以免浪費時間。

/* 
 * This sample use gradient descend to classify a spiral dataset
 * http://cs231n.github.io/neural-networks-case-study/
 */

#include <vector>
#include <deque>
#include <algorithm>
#include <iostream>
#include <cmath>
#include <cassert>
#include <ctime>
#include <cfloat>
#include "MesenneTwister.h"

#define USE_SDL 1

#if USE_SDL
#include <SDL2/SDL.h>
#include <SDL2/SDL_image.h>
#include <SDL2/SDL_ttf.h>
#endif

using namespace std;

// The regularization scale the weights toward 0
#define REGULARIZATION_FACTOR  1e-2
#define DROPOUT_RATE           0.25
#define INPUT_DROPOUT_RATE     0.001
#define MAX_NORM               10
#define LEARNING_RATE          0.02
#define BATCH_SIZE             20

#define LEAKY_RELU_CONSTANT    0.005
const double epsilon = 1e-8;

MTRand g_rng(time(NULL));

// used in adam algorithm
int g_generation = 0;
    
//--------------------------------

template <int ACTIVATE_FUNC_TYPE>
double activate (double x)
{
    return 0;
}

template <int ACTIVATE_FUNC_TYPE>
double activate_drv (double y)
{
    return 0;
}

enum {
    SIGMOID,
    TANH,
    RELU
};

template<>
double activate<SIGMOID>(double x)
{
    return 1.0 / (1.0 + exp(-x));
}

template<>
double activate_drv<SIGMOID> (double y)
{
    return y * (1.0 - y);
}

template<>
double activate<TANH>(double x)
{
    return tanh (x);
}

template<>
double activate_drv<TANH> (double y)
{
    return 1.0 - y * y;
}

template<>
double activate<RELU>(double x)
{
    return (x > 0) ? x : LEAKY_RELU_CONSTANT * x;
}

template<>
double activate_drv<RELU>(double y)
{
    return (y > 0) ? 1.0 : LEAKY_RELU_CONSTANT;
}

//---------
double activate_func (double x)
{
    return activate<RELU>(x);
}

double activate_drv_func (double y)
{
    return activate_drv<RELU>(y);
}

//-----------
class Layer;

struct NetworkConfig
{
    bool training_;
    
    NetworkConfig()
    : training_(false)
    {}
};

class Neuron
{
public:
    Neuron (int numInputs=0);

    void init_weights (double mean, double variance, double magnitude);
    double calculate_output_from (vector<double> const&inputs);
    void calculate_pdrv_of_error_wrt_input_weights(vector<double> const&inputs, double drvLoss);
    void calculate_pdrv_of_error_wrt_input_weights(vector<double> const&inputs, const Layer &output_layer);
    const vector<double> & weights() const;
    size_t set_input_weights(const double *weights);
    void clear_pdrvs ();
    void update_weights (double learning_rate, double momentum, bool reguon);
    
public:
    int            idx_;
    vector<double> inputWeights_;
    double         value_;
    bool           droped_out_;
    
    vector<double> deltaWeight_;
    vector<double> momentum_;
    double         pdrvErrorWrtInput_;
    vector<double> pdrvErrorWrtWeights_;
};

class Layer
{
public:
    Layer(NetworkConfig *netconfig, int numNeurons, int numInputs);
    ~Layer();

    void init_weights (double mean, double variance, double magnitude);
    void forward_pass(const Layer &last_layer);
    void calculate_output_layer_pdrv_of_error_wrt_input_weights (vector<double> const&inputs,
                                                                 vector<double> const&outputsExp,
                                                                 double totalExp,
                                                                 int right_ans
                                                                );
    void calculate_middle_layer_pdrv_of_error_wrt_input_weights (vector<double> const&inputs, const Layer &output_layer);
    void save_weights_to(vector<double> &weights) const;
    size_t set_input_weights(const double *weights);
    const vector<double> & get_outputs() const;
    void set_outputs_of_input_layer(const vector<double> &value);
    size_t num_of_neurons () const;
    void clear_pdrvs ();
    void update_weights (double learning_rate, double momentum, bool reguon);
    void mark_dropout (double dropout_rate);
    void mark_no_dropout ();
    
public:
    vector<Neuron> nodes_;
    vector<double> outputs_;
    NetworkConfig *netconfig_;
    bool           output_layer_;
};


Neuron::Neuron(int numInputs)
: droped_out_(false)
{
    inputWeights_.resize(numInputs+1); // plus weight for bias
}

void Neuron::init_weights(double mean, double variance, double magnitude)
{
    // This initialization technic helps a lot
    double sqrtn = sqrt(2.0 * inputWeights_.size());
    for (size_t i=0; i < inputWeights_.size()-1; ++i) {
        inputWeights_[i] = magnitude * g_rng.randNorm(mean, variance) / sqrtn;
    }
    inputWeights_.back() = 0;
}

double Neuron::calculate_output_from (const vector< double >& inputs)
{
    assert (inputs.size()+1 == inputWeights_.size() && "number of inputs wrong");
    if (droped_out_) {
        value_ = 0;
        return 0;
    }
    
    double v=0;
    for (size_t i=0; i < inputs.size(); ++i) {
        v += inputs[i] * inputWeights_[i];
    }
    v += inputWeights_.back(); // bias
    value_ =  activate_func(v);
    return value_;
}

void Neuron::calculate_pdrv_of_error_wrt_input_weights(vector<double> const&inputs, double drvLoss)
{
    pdrvErrorWrtInput_ = drvLoss * activate_drv_func(value_);
    size_t i=0;
    for (; i < pdrvErrorWrtWeights_.size() - 1; ++i) {
        pdrvErrorWrtWeights_[i] += pdrvErrorWrtInput_ * inputs[i];
    }
    pdrvErrorWrtWeights_[i] += pdrvErrorWrtInput_ * 1.0; // and bias which is 1.0
}

void Neuron::calculate_pdrv_of_error_wrt_input_weights(const vector< double >& inputs, const Layer& output_layer)
{
    if (droped_out_) {
        pdrvErrorWrtInput_ = 0;
        return;
    }
    
    double d = 0;
    for (size_t i=0; i < output_layer.num_of_neurons(); ++i) {
        d += output_layer.nodes_[i].inputWeights_[idx_] * output_layer.nodes_[i].pdrvErrorWrtInput_;
    }
    
    pdrvErrorWrtInput_ = d * activate_drv_func(value_);
    size_t i=0;
    for (; i< pdrvErrorWrtWeights_.size() - 1; ++i) {
        pdrvErrorWrtWeights_[i] += pdrvErrorWrtInput_ * inputs[i];
    }
    pdrvErrorWrtWeights_[i] += pdrvErrorWrtInput_ * 1.0; // and bias which is 1.0
}


const vector< double >& Neuron::weights() const
{
    return inputWeights_;
}

size_t Neuron::set_input_weights(const double* weights)
{
    inputWeights_.assign(weights, weights + inputWeights_.size());
    pdrvErrorWrtWeights_.resize(inputWeights_.size());
    deltaWeight_.resize(inputWeights_.size());
    momentum_.resize(inputWeights_.size());
    return inputWeights_.size();
}

void Neuron::clear_pdrvs()
{
    pdrvErrorWrtWeights_.clear();
    pdrvErrorWrtWeights_.resize(inputWeights_.size());
}

void Neuron::update_weights(double learning_rate, double momentum, bool reguon)
{
    double weight_norm = 0;
    for (size_t i=0; i < inputWeights_.size(); ++i) {
        double dx = pdrvErrorWrtWeights_[i];
        double regularization = 0;
        if (reguon) {
            regularization = learning_rate * REGULARIZATION_FACTOR * inputWeights_[i];
        }
        /* RMSprop
        double decay_rate = 0.99;
        deltaWeight_[i] = decay_rate * deltaWeight_[i] + (1.0 - decay_rate) * dx * dx;
        inputWeights_[i] += -learning_rate * dx / (sqrt(deltaWeight_[i]) + epsilon) - regularization;
        //*/
        
        //* Adam
        double beta1=0.9;
        double beta2=0.999;
        momentum_[i] = beta1*momentum_[i] + (1.0 - beta1) * dx;
        double mt = momentum_[i] / (1.0 - pow(beta1, g_generation));
        //double mt = momentum_[i];
        deltaWeight_[i] = beta2 * deltaWeight_[i] + (1.0 - beta2) * dx * dx;
        double vt = deltaWeight_[i] / (1.0 - pow(beta2, g_generation));
        //double vt = deltaWeight_[i];
        inputWeights_[i] += -learning_rate * mt / (sqrt(vt) + epsilon) - regularization;
        //*/
        
        weight_norm += inputWeights_[i] * inputWeights_[i];
    }
    
    weight_norm = sqrt(weight_norm);
    if (weight_norm > MAX_NORM) {
        double r = MAX_NORM / weight_norm;
        for (size_t i=0; i < inputWeights_.size(); ++i) {
            inputWeights_[i] *= r;
        }
    }
}



Layer::Layer(NetworkConfig *netconfig, int numNeurons=0, int numInputsPerNeuron=0)
: netconfig_(netconfig)
, output_layer_(false)
{
    nodes_.resize(numNeurons);
    for (size_t i=0; i < nodes_.size(); ++i) {
        nodes_[i] = Neuron(numInputsPerNeuron);
        nodes_[i].idx_ = i;
    }
    outputs_.resize(numNeurons);
}

Layer::~Layer()
{
    nodes_.clear();
}

void Layer::init_weights(double mean, double variance, double magnitude)
{
    for (size_t i=0; i < nodes_.size(); ++i) {
        nodes_[i].init_weights (mean, variance, magnitude);
    }
}

void Layer::forward_pass(const Layer &last_layer)
{
    double rate = 1.0 - DROPOUT_RATE;
    
    const vector<double> & inputs = last_layer.get_outputs();
    for (size_t i=0; i < nodes_.size(); ++i) {
        if (nodes_[i].droped_out_) {
            outputs_[i] = nodes_[i].value_ = 0;
        } else {
            nodes_[i].calculate_output_from (inputs);
            if (!output_layer_ && !netconfig_->training_) {
                nodes_[i].value_ *= rate;
            }
            outputs_[i] = nodes_[i].value_;
        }
    }
}

void Layer::calculate_output_layer_pdrv_of_error_wrt_input_weights(vector<double> const&inputs,
                                                                   vector<double> const&outputsExp,
                                                                   double totalExp,
                                                                   int right_ans
                                                                  )
{
    assert (outputsExp.size() == nodes_.size());
    
    for (size_t i=0; i < nodes_.size(); ++i) {
        double drvLoss;
        if (i == right_ans) {
            drvLoss = outputsExp[i] / totalExp -1.0;
        } else {
            drvLoss = outputsExp[i] / totalExp;
        }
        nodes_[i].calculate_pdrv_of_error_wrt_input_weights (inputs, drvLoss);
    }
}

void Layer::calculate_middle_layer_pdrv_of_error_wrt_input_weights(const vector< double >& inputs, const Layer& output_layer)
{
    for (size_t i=0; i < nodes_.size(); ++i) {
        nodes_[i].calculate_pdrv_of_error_wrt_input_weights (inputs, output_layer);
    }
}


void Layer::save_weights_to(vector< double >& weights) const
{
    for (size_t i=0; i < nodes_.size(); ++i) {
        const vector<double> &w = nodes_[i].weights();
        weights.insert(weights.end(), w.begin(), w.end());
    }

}

size_t Layer::set_input_weights(const double* weights)
{
    size_t pos = 0;
    for (size_t i=0; i < nodes_.size(); ++i) {
        pos += nodes_[i].set_input_weights(weights + pos);
    }
    return pos;
}

void Layer::set_outputs_of_input_layer(const vector< double >& value)
{
    outputs_ = value;
    for (size_t i=0; i < nodes_.size(); ++i) {
        if (netconfig_->training_ && nodes_[i].droped_out_) {
            nodes_[i].value_ = outputs_[i] = 0;
        } else {
            nodes_[i].value_ = outputs_[i];
        }
    }
}

const vector< double >& Layer::get_outputs() const
{
    return outputs_;
}


size_t Layer::num_of_neurons() const
{
    return nodes_.size();
}

void Layer::clear_pdrvs()
{
    for (size_t i=0; i < nodes_.size(); ++i) {
        nodes_[i].clear_pdrvs();
    }
}

void Layer::update_weights(double learning_rate, double momentum, bool reguon)
{
    for (size_t i=0; i < nodes_.size(); ++i) {
        nodes_[i].update_weights(learning_rate, momentum, reguon);
    }
}

void Layer::mark_dropout(double dropout_rate)
{
    for (size_t i=0; i < nodes_.size(); ++i) {
        nodes_[i].droped_out_ = (g_rng.rand() < dropout_rate);
    }
}

void Layer::mark_no_dropout()
{
    for (size_t i=0; i < nodes_.size(); ++i) {
        nodes_[i].droped_out_ = false;
    }
}



class NeuralNetwork
{
public:
    NeuralNetwork(int numInputs, vector<int> const& hiddenLayerTopology, int numOutpus);
    ~NeuralNetwork();
    void init_weights(double mean_value, double variance_value, double magnitude);
    size_t number_of_weights () const;
    void run (vector<double> const &inputs, vector<double> &outputs);
    void train (vector<double> const &inputs, vector<double> &outputs);
    void forward_pass (vector<double> const &inputs);
    void backward_pass (vector<double> &outputs, int right_ans);
    void save_weights_to(vector<double> &weights) const;
    void set_weights(vector<double> const &weights);
    void clear_pdrvs ();
    void update_weights (double learning_rate, double momentum);
    double error () const { return last_error_; }
    bool regularization_on () const { return regularization_on_; }
    void set_regularization_on () { regularization_on_ = true; }
    
protected:
    void feed_forward ();
    void mark_dropout ();
    void mark_no_dropout ();
    
private:
    vector<Layer *> layers_;
    double last_error_;
    NetworkConfig *netconfig_;
    bool regularization_on_;
    
    NeuralNetwork(const NeuralNetwork &rhs) {}
    NeuralNetwork &operator=(const NeuralNetwork &rhs) { return *this; }
};

NeuralNetwork::NeuralNetwork(int nInputs, const vector< int >& hiddenLayerTopology, int nOutputs)
: last_error_(0)
, netconfig_(new NetworkConfig)
, regularization_on_(false)
{
    layers_.resize(hiddenLayerTopology.size() + 2); // hidden + inputs + outputs
    int nNeuron = nInputs;
    layers_[0] = new Layer(netconfig_, nNeuron, 0); // layer zero is inputs
    for (size_t i=0; i < hiddenLayerTopology.size(); ++i) {
        layers_[i+1] = new Layer(netconfig_, hiddenLayerTopology[i], layers_[i]->num_of_neurons());
    }
    nNeuron = nOutputs;
    layers_.back() = new Layer(netconfig_, nNeuron, hiddenLayerTopology.back());
}

NeuralNetwork::~NeuralNetwork()
{
    delete netconfig_;
    for (size_t i=0; i < layers_.size(); ++i) {
        delete layers_[i];
    }
    layers_.clear();
}

void NeuralNetwork::init_weights(double mean, double variance, double magnitude)
{
    for (size_t i=1; i < layers_.size(); ++i) {
        layers_[i]->init_weights(mean, variance, magnitude);
    }    
}

size_t NeuralNetwork::number_of_weights() const
{
    vector<double> weights;
    this->save_weights_to(weights);
    return weights.size();
}

void NeuralNetwork::run(const vector< double >& inputs, vector< double >& outputs)
{
    if (netconfig_->training_) {
        netconfig_->training_ = false;
        mark_no_dropout ();
    }
    forward_pass(inputs);
    outputs = layers_.back()->get_outputs();    
}

void NeuralNetwork::train(const vector< double >& inputs, vector< double >& outputs)
{
    if (!netconfig_->training_) {
        netconfig_->training_ = true;
        mark_dropout ();
    }
    forward_pass(inputs);
    outputs = layers_.back()->get_outputs();    
}

void NeuralNetwork::forward_pass(const vector< double >& inputs)
{
    // feed forward
    // set inputs as outputs of first layer
    layers_[0]->set_outputs_of_input_layer(inputs);
    feed_forward ();
}

void NeuralNetwork::backward_pass(vector< double >& outputs, int right_ans)
{
    // calculate errors/loss
    double denom = 0;
    double numer = 0;
    last_error_ = 0;

    double max_out=-DBL_MAX;
    for (size_t i=0; i < outputs.size(); ++i) {
        if (outputs[i] > max_out) {
            max_out = outputs[i];
        }
    }
    
    vector<double> outputsExp; outputsExp.reserve(outputs.size());
    double totalExp = 0;
    for (size_t i=0; i < outputs.size(); ++i) {
        outputsExp.push_back(exp(outputs[i] - max_out));
        if (i == right_ans) {
            numer = outputsExp.back();
        }
        denom += outputsExp.back();
    }
    totalExp = denom;
    last_error_ = -log(numer / denom);
    
    // calculate partial derivatives
    // first the last layer(output layer)
    vector< double > layer_inputs = layers_[layers_.size()-2]->get_outputs();
    layers_.back()->calculate_output_layer_pdrv_of_error_wrt_input_weights(layer_inputs, outputsExp, totalExp, right_ans);
    
    // back propagate
    for (int i=layers_.size()-2; i > 0; --i) {
        vector< double > layer_inputs = layers_[i-1]->get_outputs();
        layers_[i]->calculate_middle_layer_pdrv_of_error_wrt_input_weights(layer_inputs, *layers_[i+1]);
    }
    
}


void NeuralNetwork::feed_forward()
{
    for (size_t i=1; i < layers_.size(); ++i) {
        layers_[i]->forward_pass(*layers_[i-1]);
    }    
}

void NeuralNetwork::mark_dropout()
{
    for (size_t i=0; i < layers_.size()-1; ++i) {
        double r = DROPOUT_RATE;
        if (i==0) r = INPUT_DROPOUT_RATE;
        layers_[i]->mark_dropout(r);
    }    

}

void NeuralNetwork::mark_no_dropout()
{
    for (size_t i=0; i < layers_.size()-1; ++i) {
        layers_[i]->mark_no_dropout();
    }    
}

void NeuralNetwork::clear_pdrvs()
{
    for (size_t i=1; i < layers_.size(); ++i) {
        layers_[i]->clear_pdrvs();
    }
}

void NeuralNetwork::update_weights(double learning_rate, double momentum)
{
    assert (netconfig_->training_ && "why update weights when not training?");
    
    // update weights
    for (size_t i=1; i < layers_.size(); ++i) {
        layers_[i]->update_weights(learning_rate, momentum, regularization_on_);
    }
    
    mark_dropout();
    clear_pdrvs();        
}

void NeuralNetwork::save_weights_to(vector< double >& weights) const
{
    weights.clear();
    for (size_t i=1; i < layers_.size(); ++i) {
        layers_[i]->save_weights_to (weights);
    }

}

void NeuralNetwork::set_weights(const vector< double >& weights)
{
    int pos = 0;
    for (size_t i=1; i < layers_.size(); ++i) {
        pos += layers_[i]->set_input_weights (&weights[pos]);
    }
}

struct Genome
{
    double fitness_;
    vector<double> genes_;
    
    Genome(size_t numGenes=0);
    void copy_genes_from (Genome const&source, size_t from_index, size_t to_index, size_t replace_start_index, double mutate_rate);
    void report () const;
    
    bool operator<(const Genome &rhs) const
    {
        return fitness_ > rhs.fitness_;
    }
};

Genome::Genome(size_t numGenes)
{
    assert (numGenes && "number of genes can't be zero");
    genes_.resize(numGenes);
}


void Genome::copy_genes_from(const Genome& source, size_t from_index, size_t to_index, size_t replace_start_index, double mutate_rate)
{
    for (size_t i=from_index,cgene=0; i < to_index; ++i,++cgene) {
        genes_[replace_start_index+cgene] = source.genes_[i];
    }
}

void Genome::report() const
{
    cout << "weights ";
    for (size_t i=0; i < genes_.size(); ++i) {
        cout << genes_[i] << " ";
    }
}

struct Point {
    double x_, y_;
    int class_;
};

const int nClass = 3;

struct DataSet
{
    vector<Point> samples_;
    void randomize ()
    {
        vector<Point> sam;
        while (!samples_.empty()) {
            int idx = g_rng.randInt() % samples_.size();
            sam.push_back(samples_[idx]);
            samples_[idx] = samples_.back();
            samples_.pop_back();
        }
        sam.swap(samples_);
    }
};

#if USE_SDL

#define RENDER_WINDOW_WIDTH 600
#define RENDER_WINDOW_HEIGHT 600

struct Color {
    Uint8 r_, g_, b_;
    Color(Uint8 r,Uint8 g, Uint8 b)
    : r_(r), g_(g), b_(b)
    {
    }
};

class Renderer {
    SDL_Window *win_;
    SDL_Renderer *ren_;
    SDL_Texture *tex_;
    
    int init_sdl ()
    {
        if (SDL_Init(SDL_INIT_VIDEO) != 0){
            std::cout << "SDL_Init Error: " << SDL_GetError() << std::endl;
            return 1;
        }
        
        win_ = SDL_CreateWindow("Hello World!", 100, 100, RENDER_WINDOW_WIDTH, RENDER_WINDOW_HEIGHT, SDL_WINDOW_SHOWN);
        if (win_ == 0){
            std::cout << "SDL_CreateWindow Error: " << SDL_GetError() << std::endl;
            SDL_Quit();
            return 1;
        }
            
        ren_ = SDL_CreateRenderer(win_, -1, SDL_RENDERER_ACCELERATED | SDL_RENDERER_PRESENTVSYNC);
        if (ren_ == 0){
            SDL_DestroyWindow(win_);
            std::cout << "SDL_CreateRenderer Error: " << SDL_GetError() << std::endl;
            SDL_Quit();
            return 1;
        }    

        std::string imagePath = "circle.png";
        tex_ = IMG_LoadTexture(ren_, imagePath.c_str());
        if (tex_ == 0) {
            SDL_DestroyRenderer(ren_);
            SDL_DestroyWindow(win_);
            std::cout << "SDL_CreateTextureFromSurface Error: " << SDL_GetError() << std::endl;
            SDL_Quit();
            return 1;
        }


        return 0;
    }
        

public:
    Renderer ()
    : win_(0), ren_(0), tex_(0)
    {
    }
    
    ~Renderer ()
    {
        if (tex_) SDL_DestroyTexture(tex_);
        if (ren_) SDL_DestroyRenderer(ren_);
        if (win_) SDL_DestroyWindow(win_);
        SDL_Quit();

    }
    
    int start ()
    {
        return init_sdl ();
    }
    
    int render (DataSet const&sData)
    {
        SDL_SetRenderDrawColor(ren_, 255, 255, 255, 255);
        SDL_RenderClear(ren_);
        SDL_Rect dst;
        dst.w = dst.h = 16;
        int offX = dst.w/2, offY=dst.h/2;
        vector<Color> cvClass;
        cvClass.push_back(Color(255,0,0));
        cvClass.push_back(Color(255,255,0));
        cvClass.push_back(Color(0,255,0));
        cvClass.push_back(Color(0,0,255));
        for (size_t i=0; i < sData.samples_.size(); ++i) {
            const Color &color = cvClass[sData.samples_[i].class_];
            SDL_SetTextureColorMod(tex_, color.r_, color.g_, color.b_);
            dst.x = RENDER_WINDOW_WIDTH * (0.5 + sData.samples_[i].x_) - offX;
            dst.y = RENDER_WINDOW_HEIGHT * (0.5 + sData.samples_[i].y_) - offY;
            SDL_RenderCopy(ren_, tex_, NULL, &dst);
        }
        //Update the screen
        SDL_RenderPresent(ren_);
            
        //SDL_Delay(3000);
        return 0;
    }
};
#endif

void generate_data_set (DataSet &sData, int nPointsPerClass)
{
    int nC = nClass + 1;
    for (int iC=0; iC < nClass; ++iC) {
        for (int iP=0; iP < nPointsPerClass; ++iP) {
            Point p;
            p.class_ = iC;
            double radius = 0.001 + double(iP) / nPointsPerClass;
            double angle = nC * iC + 1.0 * nC * radius + g_rng.randNorm() * 0.2;
            p.x_ = 0.5 * radius * sin(angle); p.y_ = 0.5 * radius * cos(angle);
            sData.samples_.push_back(p);
        }
    }
}

bool answer_correct (vector<double> const&outputs, int right_ans)
{
    double max_term=-DBL_MAX;
    int ans = -1;
    for (size_t i=0; i < outputs.size(); ++i) {
        if (outputs[i] > max_term) {
            max_term = outputs[i];
            ans = i;
        }
    }
    return ans == right_ans;
}

const int numInputPoints = 2;
const int numOutputPoints = 4;

int test_check (NeuralNetwork &network)
{
    vector<double> inputs(numInputPoints);
    vector<double> outputs(numOutputPoints);

    cout << "test... ";
    DataSet sDataTest;
    generate_data_set (sDataTest, 100);
    int correct_answer = 0;
    for (size_t i=0; i < sDataTest.samples_.size(); ++i) {
        inputs.clear();
        inputs.push_back(sDataTest.samples_[i].x_);
        inputs.push_back(sDataTest.samples_[i].y_);
        network.run(inputs, outputs);
        //expects.clear();
        //load_expects (expects, sDataTest.samples_[i].class_);
        //network.backward_pass(outputs, sDataTest.samples_[i].class_);
        if (answer_correct(outputs, sDataTest.samples_[i].class_)) {
            ++correct_answer;
        } else {
            //cout << i <<  " at (" << sDataTest.samples_[i].x_ << "," <<sDataTest.samples_[i].x_ << ") ";
            //cout << "value " << outputs[0] << "," << outputs[1] << "," << outputs[2] << " vs " <<  sDataTest.samples_[i].class_ << "\n";
        }
    }
    cout << "precision: " << (100.0 * double(correct_answer) / sDataTest.samples_.size()) << "%" << endl;
    return correct_answer;
}

int main()
{
    int iSeed = time(NULL)%1000000;
    //iSeed = 7;
    g_rng.seed(iSeed);
    cout << "use seed " << iSeed << endl;
       
    DataSet sData;
    generate_data_set (sData, 100);
    sData.randomize();
#if USE_SDL
    Renderer renderer;
    
    if (renderer.start() != 0) {
        cout << "SDL2 initialization failed." << endl;
        return 1;
    }

    renderer.render(sData);
    
#endif

    vector<int> hidden_layers_topology; // hidden layers, from input to output
    hidden_layers_topology.push_back(160);
    /*
    hidden_layers_topology.push_back(30);
    hidden_layers_topology.push_back(30);
    //*/
    
    NeuralNetwork network(numInputPoints, hidden_layers_topology, numOutputPoints);
    network.init_weights(0, 1.0, 1.0);
    
    const size_t numGenes = network.number_of_weights();

    Genome entity(numGenes);
    network.save_weights_to(entity.genes_);
    
    vector<double> inputs(numInputPoints);
    vector<double> outputs(numOutputPoints);
        
    network.set_weights(entity.genes_);
    
    const double learning_rate = LEARNING_RATE;
    const double momentum = 0.9;

    const int batch_size = BATCH_SIZE;
    
    while (g_generation < 500) {
//        cout << "generation " << g_generation << "    ";
        ++g_generation;
        int run_count = 0;
        double max_error = 0;
        int correct_answer = 0;
        
        for (size_t i=0; i < sData.samples_.size(); ++i) {
            inputs.clear();
            inputs.push_back(sData.samples_[i].x_);
            inputs.push_back(sData.samples_[i].y_);
            network.train(inputs, outputs);
            if (answer_correct(outputs, sData.samples_[i].class_)) {
                ++correct_answer;
            }
            network.backward_pass(outputs, sData.samples_[i].class_);
            max_error = network.error() > max_error ? network.error() : max_error;
            ++run_count;
            if (run_count >= batch_size) {
                network.update_weights(learning_rate, momentum);
                run_count = 0;
            }            
        }
        if (run_count != 0) {
            network.update_weights(learning_rate, momentum);
            run_count = 0;
        }            
                
        if (g_generation % 10 == 0) {
            int correct = test_check(network);
            if (!network.regularization_on () && double(correct_answer - correct) / correct_answer > 0.05 ) {
                network.set_regularization_on ();
                cout << "\n\t\t**** regularization on ****\n" << endl;
            }
        }
        
    }
    
    test_check(network);
    
    network.save_weights_to(entity.genes_);
    double weights = 0;
    for (size_t i=0; i < entity.genes_.size(); ++i) {
        weights += fabs(entity.genes_[i]);
    }
    cout << "(" << entity.genes_.size() << ") weights\n";
    cout << "|totalWeights| = " << weights << "\n";
    entity.report();
    cout << "\n";

    return 0;
}

這個toy example其實只要幾個epoch就能達到很好的準確度。如果你沒有SDL可以 #define USE_SDL 0。

/* 
 * a fizz buzz example
 * http://joelgrus.com/2016/05/23/fizz-buzz-in-tensorflow/
 */

#include <vector>
#include <deque>
#include <algorithm>
#include <iostream>
#include <cmath>
#include <cassert>
#include <ctime>
#include <cfloat>
#include "MesenneTwister.h"

using namespace std;

// The regularization scale the weights toward 0
#define REGULARIZATION_FACTOR  1e-2
#define DROPOUT_RATE           0.1
#define INPUT_DROPOUT_RATE     0.0
#define MAX_NORM               10
#define LEARNING_RATE          0.001
#define BATCH_SIZE             32

#define LEAKY_RELU_CONSTANT    0.005
const double epsilon = 1e-8;

MTRand g_rng(time(NULL));

// used in adam algorithm
int g_generation = 0;
    
//--------------------------------

template <int ACTIVATE_FUNC_TYPE>
double activate (double x)
{
    return 0;
}

template <int ACTIVATE_FUNC_TYPE>
double activate_drv (double y)
{
    return 0;
}

enum {
    SIGMOID,
    TANH,
    RELU
};

template<>
double activate<SIGMOID>(double x)
{
    return 1.0 / (1.0 + exp(-x));
}

template<>
double activate_drv<SIGMOID> (double y)
{
    return y * (1.0 - y);
}

template<>
double activate<TANH>(double x)
{
    return tanh (x);
}

template<>
double activate_drv<TANH> (double y)
{
    return 1.0 - y * y;
}

template<>
double activate<RELU>(double x)
{
    return (x > 0) ? x : LEAKY_RELU_CONSTANT * x;
}

template<>
double activate_drv<RELU>(double y)
{
    return (y > 0) ? 1.0 : LEAKY_RELU_CONSTANT;
}

//---------
double activate_func (double x)
{
    return activate<RELU>(x);
}

double activate_drv_func (double y)
{
    return activate_drv<RELU>(y);
}

//-----------
class Layer;

struct NetworkConfig
{
    bool training_;
    
    NetworkConfig()
    : training_(false)
    {}
};

class Neuron
{
public:
    Neuron (int numInputs=0);

    void init_weights (double mean, double variance, double magnitude);
    double calculate_output_from (vector<double> const&inputs);
    void calculate_pdrv_of_error_wrt_input_weights(vector<double> const&inputs, double drvLoss);
    void calculate_pdrv_of_error_wrt_input_weights(vector<double> const&inputs, const Layer &output_layer);
    const vector<double> & weights() const;
    size_t set_input_weights(const double *weights);
    void clear_pdrvs ();
    void update_weights (double learning_rate, double momentum, bool reguon);
    
public:
    int            idx_;
    vector<double> inputWeights_;
    double         value_;
    bool           droped_out_;
    
    vector<double> deltaWeight_;
    vector<double> momentum_;
    double         pdrvErrorWrtInput_;
    vector<double> pdrvErrorWrtWeights_;
};

class Layer
{
public:
    Layer(NetworkConfig *netconfig, int numNeurons, int numInputs);
    ~Layer();

    void init_weights (double mean, double variance, double magnitude);
    void forward_pass(const Layer &last_layer);
    void calculate_output_layer_pdrv_of_error_wrt_input_weights (vector<double> const&inputs,
                                                                 vector<double> const&outputsExp,
                                                                 double totalExp,
                                                                 int right_ans
                                                                );
    void calculate_middle_layer_pdrv_of_error_wrt_input_weights (vector<double> const&inputs, const Layer &output_layer);
    void save_weights_to(vector<double> &weights) const;
    size_t set_input_weights(const double *weights);
    const vector<double> & get_outputs() const;
    void set_outputs_of_input_layer(const vector<double> &value);
    size_t num_of_neurons () const;
    void clear_pdrvs ();
    void update_weights (double learning_rate, double momentum, bool reguon);
    void mark_dropout (double dropout_rate);
    void mark_no_dropout ();
    
public:
    vector<Neuron> nodes_;
    vector<double> outputs_;
    NetworkConfig *netconfig_;
    bool           output_layer_;
};


Neuron::Neuron(int numInputs)
: droped_out_(false)
{
    inputWeights_.resize(numInputs+1); // plus weight for bias
}

void Neuron::init_weights(double mean, double variance, double magnitude)
{
    // This initialization technic helps a lot
    double sqrtn = sqrt(2.0 * inputWeights_.size());
    for (size_t i=0; i < inputWeights_.size()-1; ++i) {
        inputWeights_[i] = magnitude * g_rng.randNorm(mean, variance) / sqrtn;
    }
    inputWeights_.back() = 0;
}

double Neuron::calculate_output_from (const vector< double >& inputs)
{
    assert (inputs.size()+1 == inputWeights_.size() && "number of inputs wrong");
    if (droped_out_) {
        value_ = 0;
        return 0;
    }
    
    double v=0;
    for (size_t i=0; i < inputs.size(); ++i) {
        v += inputs[i] * inputWeights_[i];
    }
    v += inputWeights_.back(); // bias
    value_ =  activate_func(v);
    return value_;
}

void Neuron::calculate_pdrv_of_error_wrt_input_weights(vector<double> const&inputs, double drvLoss)
{
    pdrvErrorWrtInput_ = drvLoss * activate_drv_func(value_);
    size_t i=0;
    for (; i < pdrvErrorWrtWeights_.size() - 1; ++i) {
        pdrvErrorWrtWeights_[i] += pdrvErrorWrtInput_ * inputs[i];
    }
    pdrvErrorWrtWeights_[i] += pdrvErrorWrtInput_ * 1.0; // and bias which is 1.0
}

void Neuron::calculate_pdrv_of_error_wrt_input_weights(const vector< double >& inputs, const Layer& output_layer)
{
    if (droped_out_) {
        pdrvErrorWrtInput_ = 0;
        return;
    }
    
    double d = 0;
    for (size_t i=0; i < output_layer.num_of_neurons(); ++i) {
        d += output_layer.nodes_[i].inputWeights_[idx_] * output_layer.nodes_[i].pdrvErrorWrtInput_;
    }
    
    pdrvErrorWrtInput_ = d * activate_drv_func(value_);
    size_t i=0;
    for (; i< pdrvErrorWrtWeights_.size() - 1; ++i) {
        pdrvErrorWrtWeights_[i] += pdrvErrorWrtInput_ * inputs[i];
    }
    pdrvErrorWrtWeights_[i] += pdrvErrorWrtInput_ * 1.0; // and bias which is 1.0
}


const vector< double >& Neuron::weights() const
{
    return inputWeights_;
}

size_t Neuron::set_input_weights(const double* weights)
{
    inputWeights_.assign(weights, weights + inputWeights_.size());
    pdrvErrorWrtWeights_.resize(inputWeights_.size());
    deltaWeight_.resize(inputWeights_.size());
    momentum_.resize(inputWeights_.size());
    return inputWeights_.size();
}

void Neuron::clear_pdrvs()
{
    pdrvErrorWrtWeights_.clear();
    pdrvErrorWrtWeights_.resize(inputWeights_.size());
}

void Neuron::update_weights(double learning_rate, double momentum, bool reguon)
{
    double weight_norm = 0;
    for (size_t i=0; i < inputWeights_.size(); ++i) {
        double dx = pdrvErrorWrtWeights_[i];
        double regularization = 0;
        if (reguon) {
            regularization = learning_rate * REGULARIZATION_FACTOR * inputWeights_[i];
        }
        /* RMSprop
        double decay_rate = 0.99;
        deltaWeight_[i] = decay_rate * deltaWeight_[i] + (1.0 - decay_rate) * dx * dx;
        inputWeights_[i] += -learning_rate * dx / (sqrt(deltaWeight_[i]) + epsilon) - regularization;
        //*/
        
        //* Adam
        double beta1=0.9;
        double beta2=0.999;
        momentum_[i] = beta1*momentum_[i] + (1.0 - beta1) * dx;
        double mt = momentum_[i] / (1.0 - pow(beta1, g_generation));
        //double mt = momentum_[i];
        deltaWeight_[i] = beta2 * deltaWeight_[i] + (1.0 - beta2) * dx * dx;
        double vt = deltaWeight_[i] / (1.0 - pow(beta2, g_generation));
        //double vt = deltaWeight_[i];
        inputWeights_[i] += -learning_rate * mt / (sqrt(vt) + epsilon) - regularization;
        //*/
        
        weight_norm += inputWeights_[i] * inputWeights_[i];
    }
    
    weight_norm = sqrt(weight_norm);
    if (weight_norm > MAX_NORM) {
        double r = MAX_NORM / weight_norm;
        for (size_t i=0; i < inputWeights_.size(); ++i) {
            inputWeights_[i] *= r;
        }
    }
}



Layer::Layer(NetworkConfig *netconfig, int numNeurons=0, int numInputsPerNeuron=0)
: netconfig_(netconfig)
, output_layer_(false)
{
    nodes_.resize(numNeurons);
    for (size_t i=0; i < nodes_.size(); ++i) {
        nodes_[i] = Neuron(numInputsPerNeuron);
        nodes_[i].idx_ = i;
    }
    outputs_.resize(numNeurons);
}

Layer::~Layer()
{
    nodes_.clear();
}

void Layer::init_weights(double mean, double variance, double magnitude)
{
    for (size_t i=0; i < nodes_.size(); ++i) {
        nodes_[i].init_weights (mean, variance, magnitude);
    }
}

void Layer::forward_pass(const Layer &last_layer)
{
    double rate = 1.0 - DROPOUT_RATE;
    
    const vector<double> & inputs = last_layer.get_outputs();
    for (size_t i=0; i < nodes_.size(); ++i) {
        if (nodes_[i].droped_out_) {
            outputs_[i] = nodes_[i].value_ = 0;
        } else {
            nodes_[i].calculate_output_from (inputs);
            if (!output_layer_ && !netconfig_->training_) {
                nodes_[i].value_ *= rate;
            }
            outputs_[i] = nodes_[i].value_;
        }
    }
}

void Layer::calculate_output_layer_pdrv_of_error_wrt_input_weights(vector<double> const&inputs,
                                                                   vector<double> const&outputsExp,
                                                                   double totalExp,
                                                                   int right_ans
                                                                  )
{
    assert (outputsExp.size() == nodes_.size());
    
    for (size_t i=0; i < nodes_.size(); ++i) {
        double drvLoss;
        if (i == right_ans) {
            drvLoss = outputsExp[i] / totalExp -1.0;
        } else {
            drvLoss = outputsExp[i] / totalExp;
        }
        nodes_[i].calculate_pdrv_of_error_wrt_input_weights (inputs, drvLoss);
    }
}

void Layer::calculate_middle_layer_pdrv_of_error_wrt_input_weights(const vector< double >& inputs, const Layer& output_layer)
{
    for (size_t i=0; i < nodes_.size(); ++i) {
        nodes_[i].calculate_pdrv_of_error_wrt_input_weights (inputs, output_layer);
    }
}


void Layer::save_weights_to(vector< double >& weights) const
{
    for (size_t i=0; i < nodes_.size(); ++i) {
        const vector<double> &w = nodes_[i].weights();
        weights.insert(weights.end(), w.begin(), w.end());
    }

}

size_t Layer::set_input_weights(const double* weights)
{
    size_t pos = 0;
    for (size_t i=0; i < nodes_.size(); ++i) {
        pos += nodes_[i].set_input_weights(weights + pos);
    }
    return pos;
}

void Layer::set_outputs_of_input_layer(const vector< double >& value)
{
    outputs_ = value;
    for (size_t i=0; i < nodes_.size(); ++i) {
        if (netconfig_->training_ && nodes_[i].droped_out_) {
            nodes_[i].value_ = outputs_[i] = 0;
        } else {
            nodes_[i].value_ = outputs_[i];
        }
    }
}

const vector< double >& Layer::get_outputs() const
{
    return outputs_;
}


size_t Layer::num_of_neurons() const
{
    return nodes_.size();
}

void Layer::clear_pdrvs()
{
    for (size_t i=0; i < nodes_.size(); ++i) {
        nodes_[i].clear_pdrvs();
    }
}

void Layer::update_weights(double learning_rate, double momentum, bool reguon)
{
    for (size_t i=0; i < nodes_.size(); ++i) {
        nodes_[i].update_weights(learning_rate, momentum, reguon);
    }
}

void Layer::mark_dropout(double dropout_rate)
{
    for (size_t i=0; i < nodes_.size(); ++i) {
        nodes_[i].droped_out_ = (g_rng.rand() < dropout_rate);
    }
}

void Layer::mark_no_dropout()
{
    for (size_t i=0; i < nodes_.size(); ++i) {
        nodes_[i].droped_out_ = false;
    }
}



class NeuralNetwork
{
public:
    NeuralNetwork(int numInputs, vector<int> const& hiddenLayerTopology, int numOutpus);
    ~NeuralNetwork();
    void init_weights(double mean_value, double variance_value, double magnitude);
    size_t number_of_weights () const;
    void run (vector<double> const &inputs, vector<double> &outputs);
    void train (vector<double> const &inputs, vector<double> &outputs);
    void forward_pass (vector<double> const &inputs);
    void backward_pass (vector<double> &outputs, int right_ans);
    void save_weights_to(vector<double> &weights) const;
    void set_weights(vector<double> const &weights);
    void clear_pdrvs ();
    void update_weights (double learning_rate, double momentum);
    double error () const { return last_error_; }
    bool regularization_on () const { return regularization_on_; }
    void set_regularization_on () { regularization_on_ = true; }
    
protected:
    void feed_forward ();
    void mark_dropout ();
    void mark_no_dropout ();
    
private:
    vector<Layer *> layers_;
    double last_error_;
    NetworkConfig *netconfig_;
    bool regularization_on_;
    
    NeuralNetwork(const NeuralNetwork &rhs) {}
    NeuralNetwork &operator=(const NeuralNetwork &rhs) { return *this; }
};

NeuralNetwork::NeuralNetwork(int nInputs, const vector< int >& hiddenLayerTopology, int nOutputs)
: last_error_(0)
, netconfig_(new NetworkConfig)
, regularization_on_(false)
{
    layers_.resize(hiddenLayerTopology.size() + 2); // hidden + inputs + outputs
    int nNeuron = nInputs;
    layers_[0] = new Layer(netconfig_, nNeuron, 0); // layer zero is inputs
    for (size_t i=0; i < hiddenLayerTopology.size(); ++i) {
        layers_[i+1] = new Layer(netconfig_, hiddenLayerTopology[i], layers_[i]->num_of_neurons());
    }
    nNeuron = nOutputs;
    layers_.back() = new Layer(netconfig_, nNeuron, hiddenLayerTopology.back());
}

NeuralNetwork::~NeuralNetwork()
{
    delete netconfig_;
    for (size_t i=0; i < layers_.size(); ++i) {
        delete layers_[i];
    }
    layers_.clear();
}

void NeuralNetwork::init_weights(double mean, double variance, double magnitude)
{
    for (size_t i=1; i < layers_.size(); ++i) {
        layers_[i]->init_weights(mean, variance, magnitude);
    }    
}

size_t NeuralNetwork::number_of_weights() const
{
    vector<double> weights;
    this->save_weights_to(weights);
    return weights.size();
}

void NeuralNetwork::run(const vector< double >& inputs, vector< double >& outputs)
{
    if (netconfig_->training_) {
        netconfig_->training_ = false;
        mark_no_dropout ();
    }
    forward_pass(inputs);
    outputs = layers_.back()->get_outputs();    
}

void NeuralNetwork::train(const vector< double >& inputs, vector< double >& outputs)
{
    if (!netconfig_->training_) {
        netconfig_->training_ = true;
        mark_dropout ();
    }
    forward_pass(inputs);
    outputs = layers_.back()->get_outputs();    
}

void NeuralNetwork::forward_pass(const vector< double >& inputs)
{
    // feed forward
    // set inputs as outputs of first layer
    layers_[0]->set_outputs_of_input_layer(inputs);
    feed_forward ();
}

void NeuralNetwork::backward_pass(vector< double >& outputs, int right_ans)
{
    // calculate errors/loss
    double denom = 0;
    double numer = 0;
    last_error_ = 0;

    double max_out=-DBL_MAX;
    for (size_t i=0; i < outputs.size(); ++i) {
        if (outputs[i] > max_out) {
            max_out = outputs[i];
        }
    }
    
    vector<double> outputsExp; outputsExp.reserve(outputs.size());
    double totalExp = 0;
    for (size_t i=0; i < outputs.size(); ++i) {
        outputsExp.push_back(exp(outputs[i] - max_out));
        if (i == right_ans) {
            numer = outputsExp.back();
        }
        denom += outputsExp.back();
    }
    totalExp = denom;
    last_error_ = -log(numer / denom);
    
    // calculate partial derivatives
    // first the last layer(output layer)
    vector< double > layer_inputs = layers_[layers_.size()-2]->get_outputs();
    layers_.back()->calculate_output_layer_pdrv_of_error_wrt_input_weights(layer_inputs, outputsExp, totalExp, right_ans);
    
    // back propagate
    for (int i=layers_.size()-2; i > 0; --i) {
        vector< double > layer_inputs = layers_[i-1]->get_outputs();
        layers_[i]->calculate_middle_layer_pdrv_of_error_wrt_input_weights(layer_inputs, *layers_[i+1]);
    }
    
}


void NeuralNetwork::feed_forward()
{
    for (size_t i=1; i < layers_.size(); ++i) {
        layers_[i]->forward_pass(*layers_[i-1]);
    }    
}

void NeuralNetwork::mark_dropout()
{
    for (size_t i=0; i < layers_.size()-1; ++i) {
        double r = DROPOUT_RATE;
        if (i==0) r = INPUT_DROPOUT_RATE;
        layers_[i]->mark_dropout(r);
    }    

}

void NeuralNetwork::mark_no_dropout()
{
    for (size_t i=0; i < layers_.size()-1; ++i) {
        layers_[i]->mark_no_dropout();
    }    
}

void NeuralNetwork::clear_pdrvs()
{
    for (size_t i=1; i < layers_.size(); ++i) {
        layers_[i]->clear_pdrvs();
    }
}

void NeuralNetwork::update_weights(double learning_rate, double momentum)
{
    assert (netconfig_->training_ && "why update weights when not training?");
    
    // update weights
    for (size_t i=1; i < layers_.size(); ++i) {
        layers_[i]->update_weights(learning_rate, momentum, regularization_on_);
    }
    
    mark_dropout();
    clear_pdrvs();        
}

void NeuralNetwork::save_weights_to(vector< double >& weights) const
{
    weights.clear();
    for (size_t i=1; i < layers_.size(); ++i) {
        layers_[i]->save_weights_to (weights);
    }

}

void NeuralNetwork::set_weights(const vector< double >& weights)
{
    int pos = 0;
    for (size_t i=1; i < layers_.size(); ++i) {
        pos += layers_[i]->set_input_weights (&weights[pos]);
    }
}

struct Genome
{
    double fitness_;
    vector<double> genes_;
    
    Genome(size_t numGenes=0);
    void copy_genes_from (Genome const&source, size_t from_index, size_t to_index, size_t replace_start_index, double mutate_rate);
    void report () const;
    
    bool operator<(const Genome &rhs) const
    {
        return fitness_ > rhs.fitness_;
    }
};

Genome::Genome(size_t numGenes)
{
    assert (numGenes && "number of genes can't be zero");
    genes_.resize(numGenes);
}


void Genome::copy_genes_from(const Genome& source, size_t from_index, size_t to_index, size_t replace_start_index, double mutate_rate)
{
    for (size_t i=from_index,cgene=0; i < to_index; ++i,++cgene) {
        genes_[replace_start_index+cgene] = source.genes_[i];
    }
}

void Genome::report() const
{
    cout << "weights ";
    for (size_t i=0; i < genes_.size(); ++i) {
        cout << genes_[i] << " ";
    }
}

const int numInputPoints = 11;
const int numOutputPoints = 4;


struct Data {
    int value_;
    vector<double> inputs_;        // binray representation
    
    int class_;// classify into 4 classes. 0 - no-change, 1 - fizz, 2 - buzz, 3 - fizzbuzz
};

struct DataSet
{
    vector<Data> samples_;
    void randomize ()
    {
        vector<Data> sam;
        while (!samples_.empty()) {
            int idx = g_rng.randInt() % samples_.size();
            sam.push_back(samples_[idx]);
            samples_[idx] = samples_.back();
            samples_.pop_back();
        }
        sam.swap(samples_);
    }
};

DataSet sData;
DataSet sDataTest;

Data encode_binary (int value)
{
    Data d;
    
    d.value_ = value;
    
    d.inputs_.reserve(numInputPoints);
    for (int i=0; i < numInputPoints; ++i) {
        int bin = (1 << i);
        if (value & bin) {
            d.inputs_.push_back(1.0);
        } else {
            d.inputs_.push_back(0);
        }
    }
    
    bool fizz = (value % 3 == 0);
    bool buzz = (value % 5 == 0);
    if (fizz && buzz) {
        d.class_ = 3;
    } else if (buzz) {
        d.class_ = 2;
    } else if (fizz) {
        d.class_ = 1;
    } else {
        d.class_ = 0;
    }
    
    return d;
    
}

void generate_data_set (DataSet &sData, int iFrom, int iTo)
{
    for (int iC=iFrom; iC <= iTo; ++iC) {
        sData.samples_.push_back(encode_binary(iC));
    }
}

int answer_from_outputs (vector<double> const&outputs)
{
    double max_term=-DBL_MAX;
    int ans = -1;
    for (size_t i=0; i < outputs.size(); ++i) {
        if (outputs[i] > max_term) {
            max_term = outputs[i];
            ans = i;
        }
    }
    return ans;
}

bool answer_correct (vector<double> const&outputs, int right_class)
{
    return answer_from_outputs(outputs) == right_class;
}

int test_check (NeuralNetwork &network, bool print=false)
{
    vector<double> outputs(numOutputPoints);

    cout << "test...  ";
    if (sDataTest.samples_.empty()) generate_data_set (sDataTest, 1, 100);
    int correct_answer = 0;
    for (size_t i=0; i < sDataTest.samples_.size(); ++i) {
        const Data &d = sDataTest.samples_[i];
        network.run(d.inputs_, outputs);
        int ans = answer_from_outputs(outputs);
        if (ans == d.class_) {
            ++correct_answer;
        }
        if (print) {
            if (ans == 0) {
                cout << d.value_ << " ";
            } else if (ans == 1) {
                cout << "fizz" << " ";
            } else if (ans == 2) {
                cout << "buzz" << " ";
            } else if (ans == 3) {
                cout << "fizzbuzz" << " ";
            } else {
                assert (0 && "what happened?");
            }
        }
    }
    cout << "precision: " << (100.0 * double(correct_answer) / sDataTest.samples_.size()) << "%" << endl;
    
    return correct_answer;
}

int main()
{
    int iSeed = time(NULL)%1000000;
    //iSeed = 587983;
    g_rng.seed(iSeed);
    cout << "use seed " << iSeed << endl;
       
    generate_data_set (sData, 101, 1000);
    sData.randomize();

    vector<int> hidden_layers_topology; // hidden layers, from input to output
    hidden_layers_topology.push_back(700);
    /*
    hidden_layers_topology.push_back(100);
    hidden_layers_topology.push_back(100);
    //*/
    
    NeuralNetwork network(numInputPoints, hidden_layers_topology, numOutputPoints);
    network.init_weights(0, 1.0, 1.0);
    
    const size_t numGenes = network.number_of_weights();

    Genome entity(numGenes);
    network.save_weights_to(entity.genes_);
    
    vector<double> outputs(numOutputPoints);
        
    network.set_weights(entity.genes_);
    
    const double learning_rate = LEARNING_RATE;
    const double momentum = 0.9;

    const int batch_size = BATCH_SIZE;
    
    while (g_generation < 2000) {
//        cout << "generation " << g_generation << "    ";
        ++g_generation;
        int run_count = 0;
        double max_error = 0;
        int correct_answer = 0;
        
        for (size_t i=0; i < sData.samples_.size(); ++i) {
            network.train(sData.samples_[i].inputs_, outputs);
            if (answer_correct(outputs, sData.samples_[i].class_)) {
                ++correct_answer;
            }
            network.backward_pass(outputs, sData.samples_[i].class_);
            max_error = network.error() > max_error ? network.error() : max_error;
            ++run_count;
            if (run_count >= batch_size) {
                network.update_weights(learning_rate, momentum);
                run_count = 0;
            }            
        }
        if (run_count != 0) {
            network.update_weights(learning_rate, momentum);
            run_count = 0;
        }            
                
        if (g_generation % 10 == 0) {
            cout << "generation " << g_generation << " ";
            int correct = test_check(network);
            if (!network.regularization_on () && double(correct_answer)/sData.samples_.size() - double(correct)/sDataTest.samples_.size() > 0.05 ) {
                network.set_regularization_on ();
                cout << "\t\t**** regularization on ****\n" << endl;
            }
            
            if (correct == sDataTest.samples_.size()) {
                cout << "done!\n";
                break;
            }
            
            sData.randomize();
        }
        
    }
    
    test_check(network, true);
    
    network.save_weights_to(entity.genes_);
    cout << "(" << entity.genes_.size() << ") weights\n";
    double weights = 0;
    for (size_t i=0; i < entity.genes_.size(); ++i) {
        weights += fabs(entity.genes_[i]);
    }
    cout << "|totalWeights| = " << weights << "\n";
    entity.report();
    cout << "\n";

    return 0;
}

這個例子有趣的地方是用一層hidden layer比用多層的performance更好。一般來說我們總會以為 'deep' learning當然比較強，原blog作者也是這樣想，但以眼前的事例來看，顯然要用什麼架構還是要看解的問題而定。deep learning 的特色是modularization。典型的例子像影像辨識，network最後學到的feature很明顯的由基本到複雜，越後端的layer處理越高階的feature。而像fizz buzz這種範例反而是不適合的，事實上用第一篇介紹的基因演算法應該更適合，有興趣的讀者請自行試試。

一些心得：

對training影響最大的是這些hyper parameter的設定，範例中的參數是試出來的，我相信怎麼得到一個好的初始參數應該有些研究，如果我那天從事這一行再來找看看吧。
可能是我的實作有什麼問題，當我把dropout rate設成0.5時，network就train不起來了...Orz, 不過設低一點就行，也確實有幫助。我猜很龐大的network才有比較明顯的效果吧？
Learning rate需依network的大小調整，大的網路要用小的learning rate，小的network才能用比較大的值。
regularization有兩種做法，每回更新權重時乘以一個小於1的值(L2), 或固定減某個很小的值(L1)。我這邊implement的是L2 regularization。我發現使用0.01到0.001是比較有效用的設定。太大的話，你gradient descend的變動還不如regularization的影響；太小的話，對系統變成只是雜訊，整體權重不見得會變小。
Batch size也有蠻大的影響。一樣地，太大太小都不行。
所以說究竟有沒有比較科學的方法設定這些參數呢？請留言告訴我 Orz

The Necessary Programmer Blog!

2018年3月10日星期六

人工智慧 III -- 分類

沒有留言:

張貼留言

ftps.space released!

檢舉濫用情形

標籤

2018年3月10日 星期六

人工智慧 III -- 分類

沒有留言:

張貼留言

ftps.space released!

2018年3月10日星期六