as always, 請依第一篇的建議學習machine learning http://zen747.blogspot.tw/2018/03/i.html 以免浪費時間。
/* * This sample use gradient descend to classify a spiral dataset * http://cs231n.github.io/neural-networks-case-study/ */ #include <vector> #include <deque> #include <algorithm> #include <iostream> #include <cmath> #include <cassert> #include <ctime> #include <cfloat> #include "MesenneTwister.h" #define USE_SDL 1 #if USE_SDL #include <SDL2/SDL.h> #include <SDL2/SDL_image.h> #include <SDL2/SDL_ttf.h> #endif using namespace std; // The regularization scale the weights toward 0 #define REGULARIZATION_FACTOR 1e-2 #define DROPOUT_RATE 0.25 #define INPUT_DROPOUT_RATE 0.001 #define MAX_NORM 10 #define LEARNING_RATE 0.02 #define BATCH_SIZE 20 #define LEAKY_RELU_CONSTANT 0.005 const double epsilon = 1e-8; MTRand g_rng(time(NULL)); // used in adam algorithm int g_generation = 0; //-------------------------------- template <int ACTIVATE_FUNC_TYPE> double activate (double x) { return 0; } template <int ACTIVATE_FUNC_TYPE> double activate_drv (double y) { return 0; } enum { SIGMOID, TANH, RELU }; template<> double activate<SIGMOID>(double x) { return 1.0 / (1.0 + exp(-x)); } template<> double activate_drv<SIGMOID> (double y) { return y * (1.0 - y); } template<> double activate<TANH>(double x) { return tanh (x); } template<> double activate_drv<TANH> (double y) { return 1.0 - y * y; } template<> double activate<RELU>(double x) { return (x > 0) ? x : LEAKY_RELU_CONSTANT * x; } template<> double activate_drv<RELU>(double y) { return (y > 0) ? 1.0 : LEAKY_RELU_CONSTANT; } //--------- double activate_func (double x) { return activate<RELU>(x); } double activate_drv_func (double y) { return activate_drv<RELU>(y); } //----------- class Layer; struct NetworkConfig { bool training_; NetworkConfig() : training_(false) {} }; class Neuron { public: Neuron (int numInputs=0); void init_weights (double mean, double variance, double magnitude); double calculate_output_from (vector<double> const&inputs); void calculate_pdrv_of_error_wrt_input_weights(vector<double> const&inputs, double drvLoss); void calculate_pdrv_of_error_wrt_input_weights(vector<double> const&inputs, const Layer &output_layer); const vector<double> & weights() const; size_t set_input_weights(const double *weights); void clear_pdrvs (); void update_weights (double learning_rate, double momentum, bool reguon); public: int idx_; vector<double> inputWeights_; double value_; bool droped_out_; vector<double> deltaWeight_; vector<double> momentum_; double pdrvErrorWrtInput_; vector<double> pdrvErrorWrtWeights_; }; class Layer { public: Layer(NetworkConfig *netconfig, int numNeurons, int numInputs); ~Layer(); void init_weights (double mean, double variance, double magnitude); void forward_pass(const Layer &last_layer); void calculate_output_layer_pdrv_of_error_wrt_input_weights (vector<double> const&inputs, vector<double> const&outputsExp, double totalExp, int right_ans ); void calculate_middle_layer_pdrv_of_error_wrt_input_weights (vector<double> const&inputs, const Layer &output_layer); void save_weights_to(vector<double> &weights) const; size_t set_input_weights(const double *weights); const vector<double> & get_outputs() const; void set_outputs_of_input_layer(const vector<double> &value); size_t num_of_neurons () const; void clear_pdrvs (); void update_weights (double learning_rate, double momentum, bool reguon); void mark_dropout (double dropout_rate); void mark_no_dropout (); public: vector<Neuron> nodes_; vector<double> outputs_; NetworkConfig *netconfig_; bool output_layer_; }; Neuron::Neuron(int numInputs) : droped_out_(false) { inputWeights_.resize(numInputs+1); // plus weight for bias } void Neuron::init_weights(double mean, double variance, double magnitude) { // This initialization technic helps a lot double sqrtn = sqrt(2.0 * inputWeights_.size()); for (size_t i=0; i < inputWeights_.size()-1; ++i) { inputWeights_[i] = magnitude * g_rng.randNorm(mean, variance) / sqrtn; } inputWeights_.back() = 0; } double Neuron::calculate_output_from (const vector< double >& inputs) { assert (inputs.size()+1 == inputWeights_.size() && "number of inputs wrong"); if (droped_out_) { value_ = 0; return 0; } double v=0; for (size_t i=0; i < inputs.size(); ++i) { v += inputs[i] * inputWeights_[i]; } v += inputWeights_.back(); // bias value_ = activate_func(v); return value_; } void Neuron::calculate_pdrv_of_error_wrt_input_weights(vector<double> const&inputs, double drvLoss) { pdrvErrorWrtInput_ = drvLoss * activate_drv_func(value_); size_t i=0; for (; i < pdrvErrorWrtWeights_.size() - 1; ++i) { pdrvErrorWrtWeights_[i] += pdrvErrorWrtInput_ * inputs[i]; } pdrvErrorWrtWeights_[i] += pdrvErrorWrtInput_ * 1.0; // and bias which is 1.0 } void Neuron::calculate_pdrv_of_error_wrt_input_weights(const vector< double >& inputs, const Layer& output_layer) { if (droped_out_) { pdrvErrorWrtInput_ = 0; return; } double d = 0; for (size_t i=0; i < output_layer.num_of_neurons(); ++i) { d += output_layer.nodes_[i].inputWeights_[idx_] * output_layer.nodes_[i].pdrvErrorWrtInput_; } pdrvErrorWrtInput_ = d * activate_drv_func(value_); size_t i=0; for (; i< pdrvErrorWrtWeights_.size() - 1; ++i) { pdrvErrorWrtWeights_[i] += pdrvErrorWrtInput_ * inputs[i]; } pdrvErrorWrtWeights_[i] += pdrvErrorWrtInput_ * 1.0; // and bias which is 1.0 } const vector< double >& Neuron::weights() const { return inputWeights_; } size_t Neuron::set_input_weights(const double* weights) { inputWeights_.assign(weights, weights + inputWeights_.size()); pdrvErrorWrtWeights_.resize(inputWeights_.size()); deltaWeight_.resize(inputWeights_.size()); momentum_.resize(inputWeights_.size()); return inputWeights_.size(); } void Neuron::clear_pdrvs() { pdrvErrorWrtWeights_.clear(); pdrvErrorWrtWeights_.resize(inputWeights_.size()); } void Neuron::update_weights(double learning_rate, double momentum, bool reguon) { double weight_norm = 0; for (size_t i=0; i < inputWeights_.size(); ++i) { double dx = pdrvErrorWrtWeights_[i]; double regularization = 0; if (reguon) { regularization = learning_rate * REGULARIZATION_FACTOR * inputWeights_[i]; } /* RMSprop double decay_rate = 0.99; deltaWeight_[i] = decay_rate * deltaWeight_[i] + (1.0 - decay_rate) * dx * dx; inputWeights_[i] += -learning_rate * dx / (sqrt(deltaWeight_[i]) + epsilon) - regularization; //*/ //* Adam double beta1=0.9; double beta2=0.999; momentum_[i] = beta1*momentum_[i] + (1.0 - beta1) * dx; double mt = momentum_[i] / (1.0 - pow(beta1, g_generation)); //double mt = momentum_[i]; deltaWeight_[i] = beta2 * deltaWeight_[i] + (1.0 - beta2) * dx * dx; double vt = deltaWeight_[i] / (1.0 - pow(beta2, g_generation)); //double vt = deltaWeight_[i]; inputWeights_[i] += -learning_rate * mt / (sqrt(vt) + epsilon) - regularization; //*/ weight_norm += inputWeights_[i] * inputWeights_[i]; } weight_norm = sqrt(weight_norm); if (weight_norm > MAX_NORM) { double r = MAX_NORM / weight_norm; for (size_t i=0; i < inputWeights_.size(); ++i) { inputWeights_[i] *= r; } } } Layer::Layer(NetworkConfig *netconfig, int numNeurons=0, int numInputsPerNeuron=0) : netconfig_(netconfig) , output_layer_(false) { nodes_.resize(numNeurons); for (size_t i=0; i < nodes_.size(); ++i) { nodes_[i] = Neuron(numInputsPerNeuron); nodes_[i].idx_ = i; } outputs_.resize(numNeurons); } Layer::~Layer() { nodes_.clear(); } void Layer::init_weights(double mean, double variance, double magnitude) { for (size_t i=0; i < nodes_.size(); ++i) { nodes_[i].init_weights (mean, variance, magnitude); } } void Layer::forward_pass(const Layer &last_layer) { double rate = 1.0 - DROPOUT_RATE; const vector<double> & inputs = last_layer.get_outputs(); for (size_t i=0; i < nodes_.size(); ++i) { if (nodes_[i].droped_out_) { outputs_[i] = nodes_[i].value_ = 0; } else { nodes_[i].calculate_output_from (inputs); if (!output_layer_ && !netconfig_->training_) { nodes_[i].value_ *= rate; } outputs_[i] = nodes_[i].value_; } } } void Layer::calculate_output_layer_pdrv_of_error_wrt_input_weights(vector<double> const&inputs, vector<double> const&outputsExp, double totalExp, int right_ans ) { assert (outputsExp.size() == nodes_.size()); for (size_t i=0; i < nodes_.size(); ++i) { double drvLoss; if (i == right_ans) { drvLoss = outputsExp[i] / totalExp -1.0; } else { drvLoss = outputsExp[i] / totalExp; } nodes_[i].calculate_pdrv_of_error_wrt_input_weights (inputs, drvLoss); } } void Layer::calculate_middle_layer_pdrv_of_error_wrt_input_weights(const vector< double >& inputs, const Layer& output_layer) { for (size_t i=0; i < nodes_.size(); ++i) { nodes_[i].calculate_pdrv_of_error_wrt_input_weights (inputs, output_layer); } } void Layer::save_weights_to(vector< double >& weights) const { for (size_t i=0; i < nodes_.size(); ++i) { const vector<double> &w = nodes_[i].weights(); weights.insert(weights.end(), w.begin(), w.end()); } } size_t Layer::set_input_weights(const double* weights) { size_t pos = 0; for (size_t i=0; i < nodes_.size(); ++i) { pos += nodes_[i].set_input_weights(weights + pos); } return pos; } void Layer::set_outputs_of_input_layer(const vector< double >& value) { outputs_ = value; for (size_t i=0; i < nodes_.size(); ++i) { if (netconfig_->training_ && nodes_[i].droped_out_) { nodes_[i].value_ = outputs_[i] = 0; } else { nodes_[i].value_ = outputs_[i]; } } } const vector< double >& Layer::get_outputs() const { return outputs_; } size_t Layer::num_of_neurons() const { return nodes_.size(); } void Layer::clear_pdrvs() { for (size_t i=0; i < nodes_.size(); ++i) { nodes_[i].clear_pdrvs(); } } void Layer::update_weights(double learning_rate, double momentum, bool reguon) { for (size_t i=0; i < nodes_.size(); ++i) { nodes_[i].update_weights(learning_rate, momentum, reguon); } } void Layer::mark_dropout(double dropout_rate) { for (size_t i=0; i < nodes_.size(); ++i) { nodes_[i].droped_out_ = (g_rng.rand() < dropout_rate); } } void Layer::mark_no_dropout() { for (size_t i=0; i < nodes_.size(); ++i) { nodes_[i].droped_out_ = false; } } class NeuralNetwork { public: NeuralNetwork(int numInputs, vector<int> const& hiddenLayerTopology, int numOutpus); ~NeuralNetwork(); void init_weights(double mean_value, double variance_value, double magnitude); size_t number_of_weights () const; void run (vector<double> const &inputs, vector<double> &outputs); void train (vector<double> const &inputs, vector<double> &outputs); void forward_pass (vector<double> const &inputs); void backward_pass (vector<double> &outputs, int right_ans); void save_weights_to(vector<double> &weights) const; void set_weights(vector<double> const &weights); void clear_pdrvs (); void update_weights (double learning_rate, double momentum); double error () const { return last_error_; } bool regularization_on () const { return regularization_on_; } void set_regularization_on () { regularization_on_ = true; } protected: void feed_forward (); void mark_dropout (); void mark_no_dropout (); private: vector<Layer *> layers_; double last_error_; NetworkConfig *netconfig_; bool regularization_on_; NeuralNetwork(const NeuralNetwork &rhs) {} NeuralNetwork &operator=(const NeuralNetwork &rhs) { return *this; } }; NeuralNetwork::NeuralNetwork(int nInputs, const vector< int >& hiddenLayerTopology, int nOutputs) : last_error_(0) , netconfig_(new NetworkConfig) , regularization_on_(false) { layers_.resize(hiddenLayerTopology.size() + 2); // hidden + inputs + outputs int nNeuron = nInputs; layers_[0] = new Layer(netconfig_, nNeuron, 0); // layer zero is inputs for (size_t i=0; i < hiddenLayerTopology.size(); ++i) { layers_[i+1] = new Layer(netconfig_, hiddenLayerTopology[i], layers_[i]->num_of_neurons()); } nNeuron = nOutputs; layers_.back() = new Layer(netconfig_, nNeuron, hiddenLayerTopology.back()); } NeuralNetwork::~NeuralNetwork() { delete netconfig_; for (size_t i=0; i < layers_.size(); ++i) { delete layers_[i]; } layers_.clear(); } void NeuralNetwork::init_weights(double mean, double variance, double magnitude) { for (size_t i=1; i < layers_.size(); ++i) { layers_[i]->init_weights(mean, variance, magnitude); } } size_t NeuralNetwork::number_of_weights() const { vector<double> weights; this->save_weights_to(weights); return weights.size(); } void NeuralNetwork::run(const vector< double >& inputs, vector< double >& outputs) { if (netconfig_->training_) { netconfig_->training_ = false; mark_no_dropout (); } forward_pass(inputs); outputs = layers_.back()->get_outputs(); } void NeuralNetwork::train(const vector< double >& inputs, vector< double >& outputs) { if (!netconfig_->training_) { netconfig_->training_ = true; mark_dropout (); } forward_pass(inputs); outputs = layers_.back()->get_outputs(); } void NeuralNetwork::forward_pass(const vector< double >& inputs) { // feed forward // set inputs as outputs of first layer layers_[0]->set_outputs_of_input_layer(inputs); feed_forward (); } void NeuralNetwork::backward_pass(vector< double >& outputs, int right_ans) { // calculate errors/loss double denom = 0; double numer = 0; last_error_ = 0; double max_out=-DBL_MAX; for (size_t i=0; i < outputs.size(); ++i) { if (outputs[i] > max_out) { max_out = outputs[i]; } } vector<double> outputsExp; outputsExp.reserve(outputs.size()); double totalExp = 0; for (size_t i=0; i < outputs.size(); ++i) { outputsExp.push_back(exp(outputs[i] - max_out)); if (i == right_ans) { numer = outputsExp.back(); } denom += outputsExp.back(); } totalExp = denom; last_error_ = -log(numer / denom); // calculate partial derivatives // first the last layer(output layer) vector< double > layer_inputs = layers_[layers_.size()-2]->get_outputs(); layers_.back()->calculate_output_layer_pdrv_of_error_wrt_input_weights(layer_inputs, outputsExp, totalExp, right_ans); // back propagate for (int i=layers_.size()-2; i > 0; --i) { vector< double > layer_inputs = layers_[i-1]->get_outputs(); layers_[i]->calculate_middle_layer_pdrv_of_error_wrt_input_weights(layer_inputs, *layers_[i+1]); } } void NeuralNetwork::feed_forward() { for (size_t i=1; i < layers_.size(); ++i) { layers_[i]->forward_pass(*layers_[i-1]); } } void NeuralNetwork::mark_dropout() { for (size_t i=0; i < layers_.size()-1; ++i) { double r = DROPOUT_RATE; if (i==0) r = INPUT_DROPOUT_RATE; layers_[i]->mark_dropout(r); } } void NeuralNetwork::mark_no_dropout() { for (size_t i=0; i < layers_.size()-1; ++i) { layers_[i]->mark_no_dropout(); } } void NeuralNetwork::clear_pdrvs() { for (size_t i=1; i < layers_.size(); ++i) { layers_[i]->clear_pdrvs(); } } void NeuralNetwork::update_weights(double learning_rate, double momentum) { assert (netconfig_->training_ && "why update weights when not training?"); // update weights for (size_t i=1; i < layers_.size(); ++i) { layers_[i]->update_weights(learning_rate, momentum, regularization_on_); } mark_dropout(); clear_pdrvs(); } void NeuralNetwork::save_weights_to(vector< double >& weights) const { weights.clear(); for (size_t i=1; i < layers_.size(); ++i) { layers_[i]->save_weights_to (weights); } } void NeuralNetwork::set_weights(const vector< double >& weights) { int pos = 0; for (size_t i=1; i < layers_.size(); ++i) { pos += layers_[i]->set_input_weights (&weights[pos]); } } struct Genome { double fitness_; vector<double> genes_; Genome(size_t numGenes=0); void copy_genes_from (Genome const&source, size_t from_index, size_t to_index, size_t replace_start_index, double mutate_rate); void report () const; bool operator<(const Genome &rhs) const { return fitness_ > rhs.fitness_; } }; Genome::Genome(size_t numGenes) { assert (numGenes && "number of genes can't be zero"); genes_.resize(numGenes); } void Genome::copy_genes_from(const Genome& source, size_t from_index, size_t to_index, size_t replace_start_index, double mutate_rate) { for (size_t i=from_index,cgene=0; i < to_index; ++i,++cgene) { genes_[replace_start_index+cgene] = source.genes_[i]; } } void Genome::report() const { cout << "weights "; for (size_t i=0; i < genes_.size(); ++i) { cout << genes_[i] << " "; } } struct Point { double x_, y_; int class_; }; const int nClass = 3; struct DataSet { vector<Point> samples_; void randomize () { vector<Point> sam; while (!samples_.empty()) { int idx = g_rng.randInt() % samples_.size(); sam.push_back(samples_[idx]); samples_[idx] = samples_.back(); samples_.pop_back(); } sam.swap(samples_); } }; #if USE_SDL #define RENDER_WINDOW_WIDTH 600 #define RENDER_WINDOW_HEIGHT 600 struct Color { Uint8 r_, g_, b_; Color(Uint8 r,Uint8 g, Uint8 b) : r_(r), g_(g), b_(b) { } }; class Renderer { SDL_Window *win_; SDL_Renderer *ren_; SDL_Texture *tex_; int init_sdl () { if (SDL_Init(SDL_INIT_VIDEO) != 0){ std::cout << "SDL_Init Error: " << SDL_GetError() << std::endl; return 1; } win_ = SDL_CreateWindow("Hello World!", 100, 100, RENDER_WINDOW_WIDTH, RENDER_WINDOW_HEIGHT, SDL_WINDOW_SHOWN); if (win_ == 0){ std::cout << "SDL_CreateWindow Error: " << SDL_GetError() << std::endl; SDL_Quit(); return 1; } ren_ = SDL_CreateRenderer(win_, -1, SDL_RENDERER_ACCELERATED | SDL_RENDERER_PRESENTVSYNC); if (ren_ == 0){ SDL_DestroyWindow(win_); std::cout << "SDL_CreateRenderer Error: " << SDL_GetError() << std::endl; SDL_Quit(); return 1; } std::string imagePath = "circle.png"; tex_ = IMG_LoadTexture(ren_, imagePath.c_str()); if (tex_ == 0) { SDL_DestroyRenderer(ren_); SDL_DestroyWindow(win_); std::cout << "SDL_CreateTextureFromSurface Error: " << SDL_GetError() << std::endl; SDL_Quit(); return 1; } return 0; } public: Renderer () : win_(0), ren_(0), tex_(0) { } ~Renderer () { if (tex_) SDL_DestroyTexture(tex_); if (ren_) SDL_DestroyRenderer(ren_); if (win_) SDL_DestroyWindow(win_); SDL_Quit(); } int start () { return init_sdl (); } int render (DataSet const&sData) { SDL_SetRenderDrawColor(ren_, 255, 255, 255, 255); SDL_RenderClear(ren_); SDL_Rect dst; dst.w = dst.h = 16; int offX = dst.w/2, offY=dst.h/2; vector<Color> cvClass; cvClass.push_back(Color(255,0,0)); cvClass.push_back(Color(255,255,0)); cvClass.push_back(Color(0,255,0)); cvClass.push_back(Color(0,0,255)); for (size_t i=0; i < sData.samples_.size(); ++i) { const Color &color = cvClass[sData.samples_[i].class_]; SDL_SetTextureColorMod(tex_, color.r_, color.g_, color.b_); dst.x = RENDER_WINDOW_WIDTH * (0.5 + sData.samples_[i].x_) - offX; dst.y = RENDER_WINDOW_HEIGHT * (0.5 + sData.samples_[i].y_) - offY; SDL_RenderCopy(ren_, tex_, NULL, &dst); } //Update the screen SDL_RenderPresent(ren_); //SDL_Delay(3000); return 0; } }; #endif void generate_data_set (DataSet &sData, int nPointsPerClass) { int nC = nClass + 1; for (int iC=0; iC < nClass; ++iC) { for (int iP=0; iP < nPointsPerClass; ++iP) { Point p; p.class_ = iC; double radius = 0.001 + double(iP) / nPointsPerClass; double angle = nC * iC + 1.0 * nC * radius + g_rng.randNorm() * 0.2; p.x_ = 0.5 * radius * sin(angle); p.y_ = 0.5 * radius * cos(angle); sData.samples_.push_back(p); } } } bool answer_correct (vector<double> const&outputs, int right_ans) { double max_term=-DBL_MAX; int ans = -1; for (size_t i=0; i < outputs.size(); ++i) { if (outputs[i] > max_term) { max_term = outputs[i]; ans = i; } } return ans == right_ans; } const int numInputPoints = 2; const int numOutputPoints = 4; int test_check (NeuralNetwork &network) { vector<double> inputs(numInputPoints); vector<double> outputs(numOutputPoints); cout << "test... "; DataSet sDataTest; generate_data_set (sDataTest, 100); int correct_answer = 0; for (size_t i=0; i < sDataTest.samples_.size(); ++i) { inputs.clear(); inputs.push_back(sDataTest.samples_[i].x_); inputs.push_back(sDataTest.samples_[i].y_); network.run(inputs, outputs); //expects.clear(); //load_expects (expects, sDataTest.samples_[i].class_); //network.backward_pass(outputs, sDataTest.samples_[i].class_); if (answer_correct(outputs, sDataTest.samples_[i].class_)) { ++correct_answer; } else { //cout << i << " at (" << sDataTest.samples_[i].x_ << "," <<sDataTest.samples_[i].x_ << ") "; //cout << "value " << outputs[0] << "," << outputs[1] << "," << outputs[2] << " vs " << sDataTest.samples_[i].class_ << "\n"; } } cout << "precision: " << (100.0 * double(correct_answer) / sDataTest.samples_.size()) << "%" << endl; return correct_answer; } int main() { int iSeed = time(NULL)%1000000; //iSeed = 7; g_rng.seed(iSeed); cout << "use seed " << iSeed << endl; DataSet sData; generate_data_set (sData, 100); sData.randomize(); #if USE_SDL Renderer renderer; if (renderer.start() != 0) { cout << "SDL2 initialization failed." << endl; return 1; } renderer.render(sData); #endif vector<int> hidden_layers_topology; // hidden layers, from input to output hidden_layers_topology.push_back(160); /* hidden_layers_topology.push_back(30); hidden_layers_topology.push_back(30); //*/ NeuralNetwork network(numInputPoints, hidden_layers_topology, numOutputPoints); network.init_weights(0, 1.0, 1.0); const size_t numGenes = network.number_of_weights(); Genome entity(numGenes); network.save_weights_to(entity.genes_); vector<double> inputs(numInputPoints); vector<double> outputs(numOutputPoints); network.set_weights(entity.genes_); const double learning_rate = LEARNING_RATE; const double momentum = 0.9; const int batch_size = BATCH_SIZE; while (g_generation < 500) { // cout << "generation " << g_generation << " "; ++g_generation; int run_count = 0; double max_error = 0; int correct_answer = 0; for (size_t i=0; i < sData.samples_.size(); ++i) { inputs.clear(); inputs.push_back(sData.samples_[i].x_); inputs.push_back(sData.samples_[i].y_); network.train(inputs, outputs); if (answer_correct(outputs, sData.samples_[i].class_)) { ++correct_answer; } network.backward_pass(outputs, sData.samples_[i].class_); max_error = network.error() > max_error ? network.error() : max_error; ++run_count; if (run_count >= batch_size) { network.update_weights(learning_rate, momentum); run_count = 0; } } if (run_count != 0) { network.update_weights(learning_rate, momentum); run_count = 0; } if (g_generation % 10 == 0) { int correct = test_check(network); if (!network.regularization_on () && double(correct_answer - correct) / correct_answer > 0.05 ) { network.set_regularization_on (); cout << "\n\t\t**** regularization on ****\n" << endl; } } } test_check(network); network.save_weights_to(entity.genes_); double weights = 0; for (size_t i=0; i < entity.genes_.size(); ++i) { weights += fabs(entity.genes_[i]); } cout << "(" << entity.genes_.size() << ") weights\n"; cout << "|totalWeights| = " << weights << "\n"; entity.report(); cout << "\n"; return 0; }
這個toy example其實只要幾個epoch就能達到很好的準確度。如果你沒有SDL可以 #define USE_SDL 0。
/* * a fizz buzz example * http://joelgrus.com/2016/05/23/fizz-buzz-in-tensorflow/ */ #include <vector> #include <deque> #include <algorithm> #include <iostream> #include <cmath> #include <cassert> #include <ctime> #include <cfloat> #include "MesenneTwister.h" using namespace std; // The regularization scale the weights toward 0 #define REGULARIZATION_FACTOR 1e-2 #define DROPOUT_RATE 0.1 #define INPUT_DROPOUT_RATE 0.0 #define MAX_NORM 10 #define LEARNING_RATE 0.001 #define BATCH_SIZE 32 #define LEAKY_RELU_CONSTANT 0.005 const double epsilon = 1e-8; MTRand g_rng(time(NULL)); // used in adam algorithm int g_generation = 0; //-------------------------------- template <int ACTIVATE_FUNC_TYPE> double activate (double x) { return 0; } template <int ACTIVATE_FUNC_TYPE> double activate_drv (double y) { return 0; } enum { SIGMOID, TANH, RELU }; template<> double activate<SIGMOID>(double x) { return 1.0 / (1.0 + exp(-x)); } template<> double activate_drv<SIGMOID> (double y) { return y * (1.0 - y); } template<> double activate<TANH>(double x) { return tanh (x); } template<> double activate_drv<TANH> (double y) { return 1.0 - y * y; } template<> double activate<RELU>(double x) { return (x > 0) ? x : LEAKY_RELU_CONSTANT * x; } template<> double activate_drv<RELU>(double y) { return (y > 0) ? 1.0 : LEAKY_RELU_CONSTANT; } //--------- double activate_func (double x) { return activate<RELU>(x); } double activate_drv_func (double y) { return activate_drv<RELU>(y); } //----------- class Layer; struct NetworkConfig { bool training_; NetworkConfig() : training_(false) {} }; class Neuron { public: Neuron (int numInputs=0); void init_weights (double mean, double variance, double magnitude); double calculate_output_from (vector<double> const&inputs); void calculate_pdrv_of_error_wrt_input_weights(vector<double> const&inputs, double drvLoss); void calculate_pdrv_of_error_wrt_input_weights(vector<double> const&inputs, const Layer &output_layer); const vector<double> & weights() const; size_t set_input_weights(const double *weights); void clear_pdrvs (); void update_weights (double learning_rate, double momentum, bool reguon); public: int idx_; vector<double> inputWeights_; double value_; bool droped_out_; vector<double> deltaWeight_; vector<double> momentum_; double pdrvErrorWrtInput_; vector<double> pdrvErrorWrtWeights_; }; class Layer { public: Layer(NetworkConfig *netconfig, int numNeurons, int numInputs); ~Layer(); void init_weights (double mean, double variance, double magnitude); void forward_pass(const Layer &last_layer); void calculate_output_layer_pdrv_of_error_wrt_input_weights (vector<double> const&inputs, vector<double> const&outputsExp, double totalExp, int right_ans ); void calculate_middle_layer_pdrv_of_error_wrt_input_weights (vector<double> const&inputs, const Layer &output_layer); void save_weights_to(vector<double> &weights) const; size_t set_input_weights(const double *weights); const vector<double> & get_outputs() const; void set_outputs_of_input_layer(const vector<double> &value); size_t num_of_neurons () const; void clear_pdrvs (); void update_weights (double learning_rate, double momentum, bool reguon); void mark_dropout (double dropout_rate); void mark_no_dropout (); public: vector<Neuron> nodes_; vector<double> outputs_; NetworkConfig *netconfig_; bool output_layer_; }; Neuron::Neuron(int numInputs) : droped_out_(false) { inputWeights_.resize(numInputs+1); // plus weight for bias } void Neuron::init_weights(double mean, double variance, double magnitude) { // This initialization technic helps a lot double sqrtn = sqrt(2.0 * inputWeights_.size()); for (size_t i=0; i < inputWeights_.size()-1; ++i) { inputWeights_[i] = magnitude * g_rng.randNorm(mean, variance) / sqrtn; } inputWeights_.back() = 0; } double Neuron::calculate_output_from (const vector< double >& inputs) { assert (inputs.size()+1 == inputWeights_.size() && "number of inputs wrong"); if (droped_out_) { value_ = 0; return 0; } double v=0; for (size_t i=0; i < inputs.size(); ++i) { v += inputs[i] * inputWeights_[i]; } v += inputWeights_.back(); // bias value_ = activate_func(v); return value_; } void Neuron::calculate_pdrv_of_error_wrt_input_weights(vector<double> const&inputs, double drvLoss) { pdrvErrorWrtInput_ = drvLoss * activate_drv_func(value_); size_t i=0; for (; i < pdrvErrorWrtWeights_.size() - 1; ++i) { pdrvErrorWrtWeights_[i] += pdrvErrorWrtInput_ * inputs[i]; } pdrvErrorWrtWeights_[i] += pdrvErrorWrtInput_ * 1.0; // and bias which is 1.0 } void Neuron::calculate_pdrv_of_error_wrt_input_weights(const vector< double >& inputs, const Layer& output_layer) { if (droped_out_) { pdrvErrorWrtInput_ = 0; return; } double d = 0; for (size_t i=0; i < output_layer.num_of_neurons(); ++i) { d += output_layer.nodes_[i].inputWeights_[idx_] * output_layer.nodes_[i].pdrvErrorWrtInput_; } pdrvErrorWrtInput_ = d * activate_drv_func(value_); size_t i=0; for (; i< pdrvErrorWrtWeights_.size() - 1; ++i) { pdrvErrorWrtWeights_[i] += pdrvErrorWrtInput_ * inputs[i]; } pdrvErrorWrtWeights_[i] += pdrvErrorWrtInput_ * 1.0; // and bias which is 1.0 } const vector< double >& Neuron::weights() const { return inputWeights_; } size_t Neuron::set_input_weights(const double* weights) { inputWeights_.assign(weights, weights + inputWeights_.size()); pdrvErrorWrtWeights_.resize(inputWeights_.size()); deltaWeight_.resize(inputWeights_.size()); momentum_.resize(inputWeights_.size()); return inputWeights_.size(); } void Neuron::clear_pdrvs() { pdrvErrorWrtWeights_.clear(); pdrvErrorWrtWeights_.resize(inputWeights_.size()); } void Neuron::update_weights(double learning_rate, double momentum, bool reguon) { double weight_norm = 0; for (size_t i=0; i < inputWeights_.size(); ++i) { double dx = pdrvErrorWrtWeights_[i]; double regularization = 0; if (reguon) { regularization = learning_rate * REGULARIZATION_FACTOR * inputWeights_[i]; } /* RMSprop double decay_rate = 0.99; deltaWeight_[i] = decay_rate * deltaWeight_[i] + (1.0 - decay_rate) * dx * dx; inputWeights_[i] += -learning_rate * dx / (sqrt(deltaWeight_[i]) + epsilon) - regularization; //*/ //* Adam double beta1=0.9; double beta2=0.999; momentum_[i] = beta1*momentum_[i] + (1.0 - beta1) * dx; double mt = momentum_[i] / (1.0 - pow(beta1, g_generation)); //double mt = momentum_[i]; deltaWeight_[i] = beta2 * deltaWeight_[i] + (1.0 - beta2) * dx * dx; double vt = deltaWeight_[i] / (1.0 - pow(beta2, g_generation)); //double vt = deltaWeight_[i]; inputWeights_[i] += -learning_rate * mt / (sqrt(vt) + epsilon) - regularization; //*/ weight_norm += inputWeights_[i] * inputWeights_[i]; } weight_norm = sqrt(weight_norm); if (weight_norm > MAX_NORM) { double r = MAX_NORM / weight_norm; for (size_t i=0; i < inputWeights_.size(); ++i) { inputWeights_[i] *= r; } } } Layer::Layer(NetworkConfig *netconfig, int numNeurons=0, int numInputsPerNeuron=0) : netconfig_(netconfig) , output_layer_(false) { nodes_.resize(numNeurons); for (size_t i=0; i < nodes_.size(); ++i) { nodes_[i] = Neuron(numInputsPerNeuron); nodes_[i].idx_ = i; } outputs_.resize(numNeurons); } Layer::~Layer() { nodes_.clear(); } void Layer::init_weights(double mean, double variance, double magnitude) { for (size_t i=0; i < nodes_.size(); ++i) { nodes_[i].init_weights (mean, variance, magnitude); } } void Layer::forward_pass(const Layer &last_layer) { double rate = 1.0 - DROPOUT_RATE; const vector<double> & inputs = last_layer.get_outputs(); for (size_t i=0; i < nodes_.size(); ++i) { if (nodes_[i].droped_out_) { outputs_[i] = nodes_[i].value_ = 0; } else { nodes_[i].calculate_output_from (inputs); if (!output_layer_ && !netconfig_->training_) { nodes_[i].value_ *= rate; } outputs_[i] = nodes_[i].value_; } } } void Layer::calculate_output_layer_pdrv_of_error_wrt_input_weights(vector<double> const&inputs, vector<double> const&outputsExp, double totalExp, int right_ans ) { assert (outputsExp.size() == nodes_.size()); for (size_t i=0; i < nodes_.size(); ++i) { double drvLoss; if (i == right_ans) { drvLoss = outputsExp[i] / totalExp -1.0; } else { drvLoss = outputsExp[i] / totalExp; } nodes_[i].calculate_pdrv_of_error_wrt_input_weights (inputs, drvLoss); } } void Layer::calculate_middle_layer_pdrv_of_error_wrt_input_weights(const vector< double >& inputs, const Layer& output_layer) { for (size_t i=0; i < nodes_.size(); ++i) { nodes_[i].calculate_pdrv_of_error_wrt_input_weights (inputs, output_layer); } } void Layer::save_weights_to(vector< double >& weights) const { for (size_t i=0; i < nodes_.size(); ++i) { const vector<double> &w = nodes_[i].weights(); weights.insert(weights.end(), w.begin(), w.end()); } } size_t Layer::set_input_weights(const double* weights) { size_t pos = 0; for (size_t i=0; i < nodes_.size(); ++i) { pos += nodes_[i].set_input_weights(weights + pos); } return pos; } void Layer::set_outputs_of_input_layer(const vector< double >& value) { outputs_ = value; for (size_t i=0; i < nodes_.size(); ++i) { if (netconfig_->training_ && nodes_[i].droped_out_) { nodes_[i].value_ = outputs_[i] = 0; } else { nodes_[i].value_ = outputs_[i]; } } } const vector< double >& Layer::get_outputs() const { return outputs_; } size_t Layer::num_of_neurons() const { return nodes_.size(); } void Layer::clear_pdrvs() { for (size_t i=0; i < nodes_.size(); ++i) { nodes_[i].clear_pdrvs(); } } void Layer::update_weights(double learning_rate, double momentum, bool reguon) { for (size_t i=0; i < nodes_.size(); ++i) { nodes_[i].update_weights(learning_rate, momentum, reguon); } } void Layer::mark_dropout(double dropout_rate) { for (size_t i=0; i < nodes_.size(); ++i) { nodes_[i].droped_out_ = (g_rng.rand() < dropout_rate); } } void Layer::mark_no_dropout() { for (size_t i=0; i < nodes_.size(); ++i) { nodes_[i].droped_out_ = false; } } class NeuralNetwork { public: NeuralNetwork(int numInputs, vector<int> const& hiddenLayerTopology, int numOutpus); ~NeuralNetwork(); void init_weights(double mean_value, double variance_value, double magnitude); size_t number_of_weights () const; void run (vector<double> const &inputs, vector<double> &outputs); void train (vector<double> const &inputs, vector<double> &outputs); void forward_pass (vector<double> const &inputs); void backward_pass (vector<double> &outputs, int right_ans); void save_weights_to(vector<double> &weights) const; void set_weights(vector<double> const &weights); void clear_pdrvs (); void update_weights (double learning_rate, double momentum); double error () const { return last_error_; } bool regularization_on () const { return regularization_on_; } void set_regularization_on () { regularization_on_ = true; } protected: void feed_forward (); void mark_dropout (); void mark_no_dropout (); private: vector<Layer *> layers_; double last_error_; NetworkConfig *netconfig_; bool regularization_on_; NeuralNetwork(const NeuralNetwork &rhs) {} NeuralNetwork &operator=(const NeuralNetwork &rhs) { return *this; } }; NeuralNetwork::NeuralNetwork(int nInputs, const vector< int >& hiddenLayerTopology, int nOutputs) : last_error_(0) , netconfig_(new NetworkConfig) , regularization_on_(false) { layers_.resize(hiddenLayerTopology.size() + 2); // hidden + inputs + outputs int nNeuron = nInputs; layers_[0] = new Layer(netconfig_, nNeuron, 0); // layer zero is inputs for (size_t i=0; i < hiddenLayerTopology.size(); ++i) { layers_[i+1] = new Layer(netconfig_, hiddenLayerTopology[i], layers_[i]->num_of_neurons()); } nNeuron = nOutputs; layers_.back() = new Layer(netconfig_, nNeuron, hiddenLayerTopology.back()); } NeuralNetwork::~NeuralNetwork() { delete netconfig_; for (size_t i=0; i < layers_.size(); ++i) { delete layers_[i]; } layers_.clear(); } void NeuralNetwork::init_weights(double mean, double variance, double magnitude) { for (size_t i=1; i < layers_.size(); ++i) { layers_[i]->init_weights(mean, variance, magnitude); } } size_t NeuralNetwork::number_of_weights() const { vector<double> weights; this->save_weights_to(weights); return weights.size(); } void NeuralNetwork::run(const vector< double >& inputs, vector< double >& outputs) { if (netconfig_->training_) { netconfig_->training_ = false; mark_no_dropout (); } forward_pass(inputs); outputs = layers_.back()->get_outputs(); } void NeuralNetwork::train(const vector< double >& inputs, vector< double >& outputs) { if (!netconfig_->training_) { netconfig_->training_ = true; mark_dropout (); } forward_pass(inputs); outputs = layers_.back()->get_outputs(); } void NeuralNetwork::forward_pass(const vector< double >& inputs) { // feed forward // set inputs as outputs of first layer layers_[0]->set_outputs_of_input_layer(inputs); feed_forward (); } void NeuralNetwork::backward_pass(vector< double >& outputs, int right_ans) { // calculate errors/loss double denom = 0; double numer = 0; last_error_ = 0; double max_out=-DBL_MAX; for (size_t i=0; i < outputs.size(); ++i) { if (outputs[i] > max_out) { max_out = outputs[i]; } } vector<double> outputsExp; outputsExp.reserve(outputs.size()); double totalExp = 0; for (size_t i=0; i < outputs.size(); ++i) { outputsExp.push_back(exp(outputs[i] - max_out)); if (i == right_ans) { numer = outputsExp.back(); } denom += outputsExp.back(); } totalExp = denom; last_error_ = -log(numer / denom); // calculate partial derivatives // first the last layer(output layer) vector< double > layer_inputs = layers_[layers_.size()-2]->get_outputs(); layers_.back()->calculate_output_layer_pdrv_of_error_wrt_input_weights(layer_inputs, outputsExp, totalExp, right_ans); // back propagate for (int i=layers_.size()-2; i > 0; --i) { vector< double > layer_inputs = layers_[i-1]->get_outputs(); layers_[i]->calculate_middle_layer_pdrv_of_error_wrt_input_weights(layer_inputs, *layers_[i+1]); } } void NeuralNetwork::feed_forward() { for (size_t i=1; i < layers_.size(); ++i) { layers_[i]->forward_pass(*layers_[i-1]); } } void NeuralNetwork::mark_dropout() { for (size_t i=0; i < layers_.size()-1; ++i) { double r = DROPOUT_RATE; if (i==0) r = INPUT_DROPOUT_RATE; layers_[i]->mark_dropout(r); } } void NeuralNetwork::mark_no_dropout() { for (size_t i=0; i < layers_.size()-1; ++i) { layers_[i]->mark_no_dropout(); } } void NeuralNetwork::clear_pdrvs() { for (size_t i=1; i < layers_.size(); ++i) { layers_[i]->clear_pdrvs(); } } void NeuralNetwork::update_weights(double learning_rate, double momentum) { assert (netconfig_->training_ && "why update weights when not training?"); // update weights for (size_t i=1; i < layers_.size(); ++i) { layers_[i]->update_weights(learning_rate, momentum, regularization_on_); } mark_dropout(); clear_pdrvs(); } void NeuralNetwork::save_weights_to(vector< double >& weights) const { weights.clear(); for (size_t i=1; i < layers_.size(); ++i) { layers_[i]->save_weights_to (weights); } } void NeuralNetwork::set_weights(const vector< double >& weights) { int pos = 0; for (size_t i=1; i < layers_.size(); ++i) { pos += layers_[i]->set_input_weights (&weights[pos]); } } struct Genome { double fitness_; vector<double> genes_; Genome(size_t numGenes=0); void copy_genes_from (Genome const&source, size_t from_index, size_t to_index, size_t replace_start_index, double mutate_rate); void report () const; bool operator<(const Genome &rhs) const { return fitness_ > rhs.fitness_; } }; Genome::Genome(size_t numGenes) { assert (numGenes && "number of genes can't be zero"); genes_.resize(numGenes); } void Genome::copy_genes_from(const Genome& source, size_t from_index, size_t to_index, size_t replace_start_index, double mutate_rate) { for (size_t i=from_index,cgene=0; i < to_index; ++i,++cgene) { genes_[replace_start_index+cgene] = source.genes_[i]; } } void Genome::report() const { cout << "weights "; for (size_t i=0; i < genes_.size(); ++i) { cout << genes_[i] << " "; } } const int numInputPoints = 11; const int numOutputPoints = 4; struct Data { int value_; vector<double> inputs_; // binray representation int class_;// classify into 4 classes. 0 - no-change, 1 - fizz, 2 - buzz, 3 - fizzbuzz }; struct DataSet { vector<Data> samples_; void randomize () { vector<Data> sam; while (!samples_.empty()) { int idx = g_rng.randInt() % samples_.size(); sam.push_back(samples_[idx]); samples_[idx] = samples_.back(); samples_.pop_back(); } sam.swap(samples_); } }; DataSet sData; DataSet sDataTest; Data encode_binary (int value) { Data d; d.value_ = value; d.inputs_.reserve(numInputPoints); for (int i=0; i < numInputPoints; ++i) { int bin = (1 << i); if (value & bin) { d.inputs_.push_back(1.0); } else { d.inputs_.push_back(0); } } bool fizz = (value % 3 == 0); bool buzz = (value % 5 == 0); if (fizz && buzz) { d.class_ = 3; } else if (buzz) { d.class_ = 2; } else if (fizz) { d.class_ = 1; } else { d.class_ = 0; } return d; } void generate_data_set (DataSet &sData, int iFrom, int iTo) { for (int iC=iFrom; iC <= iTo; ++iC) { sData.samples_.push_back(encode_binary(iC)); } } int answer_from_outputs (vector<double> const&outputs) { double max_term=-DBL_MAX; int ans = -1; for (size_t i=0; i < outputs.size(); ++i) { if (outputs[i] > max_term) { max_term = outputs[i]; ans = i; } } return ans; } bool answer_correct (vector<double> const&outputs, int right_class) { return answer_from_outputs(outputs) == right_class; } int test_check (NeuralNetwork &network, bool print=false) { vector<double> outputs(numOutputPoints); cout << "test... "; if (sDataTest.samples_.empty()) generate_data_set (sDataTest, 1, 100); int correct_answer = 0; for (size_t i=0; i < sDataTest.samples_.size(); ++i) { const Data &d = sDataTest.samples_[i]; network.run(d.inputs_, outputs); int ans = answer_from_outputs(outputs); if (ans == d.class_) { ++correct_answer; } if (print) { if (ans == 0) { cout << d.value_ << " "; } else if (ans == 1) { cout << "fizz" << " "; } else if (ans == 2) { cout << "buzz" << " "; } else if (ans == 3) { cout << "fizzbuzz" << " "; } else { assert (0 && "what happened?"); } } } cout << "precision: " << (100.0 * double(correct_answer) / sDataTest.samples_.size()) << "%" << endl; return correct_answer; } int main() { int iSeed = time(NULL)%1000000; //iSeed = 587983; g_rng.seed(iSeed); cout << "use seed " << iSeed << endl; generate_data_set (sData, 101, 1000); sData.randomize(); vector<int> hidden_layers_topology; // hidden layers, from input to output hidden_layers_topology.push_back(700); /* hidden_layers_topology.push_back(100); hidden_layers_topology.push_back(100); //*/ NeuralNetwork network(numInputPoints, hidden_layers_topology, numOutputPoints); network.init_weights(0, 1.0, 1.0); const size_t numGenes = network.number_of_weights(); Genome entity(numGenes); network.save_weights_to(entity.genes_); vector<double> outputs(numOutputPoints); network.set_weights(entity.genes_); const double learning_rate = LEARNING_RATE; const double momentum = 0.9; const int batch_size = BATCH_SIZE; while (g_generation < 2000) { // cout << "generation " << g_generation << " "; ++g_generation; int run_count = 0; double max_error = 0; int correct_answer = 0; for (size_t i=0; i < sData.samples_.size(); ++i) { network.train(sData.samples_[i].inputs_, outputs); if (answer_correct(outputs, sData.samples_[i].class_)) { ++correct_answer; } network.backward_pass(outputs, sData.samples_[i].class_); max_error = network.error() > max_error ? network.error() : max_error; ++run_count; if (run_count >= batch_size) { network.update_weights(learning_rate, momentum); run_count = 0; } } if (run_count != 0) { network.update_weights(learning_rate, momentum); run_count = 0; } if (g_generation % 10 == 0) { cout << "generation " << g_generation << " "; int correct = test_check(network); if (!network.regularization_on () && double(correct_answer)/sData.samples_.size() - double(correct)/sDataTest.samples_.size() > 0.05 ) { network.set_regularization_on (); cout << "\t\t**** regularization on ****\n" << endl; } if (correct == sDataTest.samples_.size()) { cout << "done!\n"; break; } sData.randomize(); } } test_check(network, true); network.save_weights_to(entity.genes_); cout << "(" << entity.genes_.size() << ") weights\n"; double weights = 0; for (size_t i=0; i < entity.genes_.size(); ++i) { weights += fabs(entity.genes_[i]); } cout << "|totalWeights| = " << weights << "\n"; entity.report(); cout << "\n"; return 0; }
這個例子有趣的地方是用一層hidden layer比用多層的performance更好。一般來說我們總會以為 'deep' learning當然比較強,原blog作者也是這樣想,但以眼前的事例來看,顯然要用什麼架構還是要看解的問題而定。deep learning 的特色是modularization。 典型的例子像影像辨識,network最後學到的feature很明顯的由基本到複雜,越後端的layer處理越高階的feature。而像fizz buzz這種範例反而是不適合的,事實上用第一篇介紹的基因演算法應該更適合,有興趣的讀者請自行試試。
一些心得:
- 對training影響最大的是這些hyper parameter的設定,範例中的參數是試出來的,我相信怎麼得到一個好的初始參數應該有些研究,如果我那天從事這一行再來找看看吧。
- 可能是我的實作有什麼問題,當我把dropout rate設成0.5時,network就train不起來了...Orz, 不過設低一點就行,也確實有幫助。我猜很龐大的network才有比較明顯的效果吧?
- Learning rate需依network的大小調整,大的網路要用小的learning rate,小的network才能用比較大的值。
- regularization有兩種做法,每回更新權重時乘以一個小於1的值(L2), 或固定減某個很小的值(L1)。我這邊implement的是L2 regularization。我發現使用0.01到0.001是比較有效用的設定。太大的話,你gradient descend的變動還不如regularization的影響;太小的話,對系統變成只是雜訊,整體權重不見得會變小。
- Batch size也有蠻大的影響。一樣地,太大太小都不行。
- 所以說究竟有沒有比較科學的方法設定這些參數呢? 請留言告訴我 Orz
沒有留言:
張貼留言