這邊假設讀者已經讀過第一篇,已經依我的建議讀完了(或者閱讀中)CS231n的課程,你想自己實作一套,希望有個對照的範本。
請確定你已經了解什麼叫back propagation再來看下面的code。
/* * This example use gradient descend to find a solution for XOR problem */ #include <vector> #include <deque> #include <algorithm> #include <iostream> #include <cmath> #include <cassert> #include <ctime> #include "MesenneTwister.h" using namespace std; #define INIT_WEIGHT_SCALE 1.0 #define LEAKY_RELU_CONSTANT 0.005 #define MAX_FITNESS 1000000.0 const double network_error_epsilon = 1e-3; const double epsilon = 1e-8; MTRand g_rng(time(NULL)); int g_generation = 0; double error_func (double expect, double actual) { return 0.5 * (expect - actual) * (expect - actual); } double error_drv_func(double target, double actual) { // by chain rule. derivative of loss function 0.5 * (actual - target)^2 = (actual - target) // or 0.5 * (target - actual)^2 = (target - actual) * -1. return actual - target; } //-------------------------------- template <int ACTIVATE_FUNC_TYPE> double activate (double x) { return 0; } template <int ACTIVATE_FUNC_TYPE> double activate_drv (double y) { return 0; } enum { SIGMOID, TANH, RELU }; template<> double activate<SIGMOID>(double x) { return 1.0 / (1.0 + exp(-x)); } template<> double activate_drv<SIGMOID> (double y) { return y * (1.0 - y); } template<> double activate<TANH>(double x) { return tanh (x); } template<> double activate_drv<TANH> (double y) { return 1.0 - y * y; } template<> double activate<RELU>(double x) { return (x > 0) ? x : LEAKY_RELU_CONSTANT * x; } template<> double activate_drv<RELU>(double y) { return (y > 0) ? 1.0 : LEAKY_RELU_CONSTANT; } //--------- try using different activation functions double activate_func (double x) { return activate<RELU>(x); // return activate<TANH>(x); // return activate<SIGMOID>(x); } double activate_drv_func (double y) { return activate_drv<RELU>(y); // return activate_drv<TANH>(y); // return activate_drv<SIGMOID>(y); } //----------- class Layer; class Neuron { public: Neuron (int numInputs=0); void init_weights (double mean, double variance); double calculate_output_from (vector<double> const&inputs); void calculate_pdrv_of_error_wrt_input_weights(vector<double> const&inputs, double target); void calculate_pdrv_of_error_wrt_input_weights(int indexToOutputLayer, vector<double> const&inputs, const Layer &output_layer); const vector<double> & weights() const; size_t set_input_weights(const double *weights); void clear_pdrvs (); void update_weights (double learning_rate, double momentum); public: vector<double> inputWeights_; double value_; vector<double> deltaWeight_; vector<double> momentum_; double pdrvErrorWrtInput_; vector<double> pdrvErrorWrtWeights_; }; class Layer { public: Layer(int numNeurons, int numInputs); ~Layer(); void init_weights (double mean, double variance); void forward_pass(const Layer &last_layer); void calculate_output_layer_pdrv_of_error_wrt_input_weights (vector<double> const&inputs, vector<double> const&targets); void calculate_middle_layer_pdrv_of_error_wrt_input_weights (vector<double> const&inputs, const Layer &output_layer); void save_weights_to(vector<double> &weights) const; size_t set_input_weights(const double *weights); const vector<double> & get_outputs() const; void set_outputs(const vector<double> &value); size_t num_of_neurons () const; void clear_pdrvs (); void update_weights (double learning_rate, double momentum); public: vector<Neuron> nodes_; vector<double> outputs_; }; Neuron::Neuron(int numInputs) { inputWeights_.resize(numInputs+1); // plust weight for bias } void Neuron::init_weights(double mean, double variance) { double sqrtn = sqrt(2.0 * inputWeights_.size()); for (size_t i=0; i < inputWeights_.size()-1; ++i) { inputWeights_[i] = INIT_WEIGHT_SCALE * g_rng.randNorm(mean, variance) / sqrtn; } inputWeights_.back() = 0; } double Neuron::calculate_output_from (const vector< double >& inputs) { assert (inputs.size()+1 == inputWeights_.size() && "number of inputs wrong"); double v=0; for (size_t i=0; i < inputs.size(); ++i) { v += inputs[i] * inputWeights_[i]; } v += inputWeights_.back(); // bias value_ = activate_func(v); return value_; } void Neuron::calculate_pdrv_of_error_wrt_input_weights(vector<double> const&inputs, double target) { pdrvErrorWrtInput_ = error_drv_func (target, value_) * activate_drv_func(value_); size_t i=0; for (; i < pdrvErrorWrtWeights_.size() - 1; ++i) { pdrvErrorWrtWeights_[i] += pdrvErrorWrtInput_ * inputs[i]; } pdrvErrorWrtWeights_[i] += pdrvErrorWrtInput_ * 1.0; // and bias which is 1.0 } void Neuron::calculate_pdrv_of_error_wrt_input_weights(int indexToOutputLayer, const vector< double >& inputs, const Layer& output_layer) { double d = 0; for (size_t i=0; i < output_layer.num_of_neurons(); ++i) { d += output_layer.nodes_[i].inputWeights_[indexToOutputLayer] * output_layer.nodes_[i].pdrvErrorWrtInput_; } pdrvErrorWrtInput_ = d * activate_drv_func(value_); size_t i=0; for (; i< pdrvErrorWrtWeights_.size() - 1; ++i) { pdrvErrorWrtWeights_[i] += pdrvErrorWrtInput_ * inputs[i]; } pdrvErrorWrtWeights_[i] += pdrvErrorWrtInput_ * 1.0; // and bias which is 1.0 } const vector< double >& Neuron::weights() const { return inputWeights_; } size_t Neuron::set_input_weights(const double* weights) { inputWeights_.assign(weights, weights + inputWeights_.size()); pdrvErrorWrtWeights_.resize(inputWeights_.size()); deltaWeight_.resize(inputWeights_.size()); momentum_.resize(inputWeights_.size()); return inputWeights_.size(); } void Neuron::clear_pdrvs() { pdrvErrorWrtWeights_.clear(); pdrvErrorWrtWeights_.resize(inputWeights_.size()); } void Neuron::update_weights(double learning_rate, double momentum) { for (size_t i=0; i < inputWeights_.size(); ++i) { double dx = pdrvErrorWrtWeights_[i]; /* Nesterov momentum double deltaWeightKeep = deltaWeight_[i]; deltaWeight_[i] = momentum * deltaWeight_[i] - learning_rate * dx; inputWeights_[i] += -momentum * deltaWeightKeep + (1 + momentum) * deltaWeight_[i]; //*/ //* RMSprop double decay_rate = 0.99; deltaWeight_[i] = decay_rate * deltaWeight_[i] + (1.0 - decay_rate) * dx * dx; inputWeights_[i] += -learning_rate * dx / (sqrt(deltaWeight_[i]) + epsilon); //*/ } } Layer::Layer(int numNeurons=0, int numInputsPerNeuron=0) { nodes_.resize(numNeurons); for (size_t i=0; i < nodes_.size(); ++i) { nodes_[i] = Neuron(numInputsPerNeuron); } outputs_.resize(numNeurons); } Layer::~Layer() { nodes_.clear(); } void Layer::init_weights(double mean, double variance) { for (size_t i=0; i < nodes_.size(); ++i) { nodes_[i].init_weights (mean, variance); } } void Layer::forward_pass(const Layer &last_layer) { const vector<double> & inputs = last_layer.get_outputs(); for (size_t i=0; i < nodes_.size(); ++i) { outputs_[i] = nodes_[i].calculate_output_from (inputs); } } void Layer::calculate_output_layer_pdrv_of_error_wrt_input_weights(vector<double> const&inputs, const vector< double >& targets) { assert (targets.size() == nodes_.size()); for (size_t i=0; i < nodes_.size(); ++i) { nodes_[i].calculate_pdrv_of_error_wrt_input_weights (inputs, targets[i]); } } void Layer::calculate_middle_layer_pdrv_of_error_wrt_input_weights(const vector< double >& inputs, const Layer& output_layer) { for (size_t i=0; i < nodes_.size(); ++i) { nodes_[i].calculate_pdrv_of_error_wrt_input_weights (i, inputs, output_layer); } } void Layer::save_weights_to(vector< double >& weights) const { for (size_t i=0; i < nodes_.size(); ++i) { const vector<double> &w = nodes_[i].weights(); weights.insert(weights.end(), w.begin(), w.end()); } } size_t Layer::set_input_weights(const double* weights) { size_t pos = 0; for (size_t i=0; i < nodes_.size(); ++i) { pos += nodes_[i].set_input_weights(weights + pos); } return pos; } void Layer::set_outputs(const vector< double >& value) { outputs_ = value; for (size_t i=0; i < nodes_.size(); ++i) { nodes_[i].value_ = outputs_[i]; } } const vector< double >& Layer::get_outputs() const { return outputs_; } size_t Layer::num_of_neurons() const { return nodes_.size(); } void Layer::clear_pdrvs() { for (size_t i=0; i < nodes_.size(); ++i) { nodes_[i].clear_pdrvs(); } } void Layer::update_weights(double learning_rate, double momentum) { for (size_t i=0; i < nodes_.size(); ++i) { nodes_[i].update_weights(learning_rate, momentum); } } class NeuralNetwork { public: NeuralNetwork(int numInputs, vector<int> const& hiddenLayerTopology, int numOutpus); ~NeuralNetwork(); void init_weights(double mean_value, double variance_value); size_t number_of_weights () const; void run (vector<double> const &inputs, vector<double> &outputs, vector<double> const &expects); void forward_pass (vector<double> const &inputs); void backward_pass (vector<double> &outputs, vector<double> const &expects); void save_weights_to(vector<double> &weights) const; void set_weights(vector<double> const &weights); void clear_pdrvs (); void update_weights (double learning_rate, double momentum); double error () const { return last_error_; } protected: void feed_forward (); private: vector<Layer> layers_; double last_error_; NeuralNetwork(const NeuralNetwork &rhs) {} NeuralNetwork &operator=(const NeuralNetwork &rhs) { return *this; } }; NeuralNetwork::NeuralNetwork(int nInputs, const vector< int >& hiddenLayerTopology, int nOutputs) { layers_.resize(hiddenLayerTopology.size() + 2); // hidden + inputs + outputs int nNeuron = nInputs; layers_[0] = Layer(nNeuron, 0); // layer zero is inputs for (size_t i=0; i < hiddenLayerTopology.size(); ++i) { layers_[i+1] = Layer(hiddenLayerTopology[i], layers_[i].num_of_neurons()); } nNeuron = nOutputs; layers_.back() = Layer(nNeuron, hiddenLayerTopology.back()); } NeuralNetwork::~NeuralNetwork() { layers_.clear(); } void NeuralNetwork::init_weights(double mean, double variance) { for (size_t i=1; i < layers_.size(); ++i) { layers_[i].init_weights(mean, variance); } } size_t NeuralNetwork::number_of_weights() const { vector<double> weights; this->save_weights_to(weights); return weights.size(); } void NeuralNetwork::run(const vector< double >& inputs, vector< double >& outputs, vector<double> const &expects) { forward_pass(inputs); outputs = layers_.back().get_outputs(); backward_pass(outputs, expects); } void NeuralNetwork::forward_pass(const vector< double >& inputs) { // feed forward layers_[0].set_outputs(inputs); feed_forward (); } void NeuralNetwork::backward_pass(vector< double >& outputs, const vector< double >& expects) { // calculate errors last_error_ = 0; for (size_t i=0; i < outputs.size(); ++i) { last_error_ += error_func(expects[i], outputs[i]); } // calculate partial derivatives // first the last layer(output layer) vector< double > layer_inputs = layers_[layers_.size()-2].get_outputs(); layers_.back().calculate_output_layer_pdrv_of_error_wrt_input_weights(layer_inputs, expects); // back propagate for (int i=layers_.size()-2; i > 0; --i) { vector< double > layer_inputs = layers_[i-1].get_outputs(); layers_[i].calculate_middle_layer_pdrv_of_error_wrt_input_weights(layer_inputs, layers_[i+1]); } } void NeuralNetwork::feed_forward() { for (size_t i=1; i < layers_.size(); ++i) { layers_[i].forward_pass(layers_[i-1]); } } void NeuralNetwork::clear_pdrvs() { for (size_t i=1; i < layers_.size(); ++i) { layers_[i].clear_pdrvs(); } } void NeuralNetwork::update_weights(double learning_rate, double momentum) { // update weights for (size_t i=1; i < layers_.size(); ++i) { layers_[i].update_weights(learning_rate, momentum); } } void NeuralNetwork::save_weights_to(vector< double >& weights) const { weights.clear(); for (size_t i=1; i < layers_.size(); ++i) { layers_[i].save_weights_to (weights); } } void NeuralNetwork::set_weights(const vector< double >& weights) { int pos = 0; for (size_t i=1; i < layers_.size(); ++i) { pos += layers_[i].set_input_weights (&weights[pos]); } } struct Genome { double fitness_; vector<double> genes_; Genome(size_t numGenes=0); void copy_genes_from (Genome const&source, size_t from_index, size_t to_index, size_t replace_start_index, double mutate_rate); void report () const; bool operator<(const Genome &rhs) const { return fitness_ > rhs.fitness_; } }; Genome::Genome(size_t numGenes) { assert (numGenes && "number of genes can't be zero"); genes_.resize(numGenes); } void Genome::copy_genes_from(const Genome& source, size_t from_index, size_t to_index, size_t replace_start_index, double mutate_rate) { for (size_t i=from_index,cgene=0; i < to_index; ++i,++cgene) { genes_[replace_start_index+cgene] = source.genes_[i]; } } void Genome::report() const { cout << "weights "; for (size_t i=0; i < genes_.size(); ++i) { cout << genes_[i] << " "; } } int main() { // XOR test, two inputs, two output const int numSamples = 4; const int numInputPoints = 2; const int numOutputPoints = 2; int iSeed = time(NULL)%1000000; g_rng.seed(iSeed); cout << "use seed " << iSeed << endl; vector<int> hidden_layers_topology; // hidden layers, from input to output hidden_layers_topology.push_back(2); NeuralNetwork network(numInputPoints, hidden_layers_topology, numOutputPoints); network.init_weights(0, 1.0); const size_t numGenes = network.number_of_weights(); Genome entity(numGenes); network.save_weights_to(entity.genes_); vector<double> inputs(numInputPoints); vector<double> outputs(numOutputPoints); vector<double> expects(2); vector<double> returns(numOutputPoints * numSamples); network.set_weights(entity.genes_); const double learning_rate = 0.001; const double momentum = 0.9; // not used when update weights using RMSprop. while (g_generation < 100000) { cout << "generation " << g_generation << " "; ++g_generation; int rno = 0; /* inputs[0] = 0.05; inputs[1] = 0.1; expects[0] = 0.01; expects[1] = 0.99; network.run(inputs, outputs, expects); returns[rno++] = outputs[0]; returns[rno++] = outputs[1]; */ inputs[0] = 0; inputs[1] = 0; expects[0] = 1; expects[1] = 0; network.run(inputs, outputs, expects); returns[rno++] = outputs[0]; returns[rno++] = outputs[1]; inputs[0] = 1; inputs[1] = 1; expects[0] = 1; expects[1] = 0; network.run(inputs, outputs, expects); returns[rno++] = outputs[0]; returns[rno++] = outputs[1]; inputs[0] = 0; inputs[1] = 1; expects[0] = 0; expects[1] = 1; network.run(inputs, outputs, expects); returns[rno++] = outputs[0]; returns[rno++] = outputs[1]; inputs[0] = 1; inputs[1] = 0; expects[0] = 0; expects[1] = 1; network.run(inputs, outputs, expects); returns[rno++] = outputs[0]; returns[rno++] = outputs[1]; cout << "network error: " << network.error() << " "; network.save_weights_to(entity.genes_); entity.report(); cout << "|| results "; for (size_t i=0; i < returns.size(); ++i) { cout << returns[i] << ", "; } cout << "\n"; network.update_weights(learning_rate, momentum); network.clear_pdrvs(); if (network.error() < network_error_epsilon) { cout << "solution found!" << endl; return 0; } } cout << "solution not found! adjust learning rate, etc. and try again!" << endl; return 0; }
實作上要注意的大概只有每run一個sample後,我們把back propagation過來的偏微分加起來,最後再呼叫update_weights來更新權重。因為xor的input也才四種所以可以這樣做,通常的話你會使用mini batch的方式來處理, 記得更新權重後把暫存的偏微分清除再run下一個sample。
你可以修改使用的activation function試試。這邊是一點觀察,使用gradient descend,看初始化的情形,如果不幸卡在某個local minimum,有時甚至跑不出解答...相比之下用GA反而很少失敗 XD。雖然sigmoid function可以說是這套神經網路誕生時預設的activation function,但是一旦我們有個基礎後,這套技術(神經網路)也漸漸演化出自己的路... 大自然是怎麼演化出這套解決方案的呢?實在不禁讓人拍案叫絕!
沒有留言:
張貼留言