// convolutional neural network c header library // inspired by euske's nn1 // meant to be synthesized into RTL through Vitus HLS for an FPGA implementation #include #include #include typedef enum { input, conv, max_pool, fully_connected } ltype; typedef enum { fc_input, fc_hidden, fc_output, } fcpos; typedef enum { a_sigmoid, a_softmax, } activation; typedef struct { ltype type; int height; int width; int channels; // in this case, "channels" are the number of filters that are coming in union { struct { int num_filters; int filter_size; // single integer b/c filter will usually be square shaped int stride; int zero_padding; // single integer for how many layers of zero padding int input_height; int input_width; int input_channels; float (*weights); float (*biases); } conv_params; struct { int pool_size; // single integer again int stride; int input_height; int input_width; } pool_params; struct { int output_size; float* weights; float* biases; activation type; } fc_params; } params; float* output; float* delta; float* pre_activation; float (*activation_g)(float); } Layer; typedef struct { Layer** layers; int num_layers; } Network; Network* create_network(int capacity) { Network* network = (Network*)malloc(sizeof(Network)); network->layers = (Layer**)malloc(capacity * sizeof(Layer*)); network->num_layers = capacity; return network; } float he_init(int fan_in) { float scale = sqrt(2.0f / fan_in); float random = (float)rand() / RAND_MAX * 2 - 1; return random * scale; } float glorot_init(int fan_in, int fan_out) { float limit = sqrt(6.0f / (fan_in + fan_out)); float random = (float)rand() / RAND_MAX; return random * 2 * limit - limit; } float relu(float x) { return x > 0 ? x : 0; } float sigmoid(float x) { return 1 / (1 + exp(-x)); } float relu_g(float x) { return x > 0 ? 1 : 0; } float sigmoid_g(float x) { float sig = sigmoid(x); return sig * (1 - sig); } void softmax(float* input, float* output, int size) { float max = input[0]; for(int i = 1; i < size; i++) { if(input[i] > max) { max = input[i]; } } float sum = 0; for(int i = 0; i < size; i++) { output[i] = exp(input[i] - max); sum += output[i]; } for(int i = 0; i < size; i++) { output[i] /= sum; } } Layer* create_input(int height, int width, int channels) { Layer* layer = (Layer*)malloc(sizeof(Layer)); layer->type = input; layer->height = height; layer->width = width; layer->channels = channels; layer->output = (float*)calloc(height * width * channels, sizeof(float)); return layer; } Layer* create_conv(int input_height, int input_width, int input_channels, int num_filters, int filter_size, int stride, int padding) { Layer* layer = (Layer*)malloc(sizeof(Layer)); layer->type = conv; layer->params.conv_params.num_filters = num_filters; layer->params.conv_params.filter_size = filter_size; layer->params.conv_params.stride = stride; layer->params.conv_params.zero_padding = padding; layer->params.conv_params.input_height = input_height; layer->params.conv_params.input_width = input_width; layer->params.conv_params.input_channels = input_channels; // output dimensions // https://cs231n.github.io/convolutional-networks/ int output_h = (input_height + 2 * padding - filter_size) / stride + 1; int output_w = (input_width + 2 * padding - filter_size) / stride + 1; layer->height = output_h; layer->width = output_w; layer->channels = num_filters; layer->activation_g = relu_g; // conv layer uses relu, use HE init int weights_size = num_filters * input_channels * filter_size * filter_size; int fan_in = input_channels * filter_size * filter_size; layer->params.conv_params.weights = (float*)calloc(weights_size, sizeof(float)); for (int i = 0; i < weights_size; i++) { layer->params.conv_params.weights[i] = he_init(fan_in); } layer->params.conv_params.biases = (float*)calloc(num_filters, sizeof(float)); layer->output = (float*) calloc(output_h * output_w * num_filters, sizeof(float)); layer->delta = (float*) calloc(output_h * output_w * num_filters, sizeof(float)); layer->pre_activation = (float*)calloc(output_h * output_w * num_filters, sizeof(float)); return layer; } Layer* create_maxpool(int input_height, int input_width, int input_channels, int pool_size, int stride) { Layer* layer = (Layer*)malloc(sizeof(Layer)); layer->type = max_pool; layer->params.pool_params.pool_size = pool_size; layer->params.pool_params.stride = stride; layer->params.pool_params.input_height = input_height; layer->params.pool_params.input_width = input_width; // output dimensions // https://cs231n.github.io/convolutional-networks/ int output_h = (input_height - pool_size) / stride + 1; int output_w = (input_width - pool_size) / stride + 1; layer->height = output_h; layer->width = output_w; layer->channels = input_channels; layer->output = (float*) calloc(output_h * output_w * input_channels, sizeof(float)); layer->delta = (float*) calloc(output_h * output_w * input_channels, sizeof(float)); return layer; } Layer* create_fc(int output_size, int input_size, activation type) { Layer* layer = (Layer*)malloc(sizeof(Layer)); layer->type = fully_connected; layer->params.fc_params.output_size = output_size; layer->params.fc_params.type = type; // activation type can either be sigmoid or softmax (output layer) layer->activation_g = (type == a_sigmoid) ? sigmoid_g : NULL; // null is softmax (doesnt have a gradient) // use glorot initalization layer->params.fc_params.weights = (float*)calloc(output_size * input_size, sizeof(float)); for (int i = 0; i < (output_size * input_size); i++) { layer->params.fc_params.weights[i] = glorot_init(input_size, output_size); } layer->params.fc_params.biases = (float*)calloc(output_size, sizeof(float)); layer->height = 1; layer->width = 1; layer->channels = output_size; layer->output = (float*) calloc(output_size, sizeof(float)); layer->delta = (float*) calloc(output_size, sizeof(float)); layer->pre_activation = (float*) calloc(output_size, sizeof(float)); return layer; } void free_layer(Layer* layer) { switch (layer->type) { case input: free(layer->output); free(layer); break; case conv: free(layer->params.conv_params.weights); free(layer->params.conv_params.biases); free(layer->output); free(layer->delta); free(layer->pre_activation); free(layer); break; case max_pool: free(layer->output); free(layer->delta); free(layer); break; case fully_connected: free(layer->params.fc_params.weights); free(layer->params.fc_params.biases); free(layer->output); free(layer->delta); free(layer->pre_activation); free(layer); break; } } void destroy_network(Network* network) { if (!network) return; for (int i = 0; i < network->num_layers; i++) { if (network->layers[i]) { free_layer(network->layers[i]); } } free(network->layers); free(network); } void conv_forward(Layer* layer, float* input) { int padding = layer->params.conv_params.zero_padding; int stride = layer->params.conv_params.stride; int filter_size = layer->params.conv_params.filter_size; int num_filters = layer->params.conv_params.num_filters; int input_height = layer->params.conv_params.input_height; int input_width = layer->params.conv_params.input_width; int input_channels = layer->params.conv_params.input_channels; int padded_height = input_height + 2 * padding; int padded_width = input_width + 2 * padding; float* padded_input = (float*) calloc(padded_height * padded_width * input_channels, sizeof(float)); for (int c = 0; c < input_channels; c++) { for (int h = 0; h < input_height; h++) { for (int w = 0; w < input_width; w++) { padded_input[c * padded_height * padded_width + (h + padding) * padded_width + (w + padding)] = input[c * input_height * input_width + h * input_width + w]; } } } int output_height = (padded_height - filter_size) / stride + 1; int output_width = (padded_width - filter_size) / stride + 1; int output_size = output_height * output_width * num_filters; // for every filter for(int f = 0; f < num_filters; f++) { for(int oh = 0; oh < output_height; oh++) { for(int ow = 0; ow < output_width; ow++) { float sum = 0; for(int c = 0; c < input_channels; c++) { for(int fh = 0; fh < filter_size; fh++) { for(int fw = 0; fw < filter_size; fw++) { int ih = oh * stride + fh; int iw = ow * stride + fw; if (ih >= 0 && ih < padded_height && iw >= 0 && iw < padded_width) { int input_idx = c * padded_height * padded_width + ih * padded_width + iw; int weight_idx = f * input_channels * filter_size * filter_size + c * filter_size * filter_size + fh * filter_size + fw; sum += padded_input[input_idx] * layer->params.conv_params.weights[weight_idx]; } } } } sum += layer->params.conv_params.biases[f]; int output_idx = f * output_height * output_width + oh * output_width + ow; layer->pre_activation[output_idx] = sum; layer->output[output_idx] = relu(sum); } } } free(padded_input); } void maxpool_forward(Layer* layer, float* input) { int pool_size = layer->params.pool_params.pool_size; int stride = layer->params.pool_params.stride; // prev layer int input_height = layer->height; int input_width = layer->width; int input_channels = layer->channels; int output_height = (input_height - pool_size) / stride + 1; int output_width = (input_width - pool_size) / stride + 1; int output_size = output_height * output_width * input_channels; for(int c = 0; c < input_channels; c++) { for(int oh = 0; oh < output_height; oh++) { for(int ow = 0; ow < output_width; ow++) { float max_val = -INFINITY; for(int ph = 0; ph < pool_size; ph++) { for(int pw = 0; pw < pool_size; pw++) { int ih = oh * stride + ph; int iw = ow * stride + pw; float val = input[c * input_height * input_width + ih * input_width + iw]; if(val > max_val) { max_val = val; } } } layer->output[c * output_height * output_width + oh * output_width + ow] = max_val; } } } } void fc_forward(Layer* layer, float* input) { int output_size = layer->params.fc_params.output_size; int input_size = layer->height * layer->width * layer->channels; // flatten float* flattened_input = (float*) calloc(input_size, sizeof(float)); for(int i = 0; i < input_size; i++) { flattened_input[i] = input[i]; } // matmul (output = bias + (input * weight)) float* temp_output = (float*) calloc(output_size, sizeof(float)); for(int o = 0; o < output_size; o++) { float sum = 0; for(int i = 0; i < input_size; i++) { sum += flattened_input[i] * layer->params.fc_params.weights[o * input_size + i]; } sum += layer->params.fc_params.biases[o]; temp_output[o] = sum; } // apply the correct activation (sigmoid for non output layers, softmax for output) if(layer->params.fc_params.type == a_sigmoid) { for(int o = 0; o < output_size; o++) { layer->pre_activation[o] = temp_output[o]; layer->output[o] = sigmoid(temp_output[o]); } } else if(layer->params.fc_params.type == a_softmax) { softmax(temp_output, layer->output, output_size); } free(temp_output); free(flattened_input); } void forward_propagation(Layer* layer, float* input_fc) { int input_size; switch(layer->type) { case input: // input to layer->output input_size = (layer->height * layer->width * layer->channels); for(int i = 0; i < input_size; i++) { layer->output[i] = input_fc[i]; } break; case conv: conv_forward(layer, input_fc); break; case max_pool: maxpool_forward(layer, input_fc); break; case fully_connected: fc_forward(layer, input_fc); break; } } void network_forward(Network* network, float* input) { float* current_input = input; for (int i = 0; i < network->num_layers; i++) { forward_propagation(network->layers[i], current_input); current_input = network->layers[i]->output; } } void fc_backward(Layer* layer, float* prev_delta, float* input, float learning_rate) { int output_size = layer->params.fc_params.output_size; int input_size = layer->height * layer->width * layer->channels; float* gradient; if(layer->params.fc_params.type == a_softmax) { gradient = (float*)malloc(output_size * sizeof(float)); for(int i = 0; i < output_size; i++) { gradient[i] = layer->output[i]; if(prev_delta[i] > 0.5) { // one hot encoded gradient[i] -= 1.0; } } } else { gradient = prev_delta; } // update weights and biases for(int o = 0; o < output_size; o++) { for(int i = 0; i < input_size; i++) { layer->params.fc_params.weights[o * input_size + i] -= learning_rate * gradient[o] * input[i]; } layer->params.fc_params.biases[o] -= learning_rate * gradient[o]; } // gradient if(layer->activation_g) { for(int i = 0; i < input_size; i++) { float sum = 0; for(int o = 0; o < output_size; o++) { sum += layer->params.fc_params.weights[o * input_size + i] * gradient[o]; } layer->delta[i] = sum * layer->activation_g(layer->pre_activation[i]); } } if(layer->params.fc_params.type == a_softmax) { free(gradient); } } void conv_backward(Layer* layer, float* prev_delta, float* input, float learning_rate) { int num_filters = layer->params.conv_params.num_filters; int channels = layer->channels; int filter_size = layer->params.conv_params.filter_size; int input_height = layer->height; int input_width = layer->width; int padding = layer->params.conv_params.zero_padding; int stride = layer->params.conv_params.stride; int output_height = (input_height + 2 * padding - filter_size) / stride + 1; int output_width = (input_width + 2 * padding - filter_size) / stride + 1; // gradient w/respect to filters for(int f = 0; f < num_filters; f++) { for(int c = 0; c < channels; c++) { for(int fh = 0; fh < filter_size; fh++) { for(int fw = 0; fw < filter_size; fw++) { float grad = 0; for(int oh = 0; oh < output_height; oh++) { for(int ow = 0; ow < output_width; ow++) { int ih = oh * stride + fh - padding; int iw = ow * stride + fw - padding; if(ih >= 0 && ih < input_height && iw >= 0 && iw < input_width) { grad += input[c * input_height * input_width + ih * input_width + iw] * prev_delta[f * output_height * output_width + oh * output_width + ow]; } } } int index = f * channels * filter_size * filter_size + c * filter_size * filter_size + fh * filter_size + fw; layer->params.conv_params.weights[index] -= learning_rate * grad; } } } } // gradient w/respect to biases for(int f = 0; f < num_filters; f++) { float grad = 0; for(int oh = 0; oh < output_height; oh++) { for(int ow = 0; ow < output_width; ow++) { grad += prev_delta[f * output_height * output_width + oh * output_width + ow]; } } layer->params.conv_params.biases[f] -= learning_rate * grad; } // gradient with respect to inputs for(int c = 0; c < channels; c++) { for(int ih = 0; ih < input_height; ih++) { for(int iw = 0; iw < input_width; iw++) { float grad = 0; for(int f = 0; f < num_filters; f++) { for(int fh = 0; fh < filter_size; fh++) { for(int fw = 0; fw < filter_size; fw++) { int oh = (ih - fh + padding) / stride; int ow = (iw - fw + padding) / stride; if((ih - fh + padding) % stride == 0 && (iw - fw + padding) % stride == 0 && oh < output_height && ow < output_width) { int w_index = f * channels * filter_size * filter_size + c * filter_size * filter_size + fh * filter_size + fw; grad += layer->params.conv_params.weights[w_index] * prev_delta[f * output_height * output_width + oh * output_width + ow]; } } } } layer->delta[c * input_height * input_width + ih * input_width + iw] = grad * layer->activation_g(layer->pre_activation[c * input_height * input_width + ih * input_width + iw]); } } } } void maxpool_backward(Layer* layer, float* prev_delta, float* input, float learning_rate) { int pool_size = layer->params.pool_params.pool_size; int stride = layer->params.pool_params.stride; int input_height = layer->params.pool_params.input_height; int input_width = layer->params.pool_params.input_width; int channels = layer->channels; // Zero initialize deltas memset(layer->delta, 0, input_height * input_width * channels * sizeof(float)); int output_height = layer->height; int output_width = layer->width; for(int c = 0; c < channels; c++) { for(int oh = 0; oh < output_height; oh++) { for(int ow = 0; ow < output_width; ow++) { // finds max value int maxI = -1, maxJ = -1; float maxVal = -INFINITY; for(int ph = 0; ph < pool_size; ph++) { for(int pw = 0; pw < pool_size; pw++) { int ih = oh * stride + ph; int iw = ow * stride + pw; // checks bounds if (ih < input_height && iw < input_width) { float val = input[c * input_height * input_width + ih * input_width + iw]; if(val > maxVal) { maxVal = val; maxI = ih; maxJ = iw; } } } } // only propagate gradient if a valid max position is found if(maxI != -1 && maxJ != -1) { int delta_idx = c * output_height * output_width + oh * output_width + ow; layer->delta[c * input_height * input_width + maxI * input_width + maxJ] = prev_delta[delta_idx]; } } } } } void backward_propagation(Layer* layer, float* prev_delta, float* input_fc, float learning_rate) { switch(layer->type) { case fully_connected: fc_backward(layer, prev_delta, input_fc, learning_rate); break; case conv: conv_backward(layer, prev_delta, input_fc, learning_rate); break; case max_pool: maxpool_backward(layer, prev_delta, input_fc, learning_rate); break; case input: // No backpropagation for input layer break; } } void network_backward(Network* network, float* label, float learning_rate) { // ouput Layer* output_layer = network->layers[network->num_layers - 1]; // output gradient for(int o = 0; o < output_layer->channels; o++) { output_layer->delta[o] = output_layer->output[o] - label[o]; } // backprop for(int i = network->num_layers - 2; i >= 0; i--) { Layer* current_layer = network->layers[i]; Layer* next_layer = network->layers[i + 1]; backward_propagation(current_layer, next_layer->delta, current_layer->output, learning_rate); } }