From 90207ab8347d911eb5f08deb05c5660e67589b0e Mon Sep 17 00:00:00 2001 From: vikshar Date: Fri, 17 Jan 2025 00:26:17 -0600 Subject: [PATCH] edit readme --- README.md | 40 +++++-- cnn-hls.c | 310 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 339 insertions(+), 11 deletions(-) create mode 100644 cnn-hls.c diff --git a/README.md b/README.md index 59f3c16..5246f9d 100644 --- a/README.md +++ b/README.md @@ -1,15 +1,33 @@ -# nn - implementation of neural networks in c +# nn - Neural Networks in C -implements neural networks in c, targets embedded systems (microcontrollers, fpgas) +This repository implements various neural networks in C, focusing mainly on targetting embedded systems or creating hardware accelerators (FPGA-Based, ASIC, etc.) \ +This project was created as part of my independent study course, where I am currently researching the design of hardware accelerators for high-performance workloads -#### current implementations -`snn.c` - a simple feedforward neural network written in ~150loc. \ -`cnn.c` - TODO, implements a convolutional neural network \ -`cnn-hls.c` - TODO, has fpga hls specific types/pragmas in order to synthesize to verilog; run on an fpga \ +### current implementations (project index) +`snn.c` - A simple feedforward neural network written in ~150loc. Depends on c native libraries and [GSL](https://www.gnu.org/software/gsl/) \ +`cnn.c` - Implements a fully featured cnn library in ~600loc. Depends solely on C native libraries \ +`cnn-hls.c` - The version of `cnn.c` with HLS specific optimizations (Pragmas, Systolic Array Mutliplication, etc.); aims at being synthesized through Vitus HLS to create a FPGA Based CNN Accelerator \ +`mnist.c` - Driver code for `cnn.c` which trains on the [MNIST](https://yann.lecun.com/exdb/mnist/) dataset -depends on native c libraries and [gsl](https://www.gnu.org/software/gsl/) +### usage +`mnist.c` is a great example of how the library is used, but basic usage boils down to a few simple things: \ -### future goals -cnn w/ pragmas -> successfully compiled to verilog using vivado/vitus \ -self-made matrix multiplication library, relying only on native c ones \ -code cleanup and optimization +1) Importing `cnn.c` into your code +2) Creating a network and creating layers: +```c +// an example of a lenet-5 inspired 8 layer network +Network* network = create_network(8); +network->layers[0] = create_input(IMG_HEIGHT, IMG_WIDTH, 1); +network->layers[1] = create_conv(IMG_HEIGHT, IMG_WIDTH, 1, 6, 5, 1, 2); +network->layers[2] = create_maxpool(network->layers[1]->height, network->layers[1]->width, network->layers[1]->channels, 2, 2); +network->layers[3] = create_conv(network->layers[2]->height, network->layers[2]->width, network->layers[2]->channels, 16, 5, 1, 0); +network->layers[4] = create_maxpool(network->layers[3]->height, network->layers[3]->width, network->layers[3]->channels, 2, 2); +network->layers[5] = create_fc(120, network->layers[4]->height * network->layers[4]->width * network->layers[4]->channels, a_sigmoid); +network->layers[6] = create_fc(84, 120, a_sigmoid); +network->layers[7] = create_fc(NUM_CLASSES, 84, a_softmax); +``` +3) Forward and backpropogation through the Network! + +## Project Overview and Explanation +### Abstract +For my project, I propose an implementation of a Convolutional Neural Network based handwritten digital classifier using the MNIST dataset on a Field Programmable Gate Array (FPGA). I utilize High Level Synthesis (HLS) tool called Vitus HLS developed by [AMD/Xilinx](https://www.xilinx.com/products/boards-and-kits.html) in order to implement the accelerator through C, eliminating the need to write any code in HDL Languages such as Verilog/VHDL. To reduce any performance losses, I implement a systolic array based architecture and utilize techniques such as pipelining, loop unrolling, and memory partitioning. Through this project, I aim to highlight the potential of FPGAs to offer reduced power consumption and latency for machine learning tasks, creating a more sustainable computing environment. diff --git a/cnn-hls.c b/cnn-hls.c new file mode 100644 index 0000000..f0a46ba --- /dev/null +++ b/cnn-hls.c @@ -0,0 +1,310 @@ +#include "ap_fixed.h" +#include "hls_stream.h" +#include "hls_math.h" +#include + +// Fixed point definitions for better hardware efficiency +typedef ap_fixed<16,8> data_t; // 16 bits total, 8 integer bits +typedef ap_fixed<16,8> weight_t; +typedef ap_fixed<32,16> acc_t; // Wider accumulator to prevent overflow + +// Enums remain the same +typedef enum { + input, + conv, + max_pool, + fully_connected +} ltype; + +typedef enum { + fc_input, + fc_hidden, + fc_output, +} fcpos; + +typedef enum { + a_sigmoid, + a_softmax, +} activation; + +// Maximum size definitions for static arrays +#define MAX_LAYER_SIZE 1024 +#define MAX_FILTER_SIZE 11 +#define MAX_CHANNELS 256 +#define MAX_FILTERS 256 + +// Layer struct optimized for HLS +struct Layer { + ltype type; + int height; + int width; + int channels; + + union { + struct { + int num_filters; + int filter_size; + int stride; + int zero_padding; + int input_height; + int input_width; + int input_channels; + weight_t weights[MAX_FILTERS][MAX_CHANNELS][MAX_FILTER_SIZE][MAX_FILTER_SIZE]; + data_t biases[MAX_FILTERS]; + } conv_params; + + struct { + int pool_size; + int stride; + int input_height; + int input_width; + } pool_params; + + struct { + int output_size; + weight_t weights[MAX_LAYER_SIZE][MAX_LAYER_SIZE]; + data_t biases[MAX_LAYER_SIZE]; + activation type; + } fc_params; + } params; + + data_t output[MAX_LAYER_SIZE]; + data_t delta[MAX_LAYER_SIZE]; + data_t pre_activation[MAX_LAYER_SIZE]; +}; + +// Helper functions +data_t sigmoid(data_t x) { + #pragma HLS INLINE + return 1.0 / (1.0 + hls::exp(-x)); +} + +data_t relu(data_t x) { + #pragma HLS INLINE + return (x > 0) ? x : 0; +} + +// Systolic array matrix multiplication for fully connected layers +void systolic_matrix_multiply( + const weight_t weights[MAX_LAYER_SIZE][MAX_LAYER_SIZE], + const data_t input[MAX_LAYER_SIZE], + acc_t output[MAX_LAYER_SIZE], + int M, int N) { + + #pragma HLS PIPELINE II=1 + #pragma HLS ARRAY_PARTITION variable=weights cyclic factor=16 dim=2 + #pragma HLS ARRAY_PARTITION variable=input cyclic factor=16 + + static acc_t pe_array[MAX_LAYER_SIZE]; + #pragma HLS ARRAY_PARTITION variable=pe_array cyclic factor=16 + + // Initialize processing elements + for (int i = 0; i < M; i++) { + #pragma HLS UNROLL factor=16 + pe_array[i] = 0; + } + + // Systolic computation + for (int k = 0; k < N; k++) { + for (int i = 0; i < M; i++) { + #pragma HLS PIPELINE II=1 + #pragma HLS UNROLL factor=16 + pe_array[i] += weights[i][k] * input[k]; + } + } + + // Write results + for (int i = 0; i < M; i++) { + #pragma HLS UNROLL factor=16 + output[i] = pe_array[i]; + } +} + +// Optimized convolution forward pass +void conv_forward(Layer& layer, const data_t input[MAX_LAYER_SIZE]) { + #pragma HLS INLINE off + + const int padding = layer.params.conv_params.zero_padding; + const int stride = layer.params.conv_params.stride; + const int filter_size = layer.params.conv_params.filter_size; + const int num_filters = layer.params.conv_params.num_filters; + const int input_height = layer.params.conv_params.input_height; + const int input_width = layer.params.conv_params.input_width; + const int input_channels = layer.params.conv_params.input_channels; + + // Create padded input buffer + data_t padded_input[MAX_CHANNELS][MAX_FILTER_SIZE][MAX_FILTER_SIZE]; + #pragma HLS ARRAY_PARTITION variable=padded_input complete dim=1 + + const int padded_height = input_height + 2 * padding; + const int padded_width = input_width + 2 * padding; + const int output_height = (padded_height - filter_size) / stride + 1; + const int output_width = (padded_width - filter_size) / stride + 1; + + // Main convolution loops +CONV_FILTERS: for(int f = 0; f < num_filters; f++) { + CONV_OUTPUT_H: for(int oh = 0; oh < output_height; oh++) { + CONV_OUTPUT_W: for(int ow = 0; ow < output_width; ow++) { + #pragma HLS PIPELINE II=1 + + acc_t sum = 0; + + CONV_CHANNELS: for(int c = 0; c < input_channels; c++) { + CONV_KERNEL_H: for(int fh = 0; fh < filter_size; fh++) { + CONV_KERNEL_W: for(int fw = 0; fw < filter_size; fw++) { + #pragma HLS UNROLL factor=3 + + int ih = oh * stride + fh; + int iw = ow * stride + fw; + + if (ih >= 0 && ih < padded_height && iw >= 0 && iw < padded_width) { + sum += input[c * input_height * input_width + (ih-padding) * input_width + (iw-padding)] * + layer.params.conv_params.weights[f][c][fh][fw]; + } + } + } + } + + sum += layer.params.conv_params.biases[f]; + int output_idx = f * output_height * output_width + oh * output_width + ow; + layer.pre_activation[output_idx] = sum; + layer.output[output_idx] = relu(sum); + } + } + } +} + +// Optimized max pooling forward pass +void maxpool_forward(Layer& layer, const data_t input[MAX_LAYER_SIZE]) { + #pragma HLS INLINE off + + const int pool_size = layer.params.pool_params.pool_size; + const int stride = layer.params.pool_params.stride; + const int input_height = layer.height; + const int input_width = layer.width; + const int input_channels = layer.channels; + + const int output_height = (input_height - pool_size) / stride + 1; + const int output_width = (input_width - pool_size) / stride + 1; + +POOL_CHANNELS: for(int c = 0; c < input_channels; c++) { + POOL_OUTPUT_H: for(int oh = 0; oh < output_height; oh++) { + POOL_OUTPUT_W: for(int ow = 0; ow < output_width; ow++) { + #pragma HLS PIPELINE II=1 + + data_t max_val = -INFINITY; + + POOL_WINDOW_H: for(int ph = 0; ph < pool_size; ph++) { + POOL_WINDOW_W: for(int pw = 0; pw < pool_size; pw++) { + #pragma HLS UNROLL + + int ih = oh * stride + ph; + int iw = ow * stride + pw; + data_t val = input[c * input_height * input_width + ih * input_width + iw]; + max_val = (val > max_val) ? val : max_val; + } + } + + layer.output[c * output_height * output_width + oh * output_width + ow] = max_val; + } + } + } +} + +// Optimized fully connected forward pass using systolic array +void fc_forward(Layer& layer, const data_t input[MAX_LAYER_SIZE]) { + #pragma HLS INLINE off + + const int output_size = layer.params.fc_params.output_size; + const int input_size = layer.height * layer.width * layer.channels; + + // Use systolic array for matrix multiplication + acc_t temp_output[MAX_LAYER_SIZE]; + systolic_matrix_multiply(layer.params.fc_params.weights, input, temp_output, output_size, input_size); + + // Add biases and apply activation +FC_OUTPUT: for(int o = 0; o < output_size; o++) { + #pragma HLS PIPELINE II=1 + + acc_t sum = temp_output[o] + layer.params.fc_params.biases[o]; + + if(layer.params.fc_params.type == a_sigmoid) { + layer.pre_activation[o] = sum; + layer.output[o] = sigmoid(sum); + } else { + layer.output[o] = sum; // For softmax, store raw values + } + } + + // Apply softmax if needed + if(layer.params.fc_params.type == a_softmax) { + acc_t max_val = layer.output[0]; + acc_t sum = 0; + + // Find max value for numerical stability + SOFTMAX_MAX: for(int i = 1; i < output_size; i++) { + #pragma HLS PIPELINE II=1 + max_val = (layer.output[i] > max_val) ? layer.output[i] : max_val; + } + + // Compute exponentials and sum + SOFTMAX_EXP: for(int i = 0; i < output_size; i++) { + #pragma HLS PIPELINE II=1 + layer.output[i] = hls::exp(layer.output[i] - max_val); + sum += layer.output[i]; + } + + // Normalize + SOFTMAX_NORM: for(int i = 0; i < output_size; i++) { + #pragma HLS PIPELINE II=1 + layer.output[i] /= sum; + } + } +} + +// Top-level function for HLS synthesis +void cnn_forward( + data_t input[MAX_LAYER_SIZE], + data_t output[MAX_LAYER_SIZE], + Layer layers[], + int num_layers) { + + #pragma HLS INTERFACE m_axi port=input offset=slave bundle=gmem0 + #pragma HLS INTERFACE m_axi port=output offset=slave bundle=gmem1 + #pragma HLS INTERFACE m_axi port=layers offset=slave bundle=gmem2 + #pragma HLS INTERFACE s_axilite port=num_layers bundle=control + #pragma HLS INTERFACE s_axilite port=return bundle=control + + data_t layer_input[MAX_LAYER_SIZE]; + data_t layer_output[MAX_LAYER_SIZE]; + + // Copy input to local buffer + memcpy(layer_input, input, MAX_LAYER_SIZE * sizeof(data_t)); + + // Process each layer +LAYER_LOOP: for(int i = 0; i < num_layers; i++) { + #pragma HLS LOOP_TRIPCOUNT min=1 max=20 + + Layer& current_layer = layers[i]; + + switch(current_layer.type) { + case conv: + conv_forward(current_layer, layer_input); + break; + case max_pool: + maxpool_forward(current_layer, layer_input); + break; + case fully_connected: + fc_forward(current_layer, layer_input); + break; + default: + break; + } + + // Copy output to input buffer for next layer + memcpy(layer_input, current_layer.output, MAX_LAYER_SIZE * sizeof(data_t)); + } + + // Copy final output + memcpy(output, layer_input, MAX_LAYER_SIZE * sizeof(data_t)); +}