From 90207ab8347d911eb5f08deb05c5660e67589b0e Mon Sep 17 00:00:00 2001
From: vikshar <viksharrajesh@gmail.com>
Date: Fri, 17 Jan 2025 00:26:17 -0600
Subject: [PATCH] edit readme

---
 README.md |  40 +++++--
 cnn-hls.c | 310 ++++++++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 339 insertions(+), 11 deletions(-)
 create mode 100644 cnn-hls.c

diff --git a/README.md b/README.md
index 59f3c16..5246f9d 100644
--- a/README.md
+++ b/README.md
@@ -1,15 +1,33 @@
-# nn - implementation of neural networks in c
+# nn - Neural Networks in C
 
-implements neural networks in c, targets embedded systems (microcontrollers, fpgas) 
+This repository implements various neural networks in C, focusing mainly on targetting embedded systems or creating hardware accelerators (FPGA-Based, ASIC, etc.) \
+This project was created as part of my independent study course, where I am currently researching the design of hardware accelerators for high-performance workloads
 
-#### current implementations
-`snn.c` - a simple feedforward neural network written in ~150loc. \
-`cnn.c` - TODO, implements a convolutional neural network \
-`cnn-hls.c` - TODO, has fpga hls specific types/pragmas in order to synthesize to verilog; run on an fpga \
+### current implementations (project index)
+`snn.c` - A simple feedforward neural network written in ~150loc. Depends on c native libraries and [GSL](https://www.gnu.org/software/gsl/) \
+`cnn.c` - Implements a fully featured cnn library in ~600loc. Depends solely on C native libraries \
+`cnn-hls.c` - The version of `cnn.c` with HLS specific optimizations (Pragmas, Systolic Array Mutliplication, etc.); aims at being synthesized through Vitus HLS to create a FPGA Based CNN Accelerator \
+`mnist.c` - Driver code for `cnn.c` which trains on the [MNIST](https://yann.lecun.com/exdb/mnist/) dataset
 
-depends on native c libraries and [gsl](https://www.gnu.org/software/gsl/)
+### usage
+`mnist.c` is a great example of how the library is used, but basic usage boils down to a few simple things: \
 
-### future goals
-cnn w/ pragmas -> successfully compiled to verilog using vivado/vitus \
-self-made matrix multiplication library, relying only on native c ones \
-code cleanup and optimization
+1) Importing `cnn.c` into your code
+2) Creating a network and creating layers:
+```c
+// an example of a lenet-5 inspired 8 layer network
+Network* network = create_network(8);
+network->layers[0] = create_input(IMG_HEIGHT, IMG_WIDTH, 1);
+network->layers[1] = create_conv(IMG_HEIGHT, IMG_WIDTH, 1, 6, 5, 1, 2);
+network->layers[2] = create_maxpool(network->layers[1]->height, network->layers[1]->width, network->layers[1]->channels, 2, 2);
+network->layers[3] = create_conv(network->layers[2]->height, network->layers[2]->width, network->layers[2]->channels, 16, 5, 1, 0);
+network->layers[4] = create_maxpool(network->layers[3]->height, network->layers[3]->width, network->layers[3]->channels, 2, 2);
+network->layers[5] = create_fc(120, network->layers[4]->height * network->layers[4]->width * network->layers[4]->channels, a_sigmoid);
+network->layers[6] = create_fc(84, 120, a_sigmoid);
+network->layers[7] = create_fc(NUM_CLASSES, 84, a_softmax);
+```
+3) Forward and backpropogation through the Network!
+
+## Project Overview and Explanation
+### Abstract
+For my project, I propose an implementation of a Convolutional Neural Network based handwritten digital classifier using the MNIST dataset on a Field Programmable Gate Array (FPGA). I utilize High Level Synthesis (HLS) tool called Vitus HLS developed by [AMD/Xilinx](https://www.xilinx.com/products/boards-and-kits.html) in order to implement the accelerator through C, eliminating the need to write any code in HDL Languages such as Verilog/VHDL. To reduce any performance losses, I implement a systolic array based architecture and utilize techniques such as pipelining, loop unrolling, and memory partitioning. Through this project, I aim to highlight the potential of FPGAs to offer reduced power consumption and latency for machine learning tasks, creating a more sustainable computing environment.
diff --git a/cnn-hls.c b/cnn-hls.c
new file mode 100644
index 0000000..f0a46ba
--- /dev/null
+++ b/cnn-hls.c
@@ -0,0 +1,310 @@
+#include "ap_fixed.h"
+#include "hls_stream.h"
+#include "hls_math.h"
+#include <string.h>
+
+// Fixed point definitions for better hardware efficiency
+typedef ap_fixed<16,8> data_t;  // 16 bits total, 8 integer bits
+typedef ap_fixed<16,8> weight_t;
+typedef ap_fixed<32,16> acc_t;  // Wider accumulator to prevent overflow
+
+// Enums remain the same
+typedef enum {
+	input,
+	conv,
+	max_pool,
+	fully_connected
+} ltype;
+
+typedef enum {
+	fc_input,
+	fc_hidden,
+	fc_output,
+} fcpos;
+
+typedef enum {
+	a_sigmoid,
+	a_softmax,
+} activation;
+
+// Maximum size definitions for static arrays
+#define MAX_LAYER_SIZE 1024
+#define MAX_FILTER_SIZE 11
+#define MAX_CHANNELS 256
+#define MAX_FILTERS 256
+
+// Layer struct optimized for HLS
+struct Layer {
+	ltype type;
+	int height;
+	int width;
+	int channels;
+
+	union {
+		struct {
+			int num_filters;
+			int filter_size;
+			int stride;
+			int zero_padding;
+			int input_height;
+			int input_width;
+			int input_channels;
+			weight_t weights[MAX_FILTERS][MAX_CHANNELS][MAX_FILTER_SIZE][MAX_FILTER_SIZE];
+			data_t biases[MAX_FILTERS];
+		} conv_params;
+
+		struct {
+			int pool_size;
+			int stride;
+			int input_height;
+			int input_width;
+		} pool_params;
+
+		struct {
+			int output_size;
+			weight_t weights[MAX_LAYER_SIZE][MAX_LAYER_SIZE];
+			data_t biases[MAX_LAYER_SIZE];
+			activation type;
+		} fc_params;
+	} params;
+
+	data_t output[MAX_LAYER_SIZE];
+	data_t delta[MAX_LAYER_SIZE];
+	data_t pre_activation[MAX_LAYER_SIZE];
+};
+
+// Helper functions
+data_t sigmoid(data_t x) {
+	#pragma HLS INLINE
+	return 1.0 / (1.0 + hls::exp(-x));
+}
+
+data_t relu(data_t x) {
+	#pragma HLS INLINE
+	return (x > 0) ? x : 0;
+}
+
+// Systolic array matrix multiplication for fully connected layers
+void systolic_matrix_multiply(
+	const weight_t weights[MAX_LAYER_SIZE][MAX_LAYER_SIZE],
+	const data_t input[MAX_LAYER_SIZE],
+	acc_t output[MAX_LAYER_SIZE],
+	int M, int N) {
+
+	#pragma HLS PIPELINE II=1
+	#pragma HLS ARRAY_PARTITION variable=weights cyclic factor=16 dim=2
+	#pragma HLS ARRAY_PARTITION variable=input cyclic factor=16
+
+	static acc_t pe_array[MAX_LAYER_SIZE];
+	#pragma HLS ARRAY_PARTITION variable=pe_array cyclic factor=16
+
+	// Initialize processing elements
+	for (int i = 0; i < M; i++) {
+		#pragma HLS UNROLL factor=16
+		pe_array[i] = 0;
+	}
+
+	// Systolic computation
+	for (int k = 0; k < N; k++) {
+		for (int i = 0; i < M; i++) {
+			#pragma HLS PIPELINE II=1
+			#pragma HLS UNROLL factor=16
+			pe_array[i] += weights[i][k] * input[k];
+		}
+	}
+
+	// Write results
+	for (int i = 0; i < M; i++) {
+		#pragma HLS UNROLL factor=16
+		output[i] = pe_array[i];
+	}
+}
+
+// Optimized convolution forward pass
+void conv_forward(Layer& layer, const data_t input[MAX_LAYER_SIZE]) {
+	#pragma HLS INLINE off
+
+	const int padding = layer.params.conv_params.zero_padding;
+	const int stride = layer.params.conv_params.stride;
+	const int filter_size = layer.params.conv_params.filter_size;
+	const int num_filters = layer.params.conv_params.num_filters;
+	const int input_height = layer.params.conv_params.input_height;
+	const int input_width = layer.params.conv_params.input_width;
+	const int input_channels = layer.params.conv_params.input_channels;
+
+	// Create padded input buffer
+	data_t padded_input[MAX_CHANNELS][MAX_FILTER_SIZE][MAX_FILTER_SIZE];
+	#pragma HLS ARRAY_PARTITION variable=padded_input complete dim=1
+
+	const int padded_height = input_height + 2 * padding;
+	const int padded_width = input_width + 2 * padding;
+	const int output_height = (padded_height - filter_size) / stride + 1;
+	const int output_width = (padded_width - filter_size) / stride + 1;
+
+	// Main convolution loops
+CONV_FILTERS: for(int f = 0; f < num_filters; f++) {
+	CONV_OUTPUT_H: for(int oh = 0; oh < output_height; oh++) {
+		CONV_OUTPUT_W: for(int ow = 0; ow < output_width; ow++) {
+				#pragma HLS PIPELINE II=1
+
+				acc_t sum = 0;
+
+			CONV_CHANNELS: for(int c = 0; c < input_channels; c++) {
+				CONV_KERNEL_H: for(int fh = 0; fh < filter_size; fh++) {
+					CONV_KERNEL_W: for(int fw = 0; fw < filter_size; fw++) {
+							#pragma HLS UNROLL factor=3
+
+							int ih = oh * stride + fh;
+							int iw = ow * stride + fw;
+
+							if (ih >= 0 && ih < padded_height && iw >= 0 && iw < padded_width) {
+								sum += input[c * input_height * input_width + (ih-padding) * input_width + (iw-padding)] * 
+									layer.params.conv_params.weights[f][c][fh][fw];
+							}
+						}
+					}
+				}
+
+				sum += layer.params.conv_params.biases[f];
+				int output_idx = f * output_height * output_width + oh * output_width + ow;
+				layer.pre_activation[output_idx] = sum;
+				layer.output[output_idx] = relu(sum);
+			}
+		}
+	}
+}
+
+// Optimized max pooling forward pass
+void maxpool_forward(Layer& layer, const data_t input[MAX_LAYER_SIZE]) {
+	#pragma HLS INLINE off
+
+	const int pool_size = layer.params.pool_params.pool_size;
+	const int stride = layer.params.pool_params.stride;
+	const int input_height = layer.height;
+	const int input_width = layer.width;
+	const int input_channels = layer.channels;
+
+	const int output_height = (input_height - pool_size) / stride + 1;
+	const int output_width = (input_width - pool_size) / stride + 1;
+
+POOL_CHANNELS: for(int c = 0; c < input_channels; c++) {
+	POOL_OUTPUT_H: for(int oh = 0; oh < output_height; oh++) {
+		POOL_OUTPUT_W: for(int ow = 0; ow < output_width; ow++) {
+				#pragma HLS PIPELINE II=1
+
+				data_t max_val = -INFINITY;
+
+			POOL_WINDOW_H: for(int ph = 0; ph < pool_size; ph++) {
+				POOL_WINDOW_W: for(int pw = 0; pw < pool_size; pw++) {
+						#pragma HLS UNROLL
+
+						int ih = oh * stride + ph;
+						int iw = ow * stride + pw;
+						data_t val = input[c * input_height * input_width + ih * input_width + iw];
+						max_val = (val > max_val) ? val : max_val;
+					}
+				}
+
+				layer.output[c * output_height * output_width + oh * output_width + ow] = max_val;
+			}
+		}
+	}
+}
+
+// Optimized fully connected forward pass using systolic array
+void fc_forward(Layer& layer, const data_t input[MAX_LAYER_SIZE]) {
+	#pragma HLS INLINE off
+
+	const int output_size = layer.params.fc_params.output_size;
+	const int input_size = layer.height * layer.width * layer.channels;
+
+	// Use systolic array for matrix multiplication
+	acc_t temp_output[MAX_LAYER_SIZE];
+	systolic_matrix_multiply(layer.params.fc_params.weights, input, temp_output, output_size, input_size);
+
+	// Add biases and apply activation
+FC_OUTPUT: for(int o = 0; o < output_size; o++) {
+		#pragma HLS PIPELINE II=1
+
+		acc_t sum = temp_output[o] + layer.params.fc_params.biases[o];
+
+		if(layer.params.fc_params.type == a_sigmoid) {
+			layer.pre_activation[o] = sum;
+			layer.output[o] = sigmoid(sum);
+		} else {
+			layer.output[o] = sum; // For softmax, store raw values
+		}
+	}
+
+	// Apply softmax if needed
+	if(layer.params.fc_params.type == a_softmax) {
+		acc_t max_val = layer.output[0];
+		acc_t sum = 0;
+
+		// Find max value for numerical stability
+	SOFTMAX_MAX: for(int i = 1; i < output_size; i++) {
+			#pragma HLS PIPELINE II=1
+			max_val = (layer.output[i] > max_val) ? layer.output[i] : max_val;
+		}
+
+		// Compute exponentials and sum
+	SOFTMAX_EXP: for(int i = 0; i < output_size; i++) {
+			#pragma HLS PIPELINE II=1
+			layer.output[i] = hls::exp(layer.output[i] - max_val);
+			sum += layer.output[i];
+		}
+
+		// Normalize
+	SOFTMAX_NORM: for(int i = 0; i < output_size; i++) {
+			#pragma HLS PIPELINE II=1
+			layer.output[i] /= sum;
+		}
+	}
+}
+
+// Top-level function for HLS synthesis
+void cnn_forward(
+	data_t input[MAX_LAYER_SIZE],
+	data_t output[MAX_LAYER_SIZE],
+	Layer layers[],
+	int num_layers) {
+
+	#pragma HLS INTERFACE m_axi port=input offset=slave bundle=gmem0
+	#pragma HLS INTERFACE m_axi port=output offset=slave bundle=gmem1
+	#pragma HLS INTERFACE m_axi port=layers offset=slave bundle=gmem2
+	#pragma HLS INTERFACE s_axilite port=num_layers bundle=control
+	#pragma HLS INTERFACE s_axilite port=return bundle=control
+
+	data_t layer_input[MAX_LAYER_SIZE];
+	data_t layer_output[MAX_LAYER_SIZE];
+
+	// Copy input to local buffer
+	memcpy(layer_input, input, MAX_LAYER_SIZE * sizeof(data_t));
+
+	// Process each layer
+LAYER_LOOP: for(int i = 0; i < num_layers; i++) {
+		#pragma HLS LOOP_TRIPCOUNT min=1 max=20
+
+		Layer& current_layer = layers[i];
+
+		switch(current_layer.type) {
+			case conv:
+				conv_forward(current_layer, layer_input);
+				break;
+			case max_pool:
+				maxpool_forward(current_layer, layer_input);
+				break;
+			case fully_connected:
+				fc_forward(current_layer, layer_input);
+				break;
+			default:
+				break;
+		}
+
+		// Copy output to input buffer for next layer
+		memcpy(layer_input, current_layer.output, MAX_LAYER_SIZE * sizeof(data_t));
+	}
+
+	// Copy final output
+	memcpy(output, layer_input, MAX_LAYER_SIZE * sizeof(data_t));
+}