edit readme
This commit is contained in:
parent
cc944d8cc3
commit
90207ab834
40
README.md
40
README.md
@ -1,15 +1,33 @@
|
||||
# nn - implementation of neural networks in c
|
||||
# nn - Neural Networks in C
|
||||
|
||||
implements neural networks in c, targets embedded systems (microcontrollers, fpgas)
|
||||
This repository implements various neural networks in C, focusing mainly on targetting embedded systems or creating hardware accelerators (FPGA-Based, ASIC, etc.) \
|
||||
This project was created as part of my independent study course, where I am currently researching the design of hardware accelerators for high-performance workloads
|
||||
|
||||
#### current implementations
|
||||
`snn.c` - a simple feedforward neural network written in ~150loc. \
|
||||
`cnn.c` - TODO, implements a convolutional neural network \
|
||||
`cnn-hls.c` - TODO, has fpga hls specific types/pragmas in order to synthesize to verilog; run on an fpga \
|
||||
### current implementations (project index)
|
||||
`snn.c` - A simple feedforward neural network written in ~150loc. Depends on c native libraries and [GSL](https://www.gnu.org/software/gsl/) \
|
||||
`cnn.c` - Implements a fully featured cnn library in ~600loc. Depends solely on C native libraries \
|
||||
`cnn-hls.c` - The version of `cnn.c` with HLS specific optimizations (Pragmas, Systolic Array Mutliplication, etc.); aims at being synthesized through Vitus HLS to create a FPGA Based CNN Accelerator \
|
||||
`mnist.c` - Driver code for `cnn.c` which trains on the [MNIST](https://yann.lecun.com/exdb/mnist/) dataset
|
||||
|
||||
depends on native c libraries and [gsl](https://www.gnu.org/software/gsl/)
|
||||
### usage
|
||||
`mnist.c` is a great example of how the library is used, but basic usage boils down to a few simple things: \
|
||||
|
||||
### future goals
|
||||
cnn w/ pragmas -> successfully compiled to verilog using vivado/vitus \
|
||||
self-made matrix multiplication library, relying only on native c ones \
|
||||
code cleanup and optimization
|
||||
1) Importing `cnn.c` into your code
|
||||
2) Creating a network and creating layers:
|
||||
```c
|
||||
// an example of a lenet-5 inspired 8 layer network
|
||||
Network* network = create_network(8);
|
||||
network->layers[0] = create_input(IMG_HEIGHT, IMG_WIDTH, 1);
|
||||
network->layers[1] = create_conv(IMG_HEIGHT, IMG_WIDTH, 1, 6, 5, 1, 2);
|
||||
network->layers[2] = create_maxpool(network->layers[1]->height, network->layers[1]->width, network->layers[1]->channels, 2, 2);
|
||||
network->layers[3] = create_conv(network->layers[2]->height, network->layers[2]->width, network->layers[2]->channels, 16, 5, 1, 0);
|
||||
network->layers[4] = create_maxpool(network->layers[3]->height, network->layers[3]->width, network->layers[3]->channels, 2, 2);
|
||||
network->layers[5] = create_fc(120, network->layers[4]->height * network->layers[4]->width * network->layers[4]->channels, a_sigmoid);
|
||||
network->layers[6] = create_fc(84, 120, a_sigmoid);
|
||||
network->layers[7] = create_fc(NUM_CLASSES, 84, a_softmax);
|
||||
```
|
||||
3) Forward and backpropogation through the Network!
|
||||
|
||||
## Project Overview and Explanation
|
||||
### Abstract
|
||||
For my project, I propose an implementation of a Convolutional Neural Network based handwritten digital classifier using the MNIST dataset on a Field Programmable Gate Array (FPGA). I utilize High Level Synthesis (HLS) tool called Vitus HLS developed by [AMD/Xilinx](https://www.xilinx.com/products/boards-and-kits.html) in order to implement the accelerator through C, eliminating the need to write any code in HDL Languages such as Verilog/VHDL. To reduce any performance losses, I implement a systolic array based architecture and utilize techniques such as pipelining, loop unrolling, and memory partitioning. Through this project, I aim to highlight the potential of FPGAs to offer reduced power consumption and latency for machine learning tasks, creating a more sustainable computing environment.
|
||||
|
310
cnn-hls.c
Normal file
310
cnn-hls.c
Normal file
@ -0,0 +1,310 @@
|
||||
#include "ap_fixed.h"
|
||||
#include "hls_stream.h"
|
||||
#include "hls_math.h"
|
||||
#include <string.h>
|
||||
|
||||
// Fixed point definitions for better hardware efficiency
|
||||
typedef ap_fixed<16,8> data_t; // 16 bits total, 8 integer bits
|
||||
typedef ap_fixed<16,8> weight_t;
|
||||
typedef ap_fixed<32,16> acc_t; // Wider accumulator to prevent overflow
|
||||
|
||||
// Enums remain the same
|
||||
typedef enum {
|
||||
input,
|
||||
conv,
|
||||
max_pool,
|
||||
fully_connected
|
||||
} ltype;
|
||||
|
||||
typedef enum {
|
||||
fc_input,
|
||||
fc_hidden,
|
||||
fc_output,
|
||||
} fcpos;
|
||||
|
||||
typedef enum {
|
||||
a_sigmoid,
|
||||
a_softmax,
|
||||
} activation;
|
||||
|
||||
// Maximum size definitions for static arrays
|
||||
#define MAX_LAYER_SIZE 1024
|
||||
#define MAX_FILTER_SIZE 11
|
||||
#define MAX_CHANNELS 256
|
||||
#define MAX_FILTERS 256
|
||||
|
||||
// Layer struct optimized for HLS
|
||||
struct Layer {
|
||||
ltype type;
|
||||
int height;
|
||||
int width;
|
||||
int channels;
|
||||
|
||||
union {
|
||||
struct {
|
||||
int num_filters;
|
||||
int filter_size;
|
||||
int stride;
|
||||
int zero_padding;
|
||||
int input_height;
|
||||
int input_width;
|
||||
int input_channels;
|
||||
weight_t weights[MAX_FILTERS][MAX_CHANNELS][MAX_FILTER_SIZE][MAX_FILTER_SIZE];
|
||||
data_t biases[MAX_FILTERS];
|
||||
} conv_params;
|
||||
|
||||
struct {
|
||||
int pool_size;
|
||||
int stride;
|
||||
int input_height;
|
||||
int input_width;
|
||||
} pool_params;
|
||||
|
||||
struct {
|
||||
int output_size;
|
||||
weight_t weights[MAX_LAYER_SIZE][MAX_LAYER_SIZE];
|
||||
data_t biases[MAX_LAYER_SIZE];
|
||||
activation type;
|
||||
} fc_params;
|
||||
} params;
|
||||
|
||||
data_t output[MAX_LAYER_SIZE];
|
||||
data_t delta[MAX_LAYER_SIZE];
|
||||
data_t pre_activation[MAX_LAYER_SIZE];
|
||||
};
|
||||
|
||||
// Helper functions
|
||||
data_t sigmoid(data_t x) {
|
||||
#pragma HLS INLINE
|
||||
return 1.0 / (1.0 + hls::exp(-x));
|
||||
}
|
||||
|
||||
data_t relu(data_t x) {
|
||||
#pragma HLS INLINE
|
||||
return (x > 0) ? x : 0;
|
||||
}
|
||||
|
||||
// Systolic array matrix multiplication for fully connected layers
|
||||
void systolic_matrix_multiply(
|
||||
const weight_t weights[MAX_LAYER_SIZE][MAX_LAYER_SIZE],
|
||||
const data_t input[MAX_LAYER_SIZE],
|
||||
acc_t output[MAX_LAYER_SIZE],
|
||||
int M, int N) {
|
||||
|
||||
#pragma HLS PIPELINE II=1
|
||||
#pragma HLS ARRAY_PARTITION variable=weights cyclic factor=16 dim=2
|
||||
#pragma HLS ARRAY_PARTITION variable=input cyclic factor=16
|
||||
|
||||
static acc_t pe_array[MAX_LAYER_SIZE];
|
||||
#pragma HLS ARRAY_PARTITION variable=pe_array cyclic factor=16
|
||||
|
||||
// Initialize processing elements
|
||||
for (int i = 0; i < M; i++) {
|
||||
#pragma HLS UNROLL factor=16
|
||||
pe_array[i] = 0;
|
||||
}
|
||||
|
||||
// Systolic computation
|
||||
for (int k = 0; k < N; k++) {
|
||||
for (int i = 0; i < M; i++) {
|
||||
#pragma HLS PIPELINE II=1
|
||||
#pragma HLS UNROLL factor=16
|
||||
pe_array[i] += weights[i][k] * input[k];
|
||||
}
|
||||
}
|
||||
|
||||
// Write results
|
||||
for (int i = 0; i < M; i++) {
|
||||
#pragma HLS UNROLL factor=16
|
||||
output[i] = pe_array[i];
|
||||
}
|
||||
}
|
||||
|
||||
// Optimized convolution forward pass
|
||||
void conv_forward(Layer& layer, const data_t input[MAX_LAYER_SIZE]) {
|
||||
#pragma HLS INLINE off
|
||||
|
||||
const int padding = layer.params.conv_params.zero_padding;
|
||||
const int stride = layer.params.conv_params.stride;
|
||||
const int filter_size = layer.params.conv_params.filter_size;
|
||||
const int num_filters = layer.params.conv_params.num_filters;
|
||||
const int input_height = layer.params.conv_params.input_height;
|
||||
const int input_width = layer.params.conv_params.input_width;
|
||||
const int input_channels = layer.params.conv_params.input_channels;
|
||||
|
||||
// Create padded input buffer
|
||||
data_t padded_input[MAX_CHANNELS][MAX_FILTER_SIZE][MAX_FILTER_SIZE];
|
||||
#pragma HLS ARRAY_PARTITION variable=padded_input complete dim=1
|
||||
|
||||
const int padded_height = input_height + 2 * padding;
|
||||
const int padded_width = input_width + 2 * padding;
|
||||
const int output_height = (padded_height - filter_size) / stride + 1;
|
||||
const int output_width = (padded_width - filter_size) / stride + 1;
|
||||
|
||||
// Main convolution loops
|
||||
CONV_FILTERS: for(int f = 0; f < num_filters; f++) {
|
||||
CONV_OUTPUT_H: for(int oh = 0; oh < output_height; oh++) {
|
||||
CONV_OUTPUT_W: for(int ow = 0; ow < output_width; ow++) {
|
||||
#pragma HLS PIPELINE II=1
|
||||
|
||||
acc_t sum = 0;
|
||||
|
||||
CONV_CHANNELS: for(int c = 0; c < input_channels; c++) {
|
||||
CONV_KERNEL_H: for(int fh = 0; fh < filter_size; fh++) {
|
||||
CONV_KERNEL_W: for(int fw = 0; fw < filter_size; fw++) {
|
||||
#pragma HLS UNROLL factor=3
|
||||
|
||||
int ih = oh * stride + fh;
|
||||
int iw = ow * stride + fw;
|
||||
|
||||
if (ih >= 0 && ih < padded_height && iw >= 0 && iw < padded_width) {
|
||||
sum += input[c * input_height * input_width + (ih-padding) * input_width + (iw-padding)] *
|
||||
layer.params.conv_params.weights[f][c][fh][fw];
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
sum += layer.params.conv_params.biases[f];
|
||||
int output_idx = f * output_height * output_width + oh * output_width + ow;
|
||||
layer.pre_activation[output_idx] = sum;
|
||||
layer.output[output_idx] = relu(sum);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Optimized max pooling forward pass
|
||||
void maxpool_forward(Layer& layer, const data_t input[MAX_LAYER_SIZE]) {
|
||||
#pragma HLS INLINE off
|
||||
|
||||
const int pool_size = layer.params.pool_params.pool_size;
|
||||
const int stride = layer.params.pool_params.stride;
|
||||
const int input_height = layer.height;
|
||||
const int input_width = layer.width;
|
||||
const int input_channels = layer.channels;
|
||||
|
||||
const int output_height = (input_height - pool_size) / stride + 1;
|
||||
const int output_width = (input_width - pool_size) / stride + 1;
|
||||
|
||||
POOL_CHANNELS: for(int c = 0; c < input_channels; c++) {
|
||||
POOL_OUTPUT_H: for(int oh = 0; oh < output_height; oh++) {
|
||||
POOL_OUTPUT_W: for(int ow = 0; ow < output_width; ow++) {
|
||||
#pragma HLS PIPELINE II=1
|
||||
|
||||
data_t max_val = -INFINITY;
|
||||
|
||||
POOL_WINDOW_H: for(int ph = 0; ph < pool_size; ph++) {
|
||||
POOL_WINDOW_W: for(int pw = 0; pw < pool_size; pw++) {
|
||||
#pragma HLS UNROLL
|
||||
|
||||
int ih = oh * stride + ph;
|
||||
int iw = ow * stride + pw;
|
||||
data_t val = input[c * input_height * input_width + ih * input_width + iw];
|
||||
max_val = (val > max_val) ? val : max_val;
|
||||
}
|
||||
}
|
||||
|
||||
layer.output[c * output_height * output_width + oh * output_width + ow] = max_val;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Optimized fully connected forward pass using systolic array
|
||||
void fc_forward(Layer& layer, const data_t input[MAX_LAYER_SIZE]) {
|
||||
#pragma HLS INLINE off
|
||||
|
||||
const int output_size = layer.params.fc_params.output_size;
|
||||
const int input_size = layer.height * layer.width * layer.channels;
|
||||
|
||||
// Use systolic array for matrix multiplication
|
||||
acc_t temp_output[MAX_LAYER_SIZE];
|
||||
systolic_matrix_multiply(layer.params.fc_params.weights, input, temp_output, output_size, input_size);
|
||||
|
||||
// Add biases and apply activation
|
||||
FC_OUTPUT: for(int o = 0; o < output_size; o++) {
|
||||
#pragma HLS PIPELINE II=1
|
||||
|
||||
acc_t sum = temp_output[o] + layer.params.fc_params.biases[o];
|
||||
|
||||
if(layer.params.fc_params.type == a_sigmoid) {
|
||||
layer.pre_activation[o] = sum;
|
||||
layer.output[o] = sigmoid(sum);
|
||||
} else {
|
||||
layer.output[o] = sum; // For softmax, store raw values
|
||||
}
|
||||
}
|
||||
|
||||
// Apply softmax if needed
|
||||
if(layer.params.fc_params.type == a_softmax) {
|
||||
acc_t max_val = layer.output[0];
|
||||
acc_t sum = 0;
|
||||
|
||||
// Find max value for numerical stability
|
||||
SOFTMAX_MAX: for(int i = 1; i < output_size; i++) {
|
||||
#pragma HLS PIPELINE II=1
|
||||
max_val = (layer.output[i] > max_val) ? layer.output[i] : max_val;
|
||||
}
|
||||
|
||||
// Compute exponentials and sum
|
||||
SOFTMAX_EXP: for(int i = 0; i < output_size; i++) {
|
||||
#pragma HLS PIPELINE II=1
|
||||
layer.output[i] = hls::exp(layer.output[i] - max_val);
|
||||
sum += layer.output[i];
|
||||
}
|
||||
|
||||
// Normalize
|
||||
SOFTMAX_NORM: for(int i = 0; i < output_size; i++) {
|
||||
#pragma HLS PIPELINE II=1
|
||||
layer.output[i] /= sum;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Top-level function for HLS synthesis
|
||||
void cnn_forward(
|
||||
data_t input[MAX_LAYER_SIZE],
|
||||
data_t output[MAX_LAYER_SIZE],
|
||||
Layer layers[],
|
||||
int num_layers) {
|
||||
|
||||
#pragma HLS INTERFACE m_axi port=input offset=slave bundle=gmem0
|
||||
#pragma HLS INTERFACE m_axi port=output offset=slave bundle=gmem1
|
||||
#pragma HLS INTERFACE m_axi port=layers offset=slave bundle=gmem2
|
||||
#pragma HLS INTERFACE s_axilite port=num_layers bundle=control
|
||||
#pragma HLS INTERFACE s_axilite port=return bundle=control
|
||||
|
||||
data_t layer_input[MAX_LAYER_SIZE];
|
||||
data_t layer_output[MAX_LAYER_SIZE];
|
||||
|
||||
// Copy input to local buffer
|
||||
memcpy(layer_input, input, MAX_LAYER_SIZE * sizeof(data_t));
|
||||
|
||||
// Process each layer
|
||||
LAYER_LOOP: for(int i = 0; i < num_layers; i++) {
|
||||
#pragma HLS LOOP_TRIPCOUNT min=1 max=20
|
||||
|
||||
Layer& current_layer = layers[i];
|
||||
|
||||
switch(current_layer.type) {
|
||||
case conv:
|
||||
conv_forward(current_layer, layer_input);
|
||||
break;
|
||||
case max_pool:
|
||||
maxpool_forward(current_layer, layer_input);
|
||||
break;
|
||||
case fully_connected:
|
||||
fc_forward(current_layer, layer_input);
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
|
||||
// Copy output to input buffer for next layer
|
||||
memcpy(layer_input, current_layer.output, MAX_LAYER_SIZE * sizeof(data_t));
|
||||
}
|
||||
|
||||
// Copy final output
|
||||
memcpy(output, layer_input, MAX_LAYER_SIZE * sizeof(data_t));
|
||||
}
|
Loading…
x
Reference in New Issue
Block a user