nn/cnn-hls.c
2025-01-17 00:26:17 -06:00

311 lines
8.4 KiB
C

#include "ap_fixed.h"
#include "hls_stream.h"
#include "hls_math.h"
#include <string.h>
// Fixed point definitions for better hardware efficiency
typedef ap_fixed<16,8> data_t; // 16 bits total, 8 integer bits
typedef ap_fixed<16,8> weight_t;
typedef ap_fixed<32,16> acc_t; // Wider accumulator to prevent overflow
// Enums remain the same
typedef enum {
input,
conv,
max_pool,
fully_connected
} ltype;
typedef enum {
fc_input,
fc_hidden,
fc_output,
} fcpos;
typedef enum {
a_sigmoid,
a_softmax,
} activation;
// Maximum size definitions for static arrays
#define MAX_LAYER_SIZE 1024
#define MAX_FILTER_SIZE 11
#define MAX_CHANNELS 256
#define MAX_FILTERS 256
// Layer struct optimized for HLS
struct Layer {
ltype type;
int height;
int width;
int channels;
union {
struct {
int num_filters;
int filter_size;
int stride;
int zero_padding;
int input_height;
int input_width;
int input_channels;
weight_t weights[MAX_FILTERS][MAX_CHANNELS][MAX_FILTER_SIZE][MAX_FILTER_SIZE];
data_t biases[MAX_FILTERS];
} conv_params;
struct {
int pool_size;
int stride;
int input_height;
int input_width;
} pool_params;
struct {
int output_size;
weight_t weights[MAX_LAYER_SIZE][MAX_LAYER_SIZE];
data_t biases[MAX_LAYER_SIZE];
activation type;
} fc_params;
} params;
data_t output[MAX_LAYER_SIZE];
data_t delta[MAX_LAYER_SIZE];
data_t pre_activation[MAX_LAYER_SIZE];
};
// Helper functions
data_t sigmoid(data_t x) {
#pragma HLS INLINE
return 1.0 / (1.0 + hls::exp(-x));
}
data_t relu(data_t x) {
#pragma HLS INLINE
return (x > 0) ? x : 0;
}
// Systolic array matrix multiplication for fully connected layers
void systolic_matrix_multiply(
const weight_t weights[MAX_LAYER_SIZE][MAX_LAYER_SIZE],
const data_t input[MAX_LAYER_SIZE],
acc_t output[MAX_LAYER_SIZE],
int M, int N) {
#pragma HLS PIPELINE II=1
#pragma HLS ARRAY_PARTITION variable=weights cyclic factor=16 dim=2
#pragma HLS ARRAY_PARTITION variable=input cyclic factor=16
static acc_t pe_array[MAX_LAYER_SIZE];
#pragma HLS ARRAY_PARTITION variable=pe_array cyclic factor=16
// Initialize processing elements
for (int i = 0; i < M; i++) {
#pragma HLS UNROLL factor=16
pe_array[i] = 0;
}
// Systolic computation
for (int k = 0; k < N; k++) {
for (int i = 0; i < M; i++) {
#pragma HLS PIPELINE II=1
#pragma HLS UNROLL factor=16
pe_array[i] += weights[i][k] * input[k];
}
}
// Write results
for (int i = 0; i < M; i++) {
#pragma HLS UNROLL factor=16
output[i] = pe_array[i];
}
}
// Optimized convolution forward pass
void conv_forward(Layer& layer, const data_t input[MAX_LAYER_SIZE]) {
#pragma HLS INLINE off
const int padding = layer.params.conv_params.zero_padding;
const int stride = layer.params.conv_params.stride;
const int filter_size = layer.params.conv_params.filter_size;
const int num_filters = layer.params.conv_params.num_filters;
const int input_height = layer.params.conv_params.input_height;
const int input_width = layer.params.conv_params.input_width;
const int input_channels = layer.params.conv_params.input_channels;
// Create padded input buffer
data_t padded_input[MAX_CHANNELS][MAX_FILTER_SIZE][MAX_FILTER_SIZE];
#pragma HLS ARRAY_PARTITION variable=padded_input complete dim=1
const int padded_height = input_height + 2 * padding;
const int padded_width = input_width + 2 * padding;
const int output_height = (padded_height - filter_size) / stride + 1;
const int output_width = (padded_width - filter_size) / stride + 1;
// Main convolution loops
CONV_FILTERS: for(int f = 0; f < num_filters; f++) {
CONV_OUTPUT_H: for(int oh = 0; oh < output_height; oh++) {
CONV_OUTPUT_W: for(int ow = 0; ow < output_width; ow++) {
#pragma HLS PIPELINE II=1
acc_t sum = 0;
CONV_CHANNELS: for(int c = 0; c < input_channels; c++) {
CONV_KERNEL_H: for(int fh = 0; fh < filter_size; fh++) {
CONV_KERNEL_W: for(int fw = 0; fw < filter_size; fw++) {
#pragma HLS UNROLL factor=3
int ih = oh * stride + fh;
int iw = ow * stride + fw;
if (ih >= 0 && ih < padded_height && iw >= 0 && iw < padded_width) {
sum += input[c * input_height * input_width + (ih-padding) * input_width + (iw-padding)] *
layer.params.conv_params.weights[f][c][fh][fw];
}
}
}
}
sum += layer.params.conv_params.biases[f];
int output_idx = f * output_height * output_width + oh * output_width + ow;
layer.pre_activation[output_idx] = sum;
layer.output[output_idx] = relu(sum);
}
}
}
}
// Optimized max pooling forward pass
void maxpool_forward(Layer& layer, const data_t input[MAX_LAYER_SIZE]) {
#pragma HLS INLINE off
const int pool_size = layer.params.pool_params.pool_size;
const int stride = layer.params.pool_params.stride;
const int input_height = layer.height;
const int input_width = layer.width;
const int input_channels = layer.channels;
const int output_height = (input_height - pool_size) / stride + 1;
const int output_width = (input_width - pool_size) / stride + 1;
POOL_CHANNELS: for(int c = 0; c < input_channels; c++) {
POOL_OUTPUT_H: for(int oh = 0; oh < output_height; oh++) {
POOL_OUTPUT_W: for(int ow = 0; ow < output_width; ow++) {
#pragma HLS PIPELINE II=1
data_t max_val = -INFINITY;
POOL_WINDOW_H: for(int ph = 0; ph < pool_size; ph++) {
POOL_WINDOW_W: for(int pw = 0; pw < pool_size; pw++) {
#pragma HLS UNROLL
int ih = oh * stride + ph;
int iw = ow * stride + pw;
data_t val = input[c * input_height * input_width + ih * input_width + iw];
max_val = (val > max_val) ? val : max_val;
}
}
layer.output[c * output_height * output_width + oh * output_width + ow] = max_val;
}
}
}
}
// Optimized fully connected forward pass using systolic array
void fc_forward(Layer& layer, const data_t input[MAX_LAYER_SIZE]) {
#pragma HLS INLINE off
const int output_size = layer.params.fc_params.output_size;
const int input_size = layer.height * layer.width * layer.channels;
// Use systolic array for matrix multiplication
acc_t temp_output[MAX_LAYER_SIZE];
systolic_matrix_multiply(layer.params.fc_params.weights, input, temp_output, output_size, input_size);
// Add biases and apply activation
FC_OUTPUT: for(int o = 0; o < output_size; o++) {
#pragma HLS PIPELINE II=1
acc_t sum = temp_output[o] + layer.params.fc_params.biases[o];
if(layer.params.fc_params.type == a_sigmoid) {
layer.pre_activation[o] = sum;
layer.output[o] = sigmoid(sum);
} else {
layer.output[o] = sum; // For softmax, store raw values
}
}
// Apply softmax if needed
if(layer.params.fc_params.type == a_softmax) {
acc_t max_val = layer.output[0];
acc_t sum = 0;
// Find max value for numerical stability
SOFTMAX_MAX: for(int i = 1; i < output_size; i++) {
#pragma HLS PIPELINE II=1
max_val = (layer.output[i] > max_val) ? layer.output[i] : max_val;
}
// Compute exponentials and sum
SOFTMAX_EXP: for(int i = 0; i < output_size; i++) {
#pragma HLS PIPELINE II=1
layer.output[i] = hls::exp(layer.output[i] - max_val);
sum += layer.output[i];
}
// Normalize
SOFTMAX_NORM: for(int i = 0; i < output_size; i++) {
#pragma HLS PIPELINE II=1
layer.output[i] /= sum;
}
}
}
// Top-level function for HLS synthesis
void cnn_forward(
data_t input[MAX_LAYER_SIZE],
data_t output[MAX_LAYER_SIZE],
Layer layers[],
int num_layers) {
#pragma HLS INTERFACE m_axi port=input offset=slave bundle=gmem0
#pragma HLS INTERFACE m_axi port=output offset=slave bundle=gmem1
#pragma HLS INTERFACE m_axi port=layers offset=slave bundle=gmem2
#pragma HLS INTERFACE s_axilite port=num_layers bundle=control
#pragma HLS INTERFACE s_axilite port=return bundle=control
data_t layer_input[MAX_LAYER_SIZE];
data_t layer_output[MAX_LAYER_SIZE];
// Copy input to local buffer
memcpy(layer_input, input, MAX_LAYER_SIZE * sizeof(data_t));
// Process each layer
LAYER_LOOP: for(int i = 0; i < num_layers; i++) {
#pragma HLS LOOP_TRIPCOUNT min=1 max=20
Layer& current_layer = layers[i];
switch(current_layer.type) {
case conv:
conv_forward(current_layer, layer_input);
break;
case max_pool:
maxpool_forward(current_layer, layer_input);
break;
case fully_connected:
fc_forward(current_layer, layer_input);
break;
default:
break;
}
// Copy output to input buffer for next layer
memcpy(layer_input, current_layer.output, MAX_LAYER_SIZE * sizeof(data_t));
}
// Copy final output
memcpy(output, layer_input, MAX_LAYER_SIZE * sizeof(data_t));
}