#include "ap_fixed.h"
#include "hls_stream.h"
#include "hls_math.h"
#include <string.h>

// Fixed point definitions for better hardware efficiency
typedef ap_fixed<16,8> data_t;  // 16 bits total, 8 integer bits
typedef ap_fixed<16,8> weight_t;
typedef ap_fixed<32,16> acc_t;  // Wider accumulator to prevent overflow

// Enums remain the same
typedef enum {
	input,
	conv,
	max_pool,
	fully_connected
} ltype;

typedef enum {
	fc_input,
	fc_hidden,
	fc_output,
} fcpos;

typedef enum {
	a_sigmoid,
	a_softmax,
} activation;

// Maximum size definitions for static arrays
#define MAX_LAYER_SIZE 1024
#define MAX_FILTER_SIZE 11
#define MAX_CHANNELS 256
#define MAX_FILTERS 256

// Layer struct optimized for HLS
struct Layer {
	ltype type;
	int height;
	int width;
	int channels;

	union {
		struct {
			int num_filters;
			int filter_size;
			int stride;
			int zero_padding;
			int input_height;
			int input_width;
			int input_channels;
			weight_t weights[MAX_FILTERS][MAX_CHANNELS][MAX_FILTER_SIZE][MAX_FILTER_SIZE];
			data_t biases[MAX_FILTERS];
		} conv_params;

		struct {
			int pool_size;
			int stride;
			int input_height;
			int input_width;
		} pool_params;

		struct {
			int output_size;
			weight_t weights[MAX_LAYER_SIZE][MAX_LAYER_SIZE];
			data_t biases[MAX_LAYER_SIZE];
			activation type;
		} fc_params;
	} params;

	data_t output[MAX_LAYER_SIZE];
	data_t delta[MAX_LAYER_SIZE];
	data_t pre_activation[MAX_LAYER_SIZE];
};

// Helper functions
data_t sigmoid(data_t x) {
	#pragma HLS INLINE
	return 1.0 / (1.0 + hls::exp(-x));
}

data_t relu(data_t x) {
	#pragma HLS INLINE
	return (x > 0) ? x : 0;
}

// Systolic array matrix multiplication for fully connected layers
void systolic_matrix_multiply(
	const weight_t weights[MAX_LAYER_SIZE][MAX_LAYER_SIZE],
	const data_t input[MAX_LAYER_SIZE],
	acc_t output[MAX_LAYER_SIZE],
	int M, int N) {

	#pragma HLS PIPELINE II=1
	#pragma HLS ARRAY_PARTITION variable=weights cyclic factor=16 dim=2
	#pragma HLS ARRAY_PARTITION variable=input cyclic factor=16

	static acc_t pe_array[MAX_LAYER_SIZE];
	#pragma HLS ARRAY_PARTITION variable=pe_array cyclic factor=16

	// Initialize processing elements
	for (int i = 0; i < M; i++) {
		#pragma HLS UNROLL factor=16
		pe_array[i] = 0;
	}

	// Systolic computation
	for (int k = 0; k < N; k++) {
		for (int i = 0; i < M; i++) {
			#pragma HLS PIPELINE II=1
			#pragma HLS UNROLL factor=16
			pe_array[i] += weights[i][k] * input[k];
		}
	}

	// Write results
	for (int i = 0; i < M; i++) {
		#pragma HLS UNROLL factor=16
		output[i] = pe_array[i];
	}
}

// Optimized convolution forward pass
void conv_forward(Layer& layer, const data_t input[MAX_LAYER_SIZE]) {
	#pragma HLS INLINE off

	const int padding = layer.params.conv_params.zero_padding;
	const int stride = layer.params.conv_params.stride;
	const int filter_size = layer.params.conv_params.filter_size;
	const int num_filters = layer.params.conv_params.num_filters;
	const int input_height = layer.params.conv_params.input_height;
	const int input_width = layer.params.conv_params.input_width;
	const int input_channels = layer.params.conv_params.input_channels;

	// Create padded input buffer
	data_t padded_input[MAX_CHANNELS][MAX_FILTER_SIZE][MAX_FILTER_SIZE];
	#pragma HLS ARRAY_PARTITION variable=padded_input complete dim=1

	const int padded_height = input_height + 2 * padding;
	const int padded_width = input_width + 2 * padding;
	const int output_height = (padded_height - filter_size) / stride + 1;
	const int output_width = (padded_width - filter_size) / stride + 1;

	// Main convolution loops
CONV_FILTERS: for(int f = 0; f < num_filters; f++) {
	CONV_OUTPUT_H: for(int oh = 0; oh < output_height; oh++) {
		CONV_OUTPUT_W: for(int ow = 0; ow < output_width; ow++) {
				#pragma HLS PIPELINE II=1

				acc_t sum = 0;

			CONV_CHANNELS: for(int c = 0; c < input_channels; c++) {
				CONV_KERNEL_H: for(int fh = 0; fh < filter_size; fh++) {
					CONV_KERNEL_W: for(int fw = 0; fw < filter_size; fw++) {
							#pragma HLS UNROLL factor=3

							int ih = oh * stride + fh;
							int iw = ow * stride + fw;

							if (ih >= 0 && ih < padded_height && iw >= 0 && iw < padded_width) {
								sum += input[c * input_height * input_width + (ih-padding) * input_width + (iw-padding)] * 
									layer.params.conv_params.weights[f][c][fh][fw];
							}
						}
					}
				}

				sum += layer.params.conv_params.biases[f];
				int output_idx = f * output_height * output_width + oh * output_width + ow;
				layer.pre_activation[output_idx] = sum;
				layer.output[output_idx] = relu(sum);
			}
		}
	}
}

// Optimized max pooling forward pass
void maxpool_forward(Layer& layer, const data_t input[MAX_LAYER_SIZE]) {
	#pragma HLS INLINE off

	const int pool_size = layer.params.pool_params.pool_size;
	const int stride = layer.params.pool_params.stride;
	const int input_height = layer.height;
	const int input_width = layer.width;
	const int input_channels = layer.channels;

	const int output_height = (input_height - pool_size) / stride + 1;
	const int output_width = (input_width - pool_size) / stride + 1;

POOL_CHANNELS: for(int c = 0; c < input_channels; c++) {
	POOL_OUTPUT_H: for(int oh = 0; oh < output_height; oh++) {
		POOL_OUTPUT_W: for(int ow = 0; ow < output_width; ow++) {
				#pragma HLS PIPELINE II=1

				data_t max_val = -INFINITY;

			POOL_WINDOW_H: for(int ph = 0; ph < pool_size; ph++) {
				POOL_WINDOW_W: for(int pw = 0; pw < pool_size; pw++) {
						#pragma HLS UNROLL

						int ih = oh * stride + ph;
						int iw = ow * stride + pw;
						data_t val = input[c * input_height * input_width + ih * input_width + iw];
						max_val = (val > max_val) ? val : max_val;
					}
				}

				layer.output[c * output_height * output_width + oh * output_width + ow] = max_val;
			}
		}
	}
}

// Optimized fully connected forward pass using systolic array
void fc_forward(Layer& layer, const data_t input[MAX_LAYER_SIZE]) {
	#pragma HLS INLINE off

	const int output_size = layer.params.fc_params.output_size;
	const int input_size = layer.height * layer.width * layer.channels;

	// Use systolic array for matrix multiplication
	acc_t temp_output[MAX_LAYER_SIZE];
	systolic_matrix_multiply(layer.params.fc_params.weights, input, temp_output, output_size, input_size);

	// Add biases and apply activation
FC_OUTPUT: for(int o = 0; o < output_size; o++) {
		#pragma HLS PIPELINE II=1

		acc_t sum = temp_output[o] + layer.params.fc_params.biases[o];

		if(layer.params.fc_params.type == a_sigmoid) {
			layer.pre_activation[o] = sum;
			layer.output[o] = sigmoid(sum);
		} else {
			layer.output[o] = sum; // For softmax, store raw values
		}
	}

	// Apply softmax if needed
	if(layer.params.fc_params.type == a_softmax) {
		acc_t max_val = layer.output[0];
		acc_t sum = 0;

		// Find max value for numerical stability
	SOFTMAX_MAX: for(int i = 1; i < output_size; i++) {
			#pragma HLS PIPELINE II=1
			max_val = (layer.output[i] > max_val) ? layer.output[i] : max_val;
		}

		// Compute exponentials and sum
	SOFTMAX_EXP: for(int i = 0; i < output_size; i++) {
			#pragma HLS PIPELINE II=1
			layer.output[i] = hls::exp(layer.output[i] - max_val);
			sum += layer.output[i];
		}

		// Normalize
	SOFTMAX_NORM: for(int i = 0; i < output_size; i++) {
			#pragma HLS PIPELINE II=1
			layer.output[i] /= sum;
		}
	}
}

// Top-level function for HLS synthesis
void cnn_forward(
	data_t input[MAX_LAYER_SIZE],
	data_t output[MAX_LAYER_SIZE],
	Layer layers[],
	int num_layers) {

	#pragma HLS INTERFACE m_axi port=input offset=slave bundle=gmem0
	#pragma HLS INTERFACE m_axi port=output offset=slave bundle=gmem1
	#pragma HLS INTERFACE m_axi port=layers offset=slave bundle=gmem2
	#pragma HLS INTERFACE s_axilite port=num_layers bundle=control
	#pragma HLS INTERFACE s_axilite port=return bundle=control

	data_t layer_input[MAX_LAYER_SIZE];
	data_t layer_output[MAX_LAYER_SIZE];

	// Copy input to local buffer
	memcpy(layer_input, input, MAX_LAYER_SIZE * sizeof(data_t));

	// Process each layer
LAYER_LOOP: for(int i = 0; i < num_layers; i++) {
		#pragma HLS LOOP_TRIPCOUNT min=1 max=20

		Layer& current_layer = layers[i];

		switch(current_layer.type) {
			case conv:
				conv_forward(current_layer, layer_input);
				break;
			case max_pool:
				maxpool_forward(current_layer, layer_input);
				break;
			case fully_connected:
				fc_forward(current_layer, layer_input);
				break;
			default:
				break;
		}

		// Copy output to input buffer for next layer
		memcpy(layer_input, current_layer.output, MAX_LAYER_SIZE * sizeof(data_t));
	}

	// Copy final output
	memcpy(output, layer_input, MAX_LAYER_SIZE * sizeof(data_t));
}