diff --git a/cnn.c b/cnn.c
index 57f3930..8e1b643 100644
--- a/cnn.c
+++ b/cnn.c
@@ -4,6 +4,7 @@
 
 #include <stdlib.h>
 #include <math.h>
+#include <string.h>
 
 typedef enum {
 	input,
@@ -28,13 +29,16 @@ typedef struct {
 	int height;
 	int width;
 	int channels; // in this case, "channels" are the number of filters that are coming in
-	
+
 	union {
 		struct {
 			int num_filters;
 			int filter_size; // single integer b/c filter will usually be square shaped
 			int stride;
 			int zero_padding; // single integer for how many layers of zero padding
+			int input_height;
+			int input_width;
+			int input_channels;
 			float (*weights);
 			float (*biases);
 		} conv_params;
@@ -42,6 +46,8 @@ typedef struct {
 		struct {
 			int pool_size; // single integer again
 			int stride;
+			int input_height;
+			int input_width;
 		} pool_params;
 
 		struct {
@@ -62,6 +68,13 @@ typedef struct {
 	int num_layers;
 } Network;
 
+Network* create_network(int capacity) {
+	Network* network = (Network*)malloc(sizeof(Network));
+	network->layers = (Layer**)malloc(capacity * sizeof(Layer*));
+	network->num_layers = capacity;
+	return network;
+}
+
 float he_init(int fan_in) {
 	float scale = sqrt(2.0f / fan_in);
 	float random = (float)rand() / RAND_MAX * 2 - 1;
@@ -92,20 +105,20 @@ float sigmoid_g(float x) {
 }
 
 void softmax(float* input, float* output, int size) {
-    float max = input[0];
-    for(int i = 1; i < size; i++) {
-        if(input[i] > max) {
-            max = input[i];
-        }
-    }
-    float sum = 0;
-    for(int i = 0; i < size; i++) {
-        output[i] = exp(input[i] - max);
-        sum += output[i];
-    }
-    for(int i = 0; i < size; i++) {
-        output[i] /= sum;
-    }
+	float max = input[0];
+	for(int i = 1; i < size; i++) {
+		if(input[i] > max) {
+			max = input[i];
+		}
+	}
+	float sum = 0;
+	for(int i = 0; i < size; i++) {
+		output[i] = exp(input[i] - max);
+		sum += output[i];
+	}
+	for(int i = 0; i < size; i++) {
+		output[i] /= sum;
+	}
 }
 
 Layer* create_input(int height, int width, int channels) {
@@ -125,6 +138,9 @@ Layer* create_conv(int input_height, int input_width, int input_channels, int nu
 	layer->params.conv_params.filter_size = filter_size;
 	layer->params.conv_params.stride = stride;
 	layer->params.conv_params.zero_padding = padding;
+	layer->params.conv_params.input_height = input_height;
+	layer->params.conv_params.input_width = input_width;
+	layer->params.conv_params.input_channels = input_channels;
 
 	// output dimensions
 	// https://cs231n.github.io/convolutional-networks/
@@ -133,6 +149,7 @@ Layer* create_conv(int input_height, int input_width, int input_channels, int nu
 	layer->height = output_h;
 	layer->width = output_w;
 	layer->channels = num_filters;
+	layer->activation_g = relu_g;
 
 	// conv layer uses relu, use HE init
 	int weights_size = num_filters * input_channels * filter_size * filter_size;
@@ -145,7 +162,8 @@ Layer* create_conv(int input_height, int input_width, int input_channels, int nu
 	layer->params.conv_params.biases = (float*)calloc(num_filters, sizeof(float));
 
 	layer->output = (float*) calloc(output_h * output_w * num_filters, sizeof(float));
-  layer->delta = (float*) calloc(output_h * output_w * num_filters, sizeof(float));
+	layer->delta = (float*) calloc(output_h * output_w * num_filters, sizeof(float));
+	layer->pre_activation = (float*)calloc(output_h * output_w * num_filters, sizeof(float));
 
 	return layer;
 }
@@ -155,6 +173,9 @@ Layer* create_maxpool(int input_height, int input_width, int input_channels, int
 	layer->type = max_pool;
 	layer->params.pool_params.pool_size = pool_size;
 	layer->params.pool_params.stride = stride;
+	layer->params.pool_params.input_height = input_height;
+	layer->params.pool_params.input_width = input_width;
+
 
 	// output dimensions
 	// https://cs231n.github.io/convolutional-networks/
@@ -165,7 +186,7 @@ Layer* create_maxpool(int input_height, int input_width, int input_channels, int
 	layer->channels = input_channels;
 
 	layer->output = (float*) calloc(output_h * output_w * input_channels, sizeof(float));
-  layer->delta = (float*) calloc(output_h * output_w * input_channels, sizeof(float));
+	layer->delta = (float*) calloc(output_h * output_w * input_channels, sizeof(float));
 
 	return layer;
 }
@@ -175,7 +196,8 @@ Layer* create_fc(int output_size, int input_size, activation type) {
 	layer->type = fully_connected;
 	layer->params.fc_params.output_size = output_size;
 	layer->params.fc_params.type = type; // activation type can either be sigmoid or softmax (output layer)
-	
+	layer->activation_g = (type == a_sigmoid) ? sigmoid_g : NULL; // null is softmax (doesnt have a gradient)
+
 	// use glorot initalization 
 	layer->params.fc_params.weights = (float*)calloc(output_size * input_size, sizeof(float));
 	for (int i = 0; i < (output_size * input_size); i++) {
@@ -189,6 +211,7 @@ Layer* create_fc(int output_size, int input_size, activation type) {
 	layer->channels = output_size;
 	layer->output = (float*) calloc(output_size, sizeof(float));
 	layer->delta = (float*) calloc(output_size, sizeof(float));
+	layer->pre_activation = (float*) calloc(output_size, sizeof(float));
 
 	return layer;
 }
@@ -197,110 +220,132 @@ void free_layer(Layer* layer) {
 	switch (layer->type) {
 		case input:
 			free(layer->output);
-    	free(layer);
+			free(layer);
 			break;
 		case conv:
 			free(layer->params.conv_params.weights);
-    	free(layer->params.conv_params.biases);
-    	free(layer->output);
-    	free(layer->delta);
-    	free(layer);
+			free(layer->params.conv_params.biases);
+			free(layer->output);
+			free(layer->delta);
+			free(layer->pre_activation);
+			free(layer);
 			break;
 		case max_pool:
 			free(layer->output);
-    	free(layer->delta);
-    	free(layer);
+			free(layer->delta);
+			free(layer);
 			break;
 		case fully_connected:
 			free(layer->params.fc_params.weights);
-    	free(layer->params.fc_params.biases);
-    	free(layer->output);
-    	free(layer->delta);
-    	free(layer);
+			free(layer->params.fc_params.biases);
+			free(layer->output);
+			free(layer->delta);
+			free(layer->pre_activation);
+			free(layer);
 			break;
 	}
 }
 
+void destroy_network(Network* network) {
+	if (!network) return;
+	for (int i = 0; i < network->num_layers; i++) {
+		if (network->layers[i]) {
+			free_layer(network->layers[i]);
+		}
+	}
+	free(network->layers);
+	free(network);
+}
+
 void conv_forward(Layer* layer, float* input) {
 	int padding = layer->params.conv_params.zero_padding;
 	int stride = layer->params.conv_params.stride;
 	int filter_size = layer->params.conv_params.filter_size;
 	int num_filters =  layer->params.conv_params.num_filters;
-	int input_height = layer->height; // from previous layer
-	int input_width = layer->width;
-	int input_channels = layer->channels;
-	
+	int input_height = layer->params.conv_params.input_height; 
+	int input_width = layer->params.conv_params.input_width;   
+	int input_channels = layer->params.conv_params.input_channels;  	
+
 	int padded_height = input_height + 2 * padding;
-  int padded_width = input_width + 2 * padding;
-  float* padded_input = (float*) calloc(padded_height * padded_width * input_channels, sizeof(float));
-	
+	int padded_width = input_width + 2 * padding;
+	float* padded_input = (float*) calloc(padded_height * padded_width * input_channels, sizeof(float));
+
 	for (int c = 0; c < input_channels; c++) {
-    for (int h = 0; h < input_height; h++) {
-      for (int w = 0; w < input_width; w++) {
-      	padded_input[c * padded_height * padded_width + (h + padding) * padded_width + (w + padding)] = input[c * input_height * input_width + h * input_width + w];
-      }
-    }
-  }
-	
+		for (int h = 0; h < input_height; h++) {
+			for (int w = 0; w < input_width; w++) {
+				padded_input[c * padded_height * padded_width + (h + padding) * padded_width + (w + padding)] = input[c * input_height * input_width + h * input_width + w];
+			}
+		}
+	}
+
 	int output_height = (padded_height - filter_size) / stride + 1;
-  int output_width = (padded_width - filter_size) / stride + 1;
-  int output_size = output_height * output_width * num_filters;
+	int output_width = (padded_width - filter_size) / stride + 1;
+	int output_size = output_height * output_width * num_filters;
 
 	// for every filter
-  for(int f = 0; f < num_filters; f++) {
-  	// for height and width
-    for(int oh = 0; oh < output_height; oh++) {
-    	for(int ow = 0; ow < output_width; ow++) {
-        float sum = 0;
-        // for each "channel (feature maps coming in)", and filter size.
-        for(int c = 0; c < input_channels; c++) {
-        	for(int fh = 0; fh < filter_size; fh++) {
-          	for(int fw = 0; fw < filter_size; fw++) {
-              int ph = oh * stride + fh;
-              int pw = ow * stride + fw;
-              sum += padded_input[c * padded_height * padded_width + ph * padded_width + pw] * layer->params.conv_params.weights[f * input_channels * filter_size * filter_size + c * filter_size * filter_size + fh * filter_size + fw];
-            }
-          }
-        }
-        sum += layer->params.conv_params.biases[f];
-        layer->output[f * output_height * output_width + oh * output_width + ow] = relu(sum);
-      }
-    }
-  }
+	for(int f = 0; f < num_filters; f++) {
+		for(int oh = 0; oh < output_height; oh++) {
+			for(int ow = 0; ow < output_width; ow++) {
+				float sum = 0;
+				for(int c = 0; c < input_channels; c++) {
+					for(int fh = 0; fh < filter_size; fh++) {
+						for(int fw = 0; fw < filter_size; fw++) {
+							int ih = oh * stride + fh;
+							int iw = ow * stride + fw;
 
-  free(padded_input);
+
+							if (ih >= 0 && ih < padded_height && iw >= 0 && iw < padded_width) {
+								int input_idx = c * padded_height * padded_width + ih * padded_width + iw;
+								int weight_idx = f * input_channels * filter_size * filter_size + 
+									c * filter_size * filter_size + 
+									fh * filter_size + fw;
+
+								sum += padded_input[input_idx] * layer->params.conv_params.weights[weight_idx];
+							}
+						}
+					}
+				}
+				sum += layer->params.conv_params.biases[f];
+				int output_idx = f * output_height * output_width + oh * output_width + ow;
+				layer->pre_activation[output_idx] = sum;
+				layer->output[output_idx] = relu(sum);
+			}
+		}
+	}	
+
+	free(padded_input);
 }
 
 void maxpool_forward(Layer* layer, float* input) {
 	int pool_size = layer->params.pool_params.pool_size;
-  int stride = layer->params.pool_params.stride;
-  // prev layer
+	int stride = layer->params.pool_params.stride;
+	// prev layer
 	int input_height = layer->height; 
 	int input_width = layer->width; 
 	int input_channels = layer->channels; 
-	
+
 	int output_height = (input_height - pool_size) / stride + 1;
-  int output_width = (input_width - pool_size) / stride + 1;
-  int output_size = output_height * output_width * input_channels;
+	int output_width = (input_width - pool_size) / stride + 1;
+	int output_size = output_height * output_width * input_channels;
 
 	for(int c = 0; c < input_channels; c++) {
-    for(int oh = 0; oh < output_height; oh++) {
-      for(int ow = 0; ow < output_width; ow++) {
-      float max_val = -INFINITY;
-      	for(int ph = 0; ph < pool_size; ph++) {
-        	for(int pw = 0; pw < pool_size; pw++) {
-          	int ih = oh * stride + ph;
-            int iw = ow * stride + pw;
-            float val = input[c * input_height * input_width + ih * input_width + iw];
-            if(val > max_val) {
-            	max_val = val;
-            }
-          }
-        }
-      layer->output[c * output_height * output_width + oh * output_width + ow] = max_val;
-      }
-    }
-  }
+		for(int oh = 0; oh < output_height; oh++) {
+			for(int ow = 0; ow < output_width; ow++) {
+				float max_val = -INFINITY;
+				for(int ph = 0; ph < pool_size; ph++) {
+					for(int pw = 0; pw < pool_size; pw++) {
+						int ih = oh * stride + ph;
+						int iw = ow * stride + pw;
+						float val = input[c * input_height * input_width + ih * input_width + iw];
+						if(val > max_val) {
+							max_val = val;
+						}
+					}
+				}
+				layer->output[c * output_height * output_width + oh * output_width + ow] = max_val;
+			}
+		}
+	}
 }
 
 void fc_forward(Layer* layer, float* input) {
@@ -309,53 +354,55 @@ void fc_forward(Layer* layer, float* input) {
 
 	// flatten
 	float* flattened_input = (float*) calloc(input_size, sizeof(float));
-  for(int i = 0; i < input_size; i++) {
-  	flattened_input[i] = input[i];
-  }
-	
+	for(int i = 0; i < input_size; i++) {
+		flattened_input[i] = input[i];
+	}
+
 	// matmul (output = bias + (input * weight))
 	float* temp_output = (float*) calloc(output_size, sizeof(float));
-  for(int o = 0; o < output_size; o++) {
-  	float sum = 0;
-    for(int i = 0; i < input_size; i++) {
-    	sum += flattened_input[i] * layer->params.fc_params.weights[o * input_size + i];
-    }
-    sum += layer->params.fc_params.biases[o];
-    temp_output[o] = sum;
-  }
+	for(int o = 0; o < output_size; o++) {
+		float sum = 0;
+		for(int i = 0; i < input_size; i++) {
+			sum += flattened_input[i] * layer->params.fc_params.weights[o * input_size + i];
+		}
+		sum += layer->params.fc_params.biases[o];
+		temp_output[o] = sum;
+	}
 
 	// apply the correct activation (sigmoid for non output layers, softmax for output)
 	if(layer->params.fc_params.type == a_sigmoid) {
-  	for(int o = 0; o < output_size; o++) {
-    	layer->output[o] = sigmoid(temp_output[o]);
-    }
-  } else if(layer->params.fc_params.type == a_softmax) {
-    softmax(temp_output, layer->output, output_size);
-  }
+		for(int o = 0; o < output_size; o++) {
+			layer->pre_activation[o] = temp_output[o];
+			layer->output[o] = sigmoid(temp_output[o]);
+		}
+	} else if(layer->params.fc_params.type == a_softmax) {
+		softmax(temp_output, layer->output, output_size);
+	}
 
-  free(temp_output);
-  free(flattened_input);
+	free(temp_output);
+	free(flattened_input);
 }
 
 void forward_propagation(Layer* layer, float* input_fc) {
+	int input_size;
 	switch(layer->type) {
 		case input:
-    	// input to layer->output
-      int input_size = layer->height * layer->width * layer->channels;
-      for(int i = 0; i < input_size; i++) {
-      	layer->output[i] = input_fc[i];
-      }
-      break;
-    case conv:
-      conv_forward(layer, input);
-      break;
-    case max_pool:
-      maxpool_forward(layer, input);
-      break;
-    case fully_connected:
-      fc_forward(layer, input);
-      break;
-  }
+			// input to layer->output
+			input_size = (layer->height * layer->width * layer->channels);
+			for(int i = 0; i < input_size; i++) {
+				layer->output[i] = input_fc[i];
+			}
+			break;
+		case conv:
+			conv_forward(layer, input_fc);
+			break;
+		case max_pool:
+			maxpool_forward(layer, input_fc);
+			break;
+		case fully_connected:
+			fc_forward(layer, input_fc);
+			break;
+	}
 }
 
 void network_forward(Network* network, float* input) {
@@ -368,91 +415,191 @@ void network_forward(Network* network, float* input) {
 
 void fc_backward(Layer* layer, float* prev_delta, float* input, float learning_rate) {
 	int output_size = layer->params.fc_params.output_size;
-  int input_size = layer->height * layer->width * layer->channels;
+	int input_size = layer->height * layer->width * layer->channels;
 
-  // gradient of weights
+	float* gradient;
+	if(layer->params.fc_params.type == a_softmax) {
+		gradient = (float*)malloc(output_size * sizeof(float));
+		for(int i = 0; i < output_size; i++) {
+			gradient[i] = layer->output[i];
+			if(prev_delta[i] > 0.5) { // one hot encoded
+				gradient[i] -= 1.0;
+			}
+		}
+	} else {
+		gradient = prev_delta;
+	}
+
+	// update weights and biases
 	for(int o = 0; o < output_size; o++) {
-  	for(int i = 0; i < input_size; i++) {
-    	layer->params.fc_params.weights[o * input_size + i] -= learning_rate * prev_delta[o] * input[i];
-    }
-    layer->params.fc_params.biases[o] -= learning_rate * prev_delta[o];
-  }
+		for(int i = 0; i < input_size; i++) {
+			layer->params.fc_params.weights[o * input_size + i] -= 
+				learning_rate * gradient[o] * input[i];
+		}
+		layer->params.fc_params.biases[o] -= learning_rate * gradient[o];
+	}
 
-  // gradient w/respect to inputs
-	for(int i = 0; i < input_size; i++) {
-  	float sum = 0;
-    for(int o = 0; o < output_size; o++) {
-    	sum += layer->params.fc_params.weights[o * input_size + i] * prev_delta[o];
-    }
-  	layer->delta[i] = sum * layer->activation_g(layer->pre_activation[i]);
-  }
+	// gradient 
+	if(layer->activation_g) { 
+		for(int i = 0; i < input_size; i++) {
+			float sum = 0;
+			for(int o = 0; o < output_size; o++) {
+				sum += layer->params.fc_params.weights[o * input_size + i] * gradient[o];
+			}
+			layer->delta[i] = sum * layer->activation_g(layer->pre_activation[i]);
+		}
+	}
+
+	if(layer->params.fc_params.type == a_softmax) {
+		free(gradient);
+	}
 }
 
+
+
 void conv_backward(Layer* layer, float* prev_delta, float* input, float learning_rate) {
 	int num_filters = layer->params.conv_params.num_filters;
-  int channels = layer->channels;
-  int filter_size = layer->params.conv_params.filter_size;
-  int input_height = layer->height;
-  int input_width = layer->width;
-  int padding = layer->params.conv_params.zero_padding;
-  int stride = layer->params.conv_params.stride;
-  int output_height = (input_height + 2 * padding - filter_size) / stride + 1;
-  int output_width = (input_width + 2 * padding - filter_size) / stride + 1;
+	int channels = layer->channels;
+	int filter_size = layer->params.conv_params.filter_size;
+	int input_height = layer->height;
+	int input_width = layer->width;
+	int padding = layer->params.conv_params.zero_padding;
+	int stride = layer->params.conv_params.stride;
+	int output_height = (input_height + 2 * padding - filter_size) / stride + 1;
+	int output_width = (input_width + 2 * padding - filter_size) / stride + 1;
 
-  // gradient w/respect to filters
-  for(int f = 0; f < num_filters; f++) {
-    for(int c = 0; c < channels; c++) {
-      for(int fh = 0; fh < filter_size; fh++) {
-        for(int fw = 0; fw < filter_size; fw++) {
-          float grad = 0;
-          for(int oh = 0; oh < output_height; oh++) {
-            for(int ow = 0; ow < output_width; ow++) {
-              int ih = oh * stride + fh - padding;
-              int iw = ow * stride + fw - padding;
-              if(ih >= 0 && ih < input_height && iw >= 0 && iw < input_width) {
-              	grad += input[c * input_height * input_width + ih * input_width + iw] * prev_delta[f * output_height * output_width + oh * output_width + ow];
-              }
-            }
-          }
-          int index = f * channels * filter_size * filter_size + c * filter_size * filter_size + fh * filter_size + fw;
-          layer->params.conv_params.weights[index] -= learning_rate * grad;
-        }
-      }
-    }
-  }
-
-  // gradient w/respect to biases
+	// gradient w/respect to filters
 	for(int f = 0; f < num_filters; f++) {
-  	float grad = 0;
-    for(int oh = 0; oh < output_height; oh++) {
-      for(int ow = 0; ow < output_width; ow++) {
-        grad += prev_delta[f * output_height * output_width + oh * output_width + ow];
-      }
-    }
-    layer->params.conv_params.biases[f] -= learning_rate * grad;
-  }
+		for(int c = 0; c < channels; c++) {
+			for(int fh = 0; fh < filter_size; fh++) {
+				for(int fw = 0; fw < filter_size; fw++) {
+					float grad = 0;
+					for(int oh = 0; oh < output_height; oh++) {
+						for(int ow = 0; ow < output_width; ow++) {
+							int ih = oh * stride + fh - padding;
+							int iw = ow * stride + fw - padding;
+							if(ih >= 0 && ih < input_height && iw >= 0 && iw < input_width) {
+								grad += input[c * input_height * input_width + ih * input_width + iw] * prev_delta[f * output_height * output_width + oh * output_width + ow];
+							}
+						}
+					}
+					int index = f * channels * filter_size * filter_size + c * filter_size * filter_size + fh * filter_size + fw;
+					layer->params.conv_params.weights[index] -= learning_rate * grad;
+				}
+			}
+		}
+	}
 
-  // gradient with respect to inputs
-  for(int c = 0; c < channels; c++) {
-    for(int ih = 0; ih < input_height; ih++) {
-      for(int iw = 0; iw < input_width; iw++) {
-        float grad = 0;
-        for(int f = 0; f < num_filters; f++) {
-          for(int fh = 0; fh < filter_size; fh++) {
-            for(int fw = 0; fw < filter_size; fw++) {
-              int oh = (ih - fh + padding) / stride;
-              int ow = (iw - fw + padding) / stride;
-              if((ih - fh + padding) % stride == 0 && (iw - fw + padding) % stride == 0 && oh < output_height && ow < output_width) {
-              	int w_index = f * channels * filter_size * filter_size + c * filter_size * filter_size + fh * filter_size + fw;
-              	grad += layer->params.conv_params.weights[w_index] * prev_delta[f * output_height * output_width + oh * output_width + ow];
-              }
-            }
-          }
-        }
-        layer->delta[c * input_height * input_width + ih * input_width + iw] = grad * layer->activation_g(layer->pre_activation[c * input_height * input_width + ih * input_width + iw]);
-      }
-    }
-  }
+	// gradient w/respect to biases
+	for(int f = 0; f < num_filters; f++) {
+		float grad = 0;
+		for(int oh = 0; oh < output_height; oh++) {
+			for(int ow = 0; ow < output_width; ow++) {
+				grad += prev_delta[f * output_height * output_width + oh * output_width + ow];
+			}
+		}
+		layer->params.conv_params.biases[f] -= learning_rate * grad;
+	}
+
+	// gradient with respect to inputs
+	for(int c = 0; c < channels; c++) {
+		for(int ih = 0; ih < input_height; ih++) {
+			for(int iw = 0; iw < input_width; iw++) {
+				float grad = 0;
+				for(int f = 0; f < num_filters; f++) {
+					for(int fh = 0; fh < filter_size; fh++) {
+						for(int fw = 0; fw < filter_size; fw++) {
+							int oh = (ih - fh + padding) / stride;
+							int ow = (iw - fw + padding) / stride;
+							if((ih - fh + padding) % stride == 0 && (iw - fw + padding) % stride == 0 && oh < output_height && ow < output_width) {
+								int w_index = f * channels * filter_size * filter_size + c * filter_size * filter_size + fh * filter_size + fw;
+								grad += layer->params.conv_params.weights[w_index] * prev_delta[f * output_height * output_width + oh * output_width + ow];
+							}
+						}
+					}
+				}
+				layer->delta[c * input_height * input_width + ih * input_width + iw] = grad * layer->activation_g(layer->pre_activation[c * input_height * input_width + ih * input_width + iw]);
+			}
+		}
+	}
 }
 
+void maxpool_backward(Layer* layer, float* prev_delta, float* input, float learning_rate) {
+	int pool_size = layer->params.pool_params.pool_size;
+	int stride = layer->params.pool_params.stride;
+	int input_height = layer->params.pool_params.input_height;
+	int input_width = layer->params.pool_params.input_width;
+	int channels = layer->channels;
 
+	// Zero initialize deltas
+	memset(layer->delta, 0, input_height * input_width * channels * sizeof(float));
+
+	int output_height = layer->height;
+	int output_width = layer->width;
+
+	for(int c = 0; c < channels; c++) {
+		for(int oh = 0; oh < output_height; oh++) {
+			for(int ow = 0; ow < output_width; ow++) {
+				// finds max value
+				int maxI = -1, maxJ = -1;
+				float maxVal = -INFINITY;
+
+				for(int ph = 0; ph < pool_size; ph++) {
+					for(int pw = 0; pw < pool_size; pw++) {
+						int ih = oh * stride + ph;
+						int iw = ow * stride + pw;
+
+						// checks bounds
+						if (ih < input_height && iw < input_width) {
+							float val = input[c * input_height * input_width + ih * input_width + iw];
+							if(val > maxVal) {
+								maxVal = val;
+								maxI = ih;
+								maxJ = iw;
+							}
+						}
+					}
+				}
+
+				// only propagate gradient if a valid max position is found
+				if(maxI != -1 && maxJ != -1) {
+					int delta_idx = c * output_height * output_width + oh * output_width + ow;
+					layer->delta[c * input_height * input_width + maxI * input_width + maxJ] = 
+						prev_delta[delta_idx];
+				}
+			}
+		}
+	}
+}
+
+void backward_propagation(Layer* layer, float* prev_delta, float* input_fc, float learning_rate) {
+	switch(layer->type) {
+		case fully_connected:
+			fc_backward(layer, prev_delta, input_fc, learning_rate);
+			break;
+		case conv:
+			conv_backward(layer, prev_delta, input_fc, learning_rate);
+			break;
+		case max_pool:
+			maxpool_backward(layer, prev_delta, input_fc, learning_rate);
+			break;
+		case input:
+			// No backpropagation for input layer
+			break;
+	}
+}
+
+void network_backward(Network* network, float* label, float learning_rate) {
+	// ouput
+	Layer* output_layer = network->layers[network->num_layers - 1];
+	// output gradient
+	for(int o = 0; o < output_layer->channels; o++) {
+		output_layer->delta[o] = output_layer->output[o] - label[o];
+	}
+	// backprop
+	for(int i = network->num_layers - 2; i >= 0; i--) {
+		Layer* current_layer = network->layers[i];
+		Layer* next_layer = network->layers[i + 1];
+		backward_propagation(current_layer, next_layer->delta, current_layer->output, learning_rate);
+	}
+}
diff --git a/mnist.c b/mnist.c
new file mode 100644
index 0000000..4b2025b
--- /dev/null
+++ b/mnist.c
@@ -0,0 +1,189 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include "cnn.c"
+
+#define IMG_HEIGHT 28
+#define IMG_WIDTH 28
+#define NUM_CLASSES 10
+#define BATCH_SIZE 32
+#define LEARNING_RATE 0.01
+#define NUM_EPOCHS 10
+
+float* read_mnist_images(const char* filename, int* num_images) {
+	FILE* fp = fopen(filename, "rb");
+	if (!fp) {
+		printf("Error opening file %s\n", filename);
+		return NULL;
+	}
+
+	int magic_number = 0;
+	fread(&magic_number, sizeof(int), 1, fp);
+	magic_number = ((magic_number & 0xff000000) >> 24) | 
+		((magic_number & 0x00ff0000) >> 8) |
+		((magic_number & 0x0000ff00) << 8) |
+		((magic_number & 0x000000ff) << 24);
+
+	if (magic_number != 2051) {
+		printf("Invalid MNIST image file format\n");
+		fclose(fp);
+		return NULL;
+	}
+
+	fread(num_images, sizeof(int), 1, fp);
+	*num_images = ((*num_images & 0xff000000) >> 24) |
+		((*num_images & 0x00ff0000) >> 8) |
+		((*num_images & 0x0000ff00) << 8) |
+		((*num_images & 0x000000ff) << 24);
+
+	int rows, cols;
+	fread(&rows, sizeof(int), 1, fp);
+	fread(&cols, sizeof(int), 1, fp);
+	rows = ((rows & 0xff000000) >> 24) |
+		((rows & 0x00ff0000) >> 8) |
+		((rows & 0x0000ff00) << 8) |
+		((rows & 0x000000ff) << 24);
+	cols = ((cols & 0xff000000) >> 24) |
+		((cols & 0x00ff0000) >> 8) |
+		((cols & 0x0000ff00) << 8) |
+		((cols & 0x000000ff) << 24);
+
+	if (rows != IMG_HEIGHT || cols != IMG_WIDTH) {
+		printf("Invalid image dimensions\n");
+		fclose(fp);
+		return NULL;
+	}
+
+	float* images = (float*)malloc(*num_images * IMG_HEIGHT * IMG_WIDTH * sizeof(float));
+	unsigned char* temp = (unsigned char*)malloc(IMG_HEIGHT * IMG_WIDTH);
+
+	for (int i = 0; i < *num_images; i++) {
+		fread(temp, 1, IMG_HEIGHT * IMG_WIDTH, fp);
+		for (int j = 0; j < IMG_HEIGHT * IMG_WIDTH; j++) {
+			images[i * IMG_HEIGHT * IMG_WIDTH + j] = temp[j] / 255.0f;
+		}
+	}
+
+	free(temp);
+	fclose(fp);
+	return images;
+}
+
+float* read_mnist_labels(const char* filename, int* num_labels) {
+	FILE* fp = fopen(filename, "rb");
+	if (!fp) {
+		printf("Error opening file %s\n", filename);
+		return NULL;
+	}
+
+	int magic_number = 0;
+	fread(&magic_number, sizeof(int), 1, fp);
+	magic_number = ((magic_number & 0xff000000) >> 24) |
+		((magic_number & 0x00ff0000) >> 8) |
+		((magic_number & 0x0000ff00) << 8) |
+		((magic_number & 0x000000ff) << 24);
+
+	if (magic_number != 2049) {
+		printf("Invalid MNIST label file format\n");
+		fclose(fp);
+		return NULL;
+	}
+
+	fread(num_labels, sizeof(int), 1, fp);
+	*num_labels = ((*num_labels & 0xff000000) >> 24) |
+		((*num_labels & 0x00ff0000) >> 8) |
+		((*num_labels & 0x0000ff00) << 8) |
+		((*num_labels & 0x000000ff) << 24);
+
+	float* labels = (float*)calloc(*num_labels * NUM_CLASSES, sizeof(float));
+	unsigned char* temp = (unsigned char*)malloc(*num_labels);
+
+	fread(temp, 1, *num_labels, fp);
+	for (int i = 0; i < *num_labels; i++) {
+		labels[i * NUM_CLASSES + temp[i]] = 1.0f;
+	}
+
+	free(temp);
+	fclose(fp);
+	return labels;
+}
+
+int main() {
+	// load mnist
+	int num_train_images, num_train_labels;
+	float* train_images = read_mnist_images("train-images-idx3-ubyte", &num_train_images);
+	float* train_labels = read_mnist_labels("train-labels-idx1-ubyte", &num_train_labels);
+
+	// creating a lenet-5 inspired network
+	Network* network = create_network(8);
+	network->layers[0] = create_input(IMG_HEIGHT, IMG_WIDTH, 1);
+	network->layers[1] = create_conv(IMG_HEIGHT, IMG_WIDTH, 1, 6, 5, 1, 2);
+	network->layers[2] = create_maxpool(network->layers[1]->height, network->layers[1]->width, network->layers[1]->channels, 2, 2);
+	network->layers[3] = create_conv(network->layers[2]->height, network->layers[2]->width, network->layers[2]->channels, 16, 5, 1, 0);
+	network->layers[4] = create_maxpool(network->layers[3]->height, network->layers[3]->width, network->layers[3]->channels, 2, 2);
+	network->layers[5] = create_fc(120, network->layers[4]->height * network->layers[4]->width * network->layers[4]->channels, a_sigmoid);
+	network->layers[6] = create_fc(84, 120, a_sigmoid);
+	network->layers[7] = create_fc(NUM_CLASSES, 84, a_softmax);
+
+	// training loop
+	for (int epoch = 0; epoch < NUM_EPOCHS; epoch++) {
+		float total_loss = 0.0f;
+		int correct = 0;
+
+		for (int i = 0; i < num_train_images; i++) {
+			// forward pass
+			network_forward(network, &train_images[i * IMG_HEIGHT * IMG_WIDTH]);
+
+			// accuracy
+			float* output = network->layers[network->num_layers - 1]->output;
+			int predicted = 0;
+			float max_prob = output[0];
+			for (int j = 1; j < NUM_CLASSES; j++) {
+				if (output[j] > max_prob) {
+					max_prob = output[j];
+					predicted = j;
+				}
+			}
+
+			int true_label = 0;
+			for (int j = 0; j < NUM_CLASSES; j++) {
+				if (train_labels[i * NUM_CLASSES + j] > 0.5f) {
+					true_label = j;
+					break;
+				}
+			}
+
+			if (predicted == true_label) correct++;
+
+			// backprop 
+			network_backward(network, &train_labels[i * NUM_CLASSES], LEARNING_RATE);
+
+			// cross entropy loss
+			float loss = 0.0f;
+			for (int j = 0; j < NUM_CLASSES; j++) {
+				if (train_labels[i * NUM_CLASSES + j] > 0.5f) {
+					loss -= log(output[j] + 1e-10);
+				}
+			}
+			total_loss += loss;
+
+			// progress
+			if ((i + 1) % 100 == 0) {
+				printf("Epoch %d/%d, Step %d/%d, Loss: %.4f, Accuracy: %.2f%%\n",
+					 epoch + 1, NUM_EPOCHS, i + 1, num_train_images,
+					 total_loss / (i + 1), 100.0f * correct / (i + 1));
+			}
+		}
+
+		printf("Epoch %d/%d completed, Average Loss: %.4f, Accuracy: %.2f%%\n",
+				 epoch + 1, NUM_EPOCHS, total_loss / num_train_images,
+				 100.0f * correct / num_train_images);
+	}
+
+	// Clean up
+	free(train_images);
+	free(train_labels);
+	destroy_network(network);
+
+	return 0;
+}
diff --git a/train-images.idx3-ubyte b/train-images.idx3-ubyte
new file mode 100644
index 0000000..bbce276
Binary files /dev/null and b/train-images.idx3-ubyte differ
diff --git a/train-labels.idx1-ubyte b/train-labels.idx1-ubyte
new file mode 100644
index 0000000..d6b4c5d
Binary files /dev/null and b/train-labels.idx1-ubyte differ