diff --git a/src/cudaKernels.cu b/src/cudaKernels.cu
index cdf1496cab39d65f794b5220b56269af88c53b65..f9c81dd8f745f6fcde8d352417dec5fbdf04df21 100644
--- a/src/cudaKernels.cu
+++ b/src/cudaKernels.cu
@@ -7,7 +7,10 @@
 #include <stdio.h>
 #include "tensor.h"
 
-#define FLT_MAX 3.402823466e+38F;
+
+#define BLOCK_SIZE 32
+#define FLT_MAX 3.402823466e+38F
+#define SHMEM_SIZE BLOCK_SIZE * BLOCK_SIZE * sizeof(float)
 
 // Allocates Memory on Gpu and Copys Load from Host
 cudaError_t loadToGpu(void** dstGpu, float* srcHost, int size){
@@ -80,13 +83,13 @@ __global__ void maxPoolKernel(float* dst, float* src, int input_rows, int input_
 
 
 // TODO: make add a seperate function
-__global__ void matmulKernel(float* A, float* B, float* C, float* bias, int M, int N, int K)
+__global__ void matmulKernel(float* A, float* B, float* C, int M, int N, int K)
 {
     int row = blockIdx.y * blockDim.y + threadIdx.y;
     int col = blockIdx.x * blockDim.x + threadIdx.x;
     if (row < M && col < K)
     {
-        float sum = bias[col];
+        float sum = 0;
         for (int n = 0; n < N; ++n)
         {
             sum += A[row*N + n] * B[n*K + col];
@@ -97,6 +100,54 @@ __global__ void matmulKernel(float* A, float* B, float* C, float* bias, int M, i
 }
 
 
+__global__ void matmulSharedKernel(float* A, float* B, float* C, int M, int N, int K) {
+    // tile size == thread block size
+    __shared__ float AS[SHMEM_SIZE];
+    __shared__ float BS[SHMEM_SIZE];
+
+    int blockRow = blockIdx.y;
+    int blockCol = blockIdx.x;
+
+    int row = threadIdx.y;
+    int col = threadIdx.x;
+
+    float tmp = 0;
+    for (int m = 0; m < (N+BLOCK_SIZE-1)/BLOCK_SIZE; m++) {
+        AS[(row*BLOCK_SIZE) + col] = A[(blockRow+row)*M + (m*BLOCK_SIZE+col)];
+        BS[(row*BLOCK_SIZE) + col] = B[((m*BLOCK_SIZE+row)*N) + blockCol+col];
+
+        __syncthreads();
+
+        for (int e = 0; e < BLOCK_SIZE; e++) {
+            tmp += AS[(row*BLOCK_SIZE) + e] * BS[(e*BLOCK_SIZE) + col];
+        }
+
+        __syncthreads();
+
+    }
+    C[(row*M) + col] = tmp;
+}
+
+
+/*
+__global__ void matrixMult (__half2* A, __half2* B, __half2* C, __half2* bias, int M, int N, int K) {
+    int k = 0;
+    half* a_half = (half*)a;
+    half2 sum = __float2half2_rn(0.0);
+    int col = threadIdx.x + blockDim.x * blockIdx.x;
+    int row = threadIdx.y + blockDim.y * blockIdx.y;
+    if(col < width/2 && row < width) {
+        for (k = 0; k < width; k++){
+        //__half2 a_temp = __half2half2(a_half[row * width + k]);
+        //~ sum += a[row * width + k] * b[k * width + col];
+        sum += __half2half2(a_half[row * width + k]) * b[k * width/2 + col];
+        
+        }
+    c[row * width/2 + col] = sum;
+    }
+}
+*/
+
 
 __global__ void conv2dKernel(float* input, float* weights, float* bias, float* output, int input_rows, int input_cols, int kernel_size, int num_channels, int num_filters) {
     int batch = blockIdx.x;
@@ -143,12 +194,31 @@ __global__ void reluKernel(float* input, int size) {
 
 
 // Multiplies Matrix A x B -> C with one Thread for each element in C (With allocating Memory first)
-void matMulCuda(float* A, float* B, float* C, float* Bias, int M, int N, int K){
-    matmulKernel<<<M, N>>>(
-        A, B, C, Bias, M, N, K
+void matMulCuda(float* A, float* B, float* C, int M, int N, int K){
+    dim3 blockDim(BLOCK_SIZE, BLOCK_SIZE, 1);
+    dim3 gridDim((BLOCK_SIZE + K - 1)/BLOCK_SIZE, (BLOCK_SIZE + M - 1)/BLOCK_SIZE, 1);
+
+    // printf("Grid : {%d, %d, %d} blocks. Blocks : {%d, %d, %d} threads.\n",
+    // gridDim.x, gridDim.y, gridDim.z, blockDim.x, blockDim.y, blockDim.z);
+    matmulKernel<<<gridDim, blockDim>>>(
+        A, B, C, M, N, K
     );
 }
 
+void matmulSharedCuda(float* A, float* B, float* C, int M, int N, int K) {
+    dim3 blockDim(BLOCK_SIZE, BLOCK_SIZE, 1);
+    dim3 gridDim((BLOCK_SIZE + K - 1)/BLOCK_SIZE, (BLOCK_SIZE + M - 1)/BLOCK_SIZE, 1);
+    matmulSharedKernel<<<gridDim, blockDim>>>(
+        A, B, C, M, N, K
+    );
+    cudaError_t cudaStatus = cudaGetLastError();
+    if (cudaStatus != cudaSuccess) {
+        std::cerr << "Error: " << cudaGetErrorString(cudaStatus) << std::endl;
+        abort();
+    }
+
+}
+
 
 void conv2dCuda(float* input, float* weights, float* bias, float* output, int input_rows, int input_cols, int kernel_size, int num_channels, int num_filters, int height_new, int width_new) {
     int batch_size = 1;
@@ -158,9 +228,9 @@ void conv2dCuda(float* input, float* weights, float* bias, float* output, int in
 }
 
 void reluCuda(float* input, int size) {
-    int blockSize = 32;
-    int numBlocks = (size + blockSize - 1) / blockSize;
-    reluKernel<<<numBlocks, blockSize>>>(input, size);
+    //int blockSize = 32;
+    int numBlocks = (size + BLOCK_SIZE - 1) / BLOCK_SIZE;
+    reluKernel<<<numBlocks, BLOCK_SIZE>>>(input, size);
 }
 
 void maxPoolCuda(float* dst, float* src, int input_rows, int input_cols, int output_rows, int output_cols, int channels) {
diff --git a/src/cudaKernels.cuh b/src/cudaKernels.cuh
index 9c2a9678a580067cf58e9e5c03089fd55e52aa8e..f99b0cebfe2f85a213135d3bd4d9e57ba942453f 100644
--- a/src/cudaKernels.cuh
+++ b/src/cudaKernels.cuh
@@ -7,11 +7,12 @@ cudaError_t loadFromGpu(float* dstHost, float* srcGpu, int size);
 __global__ void conv2dKernel(float* input, float* weights, float* bias, float* output, int input_rows, int input_cols, int kernel_size, int num_channels, int num_filters);
 __global__ void reluKernel(float* input, int size);
 __global__ void maxPoolKernel(float* dst, float* src, int input_rows, int input_cols);
-__global__ void matmulKernel(float* A, float* B, float* C, float* Bias, int M, int N, int K);
+__global__ void matmulKernel(float* A, float* B, float* C, int M, int N, int K);
 
 cudaError_t avgPoolCuda(float* dst, float* src, int sizeX, int sizeY);
 cudaError_t vecMulCuda(float* mx, float* vec, float* res, int sizeMxX, int sizeMxY);
-void matMulCuda(float* A, float* B, float* C, float* Bias, int M, int N, int K);
+void matMulCuda(float* A, float* B, float* C, int M, int N, int K);
+void matmulSharedCuda(float* A, float* B, float* C, int M, int N, int K);
 void reluCuda(float* input, int size);
 void maxPoolCuda(float* dst, float* src, int input_rows, int input_cols, int output_rows, int output_cols, int channels);
 void conv2dCuda(float* input, float* weights, float* bias, float* output, int input_rows, int input_cols, int kernel_size, int num_channels, int num_filters, int height_new, int width_new);
diff --git a/src/cudaOps.cc b/src/cudaOps.cc
index 6d8865377bcf22edfabfad0ce1a66bf013f74102..60b5874aeda2863fa346c5ad3144671397f63ba8 100644
--- a/src/cudaOps.cc
+++ b/src/cudaOps.cc
@@ -14,38 +14,51 @@ void CudaTensor::reshape(const std::vector<size_t>& new_shape) {
 
 void CudaTensor::permute(const shape_t& permutation) {}
 
-void CudaTensor::matmul(const CudaTensor& w, const CudaTensor& b) {
-    assert(!empty && !w.empty && !b.empty);
-    assertm(shape.size() == w.shape.size(), "Dimensions don't match");
-    assertm(shape.size() == 2, "Tensors need to be 2D");
+CudaTensor matmul(const CudaTensor& lhs, const CudaTensor& rhs) {
+    assert(!lhs.empty && !rhs.empty);
+    assertm(lhs.shape.size() == rhs.shape.size(), "Dimensions don't match");
+    assertm(lhs.shape.size() == 2, "Tensors need to be 2D");
     assertm(
-        shape[1] == w.shape[0],
+        lhs.shape[1] == rhs.shape[0],
         "Input tensor is of shape MxN but weight tensor is not of shape NxP");
 
-    size_t M = shape[0], N = w.shape[0], K = w.shape[1];
-    CudaTensor ret{{shape[0], w.shape[1]}};
-    // Bias
-    assertm(ret.size() == b.size(), "Bias has to be of the same size as result");
+    size_t M = lhs.shape[0], N = rhs.shape[0], K = rhs.shape[1];
+    CudaTensor ret{{lhs.shape[0], rhs.shape[1]}};
 
 
-    // TODO: currently default zero bias
-    //CudaTensor bias{{K, N}};
-
     matMulCuda(
-        dat, w.dat, ret.dat, b.dat, M, N, K
+        lhs.dat, rhs.dat, ret.dat, M, N, K
     );
 
-    swap(ret);
+    return ret;
 }
 
-void CudaTensor::conv2d(const CudaTensor& w, const CudaTensor& b, const std::string& padding) {
-    assert(!empty && !w.empty);
-    assertm(shape.size() == w.shape.size(),
+CudaTensor matmulShared(const CudaTensor& lhs, const CudaTensor& rhs) {
+    assert(!lhs.empty && !rhs.empty);
+    assertm(lhs.shape.size() == rhs.shape.size(), "Dimensions don't match");
+    assertm(lhs.shape.size() == 2, "Tensors need to be 2D");
+    assertm(
+        lhs.shape[1] == rhs.shape[0],
+        "Input tensor is of shape MxN but weight tensor is not of shape NxP");
+
+    size_t M = lhs.shape[0], N = rhs.shape[0], K = rhs.shape[1];
+    CudaTensor ret{{lhs.shape[0], rhs.shape[1]}};
+
+    matmulSharedCuda(
+        lhs.dat, rhs.dat, ret.dat, M, N, K
+    );
+
+    return ret;
+}
+
+CudaTensor conv2d(const CudaTensor& lhs, const CudaTensor& w, const CudaTensor& b, const std::string& padding) {
+    assert(!lhs.empty && !w.empty);
+    assertm(lhs.shape.size() == w.shape.size(),
             "Input tensor and weight tensor have different dimensions");
-    assertm(shape.size() == 4, "Only 4D Tensors accepted, reshape if possible");
+    assertm(lhs.shape.size() == 4, "Only 4D Tensors accepted, reshape if possible");
     if (!b.empty) assertm(b.shape.size() <= 1, "if available, bias must be 1D");
 
-    const size_t N = shape[0], C = shape[1], H = shape[2], W = shape[3],
+    const size_t N = lhs.shape[0], C = lhs.shape[1], H = lhs.shape[2], W = lhs.shape[3],
                  OUT_C = w.shape[0], K_HW = w.shape[2];
     const size_t H_NEW = H - K_HW + 1, W_NEW = W - K_HW + 1;
     assertm(C == w.shape[1], "Input channels don't match with weight tensor");
@@ -56,10 +69,10 @@ void CudaTensor::conv2d(const CudaTensor& w, const CudaTensor& b, const std::str
     CudaTensor ret{{N, OUT_C, H_NEW, W_NEW}};
 
     conv2dCuda(
-        dat, w.dat, b.dat, ret.dat, H, W, K_HW, C, OUT_C , H_NEW, W_NEW
+        lhs.dat, w.dat, b.dat, ret.dat, H, W, K_HW, C, OUT_C , H_NEW, W_NEW
     );
 
-    swap(ret);
+    return ret;
 }
 
 /*
@@ -106,13 +119,13 @@ void CudaTensor::relu() {
 
 }
 
-void CudaTensor::max_pool(const shape_t& kernel_shape) {
-    assert(!empty);
+CudaTensor max_pool(const CudaTensor& lhs, const shape_t& kernel_shape) {
+    assert(!lhs.empty);
     assertm(kernel_shape.size() == 2, "Kernel not 2D");
     assertm(kernel_shape[0] == kernel_shape[1],
             "Kernel height not equal to width");
 
-    const size_t N = shape[0], C = shape[1], H = shape[2], W = shape[3];
+    const size_t N = lhs.shape[0], C = lhs.shape[1], H = lhs.shape[2], W = lhs.shape[3];
     assertm((H % kernel_shape[0]) == 0 && (W % kernel_shape[0]) == 0,
             "Kernel shape and input tensor width/height missmatch");
     const size_t H_NEW = H / kernel_shape[0], W_NEW = W / kernel_shape[0];
@@ -121,8 +134,8 @@ void CudaTensor::max_pool(const shape_t& kernel_shape) {
     CudaTensor ret{{N, C, H_NEW, W_NEW}};
 
     maxPoolCuda(
-        ret.dat, dat, H, W, H_NEW, W_NEW, C
+        ret.dat, lhs.dat, H, W, H_NEW, W_NEW, C
     );
 
-    swap(ret);
+    return ret;
 }
\ No newline at end of file
diff --git a/src/cudaTensor.cc b/src/cudaTensor.cc
index b653c8978888c4ec70cfae9e8d02e2f24e661f2c..d50115b0364fca2f7a0f27833b40299299a37326 100644
--- a/src/cudaTensor.cc
+++ b/src/cudaTensor.cc
@@ -302,6 +302,12 @@ void CudaTensor::compare(Tensor& rhs) {
     }
 }
 
+void CudaTensor::compare(CudaTensor& rhs) {
+    for (int i = 0; i < size(); i++) {
+        assertm(abs(dat[i] - rhs.dat[i]) < 0.0001, "Tensors are not equal");
+    }
+}
+
 void CudaTensor::fillTensor() {
     float* data = (float*) malloc(size()*sizeof(float));
     for (int i = 0; i < size(); i++) {
diff --git a/src/cudaTensor.h b/src/cudaTensor.h
index fb2479a1ec9b9b852be5087ee637dec0f88df111..b1096c44cdc59a125065137dba02cdb2afd3f3fa 100644
--- a/src/cudaTensor.h
+++ b/src/cudaTensor.h
@@ -13,9 +13,10 @@
 #define assertm(exp, msg) assert(((void)msg, exp))
 
 class CudaTensor : public Tensor {
-        bool empty = true;
+        
 
     public:
+        bool empty = true;
         float* dat;
         shape_t shape;
         std::string name;
@@ -44,18 +45,20 @@ class CudaTensor : public Tensor {
         void load_data(float * data, int n);
         void print_tensor(std::ostream& os);
         void compare(Tensor& rhs);
+        void compare(CudaTensor& rhs);
         void fillTensor();
 
         // ops
         void reshape(const shape_t& new_shape);
         void permute(const shape_t& permutation);
-        void matmul(const CudaTensor& w, const CudaTensor& b);
-        void conv2d(const CudaTensor& w, const CudaTensor& b, const std::string& padding = "valid");
-        void conv2dWinograd(const CudaTensor& w, const CudaTensor& b, const std::string& padding = "valid");
-        void max_pool(const shape_t& kernel_shape);
-        void avg_pool(const shape_t& kernel_shape);
-        void add(const CudaTensor& b);
         void relu();
         void softmax();
 
-};
\ No newline at end of file
+};
+
+CudaTensor matmul(const CudaTensor& lhs, const CudaTensor& rhs);
+CudaTensor matmulShared(const CudaTensor& lhs, const CudaTensor& rhs);
+CudaTensor conv2d(const CudaTensor& lhs, const CudaTensor& w, const CudaTensor& b, const std::string& padding = "valid");
+CudaTensor max_pool(const CudaTensor& lhs, const shape_t& kernel_shape);
+CudaTensor avg_pool(const CudaTensor& lhs, const shape_t& kernel_shape);
+CudaTensor add(const CudaTensor& lhs, const CudaTensor& b);
\ No newline at end of file
diff --git a/src/inference.cc b/src/inference.cc
index 4a6d767431dc5762f77eed1f5de1d79e8d94d6e6..858ab5faa94415b8f49ca76c5ec663e1ced8d3b2 100644
--- a/src/inference.cc
+++ b/src/inference.cc
@@ -1,5 +1,4 @@
 #include "inference.h"
-#include "cudaLenet5.cuh"
 
 double get_time(void);
 
@@ -346,19 +345,20 @@ void compareGpuToCpu() {
     get_input(model, collector);
     get_input(model, collectorCpu);
         
-    CudaTensor *input = collector[model.graph().input()[0].name()];
+    CudaTensor input = *collector[model.graph().input()[0].name()];
     Tensor *inputCpu = collectorCpu[model.graph().input()[0].name()];
-    load_image(*input, "mnist/testSample/img_2.jpg");
+    load_image(input, "mnist/testSample/img_2.jpg");
     load_image(*inputCpu, "mnist/testSample/img_2.jpg");
     std::cout<<"Loaded Image"<<std::endl;
     get_weights(model, collector);
     get_weights(model, collectorCpu);
     
-    input->print_tensor(std::cout);
+    input.print_tensor(std::cout);
 
     
 
-    input->conv2d(
+    input = conv2d(
+        input,
         *collector["sequential/conv2d/Conv2D/ReadVariableOp:0"],
         *collector["sequential/conv2d/BiasAdd/ReadVariableOp:0"]
     );
@@ -367,21 +367,23 @@ void compareGpuToCpu() {
         *collectorCpu["sequential/conv2d/BiasAdd/ReadVariableOp:0"]
     );
 
-    input->relu();
+    input.relu();
     inputCpu->relu();
   
     
     
-    input->max_pool(
+    input = max_pool(
+        input,
         {2, 2}
     );
     inputCpu->max_pool(
         {2, 2}
     );
 
-    input->compare(*inputCpu);
+    input.compare(*inputCpu);
 
-    input->conv2d(
+    input = conv2d(
+        input,
         *collector["sequential/conv2d_1/Conv2D/ReadVariableOp:0"],
         *collector["sequential/conv2d_1/BiasAdd/ReadVariableOp:0"]
     );
@@ -390,10 +392,13 @@ void compareGpuToCpu() {
         *collectorCpu["sequential/conv2d_1/BiasAdd/ReadVariableOp:0"]
     );
 
-    input->relu();
+    input.relu();
     inputCpu->relu();
 
-    input->max_pool(
+    input.compare(*inputCpu);
+
+    input = max_pool(
+        input,
         {2, 2}
     );
     inputCpu->max_pool(
@@ -401,7 +406,8 @@ void compareGpuToCpu() {
     );
 
 
-    input->conv2d(
+    input = conv2d(
+        input,
         *collector["sequential/conv2d_2/Conv2D/ReadVariableOp:0"],
         *collector["sequential/conv2d_2/BiasAdd/ReadVariableOp:0"]
     );
@@ -410,18 +416,25 @@ void compareGpuToCpu() {
         *collectorCpu["sequential/conv2d_2/BiasAdd/ReadVariableOp:0"]
     );
     
-    input->relu();
+    input.relu();
     inputCpu->relu();
     // Batch size is first element
-    input->reshape({1, 120});
+
+
+    input.reshape({1, 120});
     inputCpu->reshape({1, 120});
     
 
+   
 
-    input->matmul(
-        *collector["sequential/dense/MatMul/ReadVariableOp:0"],
-        *collector["sequential/dense/BiasAdd/ReadVariableOp:0"]
+    //input->compare(*inputCpu);
+    input = matmul(
+        input,
+        *collector["sequential/dense/MatMul/ReadVariableOp:0"]
     );
+
+
+
     inputCpu->matmul(
         *collectorCpu["sequential/dense/MatMul/ReadVariableOp:0"]
     );
@@ -429,15 +442,19 @@ void compareGpuToCpu() {
 
         *collectorCpu["sequential/dense/BiasAdd/ReadVariableOp:0"]
     );
+
     
+
+
+    input.compare(*inputCpu);
     
 
-    input->relu();
+    input.relu();
     inputCpu->relu();
 
-    input->matmul(
-        *collector["sequential/dense_1/MatMul/ReadVariableOp:0"],
-        *collector["sequential/dense_1/BiasAdd/ReadVariableOp:0"]
+    input = matmul(
+        input,
+        *collector["sequential/dense_1/MatMul/ReadVariableOp:0"]
     );
     inputCpu->matmul(
         *collectorCpu["sequential/dense_1/MatMul/ReadVariableOp:0"]
@@ -445,12 +462,12 @@ void compareGpuToCpu() {
     inputCpu->add(
         *collectorCpu["sequential/dense_1/BiasAdd/ReadVariableOp:0"]
     );
-    input->relu();
+    input.relu();
     inputCpu->relu();
 
-    input->print_tensor(std::cout);
+    input.print_tensor(std::cout);
     inputCpu->print_tensor(std::cout);
-    input->print_shape(std::cout);
+    input.print_shape(std::cout);
     inputCpu->print_shape(std::cout);
 }
 
@@ -467,7 +484,46 @@ int main() {
     }
     */
 
-    compareGpuToCpu();
+    //compareGpuToCpu();
+
+    CudaTensor ct1{{10, 10}};
+    CudaTensor ct2{{10, 10}};
+    ct1.fillTensor();
+    ct2.fillTensor();
+    ct1.print_tensor(std::cout);
+    ct2.print_tensor(std::cout);
+
+    CudaTensor retShared = matmulShared(ct1, ct2);
+    CudaTensor ret = matmul(ct1, ct2);
+    ret.print_tensor(std::cout);
+    retShared.print_tensor(std::cout);
+
+
+    ret.compare(retShared);
+
+
+    /*
+    CudaTensor ct1{{10, 10}};
+    CudaTensor ct2{{10, 10}};
+    CudaTensor b{{10, 10}};
+
+    for (int i = 0; i < ct1.shape[0]; i++) {
+        for (int j = 0; j < ct2.shape[1]; j++) {
+            ct1[{i, j}] = 10;
+            ct2[{i, j}] = 5;
+            b[{i, j}] = 1;
+        }
+    }
+
+    ct1.matmul(
+        ct2,
+        b
+    );
+    
+    
+    ct1.print_tensor(std::cout);
+
+    */
     
     return 0;
 }