Upload New File

2796fcb2 · Gabriel Falk · 2cb2c650 · 2796fcb2
Commit 2796fcb2 authored Mar 18, 2023 by Gabriel Falk
--- a/Main/test.cpp
+++ b/Main/test.cpp
+#include <stdio.h>
+#include <stdlib.h>
+#include <chrono>
+#include <iostream>
+#include <math.h>
+#include <assert.h>
+#include <cuda_runtime.h>
+
+#include "matrix.h"
+#include "cudaKernels.h"
+
+//#include "cudaLenet5.h"
+//#include "tensor.h"
+
+int runs = 211;
+
+// Gives number of how often Conv is repeated
+int runsConv = 201;
+// Gives number of how often MatMul is repeated
+int runsMatmul = 501;
+// Gives number of how often AvgPool is repeated
+int runsAvgPool = 1001;
+
+//SELECT MEDIAN VALUE (Methodes to select Median out of allocated Buffer) [Frees Buffer afterwards & does not Allocate];
+//--------------------------------------------------
+int cmpfunc (const void * a, const void * b){
+  if (*(double*)a > *(double*)b)
+    return 1;
+  else if (*(double*)a < *(double*)b)
+    return -1;
+  else
+    return 0;  
+}
+
+double med(double* array, int slot){
+    double median = 0.0;
+    qsort(array, slot, sizeof(double), cmpfunc);
+    median = array[slot/2];
+    free(array);
+    return median;
+}
+//--------------------------------------------------
+
+/*
+Schema: Method takes inputs & optional Printboolean <0->noOutput / 1->Output> for debugging.
+        Method allocates a Array with size <runs> where the Runtime of each run is stored.
+        then the Median of this is determined and returned.
+Inputs: for Conv <Matrix & Filter>
+        for Matmul <Matrix & Matrix>
+        for AvgPool <Matrix>
+*/
+
+//TESTS-CPU
+double testCPUsimpConv(matrix input, matrix filter, int prinT){
+    // Test a simple CPU Convolution
+    double* tmpMeasures = (double*) malloc(runsConv*sizeof(double));
+    matrix dst;    
+    for(int i = 0; i < runsConv; i++){
+      // Time Measure START
+      auto start = std::chrono::high_resolution_clock::now();
+      dst = conv2d(input, filter);
+      
+      // Time Measure STOP
+      auto finish = std::chrono::high_resolution_clock::now();
+      std::chrono::duration<double> elapsed = finish - start;
+      //Save Measurement
+      tmpMeasures[i] = elapsed.count();
+    }
+
+    // Print Test-Case
+    if(prinT){
+      printf("CPU_simple-Convolution:\n");
+      printf("Filter:\n");
+	    printMx(filter);
+      printf("Input:\n");
+	    printMx(input);
+      printf("Result:\n");
+	    printMx(dst);
+    }
+
+    // Clean Up
+    destroyMx(input);
+    destroyMx(filter);
+    destroyMx(dst);
+
+    // Return Median
+    return med(tmpMeasures, runsConv);
+}
+
+double testCPUsimpMatMul(matrix a, matrix b, int prinT){
+    // Test a simple CPU MatMul
+    double* tmpMeasures = (double*) malloc(runsMatmul*sizeof(double));
+    matrix dst;    
+    for(int i = 0; i < runsMatmul; i++){
+      // Time Measure START
+      auto start = std::chrono::high_resolution_clock::now();
+      dst = mulMx(a, b);
+      
+      // Time Measure STOP
+      auto finish = std::chrono::high_resolution_clock::now();
+      std::chrono::duration<double> elapsed = finish - start;
+      //Save Measurement
+      tmpMeasures[i] = elapsed.count();
+    }
+
+    // Print Test-Case
+    if(prinT){
+      printf("CPU_simple-MatrixMultiplication:\n");
+      printf("Matrix A:\n");
+	    printMx(a);
+      printf("Matrix B:\n");
+	    printMx(b);
+      printf("Result:\n");
+	    printMx(dst);
+    }
+
+    // Clean Up
+    destroyMx(a);
+    destroyMx(b);
+    destroyMx(dst);
+
+    // Return Median
+    return med(tmpMeasures, runsMatmul);
+}
+
+double testCPUsimpAvgPool(matrix a, int prinT){
+    // Test a simple CPU AvgPool
+    double* tmpMeasures = (double*) malloc(runsAvgPool*sizeof(double));
+    matrix dst;    
+    for(int i = 0; i < runsAvgPool; i++){
+      // Time Measure START
+      auto start = std::chrono::high_resolution_clock::now();
+      dst = avg2Pooling(a);
+      
+      // Time Measure STOP
+      auto finish = std::chrono::high_resolution_clock::now();
+      std::chrono::duration<double> elapsed = finish - start;
+      //Save Measurement
+      tmpMeasures[i] = elapsed.count();
+    }
+
+    // Print Test-Case
+    if(prinT){
+      printf("CPU_simple-AvgPool:\n");
+      printf("Matrix A:\n");
+	    printMx(a);
+      printf("Result:\n");
+	    printMx(dst);
+    }
+
+    // Clean Up
+    destroyMx(a);
+    destroyMx(dst);
+
+    // Return Median
+    return med(tmpMeasures, runsAvgPool);
+
+}
+
+//TESTS-GPU
+double testGPUsimpConv(matrix input, matrix filter, int prinT){
+    // Test a simple parallel GPU Convolution
+    double* tmpMeasures = (double*) malloc(runsConv*sizeof(double));
+    matrix dst;
+    for(int i = 0; i < runsConv; i++){
+        dst = createMx(input.sizeX - filter.sizeX + 1, input.sizeY - filter.sizeY + 1);
+		    conv2dCuda(input.head, filter.head, dst.head, input.sizeX, input.sizeY, filter.sizeX, &tmpMeasures[i]);
+    }
+
+    // Print Test-Case
+    if(prinT){
+      printf("GPU_simple-Convolution:\n");
+      printf("Filter:\n");
+	    printMx(filter);
+      printf("Input:\n");
+	    printMx(input);
+      printf("Result:\n");
+	    printMx(dst);
+    }
+
+    // Return Median
+    return med(tmpMeasures, runsConv); 
+}
+
+double testGPUfftConv(matrix input, matrix filter, int prinT){
+    // Test a fft parallel GPU Convolution
+    double* tmpMeasures = (double*) malloc(runsConv*sizeof(double));
+    matrix dst;
+    for(int i = 0; i < runsConv; i++){
+        dst = createMx(input.sizeX - filter.sizeX + 1, input.sizeY - filter.sizeY + 1);
+		    conv2dFFTCuda(input.head, filter.head, dst.head, input.sizeX, input.sizeY, filter.sizeX, &tmpMeasures[i]);
+    }
+
+    // Print Test-Case
+    if(prinT){
+      printf("GPU_fft-Convolution:\n");
+      printf("Filter:\n");
+	    printMx(filter);
+      printf("Input:\n");
+	    printMx(input);
+      printf("Result:\n");
+	    printMx(dst);
+    }
+
+    // Return Median
+    return med(tmpMeasures, runsConv); 
+}
+
+double testGPUmatmulConv(matrix input, matrix filter, int prinT){
+    // Test a matmul GPU Convolution
+    double* tmpMeasures = (double*) malloc(runsMatmul*sizeof(double));
+    
+    // Get Convolution & ints for Reshape later
+    matrix m = getConvMx(filter, input.sizeX, input.sizeY);
+    int sizeX = input.sizeX;
+    int sizeY = input.sizeY;
+
+    // Reshape for MatMul
+    input.sizeY = input.sizeX*input.sizeY; 
+    input.sizeX = 1;
+    matrix dst;
+
+    for(int i = 0; i < runsMatmul; i++){
+        dst = createMx(input.sizeY, m.sizeX);
+        matMulCuda(m.head, input.head, dst.head, m.sizeX, m.sizeY, input.sizeX, input.sizeY,  &tmpMeasures[i]);
+    }
+
+    // Reshape of Input & dst
+    input.sizeX = sizeX;
+    input.sizeY = sizeY;
+    dst.sizeX = input.sizeX-filter.sizeX+1;
+    dst.sizeY = input.sizeY-filter.sizeY+1;
+
+
+
+    // Print Test-Case
+    if(prinT){
+      printf("GPU_fft-Convolution:\n");
+      printf("Filter:\n");
+	    printMx(filter);
+      printf("Matrix M (Conv):\n");
+	    printMx(m);
+      printf("Input:\n");
+	    printMx(input);
+      printf("Result:\n");
+	    printMx(dst);
+
+      /*
+      //Reshape for VectorOutput
+      input.sizeY = input.sizeX*input.sizeY; 
+      input.sizeX = 1;
+      dst.sizeX = dst.sizeX*input.sizeY;
+      dst.sizeY = 1;
+
+      printf("Input as Vector:\n");
+	    printMx(input);
+      printf("Result as Vektor:\n");
+	    printMx(dst);
+      */
+    }
+
+    // Return Median
+    return med(tmpMeasures, runsMatmul); 
+
+}
+
+double testGPUwinoConv(matrix input, matrix filter, int prinT){
+    // Test a Winograd-Tiled parallel GPU Convolution
+    assert((filter.sizeX == 3 || filter.sizeX == 5) && filter.sizeX == filter.sizeY);
+    double* tmpMeasures = (double*) malloc(runsConv*sizeof(double));
+    matrix dst;
+    for(int i = 0; i < runsConv; i++){
+        dst = createMx(input.sizeX - filter.sizeX + 1, input.sizeY - filter.sizeY + 1);
+		    conv2dWinoCuda(input.head, filter.head, dst.head, input.sizeX, input.sizeY, filter.sizeX, &tmpMeasures[i]);
+    }
+
+    // Print Test-Case
+    if(prinT){
+      printf("GPU_Wino-Convolution:\n");
+      printf("Filter:\n");
+	    printMx(filter);
+      printf("Input:\n");
+	    printMx(input);
+      printf("Result:\n");
+	    printMx(dst);
+    }
+
+    // Return Median
+    return med(tmpMeasures, runsConv); 
+}
+
+double testGPUsimpMatMul(matrix a, matrix b, int prinT){
+    // Test a simple parallel GPU Matrix Multiplication
+    double* tmpMeasures = (double*) malloc(runsMatmul*sizeof(double));
+    matrix dst;
+    for(int i = 0; i < runsMatmul; i++){
+        dst = createMx(a.sizeY, b.sizeX);
+        matMulCuda(a.head, b.head, dst.head, a.sizeX, a.sizeY, b.sizeX, b.sizeY,  &tmpMeasures[i]);
+    }
+
+    // Print Test-Case
+    if(prinT){
+      printf("GPU_fft-Convolution:\n");
+      printf("Matrix A:\n");
+	    printMx(a);
+      printf("Matrix B:\n");
+	    printMx(b);
+      printf("Result:\n");
+	    printMx(dst);
+    }
+
+    // Return Median
+    return med(tmpMeasures, runsMatmul);
+}
+
+double testGPUtransposeMatMul(matrix a, matrix b, int prinT){
+    // Test a transposed parallel GPU Matrix Multiplication
+    double* tmpMeasures = (double*) malloc(runsMatmul*sizeof(double));
+    matrix dst;
+    for(int i = 0; i < runsMatmul; i++){
+        dst = createMx(a.sizeY, b.sizeY);
+        matMulTransposeCuda(a.head, b.head, dst.head, a.sizeX, a.sizeY, b.sizeX, b.sizeY, &tmpMeasures[i]);
+    }
+
+    // Print Test-Case
+    if(prinT){
+      printf("GPU_fft-Convolution:\n");
+      printf("Matrix A:\n");
+	    printMx(a);
+      printf("Matrix B:\n");
+	    printMx(b);
+      printf("Result:\n");
+	    printMx(dst);
+    }
+
+    // Return Median
+    return med(tmpMeasures, runsMatmul); 
+}
+
+double testGPUtileMatMul(matrix a, matrix b, int prinT){
+    // Test a transposed parallel GPU Matrix Multiplication
+    double* tmpMeasures = (double*) malloc(runsMatmul*sizeof(double));
+    matrix dst;
+    for(int i = 0; i < runsMatmul; i++){
+        dst = createMx(a.sizeY, b.sizeY);
+        matMulTileCuda(a.head, b.head, dst.head, a.sizeX, a.sizeY, b.sizeX, b.sizeY, &tmpMeasures[i]);
+    }
+
+    // Print Test-Case
+    if(prinT){
+      printf("GPU_fft-Convolution:\n");
+      printf("Matrix A:\n");
+	    printMx(a);
+      printf("Matrix B:\n");
+	    printMx(b);
+      printf("Result:\n");
+	    printMx(dst);
+    }
+
+    // Return Median
+    return med(tmpMeasures, runsMatmul); 
+}
+
+double testGPUsimpAvgPool(matrix a, int prinT){
+    // Test a simple parallel GPU AveragePooling
+    double* tmpMeasures = (double*) malloc(runsAvgPool*sizeof(double));
+    matrix dst;
+    for(int i = 0; i < runsAvgPool; i++){
+        dst = createMx(a.sizeX/2, a.sizeY/2);
+        avgPoolCuda(dst.head, a.head, a.sizeX, a.sizeY, &tmpMeasures[i]);
+    }
+
+    // Print Test-Case
+    if(prinT){
+      printf("GPU_simple-AveragePooling:\n");
+      printf("Matrix A:\n");
+	    printMx(a);
+      printf("Result:\n");
+	    printMx(dst);
+    }
+
+    // Return Median
+    return med(tmpMeasures, runsAvgPool); 
+}
+
+//MAIN Testcalls
+void test(){
+    // Convolution-Test  --------------------------------------------------------------------------------------------------------
+    // MaxKernel 1024 per Block, Convoltuions launching different numbers of Kernels
+    /* CPU->"no limit"; 
+    *  GPUsimple #El resultMx -> Limit Resultmatrix has size 32x32 or so -> input.x - filter.x + 1 < 32;
+    *  GPUfft    #El inputMx -> input has max size of 32x32;
+    *  GPUmatmul #El resultMx -> Limit Resultmatrix has size 32x32 or so -> input.x - filter.x + 1 < 32;
+    *  GPUwino   #El (sizeF+1)*(sizeF+1)*2 -> is max 5 so max 72 Threads are launched, but 1 block per Tile -> "no limit"
+    */
+    int inputSize = 32;
+    int maxFilterSize = 30;
+    assert(inputSize >= maxFilterSize-1);
+
+    // Loop has a Fixed Input by size and a growing Filter form [min 3 to max <maxFilterSize>] with a step size of 2;
+    for(int i = 3; i < maxFilterSize; i+=2){
+        // CPU simpleConvolution
+        double CPUsimpConv = testCPUsimpConv(getrdmMx(inputSize, inputSize), getrdmMx(i, i), 0);
+        printf("CPU_SimpleConvolution Time for [%dx%d] Kernel: %g\n", i, i, CPUsimpConv);
+
+        // GPU simpleConvolution
+        double GPUsimpConv = testGPUsimpConv(getrdmMx(inputSize, inputSize), getrdmMx(i, i), 0);
+        printf("GPU_SimpleConvolution Time for [%dx%d] Kernel: %g\n", i, i, GPUsimpConv);
+    
+        // GPU fftConvolution
+        double GPUfftConv = testGPUfftConv(getrdmMx(inputSize, inputSize), getrdmMx(i, i), 0);
+        printf("GPU_FFTConvolution    Time for [%dx%d] Kernel: %g\n", i, i, GPUfftConv);
+        
+        // GPU matmulConvolution
+        double GPUmatmulConv = testGPUmatmulConv(getrdmMx(inputSize, inputSize), getrdmMx(i, i), 0);
+        printf("GPU_matmulConvolution Time for [%dx%d] Kernel: %g\n", i, i, GPUmatmulConv);
+
+        //GPU winograd
+        double GPUwinoConv = (double) 0;
+        if(i == 3 || i == 5){
+            GPUwinoConv = testGPUwinoConv(getrdmMx(inputSize, inputSize), getrdmMx(i, i), 0);
+            printf("GPU_WinoConvolution   Time for [%dx%d] Kernel: %g\n", i, i, GPUwinoConv);
+        }
+
+        printf("\n");
+        //TODO: (Enroll)
+    }
+    
+    // MatrixMultiplication-Test  --------------------------------------------------------------------------------------------------------
+    // MaxKernel 1024 per Block, MatrixMultiplications launching different numbers of Kernels
+    /* CPU->"no limit";
+    *  GPUsimple #El resultMx -> Limit Resultmatrix has size 32x32 -> so intput with a.y & b.x < 33 
+    *
+    */
+    int maxMatrixSizeMM = 33;
+    
+    // Loop multiplies 2 Matrices, with the same size growing from [min 1 to max <maxMatrixSizeMM>] with a step size of 1, with each other;
+    for(int i = 1; i < maxMatrixSizeMM; i++){
+        // CPU simpleMatrixMultiplication
+        double CPUsimpMatMul = testCPUsimpMatMul(getrdmMx(i, i), getrdmMx(i,i), 0);
+        printf("CPU_SimpleMatrixMultiplication     Time for [%dx%d]*[%dx%d] Matrices: %g\n", i, i, i, i, CPUsimpMatMul);
+
+        // GPU simpleMatrixMultiplication
+        double GPUsimpMatMul = testGPUsimpMatMul(getrdmMx(i, i), getrdmMx(i,i), 0);
+        printf("GPU_SimpleMatrixMultiplication     Time for [%dx%d]*[%dx%d] Matrices: %g\n", i, i, i, i, GPUsimpMatMul);
+
+        // GPU transposedMatrixMultiplication
+        double GPUtransposeMatMul = testGPUtransposeMatMul(getrdmMx(i, i), transpose(getrdmMx(i, i)), 0);
+        printf("GPU_TransposedMatrixMultiplication Time for [%dx%d]*[%dx%d] Matrices: %g\n", i, i, i, i, GPUtransposeMatMul);
+
+        // GPU tiledMatrixMultiplication
+        double GPUtileMatMul = testGPUtileMatMul(getrdmMx(i, i), getrdmMx(i, i), 0);
+        printf("GPU_TiledMatrixMultiplication      Time for [%dx%d]*[%dx%d] Matrices: %g\n", i, i, i, i, GPUtileMatMul);
+        printf("\n");
+        //TODO: Tile, (opt. Tile, Enroll)
+    }
+
+    // AveragePooling-Test  --------------------------------------------------------------------------------------------------------
+    // MaxKernel 1024 per Block, AveragePooling launching different numbers of Kernels
+    /* CPU->"no limit";
+    *  GPUsimple -> #El resultMx -> Limit Resultmatrix has size 32x32 -> so max Input size is 64x64;
+    */
+    int maxMatrixSizeAP = 33;
+
+    // Loop does AveragePooling on a Matrix growing form [min 2 to max <maxMatrixSizeAP>] with a step size of 2;
+    for(int i = 2; i < maxMatrixSizeAP; i+=2){
+        // CPU simpleAvgPooling
+        double CPUsimpAvgPool = testCPUsimpAvgPool(getrdmMx(i, i), 0);
+        printf("CPU_SimpleAveragePooling Time for [%dx%d] Matrix: %g\n", i, i, CPUsimpAvgPool);
+
+        // GPU simpleAvgPooling
+        double GPUsimpAvgPool = testGPUsimpAvgPool(getrdmMx(i, i), 0);
+        printf("GPU_SimpleAveragePooling Time for [%dx%d] Matrix: %g\n", i, i, GPUsimpAvgPool);
+        printf("\n");
+    }
+    
+    // ResteRampe------------------------------------------------------------------------------------------------------------------
+    /*
+    cudaError_t matMulCuda(float* a, float* b, float* c, int sizeAx, int sizeAy, int sizeBx, int sizeBy);
+    cudaError_t matMulTransposeCuda(float* a, float* b, float* c, int sizeAx, int sizeAy, int sizeBx, int sizeBy);
+    cudaError_t matMulTileCuda(float* a, float* b, float* c, int sizeAx, int sizeAy, int sizeBx, int sizeBy);
+
+    cudaError_t conv2dEn5x5Cuda(float* mx, float* f, float* res, int sizeMxX, int sizeMxY, int sizeF);   
+    */
+
+
+
+
+
+
+}
\ No newline at end of file