Skip to content
Snippets Groups Projects
Commit 2796fcb2 authored by Gabriel Falk's avatar Gabriel Falk
Browse files

Upload New File

parent 2cb2c650
Branches
No related tags found
No related merge requests found
#include <stdio.h>
#include <stdlib.h>
#include <chrono>
#include <iostream>
#include <math.h>
#include <assert.h>
#include <cuda_runtime.h>
#include "matrix.h"
#include "cudaKernels.h"
//#include "cudaLenet5.h"
//#include "tensor.h"
int runs = 211;
// Gives number of how often Conv is repeated
int runsConv = 201;
// Gives number of how often MatMul is repeated
int runsMatmul = 501;
// Gives number of how often AvgPool is repeated
int runsAvgPool = 1001;
//SELECT MEDIAN VALUE (Methodes to select Median out of allocated Buffer) [Frees Buffer afterwards & does not Allocate];
//--------------------------------------------------
int cmpfunc (const void * a, const void * b){
if (*(double*)a > *(double*)b)
return 1;
else if (*(double*)a < *(double*)b)
return -1;
else
return 0;
}
double med(double* array, int slot){
double median = 0.0;
qsort(array, slot, sizeof(double), cmpfunc);
median = array[slot/2];
free(array);
return median;
}
//--------------------------------------------------
/*
Schema: Method takes inputs & optional Printboolean <0->noOutput / 1->Output> for debugging.
Method allocates a Array with size <runs> where the Runtime of each run is stored.
then the Median of this is determined and returned.
Inputs: for Conv <Matrix & Filter>
for Matmul <Matrix & Matrix>
for AvgPool <Matrix>
*/
//TESTS-CPU
double testCPUsimpConv(matrix input, matrix filter, int prinT){
// Test a simple CPU Convolution
double* tmpMeasures = (double*) malloc(runsConv*sizeof(double));
matrix dst;
for(int i = 0; i < runsConv; i++){
// Time Measure START
auto start = std::chrono::high_resolution_clock::now();
dst = conv2d(input, filter);
// Time Measure STOP
auto finish = std::chrono::high_resolution_clock::now();
std::chrono::duration<double> elapsed = finish - start;
//Save Measurement
tmpMeasures[i] = elapsed.count();
}
// Print Test-Case
if(prinT){
printf("CPU_simple-Convolution:\n");
printf("Filter:\n");
printMx(filter);
printf("Input:\n");
printMx(input);
printf("Result:\n");
printMx(dst);
}
// Clean Up
destroyMx(input);
destroyMx(filter);
destroyMx(dst);
// Return Median
return med(tmpMeasures, runsConv);
}
double testCPUsimpMatMul(matrix a, matrix b, int prinT){
// Test a simple CPU MatMul
double* tmpMeasures = (double*) malloc(runsMatmul*sizeof(double));
matrix dst;
for(int i = 0; i < runsMatmul; i++){
// Time Measure START
auto start = std::chrono::high_resolution_clock::now();
dst = mulMx(a, b);
// Time Measure STOP
auto finish = std::chrono::high_resolution_clock::now();
std::chrono::duration<double> elapsed = finish - start;
//Save Measurement
tmpMeasures[i] = elapsed.count();
}
// Print Test-Case
if(prinT){
printf("CPU_simple-MatrixMultiplication:\n");
printf("Matrix A:\n");
printMx(a);
printf("Matrix B:\n");
printMx(b);
printf("Result:\n");
printMx(dst);
}
// Clean Up
destroyMx(a);
destroyMx(b);
destroyMx(dst);
// Return Median
return med(tmpMeasures, runsMatmul);
}
double testCPUsimpAvgPool(matrix a, int prinT){
// Test a simple CPU AvgPool
double* tmpMeasures = (double*) malloc(runsAvgPool*sizeof(double));
matrix dst;
for(int i = 0; i < runsAvgPool; i++){
// Time Measure START
auto start = std::chrono::high_resolution_clock::now();
dst = avg2Pooling(a);
// Time Measure STOP
auto finish = std::chrono::high_resolution_clock::now();
std::chrono::duration<double> elapsed = finish - start;
//Save Measurement
tmpMeasures[i] = elapsed.count();
}
// Print Test-Case
if(prinT){
printf("CPU_simple-AvgPool:\n");
printf("Matrix A:\n");
printMx(a);
printf("Result:\n");
printMx(dst);
}
// Clean Up
destroyMx(a);
destroyMx(dst);
// Return Median
return med(tmpMeasures, runsAvgPool);
}
//TESTS-GPU
double testGPUsimpConv(matrix input, matrix filter, int prinT){
// Test a simple parallel GPU Convolution
double* tmpMeasures = (double*) malloc(runsConv*sizeof(double));
matrix dst;
for(int i = 0; i < runsConv; i++){
dst = createMx(input.sizeX - filter.sizeX + 1, input.sizeY - filter.sizeY + 1);
conv2dCuda(input.head, filter.head, dst.head, input.sizeX, input.sizeY, filter.sizeX, &tmpMeasures[i]);
}
// Print Test-Case
if(prinT){
printf("GPU_simple-Convolution:\n");
printf("Filter:\n");
printMx(filter);
printf("Input:\n");
printMx(input);
printf("Result:\n");
printMx(dst);
}
// Return Median
return med(tmpMeasures, runsConv);
}
double testGPUfftConv(matrix input, matrix filter, int prinT){
// Test a fft parallel GPU Convolution
double* tmpMeasures = (double*) malloc(runsConv*sizeof(double));
matrix dst;
for(int i = 0; i < runsConv; i++){
dst = createMx(input.sizeX - filter.sizeX + 1, input.sizeY - filter.sizeY + 1);
conv2dFFTCuda(input.head, filter.head, dst.head, input.sizeX, input.sizeY, filter.sizeX, &tmpMeasures[i]);
}
// Print Test-Case
if(prinT){
printf("GPU_fft-Convolution:\n");
printf("Filter:\n");
printMx(filter);
printf("Input:\n");
printMx(input);
printf("Result:\n");
printMx(dst);
}
// Return Median
return med(tmpMeasures, runsConv);
}
double testGPUmatmulConv(matrix input, matrix filter, int prinT){
// Test a matmul GPU Convolution
double* tmpMeasures = (double*) malloc(runsMatmul*sizeof(double));
// Get Convolution & ints for Reshape later
matrix m = getConvMx(filter, input.sizeX, input.sizeY);
int sizeX = input.sizeX;
int sizeY = input.sizeY;
// Reshape for MatMul
input.sizeY = input.sizeX*input.sizeY;
input.sizeX = 1;
matrix dst;
for(int i = 0; i < runsMatmul; i++){
dst = createMx(input.sizeY, m.sizeX);
matMulCuda(m.head, input.head, dst.head, m.sizeX, m.sizeY, input.sizeX, input.sizeY, &tmpMeasures[i]);
}
// Reshape of Input & dst
input.sizeX = sizeX;
input.sizeY = sizeY;
dst.sizeX = input.sizeX-filter.sizeX+1;
dst.sizeY = input.sizeY-filter.sizeY+1;
// Print Test-Case
if(prinT){
printf("GPU_fft-Convolution:\n");
printf("Filter:\n");
printMx(filter);
printf("Matrix M (Conv):\n");
printMx(m);
printf("Input:\n");
printMx(input);
printf("Result:\n");
printMx(dst);
/*
//Reshape for VectorOutput
input.sizeY = input.sizeX*input.sizeY;
input.sizeX = 1;
dst.sizeX = dst.sizeX*input.sizeY;
dst.sizeY = 1;
printf("Input as Vector:\n");
printMx(input);
printf("Result as Vektor:\n");
printMx(dst);
*/
}
// Return Median
return med(tmpMeasures, runsMatmul);
}
double testGPUwinoConv(matrix input, matrix filter, int prinT){
// Test a Winograd-Tiled parallel GPU Convolution
assert((filter.sizeX == 3 || filter.sizeX == 5) && filter.sizeX == filter.sizeY);
double* tmpMeasures = (double*) malloc(runsConv*sizeof(double));
matrix dst;
for(int i = 0; i < runsConv; i++){
dst = createMx(input.sizeX - filter.sizeX + 1, input.sizeY - filter.sizeY + 1);
conv2dWinoCuda(input.head, filter.head, dst.head, input.sizeX, input.sizeY, filter.sizeX, &tmpMeasures[i]);
}
// Print Test-Case
if(prinT){
printf("GPU_Wino-Convolution:\n");
printf("Filter:\n");
printMx(filter);
printf("Input:\n");
printMx(input);
printf("Result:\n");
printMx(dst);
}
// Return Median
return med(tmpMeasures, runsConv);
}
double testGPUsimpMatMul(matrix a, matrix b, int prinT){
// Test a simple parallel GPU Matrix Multiplication
double* tmpMeasures = (double*) malloc(runsMatmul*sizeof(double));
matrix dst;
for(int i = 0; i < runsMatmul; i++){
dst = createMx(a.sizeY, b.sizeX);
matMulCuda(a.head, b.head, dst.head, a.sizeX, a.sizeY, b.sizeX, b.sizeY, &tmpMeasures[i]);
}
// Print Test-Case
if(prinT){
printf("GPU_fft-Convolution:\n");
printf("Matrix A:\n");
printMx(a);
printf("Matrix B:\n");
printMx(b);
printf("Result:\n");
printMx(dst);
}
// Return Median
return med(tmpMeasures, runsMatmul);
}
double testGPUtransposeMatMul(matrix a, matrix b, int prinT){
// Test a transposed parallel GPU Matrix Multiplication
double* tmpMeasures = (double*) malloc(runsMatmul*sizeof(double));
matrix dst;
for(int i = 0; i < runsMatmul; i++){
dst = createMx(a.sizeY, b.sizeY);
matMulTransposeCuda(a.head, b.head, dst.head, a.sizeX, a.sizeY, b.sizeX, b.sizeY, &tmpMeasures[i]);
}
// Print Test-Case
if(prinT){
printf("GPU_fft-Convolution:\n");
printf("Matrix A:\n");
printMx(a);
printf("Matrix B:\n");
printMx(b);
printf("Result:\n");
printMx(dst);
}
// Return Median
return med(tmpMeasures, runsMatmul);
}
double testGPUtileMatMul(matrix a, matrix b, int prinT){
// Test a transposed parallel GPU Matrix Multiplication
double* tmpMeasures = (double*) malloc(runsMatmul*sizeof(double));
matrix dst;
for(int i = 0; i < runsMatmul; i++){
dst = createMx(a.sizeY, b.sizeY);
matMulTileCuda(a.head, b.head, dst.head, a.sizeX, a.sizeY, b.sizeX, b.sizeY, &tmpMeasures[i]);
}
// Print Test-Case
if(prinT){
printf("GPU_fft-Convolution:\n");
printf("Matrix A:\n");
printMx(a);
printf("Matrix B:\n");
printMx(b);
printf("Result:\n");
printMx(dst);
}
// Return Median
return med(tmpMeasures, runsMatmul);
}
double testGPUsimpAvgPool(matrix a, int prinT){
// Test a simple parallel GPU AveragePooling
double* tmpMeasures = (double*) malloc(runsAvgPool*sizeof(double));
matrix dst;
for(int i = 0; i < runsAvgPool; i++){
dst = createMx(a.sizeX/2, a.sizeY/2);
avgPoolCuda(dst.head, a.head, a.sizeX, a.sizeY, &tmpMeasures[i]);
}
// Print Test-Case
if(prinT){
printf("GPU_simple-AveragePooling:\n");
printf("Matrix A:\n");
printMx(a);
printf("Result:\n");
printMx(dst);
}
// Return Median
return med(tmpMeasures, runsAvgPool);
}
//MAIN Testcalls
void test(){
// Convolution-Test --------------------------------------------------------------------------------------------------------
// MaxKernel 1024 per Block, Convoltuions launching different numbers of Kernels
/* CPU->"no limit";
* GPUsimple #El resultMx -> Limit Resultmatrix has size 32x32 or so -> input.x - filter.x + 1 < 32;
* GPUfft #El inputMx -> input has max size of 32x32;
* GPUmatmul #El resultMx -> Limit Resultmatrix has size 32x32 or so -> input.x - filter.x + 1 < 32;
* GPUwino #El (sizeF+1)*(sizeF+1)*2 -> is max 5 so max 72 Threads are launched, but 1 block per Tile -> "no limit"
*/
int inputSize = 32;
int maxFilterSize = 30;
assert(inputSize >= maxFilterSize-1);
// Loop has a Fixed Input by size and a growing Filter form [min 3 to max <maxFilterSize>] with a step size of 2;
for(int i = 3; i < maxFilterSize; i+=2){
// CPU simpleConvolution
double CPUsimpConv = testCPUsimpConv(getrdmMx(inputSize, inputSize), getrdmMx(i, i), 0);
printf("CPU_SimpleConvolution Time for [%dx%d] Kernel: %g\n", i, i, CPUsimpConv);
// GPU simpleConvolution
double GPUsimpConv = testGPUsimpConv(getrdmMx(inputSize, inputSize), getrdmMx(i, i), 0);
printf("GPU_SimpleConvolution Time for [%dx%d] Kernel: %g\n", i, i, GPUsimpConv);
// GPU fftConvolution
double GPUfftConv = testGPUfftConv(getrdmMx(inputSize, inputSize), getrdmMx(i, i), 0);
printf("GPU_FFTConvolution Time for [%dx%d] Kernel: %g\n", i, i, GPUfftConv);
// GPU matmulConvolution
double GPUmatmulConv = testGPUmatmulConv(getrdmMx(inputSize, inputSize), getrdmMx(i, i), 0);
printf("GPU_matmulConvolution Time for [%dx%d] Kernel: %g\n", i, i, GPUmatmulConv);
//GPU winograd
double GPUwinoConv = (double) 0;
if(i == 3 || i == 5){
GPUwinoConv = testGPUwinoConv(getrdmMx(inputSize, inputSize), getrdmMx(i, i), 0);
printf("GPU_WinoConvolution Time for [%dx%d] Kernel: %g\n", i, i, GPUwinoConv);
}
printf("\n");
//TODO: (Enroll)
}
// MatrixMultiplication-Test --------------------------------------------------------------------------------------------------------
// MaxKernel 1024 per Block, MatrixMultiplications launching different numbers of Kernels
/* CPU->"no limit";
* GPUsimple #El resultMx -> Limit Resultmatrix has size 32x32 -> so intput with a.y & b.x < 33
*
*/
int maxMatrixSizeMM = 33;
// Loop multiplies 2 Matrices, with the same size growing from [min 1 to max <maxMatrixSizeMM>] with a step size of 1, with each other;
for(int i = 1; i < maxMatrixSizeMM; i++){
// CPU simpleMatrixMultiplication
double CPUsimpMatMul = testCPUsimpMatMul(getrdmMx(i, i), getrdmMx(i,i), 0);
printf("CPU_SimpleMatrixMultiplication Time for [%dx%d]*[%dx%d] Matrices: %g\n", i, i, i, i, CPUsimpMatMul);
// GPU simpleMatrixMultiplication
double GPUsimpMatMul = testGPUsimpMatMul(getrdmMx(i, i), getrdmMx(i,i), 0);
printf("GPU_SimpleMatrixMultiplication Time for [%dx%d]*[%dx%d] Matrices: %g\n", i, i, i, i, GPUsimpMatMul);
// GPU transposedMatrixMultiplication
double GPUtransposeMatMul = testGPUtransposeMatMul(getrdmMx(i, i), transpose(getrdmMx(i, i)), 0);
printf("GPU_TransposedMatrixMultiplication Time for [%dx%d]*[%dx%d] Matrices: %g\n", i, i, i, i, GPUtransposeMatMul);
// GPU tiledMatrixMultiplication
double GPUtileMatMul = testGPUtileMatMul(getrdmMx(i, i), getrdmMx(i, i), 0);
printf("GPU_TiledMatrixMultiplication Time for [%dx%d]*[%dx%d] Matrices: %g\n", i, i, i, i, GPUtileMatMul);
printf("\n");
//TODO: Tile, (opt. Tile, Enroll)
}
// AveragePooling-Test --------------------------------------------------------------------------------------------------------
// MaxKernel 1024 per Block, AveragePooling launching different numbers of Kernels
/* CPU->"no limit";
* GPUsimple -> #El resultMx -> Limit Resultmatrix has size 32x32 -> so max Input size is 64x64;
*/
int maxMatrixSizeAP = 33;
// Loop does AveragePooling on a Matrix growing form [min 2 to max <maxMatrixSizeAP>] with a step size of 2;
for(int i = 2; i < maxMatrixSizeAP; i+=2){
// CPU simpleAvgPooling
double CPUsimpAvgPool = testCPUsimpAvgPool(getrdmMx(i, i), 0);
printf("CPU_SimpleAveragePooling Time for [%dx%d] Matrix: %g\n", i, i, CPUsimpAvgPool);
// GPU simpleAvgPooling
double GPUsimpAvgPool = testGPUsimpAvgPool(getrdmMx(i, i), 0);
printf("GPU_SimpleAveragePooling Time for [%dx%d] Matrix: %g\n", i, i, GPUsimpAvgPool);
printf("\n");
}
// ResteRampe------------------------------------------------------------------------------------------------------------------
/*
cudaError_t matMulCuda(float* a, float* b, float* c, int sizeAx, int sizeAy, int sizeBx, int sizeBy);
cudaError_t matMulTransposeCuda(float* a, float* b, float* c, int sizeAx, int sizeAy, int sizeBx, int sizeBy);
cudaError_t matMulTileCuda(float* a, float* b, float* c, int sizeAx, int sizeAy, int sizeBx, int sizeBy);
cudaError_t conv2dEn5x5Cuda(float* mx, float* f, float* res, int sizeMxX, int sizeMxY, int sizeF);
*/
}
\ No newline at end of file
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment