Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
D
Deeplearning
Manage
Activity
Members
Labels
Plan
Issues
Issue boards
Milestones
Wiki
Code
Merge requests
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Snippets
Build
Pipelines
Jobs
Pipeline schedules
Artifacts
Deploy
Releases
Package registry
Container registry
Model registry
Operate
Environments
Terraform modules
Monitor
Incidents
Service Desk
Analyze
Value stream analytics
Contributor analytics
CI/CD analytics
Repository analytics
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
GitLab community forum
Contribute to GitLab
Provide feedback
Terms and privacy
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
RADL
Deeplearning
Commits
2796fcb2
Commit
2796fcb2
authored
Mar 18, 2023
by
Gabriel Falk
Browse files
Options
Downloads
Patches
Plain Diff
Upload New File
parent
2cb2c650
Branches
Branches containing commit
No related tags found
No related merge requests found
Changes
1
Show whitespace changes
Inline
Side-by-side
Showing
1 changed file
Main/test.cpp
+491
-0
491 additions, 0 deletions
Main/test.cpp
with
491 additions
and
0 deletions
Main/test.cpp
0 → 100644
+
491
−
0
View file @
2796fcb2
#include
<stdio.h>
#include
<stdlib.h>
#include
<chrono>
#include
<iostream>
#include
<math.h>
#include
<assert.h>
#include
<cuda_runtime.h>
#include
"matrix.h"
#include
"cudaKernels.h"
//#include "cudaLenet5.h"
//#include "tensor.h"
int
runs
=
211
;
// Gives number of how often Conv is repeated
int
runsConv
=
201
;
// Gives number of how often MatMul is repeated
int
runsMatmul
=
501
;
// Gives number of how often AvgPool is repeated
int
runsAvgPool
=
1001
;
//SELECT MEDIAN VALUE (Methodes to select Median out of allocated Buffer) [Frees Buffer afterwards & does not Allocate];
//--------------------------------------------------
int
cmpfunc
(
const
void
*
a
,
const
void
*
b
){
if
(
*
(
double
*
)
a
>
*
(
double
*
)
b
)
return
1
;
else
if
(
*
(
double
*
)
a
<
*
(
double
*
)
b
)
return
-
1
;
else
return
0
;
}
double
med
(
double
*
array
,
int
slot
){
double
median
=
0.0
;
qsort
(
array
,
slot
,
sizeof
(
double
),
cmpfunc
);
median
=
array
[
slot
/
2
];
free
(
array
);
return
median
;
}
//--------------------------------------------------
/*
Schema: Method takes inputs & optional Printboolean <0->noOutput / 1->Output> for debugging.
Method allocates a Array with size <runs> where the Runtime of each run is stored.
then the Median of this is determined and returned.
Inputs: for Conv <Matrix & Filter>
for Matmul <Matrix & Matrix>
for AvgPool <Matrix>
*/
//TESTS-CPU
double
testCPUsimpConv
(
matrix
input
,
matrix
filter
,
int
prinT
){
// Test a simple CPU Convolution
double
*
tmpMeasures
=
(
double
*
)
malloc
(
runsConv
*
sizeof
(
double
));
matrix
dst
;
for
(
int
i
=
0
;
i
<
runsConv
;
i
++
){
// Time Measure START
auto
start
=
std
::
chrono
::
high_resolution_clock
::
now
();
dst
=
conv2d
(
input
,
filter
);
// Time Measure STOP
auto
finish
=
std
::
chrono
::
high_resolution_clock
::
now
();
std
::
chrono
::
duration
<
double
>
elapsed
=
finish
-
start
;
//Save Measurement
tmpMeasures
[
i
]
=
elapsed
.
count
();
}
// Print Test-Case
if
(
prinT
){
printf
(
"CPU_simple-Convolution:
\n
"
);
printf
(
"Filter:
\n
"
);
printMx
(
filter
);
printf
(
"Input:
\n
"
);
printMx
(
input
);
printf
(
"Result:
\n
"
);
printMx
(
dst
);
}
// Clean Up
destroyMx
(
input
);
destroyMx
(
filter
);
destroyMx
(
dst
);
// Return Median
return
med
(
tmpMeasures
,
runsConv
);
}
double
testCPUsimpMatMul
(
matrix
a
,
matrix
b
,
int
prinT
){
// Test a simple CPU MatMul
double
*
tmpMeasures
=
(
double
*
)
malloc
(
runsMatmul
*
sizeof
(
double
));
matrix
dst
;
for
(
int
i
=
0
;
i
<
runsMatmul
;
i
++
){
// Time Measure START
auto
start
=
std
::
chrono
::
high_resolution_clock
::
now
();
dst
=
mulMx
(
a
,
b
);
// Time Measure STOP
auto
finish
=
std
::
chrono
::
high_resolution_clock
::
now
();
std
::
chrono
::
duration
<
double
>
elapsed
=
finish
-
start
;
//Save Measurement
tmpMeasures
[
i
]
=
elapsed
.
count
();
}
// Print Test-Case
if
(
prinT
){
printf
(
"CPU_simple-MatrixMultiplication:
\n
"
);
printf
(
"Matrix A:
\n
"
);
printMx
(
a
);
printf
(
"Matrix B:
\n
"
);
printMx
(
b
);
printf
(
"Result:
\n
"
);
printMx
(
dst
);
}
// Clean Up
destroyMx
(
a
);
destroyMx
(
b
);
destroyMx
(
dst
);
// Return Median
return
med
(
tmpMeasures
,
runsMatmul
);
}
double
testCPUsimpAvgPool
(
matrix
a
,
int
prinT
){
// Test a simple CPU AvgPool
double
*
tmpMeasures
=
(
double
*
)
malloc
(
runsAvgPool
*
sizeof
(
double
));
matrix
dst
;
for
(
int
i
=
0
;
i
<
runsAvgPool
;
i
++
){
// Time Measure START
auto
start
=
std
::
chrono
::
high_resolution_clock
::
now
();
dst
=
avg2Pooling
(
a
);
// Time Measure STOP
auto
finish
=
std
::
chrono
::
high_resolution_clock
::
now
();
std
::
chrono
::
duration
<
double
>
elapsed
=
finish
-
start
;
//Save Measurement
tmpMeasures
[
i
]
=
elapsed
.
count
();
}
// Print Test-Case
if
(
prinT
){
printf
(
"CPU_simple-AvgPool:
\n
"
);
printf
(
"Matrix A:
\n
"
);
printMx
(
a
);
printf
(
"Result:
\n
"
);
printMx
(
dst
);
}
// Clean Up
destroyMx
(
a
);
destroyMx
(
dst
);
// Return Median
return
med
(
tmpMeasures
,
runsAvgPool
);
}
//TESTS-GPU
double
testGPUsimpConv
(
matrix
input
,
matrix
filter
,
int
prinT
){
// Test a simple parallel GPU Convolution
double
*
tmpMeasures
=
(
double
*
)
malloc
(
runsConv
*
sizeof
(
double
));
matrix
dst
;
for
(
int
i
=
0
;
i
<
runsConv
;
i
++
){
dst
=
createMx
(
input
.
sizeX
-
filter
.
sizeX
+
1
,
input
.
sizeY
-
filter
.
sizeY
+
1
);
conv2dCuda
(
input
.
head
,
filter
.
head
,
dst
.
head
,
input
.
sizeX
,
input
.
sizeY
,
filter
.
sizeX
,
&
tmpMeasures
[
i
]);
}
// Print Test-Case
if
(
prinT
){
printf
(
"GPU_simple-Convolution:
\n
"
);
printf
(
"Filter:
\n
"
);
printMx
(
filter
);
printf
(
"Input:
\n
"
);
printMx
(
input
);
printf
(
"Result:
\n
"
);
printMx
(
dst
);
}
// Return Median
return
med
(
tmpMeasures
,
runsConv
);
}
double
testGPUfftConv
(
matrix
input
,
matrix
filter
,
int
prinT
){
// Test a fft parallel GPU Convolution
double
*
tmpMeasures
=
(
double
*
)
malloc
(
runsConv
*
sizeof
(
double
));
matrix
dst
;
for
(
int
i
=
0
;
i
<
runsConv
;
i
++
){
dst
=
createMx
(
input
.
sizeX
-
filter
.
sizeX
+
1
,
input
.
sizeY
-
filter
.
sizeY
+
1
);
conv2dFFTCuda
(
input
.
head
,
filter
.
head
,
dst
.
head
,
input
.
sizeX
,
input
.
sizeY
,
filter
.
sizeX
,
&
tmpMeasures
[
i
]);
}
// Print Test-Case
if
(
prinT
){
printf
(
"GPU_fft-Convolution:
\n
"
);
printf
(
"Filter:
\n
"
);
printMx
(
filter
);
printf
(
"Input:
\n
"
);
printMx
(
input
);
printf
(
"Result:
\n
"
);
printMx
(
dst
);
}
// Return Median
return
med
(
tmpMeasures
,
runsConv
);
}
double
testGPUmatmulConv
(
matrix
input
,
matrix
filter
,
int
prinT
){
// Test a matmul GPU Convolution
double
*
tmpMeasures
=
(
double
*
)
malloc
(
runsMatmul
*
sizeof
(
double
));
// Get Convolution & ints for Reshape later
matrix
m
=
getConvMx
(
filter
,
input
.
sizeX
,
input
.
sizeY
);
int
sizeX
=
input
.
sizeX
;
int
sizeY
=
input
.
sizeY
;
// Reshape for MatMul
input
.
sizeY
=
input
.
sizeX
*
input
.
sizeY
;
input
.
sizeX
=
1
;
matrix
dst
;
for
(
int
i
=
0
;
i
<
runsMatmul
;
i
++
){
dst
=
createMx
(
input
.
sizeY
,
m
.
sizeX
);
matMulCuda
(
m
.
head
,
input
.
head
,
dst
.
head
,
m
.
sizeX
,
m
.
sizeY
,
input
.
sizeX
,
input
.
sizeY
,
&
tmpMeasures
[
i
]);
}
// Reshape of Input & dst
input
.
sizeX
=
sizeX
;
input
.
sizeY
=
sizeY
;
dst
.
sizeX
=
input
.
sizeX
-
filter
.
sizeX
+
1
;
dst
.
sizeY
=
input
.
sizeY
-
filter
.
sizeY
+
1
;
// Print Test-Case
if
(
prinT
){
printf
(
"GPU_fft-Convolution:
\n
"
);
printf
(
"Filter:
\n
"
);
printMx
(
filter
);
printf
(
"Matrix M (Conv):
\n
"
);
printMx
(
m
);
printf
(
"Input:
\n
"
);
printMx
(
input
);
printf
(
"Result:
\n
"
);
printMx
(
dst
);
/*
//Reshape for VectorOutput
input.sizeY = input.sizeX*input.sizeY;
input.sizeX = 1;
dst.sizeX = dst.sizeX*input.sizeY;
dst.sizeY = 1;
printf("Input as Vector:\n");
printMx(input);
printf("Result as Vektor:\n");
printMx(dst);
*/
}
// Return Median
return
med
(
tmpMeasures
,
runsMatmul
);
}
double
testGPUwinoConv
(
matrix
input
,
matrix
filter
,
int
prinT
){
// Test a Winograd-Tiled parallel GPU Convolution
assert
((
filter
.
sizeX
==
3
||
filter
.
sizeX
==
5
)
&&
filter
.
sizeX
==
filter
.
sizeY
);
double
*
tmpMeasures
=
(
double
*
)
malloc
(
runsConv
*
sizeof
(
double
));
matrix
dst
;
for
(
int
i
=
0
;
i
<
runsConv
;
i
++
){
dst
=
createMx
(
input
.
sizeX
-
filter
.
sizeX
+
1
,
input
.
sizeY
-
filter
.
sizeY
+
1
);
conv2dWinoCuda
(
input
.
head
,
filter
.
head
,
dst
.
head
,
input
.
sizeX
,
input
.
sizeY
,
filter
.
sizeX
,
&
tmpMeasures
[
i
]);
}
// Print Test-Case
if
(
prinT
){
printf
(
"GPU_Wino-Convolution:
\n
"
);
printf
(
"Filter:
\n
"
);
printMx
(
filter
);
printf
(
"Input:
\n
"
);
printMx
(
input
);
printf
(
"Result:
\n
"
);
printMx
(
dst
);
}
// Return Median
return
med
(
tmpMeasures
,
runsConv
);
}
double
testGPUsimpMatMul
(
matrix
a
,
matrix
b
,
int
prinT
){
// Test a simple parallel GPU Matrix Multiplication
double
*
tmpMeasures
=
(
double
*
)
malloc
(
runsMatmul
*
sizeof
(
double
));
matrix
dst
;
for
(
int
i
=
0
;
i
<
runsMatmul
;
i
++
){
dst
=
createMx
(
a
.
sizeY
,
b
.
sizeX
);
matMulCuda
(
a
.
head
,
b
.
head
,
dst
.
head
,
a
.
sizeX
,
a
.
sizeY
,
b
.
sizeX
,
b
.
sizeY
,
&
tmpMeasures
[
i
]);
}
// Print Test-Case
if
(
prinT
){
printf
(
"GPU_fft-Convolution:
\n
"
);
printf
(
"Matrix A:
\n
"
);
printMx
(
a
);
printf
(
"Matrix B:
\n
"
);
printMx
(
b
);
printf
(
"Result:
\n
"
);
printMx
(
dst
);
}
// Return Median
return
med
(
tmpMeasures
,
runsMatmul
);
}
double
testGPUtransposeMatMul
(
matrix
a
,
matrix
b
,
int
prinT
){
// Test a transposed parallel GPU Matrix Multiplication
double
*
tmpMeasures
=
(
double
*
)
malloc
(
runsMatmul
*
sizeof
(
double
));
matrix
dst
;
for
(
int
i
=
0
;
i
<
runsMatmul
;
i
++
){
dst
=
createMx
(
a
.
sizeY
,
b
.
sizeY
);
matMulTransposeCuda
(
a
.
head
,
b
.
head
,
dst
.
head
,
a
.
sizeX
,
a
.
sizeY
,
b
.
sizeX
,
b
.
sizeY
,
&
tmpMeasures
[
i
]);
}
// Print Test-Case
if
(
prinT
){
printf
(
"GPU_fft-Convolution:
\n
"
);
printf
(
"Matrix A:
\n
"
);
printMx
(
a
);
printf
(
"Matrix B:
\n
"
);
printMx
(
b
);
printf
(
"Result:
\n
"
);
printMx
(
dst
);
}
// Return Median
return
med
(
tmpMeasures
,
runsMatmul
);
}
double
testGPUtileMatMul
(
matrix
a
,
matrix
b
,
int
prinT
){
// Test a transposed parallel GPU Matrix Multiplication
double
*
tmpMeasures
=
(
double
*
)
malloc
(
runsMatmul
*
sizeof
(
double
));
matrix
dst
;
for
(
int
i
=
0
;
i
<
runsMatmul
;
i
++
){
dst
=
createMx
(
a
.
sizeY
,
b
.
sizeY
);
matMulTileCuda
(
a
.
head
,
b
.
head
,
dst
.
head
,
a
.
sizeX
,
a
.
sizeY
,
b
.
sizeX
,
b
.
sizeY
,
&
tmpMeasures
[
i
]);
}
// Print Test-Case
if
(
prinT
){
printf
(
"GPU_fft-Convolution:
\n
"
);
printf
(
"Matrix A:
\n
"
);
printMx
(
a
);
printf
(
"Matrix B:
\n
"
);
printMx
(
b
);
printf
(
"Result:
\n
"
);
printMx
(
dst
);
}
// Return Median
return
med
(
tmpMeasures
,
runsMatmul
);
}
double
testGPUsimpAvgPool
(
matrix
a
,
int
prinT
){
// Test a simple parallel GPU AveragePooling
double
*
tmpMeasures
=
(
double
*
)
malloc
(
runsAvgPool
*
sizeof
(
double
));
matrix
dst
;
for
(
int
i
=
0
;
i
<
runsAvgPool
;
i
++
){
dst
=
createMx
(
a
.
sizeX
/
2
,
a
.
sizeY
/
2
);
avgPoolCuda
(
dst
.
head
,
a
.
head
,
a
.
sizeX
,
a
.
sizeY
,
&
tmpMeasures
[
i
]);
}
// Print Test-Case
if
(
prinT
){
printf
(
"GPU_simple-AveragePooling:
\n
"
);
printf
(
"Matrix A:
\n
"
);
printMx
(
a
);
printf
(
"Result:
\n
"
);
printMx
(
dst
);
}
// Return Median
return
med
(
tmpMeasures
,
runsAvgPool
);
}
//MAIN Testcalls
void
test
(){
// Convolution-Test --------------------------------------------------------------------------------------------------------
// MaxKernel 1024 per Block, Convoltuions launching different numbers of Kernels
/* CPU->"no limit";
* GPUsimple #El resultMx -> Limit Resultmatrix has size 32x32 or so -> input.x - filter.x + 1 < 32;
* GPUfft #El inputMx -> input has max size of 32x32;
* GPUmatmul #El resultMx -> Limit Resultmatrix has size 32x32 or so -> input.x - filter.x + 1 < 32;
* GPUwino #El (sizeF+1)*(sizeF+1)*2 -> is max 5 so max 72 Threads are launched, but 1 block per Tile -> "no limit"
*/
int
inputSize
=
32
;
int
maxFilterSize
=
30
;
assert
(
inputSize
>=
maxFilterSize
-
1
);
// Loop has a Fixed Input by size and a growing Filter form [min 3 to max <maxFilterSize>] with a step size of 2;
for
(
int
i
=
3
;
i
<
maxFilterSize
;
i
+=
2
){
// CPU simpleConvolution
double
CPUsimpConv
=
testCPUsimpConv
(
getrdmMx
(
inputSize
,
inputSize
),
getrdmMx
(
i
,
i
),
0
);
printf
(
"CPU_SimpleConvolution Time for [%dx%d] Kernel: %g
\n
"
,
i
,
i
,
CPUsimpConv
);
// GPU simpleConvolution
double
GPUsimpConv
=
testGPUsimpConv
(
getrdmMx
(
inputSize
,
inputSize
),
getrdmMx
(
i
,
i
),
0
);
printf
(
"GPU_SimpleConvolution Time for [%dx%d] Kernel: %g
\n
"
,
i
,
i
,
GPUsimpConv
);
// GPU fftConvolution
double
GPUfftConv
=
testGPUfftConv
(
getrdmMx
(
inputSize
,
inputSize
),
getrdmMx
(
i
,
i
),
0
);
printf
(
"GPU_FFTConvolution Time for [%dx%d] Kernel: %g
\n
"
,
i
,
i
,
GPUfftConv
);
// GPU matmulConvolution
double
GPUmatmulConv
=
testGPUmatmulConv
(
getrdmMx
(
inputSize
,
inputSize
),
getrdmMx
(
i
,
i
),
0
);
printf
(
"GPU_matmulConvolution Time for [%dx%d] Kernel: %g
\n
"
,
i
,
i
,
GPUmatmulConv
);
//GPU winograd
double
GPUwinoConv
=
(
double
)
0
;
if
(
i
==
3
||
i
==
5
){
GPUwinoConv
=
testGPUwinoConv
(
getrdmMx
(
inputSize
,
inputSize
),
getrdmMx
(
i
,
i
),
0
);
printf
(
"GPU_WinoConvolution Time for [%dx%d] Kernel: %g
\n
"
,
i
,
i
,
GPUwinoConv
);
}
printf
(
"
\n
"
);
//TODO: (Enroll)
}
// MatrixMultiplication-Test --------------------------------------------------------------------------------------------------------
// MaxKernel 1024 per Block, MatrixMultiplications launching different numbers of Kernels
/* CPU->"no limit";
* GPUsimple #El resultMx -> Limit Resultmatrix has size 32x32 -> so intput with a.y & b.x < 33
*
*/
int
maxMatrixSizeMM
=
33
;
// Loop multiplies 2 Matrices, with the same size growing from [min 1 to max <maxMatrixSizeMM>] with a step size of 1, with each other;
for
(
int
i
=
1
;
i
<
maxMatrixSizeMM
;
i
++
){
// CPU simpleMatrixMultiplication
double
CPUsimpMatMul
=
testCPUsimpMatMul
(
getrdmMx
(
i
,
i
),
getrdmMx
(
i
,
i
),
0
);
printf
(
"CPU_SimpleMatrixMultiplication Time for [%dx%d]*[%dx%d] Matrices: %g
\n
"
,
i
,
i
,
i
,
i
,
CPUsimpMatMul
);
// GPU simpleMatrixMultiplication
double
GPUsimpMatMul
=
testGPUsimpMatMul
(
getrdmMx
(
i
,
i
),
getrdmMx
(
i
,
i
),
0
);
printf
(
"GPU_SimpleMatrixMultiplication Time for [%dx%d]*[%dx%d] Matrices: %g
\n
"
,
i
,
i
,
i
,
i
,
GPUsimpMatMul
);
// GPU transposedMatrixMultiplication
double
GPUtransposeMatMul
=
testGPUtransposeMatMul
(
getrdmMx
(
i
,
i
),
transpose
(
getrdmMx
(
i
,
i
)),
0
);
printf
(
"GPU_TransposedMatrixMultiplication Time for [%dx%d]*[%dx%d] Matrices: %g
\n
"
,
i
,
i
,
i
,
i
,
GPUtransposeMatMul
);
// GPU tiledMatrixMultiplication
double
GPUtileMatMul
=
testGPUtileMatMul
(
getrdmMx
(
i
,
i
),
getrdmMx
(
i
,
i
),
0
);
printf
(
"GPU_TiledMatrixMultiplication Time for [%dx%d]*[%dx%d] Matrices: %g
\n
"
,
i
,
i
,
i
,
i
,
GPUtileMatMul
);
printf
(
"
\n
"
);
//TODO: Tile, (opt. Tile, Enroll)
}
// AveragePooling-Test --------------------------------------------------------------------------------------------------------
// MaxKernel 1024 per Block, AveragePooling launching different numbers of Kernels
/* CPU->"no limit";
* GPUsimple -> #El resultMx -> Limit Resultmatrix has size 32x32 -> so max Input size is 64x64;
*/
int
maxMatrixSizeAP
=
33
;
// Loop does AveragePooling on a Matrix growing form [min 2 to max <maxMatrixSizeAP>] with a step size of 2;
for
(
int
i
=
2
;
i
<
maxMatrixSizeAP
;
i
+=
2
){
// CPU simpleAvgPooling
double
CPUsimpAvgPool
=
testCPUsimpAvgPool
(
getrdmMx
(
i
,
i
),
0
);
printf
(
"CPU_SimpleAveragePooling Time for [%dx%d] Matrix: %g
\n
"
,
i
,
i
,
CPUsimpAvgPool
);
// GPU simpleAvgPooling
double
GPUsimpAvgPool
=
testGPUsimpAvgPool
(
getrdmMx
(
i
,
i
),
0
);
printf
(
"GPU_SimpleAveragePooling Time for [%dx%d] Matrix: %g
\n
"
,
i
,
i
,
GPUsimpAvgPool
);
printf
(
"
\n
"
);
}
// ResteRampe------------------------------------------------------------------------------------------------------------------
/*
cudaError_t matMulCuda(float* a, float* b, float* c, int sizeAx, int sizeAy, int sizeBx, int sizeBy);
cudaError_t matMulTransposeCuda(float* a, float* b, float* c, int sizeAx, int sizeAy, int sizeBx, int sizeBy);
cudaError_t matMulTileCuda(float* a, float* b, float* c, int sizeAx, int sizeAy, int sizeBx, int sizeBy);
cudaError_t conv2dEn5x5Cuda(float* mx, float* f, float* res, int sizeMxX, int sizeMxY, int sizeF);
*/
}
\ No newline at end of file
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment