diff --git a/src/cudaKernels.cu b/src/cudaKernels.cu index cdf1496cab39d65f794b5220b56269af88c53b65..f9c81dd8f745f6fcde8d352417dec5fbdf04df21 100644 --- a/src/cudaKernels.cu +++ b/src/cudaKernels.cu @@ -7,7 +7,10 @@ #include <stdio.h> #include "tensor.h" -#define FLT_MAX 3.402823466e+38F; + +#define BLOCK_SIZE 32 +#define FLT_MAX 3.402823466e+38F +#define SHMEM_SIZE BLOCK_SIZE * BLOCK_SIZE * sizeof(float) // Allocates Memory on Gpu and Copys Load from Host cudaError_t loadToGpu(void** dstGpu, float* srcHost, int size){ @@ -80,13 +83,13 @@ __global__ void maxPoolKernel(float* dst, float* src, int input_rows, int input_ // TODO: make add a seperate function -__global__ void matmulKernel(float* A, float* B, float* C, float* bias, int M, int N, int K) +__global__ void matmulKernel(float* A, float* B, float* C, int M, int N, int K) { int row = blockIdx.y * blockDim.y + threadIdx.y; int col = blockIdx.x * blockDim.x + threadIdx.x; if (row < M && col < K) { - float sum = bias[col]; + float sum = 0; for (int n = 0; n < N; ++n) { sum += A[row*N + n] * B[n*K + col]; @@ -97,6 +100,54 @@ __global__ void matmulKernel(float* A, float* B, float* C, float* bias, int M, i } +__global__ void matmulSharedKernel(float* A, float* B, float* C, int M, int N, int K) { + // tile size == thread block size + __shared__ float AS[SHMEM_SIZE]; + __shared__ float BS[SHMEM_SIZE]; + + int blockRow = blockIdx.y; + int blockCol = blockIdx.x; + + int row = threadIdx.y; + int col = threadIdx.x; + + float tmp = 0; + for (int m = 0; m < (N+BLOCK_SIZE-1)/BLOCK_SIZE; m++) { + AS[(row*BLOCK_SIZE) + col] = A[(blockRow+row)*M + (m*BLOCK_SIZE+col)]; + BS[(row*BLOCK_SIZE) + col] = B[((m*BLOCK_SIZE+row)*N) + blockCol+col]; + + __syncthreads(); + + for (int e = 0; e < BLOCK_SIZE; e++) { + tmp += AS[(row*BLOCK_SIZE) + e] * BS[(e*BLOCK_SIZE) + col]; + } + + __syncthreads(); + + } + C[(row*M) + col] = tmp; +} + + +/* +__global__ void matrixMult (__half2* A, __half2* B, __half2* C, __half2* bias, int M, int N, int K) { + int k = 0; + half* a_half = (half*)a; + half2 sum = __float2half2_rn(0.0); + int col = threadIdx.x + blockDim.x * blockIdx.x; + int row = threadIdx.y + blockDim.y * blockIdx.y; + if(col < width/2 && row < width) { + for (k = 0; k < width; k++){ + //__half2 a_temp = __half2half2(a_half[row * width + k]); + //~ sum += a[row * width + k] * b[k * width + col]; + sum += __half2half2(a_half[row * width + k]) * b[k * width/2 + col]; + + } + c[row * width/2 + col] = sum; + } +} +*/ + __global__ void conv2dKernel(float* input, float* weights, float* bias, float* output, int input_rows, int input_cols, int kernel_size, int num_channels, int num_filters) { int batch = blockIdx.x; @@ -143,12 +194,31 @@ __global__ void reluKernel(float* input, int size) { // Multiplies Matrix A x B -> C with one Thread for each element in C (With allocating Memory first) -void matMulCuda(float* A, float* B, float* C, float* Bias, int M, int N, int K){ - matmulKernel<<<M, N>>>( - A, B, C, Bias, M, N, K +void matMulCuda(float* A, float* B, float* C, int M, int N, int K){ + dim3 blockDim(BLOCK_SIZE, BLOCK_SIZE, 1); + dim3 gridDim((BLOCK_SIZE + K - 1)/BLOCK_SIZE, (BLOCK_SIZE + M - 1)/BLOCK_SIZE, 1); + + // printf("Grid : {%d, %d, %d} blocks. Blocks : {%d, %d, %d} threads.\n", + // gridDim.x, gridDim.y, gridDim.z, blockDim.x, blockDim.y, blockDim.z); + matmulKernel<<<gridDim, blockDim>>>( + A, B, C, M, N, K ); } +void matmulSharedCuda(float* A, float* B, float* C, int M, int N, int K) { + dim3 blockDim(BLOCK_SIZE, BLOCK_SIZE, 1); + dim3 gridDim((BLOCK_SIZE + K - 1)/BLOCK_SIZE, (BLOCK_SIZE + M - 1)/BLOCK_SIZE, 1); + matmulSharedKernel<<<gridDim, blockDim>>>( + A, B, C, M, N, K + ); + cudaError_t cudaStatus = cudaGetLastError(); + if (cudaStatus != cudaSuccess) { + std::cerr << "Error: " << cudaGetErrorString(cudaStatus) << std::endl; + abort(); + } + +} + void conv2dCuda(float* input, float* weights, float* bias, float* output, int input_rows, int input_cols, int kernel_size, int num_channels, int num_filters, int height_new, int width_new) { int batch_size = 1; @@ -158,9 +228,9 @@ void conv2dCuda(float* input, float* weights, float* bias, float* output, int in } void reluCuda(float* input, int size) { - int blockSize = 32; - int numBlocks = (size + blockSize - 1) / blockSize; - reluKernel<<<numBlocks, blockSize>>>(input, size); + //int blockSize = 32; + int numBlocks = (size + BLOCK_SIZE - 1) / BLOCK_SIZE; + reluKernel<<<numBlocks, BLOCK_SIZE>>>(input, size); } void maxPoolCuda(float* dst, float* src, int input_rows, int input_cols, int output_rows, int output_cols, int channels) { diff --git a/src/cudaKernels.cuh b/src/cudaKernels.cuh index 9c2a9678a580067cf58e9e5c03089fd55e52aa8e..f99b0cebfe2f85a213135d3bd4d9e57ba942453f 100644 --- a/src/cudaKernels.cuh +++ b/src/cudaKernels.cuh @@ -7,11 +7,12 @@ cudaError_t loadFromGpu(float* dstHost, float* srcGpu, int size); __global__ void conv2dKernel(float* input, float* weights, float* bias, float* output, int input_rows, int input_cols, int kernel_size, int num_channels, int num_filters); __global__ void reluKernel(float* input, int size); __global__ void maxPoolKernel(float* dst, float* src, int input_rows, int input_cols); -__global__ void matmulKernel(float* A, float* B, float* C, float* Bias, int M, int N, int K); +__global__ void matmulKernel(float* A, float* B, float* C, int M, int N, int K); cudaError_t avgPoolCuda(float* dst, float* src, int sizeX, int sizeY); cudaError_t vecMulCuda(float* mx, float* vec, float* res, int sizeMxX, int sizeMxY); -void matMulCuda(float* A, float* B, float* C, float* Bias, int M, int N, int K); +void matMulCuda(float* A, float* B, float* C, int M, int N, int K); +void matmulSharedCuda(float* A, float* B, float* C, int M, int N, int K); void reluCuda(float* input, int size); void maxPoolCuda(float* dst, float* src, int input_rows, int input_cols, int output_rows, int output_cols, int channels); void conv2dCuda(float* input, float* weights, float* bias, float* output, int input_rows, int input_cols, int kernel_size, int num_channels, int num_filters, int height_new, int width_new); diff --git a/src/cudaOps.cc b/src/cudaOps.cc index 6d8865377bcf22edfabfad0ce1a66bf013f74102..60b5874aeda2863fa346c5ad3144671397f63ba8 100644 --- a/src/cudaOps.cc +++ b/src/cudaOps.cc @@ -14,38 +14,51 @@ void CudaTensor::reshape(const std::vector<size_t>& new_shape) { void CudaTensor::permute(const shape_t& permutation) {} -void CudaTensor::matmul(const CudaTensor& w, const CudaTensor& b) { - assert(!empty && !w.empty && !b.empty); - assertm(shape.size() == w.shape.size(), "Dimensions don't match"); - assertm(shape.size() == 2, "Tensors need to be 2D"); +CudaTensor matmul(const CudaTensor& lhs, const CudaTensor& rhs) { + assert(!lhs.empty && !rhs.empty); + assertm(lhs.shape.size() == rhs.shape.size(), "Dimensions don't match"); + assertm(lhs.shape.size() == 2, "Tensors need to be 2D"); assertm( - shape[1] == w.shape[0], + lhs.shape[1] == rhs.shape[0], "Input tensor is of shape MxN but weight tensor is not of shape NxP"); - size_t M = shape[0], N = w.shape[0], K = w.shape[1]; - CudaTensor ret{{shape[0], w.shape[1]}}; - // Bias - assertm(ret.size() == b.size(), "Bias has to be of the same size as result"); + size_t M = lhs.shape[0], N = rhs.shape[0], K = rhs.shape[1]; + CudaTensor ret{{lhs.shape[0], rhs.shape[1]}}; - // TODO: currently default zero bias - //CudaTensor bias{{K, N}}; - matMulCuda( - dat, w.dat, ret.dat, b.dat, M, N, K + lhs.dat, rhs.dat, ret.dat, M, N, K ); - swap(ret); + return ret; } -void CudaTensor::conv2d(const CudaTensor& w, const CudaTensor& b, const std::string& padding) { - assert(!empty && !w.empty); - assertm(shape.size() == w.shape.size(), +CudaTensor matmulShared(const CudaTensor& lhs, const CudaTensor& rhs) { + assert(!lhs.empty && !rhs.empty); + assertm(lhs.shape.size() == rhs.shape.size(), "Dimensions don't match"); + assertm(lhs.shape.size() == 2, "Tensors need to be 2D"); + assertm( + lhs.shape[1] == rhs.shape[0], + "Input tensor is of shape MxN but weight tensor is not of shape NxP"); + + size_t M = lhs.shape[0], N = rhs.shape[0], K = rhs.shape[1]; + CudaTensor ret{{lhs.shape[0], rhs.shape[1]}}; + + matmulSharedCuda( + lhs.dat, rhs.dat, ret.dat, M, N, K + ); + + return ret; +} + +CudaTensor conv2d(const CudaTensor& lhs, const CudaTensor& w, const CudaTensor& b, const std::string& padding) { + assert(!lhs.empty && !w.empty); + assertm(lhs.shape.size() == w.shape.size(), "Input tensor and weight tensor have different dimensions"); - assertm(shape.size() == 4, "Only 4D Tensors accepted, reshape if possible"); + assertm(lhs.shape.size() == 4, "Only 4D Tensors accepted, reshape if possible"); if (!b.empty) assertm(b.shape.size() <= 1, "if available, bias must be 1D"); - const size_t N = shape[0], C = shape[1], H = shape[2], W = shape[3], + const size_t N = lhs.shape[0], C = lhs.shape[1], H = lhs.shape[2], W = lhs.shape[3], OUT_C = w.shape[0], K_HW = w.shape[2]; const size_t H_NEW = H - K_HW + 1, W_NEW = W - K_HW + 1; assertm(C == w.shape[1], "Input channels don't match with weight tensor"); @@ -56,10 +69,10 @@ void CudaTensor::conv2d(const CudaTensor& w, const CudaTensor& b, const std::str CudaTensor ret{{N, OUT_C, H_NEW, W_NEW}}; conv2dCuda( - dat, w.dat, b.dat, ret.dat, H, W, K_HW, C, OUT_C , H_NEW, W_NEW + lhs.dat, w.dat, b.dat, ret.dat, H, W, K_HW, C, OUT_C , H_NEW, W_NEW ); - swap(ret); + return ret; } /* @@ -106,13 +119,13 @@ void CudaTensor::relu() { } -void CudaTensor::max_pool(const shape_t& kernel_shape) { - assert(!empty); +CudaTensor max_pool(const CudaTensor& lhs, const shape_t& kernel_shape) { + assert(!lhs.empty); assertm(kernel_shape.size() == 2, "Kernel not 2D"); assertm(kernel_shape[0] == kernel_shape[1], "Kernel height not equal to width"); - const size_t N = shape[0], C = shape[1], H = shape[2], W = shape[3]; + const size_t N = lhs.shape[0], C = lhs.shape[1], H = lhs.shape[2], W = lhs.shape[3]; assertm((H % kernel_shape[0]) == 0 && (W % kernel_shape[0]) == 0, "Kernel shape and input tensor width/height missmatch"); const size_t H_NEW = H / kernel_shape[0], W_NEW = W / kernel_shape[0]; @@ -121,8 +134,8 @@ void CudaTensor::max_pool(const shape_t& kernel_shape) { CudaTensor ret{{N, C, H_NEW, W_NEW}}; maxPoolCuda( - ret.dat, dat, H, W, H_NEW, W_NEW, C + ret.dat, lhs.dat, H, W, H_NEW, W_NEW, C ); - swap(ret); + return ret; } \ No newline at end of file diff --git a/src/cudaTensor.cc b/src/cudaTensor.cc index b653c8978888c4ec70cfae9e8d02e2f24e661f2c..d50115b0364fca2f7a0f27833b40299299a37326 100644 --- a/src/cudaTensor.cc +++ b/src/cudaTensor.cc @@ -302,6 +302,12 @@ void CudaTensor::compare(Tensor& rhs) { } } +void CudaTensor::compare(CudaTensor& rhs) { + for (int i = 0; i < size(); i++) { + assertm(abs(dat[i] - rhs.dat[i]) < 0.0001, "Tensors are not equal"); + } +} + void CudaTensor::fillTensor() { float* data = (float*) malloc(size()*sizeof(float)); for (int i = 0; i < size(); i++) { diff --git a/src/cudaTensor.h b/src/cudaTensor.h index fb2479a1ec9b9b852be5087ee637dec0f88df111..b1096c44cdc59a125065137dba02cdb2afd3f3fa 100644 --- a/src/cudaTensor.h +++ b/src/cudaTensor.h @@ -13,9 +13,10 @@ #define assertm(exp, msg) assert(((void)msg, exp)) class CudaTensor : public Tensor { - bool empty = true; + public: + bool empty = true; float* dat; shape_t shape; std::string name; @@ -44,18 +45,20 @@ class CudaTensor : public Tensor { void load_data(float * data, int n); void print_tensor(std::ostream& os); void compare(Tensor& rhs); + void compare(CudaTensor& rhs); void fillTensor(); // ops void reshape(const shape_t& new_shape); void permute(const shape_t& permutation); - void matmul(const CudaTensor& w, const CudaTensor& b); - void conv2d(const CudaTensor& w, const CudaTensor& b, const std::string& padding = "valid"); - void conv2dWinograd(const CudaTensor& w, const CudaTensor& b, const std::string& padding = "valid"); - void max_pool(const shape_t& kernel_shape); - void avg_pool(const shape_t& kernel_shape); - void add(const CudaTensor& b); void relu(); void softmax(); -}; \ No newline at end of file +}; + +CudaTensor matmul(const CudaTensor& lhs, const CudaTensor& rhs); +CudaTensor matmulShared(const CudaTensor& lhs, const CudaTensor& rhs); +CudaTensor conv2d(const CudaTensor& lhs, const CudaTensor& w, const CudaTensor& b, const std::string& padding = "valid"); +CudaTensor max_pool(const CudaTensor& lhs, const shape_t& kernel_shape); +CudaTensor avg_pool(const CudaTensor& lhs, const shape_t& kernel_shape); +CudaTensor add(const CudaTensor& lhs, const CudaTensor& b); \ No newline at end of file diff --git a/src/inference.cc b/src/inference.cc index 4a6d767431dc5762f77eed1f5de1d79e8d94d6e6..858ab5faa94415b8f49ca76c5ec663e1ced8d3b2 100644 --- a/src/inference.cc +++ b/src/inference.cc @@ -1,5 +1,4 @@ #include "inference.h" -#include "cudaLenet5.cuh" double get_time(void); @@ -346,19 +345,20 @@ void compareGpuToCpu() { get_input(model, collector); get_input(model, collectorCpu); - CudaTensor *input = collector[model.graph().input()[0].name()]; + CudaTensor input = *collector[model.graph().input()[0].name()]; Tensor *inputCpu = collectorCpu[model.graph().input()[0].name()]; - load_image(*input, "mnist/testSample/img_2.jpg"); + load_image(input, "mnist/testSample/img_2.jpg"); load_image(*inputCpu, "mnist/testSample/img_2.jpg"); std::cout<<"Loaded Image"<<std::endl; get_weights(model, collector); get_weights(model, collectorCpu); - input->print_tensor(std::cout); + input.print_tensor(std::cout); - input->conv2d( + input = conv2d( + input, *collector["sequential/conv2d/Conv2D/ReadVariableOp:0"], *collector["sequential/conv2d/BiasAdd/ReadVariableOp:0"] ); @@ -367,21 +367,23 @@ void compareGpuToCpu() { *collectorCpu["sequential/conv2d/BiasAdd/ReadVariableOp:0"] ); - input->relu(); + input.relu(); inputCpu->relu(); - input->max_pool( + input = max_pool( + input, {2, 2} ); inputCpu->max_pool( {2, 2} ); - input->compare(*inputCpu); + input.compare(*inputCpu); - input->conv2d( + input = conv2d( + input, *collector["sequential/conv2d_1/Conv2D/ReadVariableOp:0"], *collector["sequential/conv2d_1/BiasAdd/ReadVariableOp:0"] ); @@ -390,10 +392,13 @@ void compareGpuToCpu() { *collectorCpu["sequential/conv2d_1/BiasAdd/ReadVariableOp:0"] ); - input->relu(); + input.relu(); inputCpu->relu(); - input->max_pool( + input.compare(*inputCpu); + + input = max_pool( + input, {2, 2} ); inputCpu->max_pool( @@ -401,7 +406,8 @@ void compareGpuToCpu() { ); - input->conv2d( + input = conv2d( + input, *collector["sequential/conv2d_2/Conv2D/ReadVariableOp:0"], *collector["sequential/conv2d_2/BiasAdd/ReadVariableOp:0"] ); @@ -410,18 +416,25 @@ void compareGpuToCpu() { *collectorCpu["sequential/conv2d_2/BiasAdd/ReadVariableOp:0"] ); - input->relu(); + input.relu(); inputCpu->relu(); // Batch size is first element - input->reshape({1, 120}); + + + input.reshape({1, 120}); inputCpu->reshape({1, 120}); + - input->matmul( - *collector["sequential/dense/MatMul/ReadVariableOp:0"], - *collector["sequential/dense/BiasAdd/ReadVariableOp:0"] + //input->compare(*inputCpu); + input = matmul( + input, + *collector["sequential/dense/MatMul/ReadVariableOp:0"] ); + + + inputCpu->matmul( *collectorCpu["sequential/dense/MatMul/ReadVariableOp:0"] ); @@ -429,15 +442,19 @@ void compareGpuToCpu() { *collectorCpu["sequential/dense/BiasAdd/ReadVariableOp:0"] ); + + + + input.compare(*inputCpu); - input->relu(); + input.relu(); inputCpu->relu(); - input->matmul( - *collector["sequential/dense_1/MatMul/ReadVariableOp:0"], - *collector["sequential/dense_1/BiasAdd/ReadVariableOp:0"] + input = matmul( + input, + *collector["sequential/dense_1/MatMul/ReadVariableOp:0"] ); inputCpu->matmul( *collectorCpu["sequential/dense_1/MatMul/ReadVariableOp:0"] @@ -445,12 +462,12 @@ void compareGpuToCpu() { inputCpu->add( *collectorCpu["sequential/dense_1/BiasAdd/ReadVariableOp:0"] ); - input->relu(); + input.relu(); inputCpu->relu(); - input->print_tensor(std::cout); + input.print_tensor(std::cout); inputCpu->print_tensor(std::cout); - input->print_shape(std::cout); + input.print_shape(std::cout); inputCpu->print_shape(std::cout); } @@ -467,7 +484,46 @@ int main() { } */ - compareGpuToCpu(); + //compareGpuToCpu(); + + CudaTensor ct1{{10, 10}}; + CudaTensor ct2{{10, 10}}; + ct1.fillTensor(); + ct2.fillTensor(); + ct1.print_tensor(std::cout); + ct2.print_tensor(std::cout); + + CudaTensor retShared = matmulShared(ct1, ct2); + CudaTensor ret = matmul(ct1, ct2); + ret.print_tensor(std::cout); + retShared.print_tensor(std::cout); + + + ret.compare(retShared); + + + /* + CudaTensor ct1{{10, 10}}; + CudaTensor ct2{{10, 10}}; + CudaTensor b{{10, 10}}; + + for (int i = 0; i < ct1.shape[0]; i++) { + for (int j = 0; j < ct2.shape[1]; j++) { + ct1[{i, j}] = 10; + ct2[{i, j}] = 5; + b[{i, j}] = 1; + } + } + + ct1.matmul( + ct2, + b + ); + + + ct1.print_tensor(std::cout); + + */ return 0; }