diff --git a/OFCUDA/CMakeLists.txt b/OFCUDA/CMakeLists.txt index f6e46ef06b125602099b6041660500647e579e62..0a2d9120138c4d9bd1e411850295972edbf2fdec 100644 --- a/OFCUDA/CMakeLists.txt +++ b/OFCUDA/CMakeLists.txt @@ -30,7 +30,7 @@ include_directories( "header/" ) SET( CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS}; - -O3 -g -gencode arch=compute_20,code=sm_20; -D_FORCE_INLINES + -O3 -g -gencode arch=compute_52,code=sm_52; -D_FORCE_INLINES ) SET( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11 -O3" ) diff --git a/OFCUDA/header/CudaMat.h b/OFCUDA/header/CudaMat.h index 878611ea719e2bdec6dfb784fb75e6a798e0fa1e..a39eeade3f880d2e3200c5ede3c1ac3b5b645140 100644 --- a/OFCUDA/header/CudaMat.h +++ b/OFCUDA/header/CudaMat.h @@ -3,13 +3,18 @@ #include <iostream> #include <string> #include <vector> +#include <limits> + #include <cuda_runtime.h> #include <cuda.h> #include <assert.h> + #include <thrust/host_vector.h> #include <thrust/device_vector.h> #include <thrust/device_ptr.h> #include <thrust/for_each.h> +#include <thrust/reduce.h> +#include <thrust/functional.h> #include <thrust/execution_policy.h> #include <thrust/fill.h> #include <thrust/swap.h> @@ -258,11 +263,8 @@ public: ~CudaMat() { if(dev_mat != nullptr){ - //std::cout << "free " << getRawPtr() << std::endl; cudaFree(dev_mat); - } /*else { - std::cout << "invalid free " << getRawPtr() << std::endl; - }*/ + } } void write_image(const std::string &filename) const diff --git a/OFCUDA/source/cu/HornSchunckPyramidal.cu b/OFCUDA/source/cu/HornSchunckPyramidal.cu index fcd8782473f586cfd6c39e9c6697d87ff030188c..dcff4132ff0fdd98e5e56c4ce51e8c3ef261ed5c 100644 --- a/OFCUDA/source/cu/HornSchunckPyramidal.cu +++ b/OFCUDA/source/cu/HornSchunckPyramidal.cu @@ -42,6 +42,24 @@ namespace { return x[r]; } + // compute the largest number of an array + float max_element(CudaMat &a) + { + const thrust::device_ptr<float> self_ptr(a.getRawPtr()); + return thrust::reduce(self_ptr,self_ptr+(a.width()*a.height()), + std::numeric_limits<float>::min(), + thrust::maximum<float>()); + } + + // compute the smallest number of an array + float min_element(CudaMat &a) + { + const thrust::device_ptr<float> self_ptr(a.getRawPtr()); + return thrust::reduce(self_ptr,self_ptr+(a.width()*a.height()), + std::numeric_limits<float>::max(), + thrust::minimum<float>()); + } + void normalize(const thrust::host_vector<float> &a, const thrust::host_vector<float> &b, thrust::host_vector<float> &an, thrust::host_vector<float> &bn) { @@ -76,8 +94,6 @@ namespace { } } - - void uv_point2f_cv( const thrust::host_vector<float> &u, const thrust::host_vector<float> &v, @@ -105,6 +121,23 @@ namespace { return yy*w+xx; } + __global__ void norm( + float* a, + float* b, + const float min, + const float den, + int w, int h) + { + int x = blockIdx.x * blockDim.x + threadIdx.x; + int y = blockIdx.y * blockDim.y + threadIdx.y; + if(x >= w || y >= h) return; + + if(den > 0){ + a[y*w+x] = 255.0 * (a[y*w+x] - min) / den; + b[y*w+x] = 255.0 * (b[y*w+x] - min) / den; + } + } + __global__ void mult( float* dev, const float val, @@ -315,6 +348,31 @@ namespace { + (v[idx] - vk) * (v[idx] - vk); } + void normalize(CudaMat &a, CudaMat &b) + { + int w = a.width(); + int h = a.height(); + + assert(a.width() == b.width() && a.height() == b.height()); + // find the max and min of both images + const float max1 = max_element(a); + const float max2 = max_element(b); + const float min1 = min_element(a); + const float min2 = min_element(b); + + // obtain the absolute max and min + const float max = max1 > max2 ? max1 : max2; + const float min = min1 < min2 ? min1 : min2; + const float den = max - min; + + dim3 threadsperblock(16,16); + dim3 blocks(ceil(float(w)/float(16)),ceil(float(h)/float(16))); + norm<<< blocks, threadsperblock >>>(a.getRawPtr(), + b.getRawPtr(), + min,den, + w,h); + } + } void HornSchunckPyramidal::getSizes() @@ -332,16 +390,10 @@ void HornSchunckPyramidal::getSizes() { break; } - std::cout << w << "/" << h << std::endl; sizes.emplace_back(w,h); } N_scales = sizes.size(); - /*int cnt = 0; - for(auto &p : sizes) - { - std::cout << nue << "^" << cnt++ << " : " << p.first << " " << p.second << std::endl; - }*/ } void gaussianCPU(CudaMat &m, const float sigma) @@ -385,13 +437,10 @@ HornSchunckPyramidal::HornSchunckPyramidal(const string &t0_file, const string & cv::Mat image0 = cv::imread(t0_file,CV_LOAD_IMAGE_GRAYSCALE); cv::Mat image1 = cv::imread(t1_file,CV_LOAD_IMAGE_GRAYSCALE); - int w = image0.cols; int h = image0.rows; thrust::host_vector<float> a(w*h); - thrust::host_vector<float> an(w*h); thrust::host_vector<float> b(w*h); - thrust::host_vector<float> bn(w*h); for (int y = 0; y < h; ++y){ for (int x = 0; x < w; ++x){ @@ -400,48 +449,33 @@ HornSchunckPyramidal::HornSchunckPyramidal(const string &t0_file, const string & } } - //CudaMat test0(w,h,a); - //test0.toFile("pre normal.out"); - normalize(a,b,an,bn); - CudaMat test(w,h,an); - test.toFile("I1norm.out"); + I0.emplace_back(w,h,a); + I1.emplace_back(w,h,b); - CudaMat test2(w,h,bn); - test2.toFile("I2norm.out"); - //exit(1); + std::cout <<"image loadng: " << time.stop() << " ms" << std::endl; + time.start(); - I0.emplace_back(w,h,an); - I1.emplace_back(w,h,bn); + normalize(I0[0],I1[0]); u.emplace_back( I0[0].width(), I0[0].height() ); - u.back().setZero(); v.emplace_back( I0[0].width(), I0[0].height() ); - v.back().setZero(); I0[0].gauss(5,0.8); I1[0].gauss(5,0.8); - //gaussianCPU(I0[0],0.8); - //gaussianCPU(I1[0],0.8); - - /*I0[0].toFile("I1gauss.out"); - I1[0].toFile("I2gauss.out"); - exit(1);*/ - assert((I0[0].width() == I1[0].width()) && (I0[0].height() == I1[0].height())); getSizes(); float sigma = 0.6 * sqrtf(1.0/(nue*nue)-1.0); - std::cout << "sigma " << sigma << std::endl; for(int i = 1; i < sizes.size();++i) { I0.emplace_back(); @@ -450,66 +484,20 @@ HornSchunckPyramidal::HornSchunckPyramidal(const string &t0_file, const string & I0[i-1].gauss(5,sigma,I0[i]); I1[i-1].gauss(5,sigma,I1[i]); - //Cpu test - /*I0[i] = I0[i-1]; - I1[i] = I1[i-1]; - - gaussianCPU(I0[i],sigma); - gaussianCPU(I1[i],sigma);*/ - I0[i].bicubic(sizes[i].first,sizes[i].second); I1[i].bicubic(sizes[i].first,sizes[i].second); - /*string n1 = "I1_"+boost::lexical_cast<std::string>(I0[i].width())+".out"; - string n2 = "I2_"+boost::lexical_cast<std::string>(I0[i].width())+".out"; - I0[i].toFile(n1); - I1[i].toFile(n2);*/ - u.emplace_back( sizes[i].first, sizes[i].second ); - u.back().setZero(); v.emplace_back( sizes[i].first, sizes[i].second ); - v.back().setZero(); } - - //exit(1); - //same values up to this point! - - /*for(int i = 1; i < I0.size();++i) - { - I0[i].bicubic(sizes[i].first,sizes[i].second); - I1[i].bicubic(sizes[i].first,sizes[i].second); - I0[i].show("initial cuda"); - u.emplace_back( - sizes[i].first, - sizes[i].second - ); - u.back().setZero(); - - v.emplace_back( - sizes[i].first, - sizes[i].second - ); - v.back().setZero(); - }*/ - //exit(1); - /* int cnt = 0; - for(int i = 0; i < I0.size(); ++i) - { - std::string name = "output/t"; - name+= boost::lexical_cast<std::string>(cnt++); - name+=".png"; - std::cout << name << std::endl; - I0[i].write_image(name); - }*/ - //exit(1); - std::cout <<"pyramid creation: " << time.stop() << " ms" << std::endl; + std::cout <<"pyramid creation: " << time.stop() << " ms" << std::endl; } void HornSchunckPyramidal::draw(int id) @@ -529,13 +517,10 @@ void HornSchunckPyramidal::draw(CudaMat &UA, CudaMat &VA, int id) UA.copyToHost(ua); VA.copyToHost(va); - std::cout << "after copy" << std::endl; - cv::Mat flo(h,w,CV_32FC2); uv_point2f_cv(ua,va,flo,w,h); - std::cout << "after copy" << std::endl; cv::Mat xy[2]; //X,Y cv::split(flo, xy); @@ -544,15 +529,12 @@ void HornSchunckPyramidal::draw(CudaMat &UA, CudaMat &VA, int id) cv::Mat magnitude, angle; cv::cartToPolar(xy[0], xy[1], magnitude, angle, true); - std::cout << "after copy" << std::endl; - - //translate magnitude to range [0;1] + //translate magnitude to range [0;1] double mag_max; cv::minMaxLoc(magnitude, 0, &mag_max); magnitude.convertTo(magnitude, -1, 1.0 / mag_max); - std::cout << "after copy" << std::endl; //build hsv image std::vector<cv::Mat> _hsv; @@ -564,60 +546,15 @@ void HornSchunckPyramidal::draw(CudaMat &UA, CudaMat &VA, int id) //convert to BGR and show cv::Mat bgr;//CV_32FC3 matrix - std::cout << "after copy" << std::endl; cv::cvtColor(hsv, bgr, cv::COLOR_HSV2BGR); std::string name = "CUDA/"; name+= boost::lexical_cast<std::string>(UA.width()); name+=" "; name+=boost::lexical_cast<std::string>(id); - std::cout << "after copy" << std::endl; cv::imshow(name, bgr); - std::cout << "after copy" << std::endl; cv::waitKey(0); } -float sor_iteration( - const float *Au, // constant part of the numerator of u - const float *Av, // constant part of the numerator of v - const float *Du, // denominator of u - const float *Dv, // denominator of v - const float *D, // constant part of the numerator - float *u, // x component of the flow - float *v, // y component of the flow - const float al, // alpha smoothness parameter - const int p, // current position - const int p1, // up-left neighbor - const int p2, // up-right neighbor - const int p3, // bottom-left neighbor - const int p4, // bottom-right neighbor - const int p5, // up neighbor - const int p6, // left neighbor - const int p7, // bottom neighbor - const int p8 // right neighbor -) -{ - // set the SOR extrapolation parameter - const float w = 1.9; - - // compute the divergence - const float ula = 1./12. * (u[p1] + u[p2] + u[p3] + u[p4]) + - 1./6. * (u[p5] + u[p6] + u[p7] + u[p8]); - const float vla = 1./12. * (v[p1] + v[p2] + v[p3] + v[p4]) + - 1./6. * (v[p5] + v[p6] + v[p7] + v[p8]); - - // store the previous values - const float uk = u[p]; - const float vk = v[p]; - - // update the flow - u[p] = (1.0 - w) * uk + w * (Au[p] - D[p] * v[p] + al * ula) / Du[p]; - - v[p] = (1.0 - w) * vk + w * (Av[p] - D[p] * u[p] + al * vla) / Dv[p]; - - // return the convergence error - return (u[p] - uk) * (u[p] - uk) + (v[p] - vk) * (v[p] - vk); -} - -float sorCPUOwn(CudaMat &Au,CudaMat &Av, +inline float sorCPU(CudaMat &Au,CudaMat &Av, CudaMat &Du,CudaMat &Dv, CudaMat &D, CudaMat &u, @@ -674,7 +611,7 @@ float sorCPUOwn(CudaMat &Au,CudaMat &Av, return sqrt(error / float(w*h)); } -float sorGPU(CudaMat &Au,CudaMat &Av, +inline float sorGPU(CudaMat &Au,CudaMat &Av, CudaMat &Du,CudaMat &Dv, CudaMat &D, CudaMat &u, @@ -690,7 +627,6 @@ float sorGPU(CudaMat &Au,CudaMat &Av, thrust::device_vector<float> total_error(w*h,0); const float omega = 0.5; if(rb){ - std::cout << "red" << std::endl; sorRED<<<blocks,threadsperblock>>>( Au.getRawPtr(), Av.getRawPtr(), @@ -717,7 +653,6 @@ float sorGPU(CudaMat &Au,CudaMat &Av, w,h ); } else { - std::cout << "black" << std::endl; sorBLACK<<<blocks,threadsperblock>>>( Au.getRawPtr(), Av.getRawPtr(), @@ -752,124 +687,6 @@ float sorGPU(CudaMat &Au,CudaMat &Av, return sqrt(error / float(w*h) ); } -float sorCPU(CudaMat &Au,CudaMat &Av, - CudaMat &Du,CudaMat &Dv, - CudaMat &D, - CudaMat &u, - CudaMat &v, - const float alpha) -{ - int w = u.width(); - int h = v.height(); - - std::vector<float> Auc(w*h); - std::vector<float> Avc(w*h); - std::vector<float> Duc(w*h); - std::vector<float> Dvc(w*h); - std::vector<float> Dc(w*h); - std::vector<float> uc(w*h); - std::vector<float> vc(w*h); - - Au.copyToHost(Auc); - Av.copyToHost(Avc); - Du.copyToHost(Duc); - Dv.copyToHost(Dvc); - D.copyToHost(Dc); - u.copyToHost(uc); - v.copyToHost(vc); - - float error = 0; - - for(int i = 1; i < h-1; i++) - for(int j = 1; j < w-1; j++) - { - const int k = i * w + j; - error += sor_iteration( - Auc.data(), Avc.data(), Duc.data(), Dvc.data(), Dc.data(), uc.data(), vc.data(), alpha * alpha, - k, k-w-1, k-w+1, k+w-1, - k+w+1, k-w, k-1, k+w, k+1 - ); - } - - // process the first and last rows - for(int j = 1; j < w-1; j++) - { - // first row - int k = j; - error += sor_iteration( - Auc.data(), Avc.data(), Duc.data(), Dvc.data(), Dc.data(), uc.data(), vc.data(), alpha * alpha, - k, k-1, k+1, k+w-1, k+w+1, - k, k-1, k+w, k+1 - ); - - // last row - k = (h-1) * w + j; - error += sor_iteration( - Auc.data(), Avc.data(), Duc.data(), Dvc.data(), Dc.data(), uc.data(), vc.data(), alpha * alpha, - k, k-w-1, k-w+1, k-1, k+1, - k-w, k-1, k, k+1 - ); - } - - // process the first and last columns - for(int i = 1; i < h-1; i++) - { - // first column - int k = i * w; - error += sor_iteration( - Auc.data(), Avc.data(), Duc.data(), Dvc.data(), Dc.data(), uc.data(), vc.data(), alpha * alpha, - k, k-w, k-w+1, k+w, k+w+1, - k-w, k, k+w, k+1 - ); - - // last column - k = (i+1) * w - 1; - error += sor_iteration( - Auc.data(), Avc.data(), Duc.data(), Dvc.data(), Dc.data(), uc.data(), vc.data(), alpha * alpha, - k, k-w-1, k-w, k+w-1, k+w, - k-w, k-1, k+w, k - ); - } - - // process the corners - // up-left corner - error += sor_iteration( - Auc.data(), Avc.data(), Duc.data(), Dvc.data(), Dc.data(), uc.data(), vc.data(), alpha * alpha, - 0, 0, 1, w, w+1, - 0, 0, w, 1 - ); - - // up-right corner - int k = w - 1; - error += sor_iteration( - Auc.data(), Avc.data(), Duc.data(), Dvc.data(), Dc.data(), uc.data(), vc.data(), alpha * alpha, - k, k-1, k, k+w-1, k+w, - k, k-1, k+w, k - ); - - // bottom-left corner - k = (h-1) * w; - error += sor_iteration( - Auc.data(), Avc.data(), Duc.data(), Dvc.data(), Dc.data(), uc.data(), vc.data(), alpha * alpha, - k, k-w, k-w+1,k, k+1, - k-w, k, k, k+1 - ); - - // bottom-right corner - k = h * w - 1; - error += sor_iteration( - Auc.data(), Avc.data(), Duc.data(), Dvc.data(), Dc.data(), uc.data(), vc.data(), alpha * alpha, - k, k-1, k, k-w-1, k-w, - k-w, k-1, k, k - ); - - u = CudaMat(w,h,uc); - v = CudaMat(w,h,vc); - - return sqrtf(error / (w*h) ); -} - - void HornSchunckPyramidal::HS(CudaMat &I1s, CudaMat &I2s, CudaMat &us, @@ -900,29 +717,12 @@ void HornSchunckPyramidal::HS(CudaMat &I1s, w,h ); - /*I2s.toFile("orig.out"); - I2x.toFile("gradX.out"); - I2y.toFile("gradY.out"); - - exit(1);*/ - //Identical - std::cout << "compute: " << w <<"/" << h << std::endl; - //exit(1); - for(size_t j = 0; j < N_warps; ++j) { I2s.warp(us,vs,1.0,true,I2_warped); I2x.warp(us,vs,1.0,true,I2x_warped); I2y.warp(us,vs,1.0,true,I2y_warped); - - /* std::cout << w << "x" << h << std::endl; - - I2_warped.toFile("I2warped.out"); - I2x_warped.toFile("I2xwarped.out"); - I2y_warped.toFile("I2ywarped.out"); - exit(1);*/ - //calc constant stuff constants<<<blocks,threadsperblock>>>( I1s.getRawPtr(), @@ -940,34 +740,17 @@ void HornSchunckPyramidal::HS(CudaMat &I1s, w,h ); - /*Au.toFile("Au.out"); - Av.toFile("Av.out"); - Du.toFile("Du.out"); - Dv.toFile("Dv.out"); - D.toFile("D.out"); - exit(1);*/ - size_t iter = 0; float tol = 0.0001; float error = tol+100; bool red = true; while(iter < N_iterations && error > tol){ iter++; - - //error = sorCPU(Au,Av,Du,Dv,D,us,vs,alpha); - error = sorGPU(Au,Av,Du,Dv,D,us,vs,alpha,red); - - //error = sorCPUOwn(Au,Av,Du,Dv,D,us,vs,alpha); + //error = sorCPU(Au,Av,Du,Dv,D,us,vs,alpha); red = !red; - - - std::cout << "Iterations " << iter << " " << error << std::endl; + //std::cout << "Iterations " << iter << " " << error << std::endl; } - - //us.toFile("ucpu.out"); - //vs.toFile("vcpu.out"); - //exit(1); } } @@ -978,12 +761,8 @@ void HornSchunckPyramidal::compute(const float alpha) u[N_scales-1].setZero(); v[N_scales-1].setZero(); - std::cout << N_scales << std::endl; - for(int s = N_scales - 1; s >= 0; --s) { - - std::cout << I0[s].width() << std::endl; assert( I0[s].width() == I1[s].width() && I0[s].width() == u[s].width() @@ -997,10 +776,7 @@ void HornSchunckPyramidal::compute(const float alpha) I0[s].height() == v[s].height() ); - // std::cout << sizes[s].first << "x" << sizes[s].second << std::endl; - // draw(u[s],v[s],0); HS(I0[s],I1[s],u[s],v[s],alpha); - // draw(u[s],v[s],0); if(s > 0) { @@ -1011,12 +787,6 @@ void HornSchunckPyramidal::compute(const float alpha) v[s].upscale(v[s-1],ws,hs); u[s-1]*=(1.0/nue); v[s-1]*=(1.0/nue); - - //upscale cpu - //upscaleCPU(u[s],u[s-1],nue); - //upscaleCPU(v[s],v[s-1],nue); - - } } std::cout << "pyramid computation: " << time.stop() << " ms" << std::endl;