diff --git a/OFCUDA/CMakeLists.txt b/OFCUDA/CMakeLists.txt
index f6e46ef06b125602099b6041660500647e579e62..0a2d9120138c4d9bd1e411850295972edbf2fdec 100644
--- a/OFCUDA/CMakeLists.txt
+++ b/OFCUDA/CMakeLists.txt
@@ -30,7 +30,7 @@ include_directories( "header/" )
 SET(
     CUDA_NVCC_FLAGS
     ${CUDA_NVCC_FLAGS};
-    -O3 -g -gencode arch=compute_20,code=sm_20; -D_FORCE_INLINES
+    -O3 -g -gencode arch=compute_52,code=sm_52; -D_FORCE_INLINES
     )
 SET( CMAKE_CXX_FLAGS  "${CMAKE_CXX_FLAGS} -std=c++11 -O3" )
 
diff --git a/OFCUDA/header/CudaMat.h b/OFCUDA/header/CudaMat.h
index 878611ea719e2bdec6dfb784fb75e6a798e0fa1e..a39eeade3f880d2e3200c5ede3c1ac3b5b645140 100644
--- a/OFCUDA/header/CudaMat.h
+++ b/OFCUDA/header/CudaMat.h
@@ -3,13 +3,18 @@
 #include <iostream>
 #include <string>
 #include <vector>
+#include <limits>
+
 #include <cuda_runtime.h>
 #include <cuda.h>
 #include <assert.h>
+
 #include <thrust/host_vector.h>
 #include <thrust/device_vector.h>
 #include <thrust/device_ptr.h>
 #include <thrust/for_each.h>
+#include <thrust/reduce.h>
+#include <thrust/functional.h>
 #include <thrust/execution_policy.h>
 #include <thrust/fill.h>
 #include <thrust/swap.h>
@@ -258,11 +263,8 @@ public:
   ~CudaMat()
   {
     if(dev_mat != nullptr){
-        //std::cout << "free " << getRawPtr() << std::endl;
         cudaFree(dev_mat);
-    } /*else {
-        std::cout << "invalid free " << getRawPtr() << std::endl;
-    }*/
+    }
   }
 
   void write_image(const std::string &filename) const
diff --git a/OFCUDA/source/cu/HornSchunckPyramidal.cu b/OFCUDA/source/cu/HornSchunckPyramidal.cu
index fcd8782473f586cfd6c39e9c6697d87ff030188c..dcff4132ff0fdd98e5e56c4ce51e8c3ef261ed5c 100644
--- a/OFCUDA/source/cu/HornSchunckPyramidal.cu
+++ b/OFCUDA/source/cu/HornSchunckPyramidal.cu
@@ -42,6 +42,24 @@ namespace {
         return x[r];
     }
 
+    // compute the largest number of an array
+    float max_element(CudaMat &a)
+    {
+        const thrust::device_ptr<float> self_ptr(a.getRawPtr());
+        return thrust::reduce(self_ptr,self_ptr+(a.width()*a.height()),
+                                 std::numeric_limits<float>::min(),
+                                 thrust::maximum<float>());
+    }
+
+    // compute the smallest number of an array
+    float min_element(CudaMat &a)
+    {
+        const thrust::device_ptr<float> self_ptr(a.getRawPtr());
+        return thrust::reduce(self_ptr,self_ptr+(a.width()*a.height()),
+                                 std::numeric_limits<float>::max(),
+                                 thrust::minimum<float>());
+    }
+
     void normalize(const thrust::host_vector<float> &a, const thrust::host_vector<float> &b,
                    thrust::host_vector<float> &an, thrust::host_vector<float> &bn)
     {
@@ -76,8 +94,6 @@ namespace {
             }
     }
 
-
-
   void uv_point2f_cv(
     const thrust::host_vector<float> &u,
     const thrust::host_vector<float> &v,
@@ -105,6 +121,23 @@ namespace {
     return yy*w+xx;
   }
 
+  __global__ void norm(
+    float* a,
+    float* b,
+    const float min,
+    const float den,
+    int w, int h)
+  {
+    int x = blockIdx.x * blockDim.x + threadIdx.x;
+    int y = blockIdx.y * blockDim.y + threadIdx.y;
+    if(x >= w || y >= h) return;
+
+    if(den > 0){
+        a[y*w+x] = 255.0 * (a[y*w+x] - min) / den;
+        b[y*w+x] = 255.0 * (b[y*w+x] - min) / den;
+    }
+  }
+
   __global__ void mult(
     float* dev,
     const float val,
@@ -315,6 +348,31 @@ namespace {
             + (v[idx] - vk) * (v[idx] - vk);
   }
 
+   void normalize(CudaMat &a, CudaMat &b)
+   {
+       int w = a.width();
+       int h = a.height();
+
+       assert(a.width() == b.width() && a.height() == b.height());
+       // find the max and min of both images
+       const float max1 = max_element(a);
+       const float max2 = max_element(b);
+       const float min1 = min_element(a);
+       const float min2 = min_element(b);
+
+       // obtain the absolute max and min
+       const float max = max1 > max2 ? max1 : max2;
+       const float min = min1 < min2 ? min1 : min2;
+       const float den = max - min;
+
+       dim3 threadsperblock(16,16);
+       dim3 blocks(ceil(float(w)/float(16)),ceil(float(h)/float(16)));
+       norm<<< blocks, threadsperblock >>>(a.getRawPtr(),
+                                           b.getRawPtr(),
+                                           min,den,
+                                           w,h);
+   }
+
 }
 
 void HornSchunckPyramidal::getSizes()
@@ -332,16 +390,10 @@ void HornSchunckPyramidal::getSizes()
     {
       break;
     }
-    std::cout << w << "/" << h << std::endl;
     sizes.emplace_back(w,h);
   }
 
   N_scales = sizes.size();
-  /*int cnt = 0;
-  for(auto &p : sizes)
-  {
-    std::cout << nue << "^" << cnt++ << " : " << p.first << " " << p.second << std::endl;
-  }*/
 }
 
 void gaussianCPU(CudaMat &m, const float sigma)
@@ -385,13 +437,10 @@ HornSchunckPyramidal::HornSchunckPyramidal(const string &t0_file, const string &
   cv::Mat image0 = cv::imread(t0_file,CV_LOAD_IMAGE_GRAYSCALE);
   cv::Mat image1 = cv::imread(t1_file,CV_LOAD_IMAGE_GRAYSCALE);
 
-
   int w = image0.cols;
   int h = image0.rows;
   thrust::host_vector<float> a(w*h);
-  thrust::host_vector<float> an(w*h);
   thrust::host_vector<float> b(w*h);
-  thrust::host_vector<float> bn(w*h);
 
   for (int y = 0; y < h; ++y){
       for (int x = 0; x < w; ++x){
@@ -400,48 +449,33 @@ HornSchunckPyramidal::HornSchunckPyramidal(const string &t0_file, const string &
       }
     }
 
-  //CudaMat test0(w,h,a);
-  //test0.toFile("pre normal.out");
-  normalize(a,b,an,bn);
 
-  CudaMat test(w,h,an);
-  test.toFile("I1norm.out");
+  I0.emplace_back(w,h,a);
+  I1.emplace_back(w,h,b);
 
-  CudaMat test2(w,h,bn);
-  test2.toFile("I2norm.out");
-  //exit(1);
+  std::cout <<"image loadng: " << time.stop() << " ms" <<  std::endl;
+  time.start();
 
-  I0.emplace_back(w,h,an);
-  I1.emplace_back(w,h,bn);
+  normalize(I0[0],I1[0]);
 
   u.emplace_back(
               I0[0].width(),
               I0[0].height()
           );
-  u.back().setZero();
 
   v.emplace_back(
               I0[0].width(),
               I0[0].height()
           );
-  v.back().setZero();
 
   I0[0].gauss(5,0.8);
   I1[0].gauss(5,0.8);
 
-  //gaussianCPU(I0[0],0.8);
-  //gaussianCPU(I1[0],0.8);
-
-  /*I0[0].toFile("I1gauss.out");
-  I1[0].toFile("I2gauss.out");
-  exit(1);*/
-
   assert((I0[0].width() == I1[0].width())
         && (I0[0].height() == I1[0].height()));
 
   getSizes();
   float sigma = 0.6 * sqrtf(1.0/(nue*nue)-1.0);
-  std::cout << "sigma " << sigma << std::endl;
   for(int i = 1; i < sizes.size();++i)
   {
     I0.emplace_back();
@@ -450,66 +484,20 @@ HornSchunckPyramidal::HornSchunckPyramidal(const string &t0_file, const string &
     I0[i-1].gauss(5,sigma,I0[i]);
     I1[i-1].gauss(5,sigma,I1[i]);
 
-    //Cpu test
-    /*I0[i] = I0[i-1];
-    I1[i] = I1[i-1];
-
-    gaussianCPU(I0[i],sigma);
-    gaussianCPU(I1[i],sigma);*/
-
     I0[i].bicubic(sizes[i].first,sizes[i].second);
     I1[i].bicubic(sizes[i].first,sizes[i].second);
 
-    /*string n1 = "I1_"+boost::lexical_cast<std::string>(I0[i].width())+".out";
-    string n2 = "I2_"+boost::lexical_cast<std::string>(I0[i].width())+".out";
-    I0[i].toFile(n1);
-    I1[i].toFile(n2);*/
-
     u.emplace_back(
                 sizes[i].first,
                 sizes[i].second
             );
-    u.back().setZero();
 
     v.emplace_back(
                 sizes[i].first,
                 sizes[i].second
             );
-    v.back().setZero();
   }
-
-  //exit(1);
-  //same values up to this point!
-
-  /*for(int i = 1; i < I0.size();++i)
-  {
-    I0[i].bicubic(sizes[i].first,sizes[i].second);
-    I1[i].bicubic(sizes[i].first,sizes[i].second);
-    I0[i].show("initial cuda");
-    u.emplace_back(
-                sizes[i].first,
-                sizes[i].second
-            );
-    u.back().setZero();
-
-    v.emplace_back(
-                sizes[i].first,
-                sizes[i].second
-            );
-    v.back().setZero();
-  }*/
-  //exit(1);
- /* int cnt = 0;
-  for(int i = 0; i < I0.size(); ++i)
-  {
-    std::string name = "output/t";
-    name+= boost::lexical_cast<std::string>(cnt++);
-    name+=".png";
-    std::cout << name << std::endl;
-    I0[i].write_image(name);
-  }*/
-  //exit(1);
-    std::cout <<"pyramid creation: " << time.stop() << " ms" <<  std::endl;
+   std::cout <<"pyramid creation: " << time.stop() << " ms" <<  std::endl;
 }
 
 void HornSchunckPyramidal::draw(int id)
@@ -529,13 +517,10 @@ void HornSchunckPyramidal::draw(CudaMat &UA, CudaMat &VA, int id)
     UA.copyToHost(ua);
     VA.copyToHost(va);
 
-    std::cout << "after copy" << std::endl;
-
 
     cv::Mat flo(h,w,CV_32FC2);
     uv_point2f_cv(ua,va,flo,w,h);
 
-    std::cout << "after copy" << std::endl;
 
     cv::Mat xy[2]; //X,Y
     cv::split(flo, xy);
@@ -544,15 +529,12 @@ void HornSchunckPyramidal::draw(CudaMat &UA, CudaMat &VA, int id)
     cv::Mat magnitude, angle;
     cv::cartToPolar(xy[0], xy[1], magnitude, angle, true);
 
-        std::cout << "after copy" << std::endl;
-
-    //translate magnitude to range [0;1]
+   //translate magnitude to range [0;1]
     double mag_max;
     cv::minMaxLoc(magnitude, 0, &mag_max);
     magnitude.convertTo(magnitude, -1, 1.0 / mag_max);
 
 
-        std::cout << "after copy" << std::endl;
     //build hsv image
     std::vector<cv::Mat> _hsv;
 
@@ -564,60 +546,15 @@ void HornSchunckPyramidal::draw(CudaMat &UA, CudaMat &VA, int id)
 
     //convert to BGR and show
     cv::Mat bgr;//CV_32FC3 matrix
-        std::cout << "after copy" << std::endl;
     cv::cvtColor(hsv, bgr, cv::COLOR_HSV2BGR);
     std::string name = "CUDA/";
     name+= boost::lexical_cast<std::string>(UA.width());
     name+=" "; name+=boost::lexical_cast<std::string>(id);
-        std::cout << "after copy" << std::endl;
     cv::imshow(name, bgr);
-        std::cout << "after copy" << std::endl;
     cv::waitKey(0);
 }
 
-float sor_iteration(
-    const float *Au, // constant part of the numerator of u
-    const float *Av, // constant part of the numerator of v
-    const float *Du, // denominator of u
-    const float *Dv, // denominator of v
-    const float *D,  // constant part of the numerator
-    float 	    *u,  // x component of the flow
-    float 	    *v,  // y component of the flow
-    const float  al, // alpha smoothness parameter
-    const int    p,  // current position
-    const int    p1, // up-left neighbor
-    const int    p2, // up-right neighbor
-    const int    p3, // bottom-left neighbor
-    const int    p4, // bottom-right neighbor
-    const int    p5, // up neighbor
-    const int    p6, // left neighbor
-    const int    p7, // bottom neighbor
-    const int    p8  // right neighbor
-)
-{
-    // set the SOR extrapolation parameter
-    const float w = 1.9;
-
-    // compute the divergence
-    const float ula = 1./12. * (u[p1] + u[p2] + u[p3] + u[p4]) +
-              1./6.  * (u[p5] + u[p6] + u[p7] + u[p8]);
-    const float vla = 1./12. * (v[p1] + v[p2] + v[p3] + v[p4]) +
-              1./6.  * (v[p5] + v[p6] + v[p7] + v[p8]);
-
-    // store the previous values
-    const float uk = u[p];
-    const float vk = v[p];
-
-    // update the flow
-    u[p] = (1.0 - w) * uk + w * (Au[p] - D[p] * v[p] + al * ula) / Du[p];
-
-    v[p] = (1.0 - w) * vk + w * (Av[p] - D[p] * u[p] + al * vla) / Dv[p];
-
-    // return the convergence error
-    return (u[p] - uk) * (u[p] - uk) + (v[p] - vk) * (v[p] - vk);
-}
-
-float sorCPUOwn(CudaMat &Au,CudaMat &Av,
+inline float sorCPU(CudaMat &Au,CudaMat &Av,
             CudaMat &Du,CudaMat &Dv,
             CudaMat &D,
             CudaMat &u,
@@ -674,7 +611,7 @@ float sorCPUOwn(CudaMat &Au,CudaMat &Av,
     return sqrt(error / float(w*h));
 }
 
-float sorGPU(CudaMat &Au,CudaMat &Av,
+inline float sorGPU(CudaMat &Au,CudaMat &Av,
             CudaMat &Du,CudaMat &Dv,
             CudaMat &D,
             CudaMat &u,
@@ -690,7 +627,6 @@ float sorGPU(CudaMat &Au,CudaMat &Av,
     thrust::device_vector<float> total_error(w*h,0);
     const float omega = 0.5;
     if(rb){
-        std::cout << "red" << std::endl;
         sorRED<<<blocks,threadsperblock>>>(
                         Au.getRawPtr(),
                         Av.getRawPtr(),
@@ -717,7 +653,6 @@ float sorGPU(CudaMat &Au,CudaMat &Av,
                         w,h
                     );
     } else {
-        std::cout << "black" << std::endl;
         sorBLACK<<<blocks,threadsperblock>>>(
                         Au.getRawPtr(),
                         Av.getRawPtr(),
@@ -752,124 +687,6 @@ float sorGPU(CudaMat &Au,CudaMat &Av,
     return sqrt(error / float(w*h) );
 }
 
-float sorCPU(CudaMat &Au,CudaMat &Av,
-            CudaMat &Du,CudaMat &Dv,
-            CudaMat &D,
-            CudaMat &u,
-            CudaMat &v,
-            const float alpha)
-{
-    int w = u.width();
-    int h = v.height();
-
-    std::vector<float> Auc(w*h);
-    std::vector<float> Avc(w*h);
-    std::vector<float> Duc(w*h);
-    std::vector<float> Dvc(w*h);
-    std::vector<float> Dc(w*h);
-    std::vector<float> uc(w*h);
-    std::vector<float> vc(w*h);
-
-    Au.copyToHost(Auc);
-    Av.copyToHost(Avc);
-    Du.copyToHost(Duc);
-    Dv.copyToHost(Dvc);
-    D.copyToHost(Dc);
-    u.copyToHost(uc);
-    v.copyToHost(vc);
-
-    float error = 0;
-
-    for(int i = 1; i < h-1; i++)
-    for(int j = 1; j < w-1; j++)
-    {
-        const int k = i * w + j;
-        error += sor_iteration(
-                Auc.data(), Avc.data(), Duc.data(), Dvc.data(), Dc.data(), uc.data(), vc.data(), alpha * alpha,
-                k, k-w-1, k-w+1, k+w-1,
-                k+w+1, k-w, k-1, k+w, k+1
-                );
-    }
-
-    // process the first and last rows
-    for(int j = 1; j < w-1; j++)
-    {
-        // first row
-        int k = j;
-        error += sor_iteration(
-                Auc.data(), Avc.data(), Duc.data(), Dvc.data(), Dc.data(), uc.data(), vc.data(), alpha * alpha,
-                k, k-1, k+1, k+w-1, k+w+1,
-                k, k-1, k+w, k+1
-                );
-
-        // last row
-        k = (h-1) * w + j;
-        error += sor_iteration(
-                Auc.data(), Avc.data(), Duc.data(), Dvc.data(), Dc.data(), uc.data(), vc.data(), alpha * alpha,
-                k, k-w-1, k-w+1, k-1, k+1,
-                k-w, k-1, k, k+1
-                );
-    }
-
-    // process the first and last columns
-    for(int i = 1; i < h-1; i++)
-    {
-        // first column
-        int k = i * w;
-        error += sor_iteration(
-                Auc.data(), Avc.data(), Duc.data(), Dvc.data(), Dc.data(), uc.data(), vc.data(), alpha * alpha,
-                k, k-w, k-w+1, k+w, k+w+1,
-                k-w, k, k+w, k+1
-                );
-
-        // last column
-        k = (i+1) * w - 1;
-        error += sor_iteration(
-                Auc.data(), Avc.data(), Duc.data(), Dvc.data(), Dc.data(), uc.data(), vc.data(), alpha * alpha,
-                k, k-w-1, k-w, k+w-1, k+w,
-                k-w, k-1, k+w, k
-                );
-    }
-
-    // process the corners
-    // up-left corner
-    error += sor_iteration(
-            Auc.data(), Avc.data(), Duc.data(), Dvc.data(), Dc.data(), uc.data(), vc.data(), alpha * alpha,
-            0, 0, 1, w, w+1,
-            0, 0, w, 1
-            );
-
-    // up-right corner
-    int k = w - 1;
-    error += sor_iteration(
-            Auc.data(), Avc.data(), Duc.data(), Dvc.data(), Dc.data(), uc.data(), vc.data(), alpha * alpha,
-            k, k-1, k, k+w-1, k+w,
-            k, k-1, k+w, k
-            );
-
-    // bottom-left corner
-    k = (h-1) * w;
-    error += sor_iteration(
-            Auc.data(), Avc.data(), Duc.data(), Dvc.data(), Dc.data(), uc.data(), vc.data(), alpha * alpha,
-            k, k-w, k-w+1,k, k+1,
-            k-w, k, k, k+1
-            );
-
-    // bottom-right corner
-    k = h * w - 1;
-    error += sor_iteration(
-            Auc.data(), Avc.data(), Duc.data(), Dvc.data(), Dc.data(), uc.data(), vc.data(), alpha * alpha,
-            k, k-1, k, k-w-1, k-w,
-            k-w, k-1, k, k
-            );
-
-    u = CudaMat(w,h,uc);
-    v = CudaMat(w,h,vc);
-
-    return sqrtf(error / (w*h) );
-}
-
-
 void HornSchunckPyramidal::HS(CudaMat &I1s,
         CudaMat &I2s,
         CudaMat &us,
@@ -900,29 +717,12 @@ void HornSchunckPyramidal::HS(CudaMat &I1s,
         w,h
         );
 
-    /*I2s.toFile("orig.out");
-    I2x.toFile("gradX.out");
-    I2y.toFile("gradY.out");
-
-    exit(1);*/
-    //Identical
-    std::cout << "compute: " << w <<"/" << h << std::endl;
-    //exit(1);
-
     for(size_t j = 0; j < N_warps; ++j)
     {
         I2s.warp(us,vs,1.0,true,I2_warped);
         I2x.warp(us,vs,1.0,true,I2x_warped);
         I2y.warp(us,vs,1.0,true,I2y_warped);
 
-
-       /* std::cout << w << "x" << h << std::endl;
-
-        I2_warped.toFile("I2warped.out");
-        I2x_warped.toFile("I2xwarped.out");
-        I2y_warped.toFile("I2ywarped.out");
-        exit(1);*/
-
         //calc constant stuff
         constants<<<blocks,threadsperblock>>>(
         I1s.getRawPtr(),
@@ -940,34 +740,17 @@ void HornSchunckPyramidal::HS(CudaMat &I1s,
         w,h
         );
 
-        /*Au.toFile("Au.out");
-        Av.toFile("Av.out");
-        Du.toFile("Du.out");
-        Dv.toFile("Dv.out");
-        D.toFile("D.out");
-        exit(1);*/
-
         size_t iter = 0;
         float tol = 0.0001;
         float error = tol+100;
         bool red = true;
         while(iter < N_iterations && error > tol){
             iter++;
-
-            //error = sorCPU(Au,Av,Du,Dv,D,us,vs,alpha);
-
             error = sorGPU(Au,Av,Du,Dv,D,us,vs,alpha,red);
-
-            //error = sorCPUOwn(Au,Av,Du,Dv,D,us,vs,alpha);
+            //error = sorCPU(Au,Av,Du,Dv,D,us,vs,alpha);
             red = !red;
-
-
-            std::cout << "Iterations " << iter  << " " << error << std::endl;
+            //std::cout << "Iterations " << iter  << " " << error << std::endl;
         }
-
-        //us.toFile("ucpu.out");
-        //vs.toFile("vcpu.out");
-        //exit(1);
     }
 }
 
@@ -978,12 +761,8 @@ void HornSchunckPyramidal::compute(const float alpha)
     u[N_scales-1].setZero();
     v[N_scales-1].setZero();
 
-    std::cout << N_scales << std::endl;
-
     for(int s = N_scales - 1; s >= 0; --s)
     {
-
-        std::cout << I0[s].width() << std::endl;
         assert( I0[s].width() == I1[s].width()
                 &&
                 I0[s].width() == u[s].width()
@@ -997,10 +776,7 @@ void HornSchunckPyramidal::compute(const float alpha)
                 I0[s].height() == v[s].height()
                     );
 
-       // std::cout << sizes[s].first << "x" << sizes[s].second << std::endl;
-       // draw(u[s],v[s],0);
         HS(I0[s],I1[s],u[s],v[s],alpha);
-       // draw(u[s],v[s],0);
 
         if(s > 0)
         {
@@ -1011,12 +787,6 @@ void HornSchunckPyramidal::compute(const float alpha)
             v[s].upscale(v[s-1],ws,hs);
             u[s-1]*=(1.0/nue);
             v[s-1]*=(1.0/nue);
-
-            //upscale cpu
-            //upscaleCPU(u[s],u[s-1],nue);
-            //upscaleCPU(v[s],v[s-1],nue);
-
-
         }
     }
     std::cout << "pyramid computation: " << time.stop() << " ms" <<  std::endl;