2012-08-17 83 views
1

我已经检查过有关此问题的SO以前的问题,但无法看到它在这里如何关联。优化CUDA中2D扩散(热量)方程的解决方案

我正在用CUDA求解二维扩散方程,结果发现我的GPU代码比CPU的代码慢。

这里是我的代码:

//kernel definition 
__global__ void diffusionSolver(double* A, int n_x,int n_y) 
{ 

int i = blockIdx.x * blockDim.x + threadIdx.x; 
int j = blockIdx.y * blockDim.y + threadIdx.y; 


if(i<n_x && j <n_y && i*(n_x-i-1)*j*(n_y-j-1)!=0) 
    A[i+n_y*j] = A[i+n_y*j] + (A[i-1+n_y*j]+A[i+1+n_y*j]+A[i+(j-1)*n_y]+A[i+(j+1)*n_y] -4.0*A[i+n_y*j])/40.0; 


} 

INT主要功能

int main() 
     { 
     int n_x = 200 ; 
     int n_y = 200 ;  
     double *phi; 
     double *dummy; 
     double *phi_old;    
     int i,j ; 

     phi = (double *) malloc(n_x*n_y* sizeof(double)); 
     phi_old = (double *) malloc(n_x*n_y* sizeof(double)); 
     dummy = (double *) malloc(n_x*n_y* sizeof(double)); 
     int iterationMax =200;      
     for(j=0;j<n_y ;j++) 
     { 
      for(i=0;i<n_x;i++) 
      { 
      if((.4*n_x-i)*(.6*n_x-i)<0) 
      phi[i+n_y*j] = -1; 

      else 
      phi[i+n_y*j] = 1; 
      } 

     } 

     double *dev_phi; 
     cudaMalloc((void **) &dev_phi, n_x*n_y*sizeof(double)); 
     cudaMemcpy(dev_phi, phi, n_x*n_y*sizeof(double), 
     cudaMemcpyHostToDevice); 

     dim3 threadsPerBlock(10,100); 
     dim3 numBlocks(n_x*n_y/threadsPerBlock.x, n_x*n_y/threadsPerBlock.y); 

     for(int z=0; z<iterationMax; z++) 
     { 
     if(z%100==0) 
     cout <<z/100 <<"\n";; 
     diffusionSolver<<<numBlocks, threadsPerBlock>>>(dev_phi, n_x,n_y); 
     } 
    cudaMemcpy(phi, dev_phi,n_x*n_y*sizeof(double), cudaMemcpyDeviceToHost); 
cudaFree(dev_phi); 
    return 0; 
    } 

这段代码的问题是,它的运行速度比简单的CPU只迭代法慢。我不太了解分析器,直到现在我尝试使用cuda-memcheck,这给出0错误。 如何知道代码的哪个部分运行缓慢并加快速度?我正在研究Linux环境。预先感谢您的帮助。

+2

我将专注于正确性速度这里之前:你已经改变内核相比,我们已经看到了这同一代码的最后两次,现在你有一个很大的正确性问题:原位数组A介绍的更新一个读后写内存竞赛。此外,您的网格大小计算似乎完全不正确。如果我没有弄错,你会发射40000次太多块! – talonmies 2012-08-17 13:57:35

+0

@talonmies,我检查了代码与标准结果,它工作正常。我原地更新了数组A,因为我认为如果使用额外数组来存储旧数值,它会变慢。我不确定网格大小。这似乎是错误的。它应该是dim3 numBlocks(n_x/threadsPerBlock.x,n_y/threadsPerBlock.y);? – chatur 2012-08-17 14:15:19

+0

@talonmies,计算速度很慢,因为正如你所提到的,许多内核调用都是不必要的(在本例中为40000)。如果你可以把它作为答案,我可以把它标记为正确的。 – chatur 2012-08-17 14:28:08

回答

4

我看到的最糟糕的问题是,您正在为输入数组的大小启动太多的块。此刻的你是计算网格大小为:

dim3 numBlocks(n_x*n_y/threadsPerBlock.x, n_x*n_y/threadsPerBlock.y); 

其应产生的(400,4000)的块的网格尺寸为200×200唯一的输入数组。这显然是不正确的。计算应该是这样的:

int nbx = (n_x/threadsPerBlock.x) + (((n_x % threadsPerBlock.x) == 0) ? 0 : 1); 
int nby = (n_y/threadsPerBlock.y) + (((n_y % threadsPerBlock.y) == 0) ? 0 : 1); 
dim3 numBlocks(nbx,nby); 

这将产生(2,20)块的网格大小,或比您当前启动的少40000倍。

你可以考虑对内核进行其他优化,但与这种规模的错误相比,这些优化显得微不足道。

1

您正在进行大量的整数乘法操作,并且有很多全局内存读取,这两者在CUDA中都很慢。我也想象没有很多合并的全局内存读取。

加速内核的唯一方法是通过共享内存对合并内存进行读取和/或重新排列数据,以便可以在不使用大量整数乘法的情况下对其进行索引。

我对扩散方程没有很好的把握,但我不认为有很多天真的并行性可以被利用。看看CUDA Programming GuideBest Practices Guide,也许你会得到一些关于如何改进算法的想法。

+0

感谢maxywb的回复,我对合并内存没有任何意见,可能我需要阅读最佳实践 – chatur 2012-08-17 14:17:04

1

如果有人有兴趣,我在下面发布一个完整的代码,关于2D热方程的解决方案的优化方法。

五种方法都认为,使用:

  1. 全局内存,本质上是OP的做法;
  2. 共享内存的大小BLOCK_SIZE_X x BLOCK_SIZE_Y不加载晕区;
  3. 共享内存的大小BLOCK_SIZE_X x BLOCK_SIZE_Y加载晕区;
  4. 共享内存的大小(BLOCK_SIZE_X + 2) x (BLOCK_SIZE_Y + 2)加载晕区;
  5. 纹理内存

每个人都可以运行代码并检查哪种方法对于自己的GPU架构更快。

#include <iostream> 

#include "cuda_runtime.h" 
#include "device_launch_parameters.h" 

#include "Utilities.cuh" 
#include "InputOutput.cuh" 
#include "TimingGPU.cuh" 

#define BLOCK_SIZE_X 16 
#define BLOCK_SIZE_Y 16 

#define DEBUG 

texture<float, 2, cudaReadModeElementType> tex_T; 
texture<float, 2, cudaReadModeElementType> tex_T_old; 

/***********************************/ 
/* JACOBI ITERATION FUNCTION - GPU */ 
/***********************************/ 
__global__ void Jacobi_Iterator_GPU(const float * __restrict__ T_old, float * __restrict__ T_new, const int NX, const int NY) 
{ 
    const int i = blockIdx.x * blockDim.x + threadIdx.x ; 
    const int j = blockIdx.y * blockDim.y + threadIdx.y ; 

           //       N 
    int P = i + j*NX;   // node (i,j)    | 
    int N = i + (j+1)*NX;  // node (i,j+1)   | 
    int S = i + (j-1)*NX;  // node (i,j-1)  W ---- P ---- E 
    int E = (i+1) + j*NX;  // node (i+1,j)   | 
    int W = (i-1) + j*NX;  // node (i-1,j)   | 
           //       S 

    // --- Only update "interior" (not boundary) node points 
    if (i>0 && i<NX-1 && j>0 && j<NY-1) T_new[P] = 0.25 * (T_old[E] + T_old[W] + T_old[N] + T_old[S]); 
} 

/******************************************************/ 
/* JACOBI ITERATION FUNCTION - GPU - SHARED MEMORY V1 */ 
/******************************************************/ 
__global__ void Jacobi_Iterator_GPU_shared_v1(const float * __restrict__ T_old, float * __restrict__ T_new, const int NX, const int NY) 
{ 
    const int i = blockIdx.x * blockDim.x + threadIdx.x ; 
    const int j = blockIdx.y * blockDim.y + threadIdx.y ; 

           //       N 
    int P = i + j*NX;   // node (i,j)    | 
    int N = i + (j+1)*NX;  // node (i,j+1)   | 
    int S = i + (j-1)*NX;  // node (i,j-1)  W ---- P ---- E 
    int E = (i+1) + j*NX;  // node (i+1,j)   | 
    int W = (i-1) + j*NX;  // node (i-1,j)   | 
           //       S 
    __shared__ float T_sh[BLOCK_SIZE_X][BLOCK_SIZE_Y]; 

    // --- Load data to shared memory. Halo regions are NOT loaded. 
    T_sh[threadIdx.x][threadIdx.y] = T_old[P]; 
    __syncthreads(); 

    if ((threadIdx.x > 0) && (threadIdx.x < (BLOCK_SIZE_X - 1)) && (threadIdx.y > 0) && (threadIdx.y < (BLOCK_SIZE_Y ‐ 1))) 
     // --- If we do not need halo region elements, then use shared memory. 
     T_new[P] = 0.25 * (T_sh[threadIdx.x][threadIdx.y - 1] + T_sh[threadIdx.x][threadIdx.y + 1] + T_sh[threadIdx.x - 1][threadIdx.y] + T_sh[threadIdx.x + 1][threadIdx.y]); 
    else if (i>0 && i<NX-1 && j>0 && j<NY-1) // --- Only update "interior" (not boundary) node points 
     // --- If we need halo region elements, then use global memory. 
     T_new[P] = 0.25 * (T_old[E] + T_old[W] + T_old[N] + T_old[S]); 

} 

/******************************************************/ 
/* JACOBI ITERATION FUNCTION - GPU - SHARED MEMORY V2 */ 
/******************************************************/ 
__global__ void Jacobi_Iterator_GPU_shared_v2(const float * __restrict__ T_old, float * __restrict__ T_new, const int NX, const int NY) 
{ 
    const int i = blockIdx.x * (BLOCK_SIZE_X - 2) + threadIdx.x ; 
    const int j = blockIdx.y * (BLOCK_SIZE_Y - 2) + threadIdx.y ; 

    int P = i + j*NX;   

    if ((i >= NX) || (j >= NY)) return; 

    __shared__ float T_sh[BLOCK_SIZE_X][BLOCK_SIZE_Y]; 

    // --- Load data to shared memory. Halo regions ARE loaded. 
    T_sh[threadIdx.x][threadIdx.y] = T_old[P]; 
    __syncthreads(); 

    if (((threadIdx.x > 0) && (threadIdx.x < (BLOCK_SIZE_X - 1)) && (threadIdx.y > 0) && (threadIdx.y < (BLOCK_SIZE_Y ‐ 1))) && 
     (i>0 && i<NX-1 && j>0 && j<NY-1)) 
     T_new[P] = 0.25 * (T_sh[threadIdx.x][threadIdx.y - 1] + T_sh[threadIdx.x][threadIdx.y + 1] + T_sh[threadIdx.x - 1][threadIdx.y] + T_sh[threadIdx.x + 1][threadIdx.y]); 

} 

/******************************************************/ 
/* JACOBI ITERATION FUNCTION - GPU - SHARED MEMORY V2 */ 
/******************************************************/ 
__global__ void Jacobi_Iterator_GPU_shared_v3(const float * __restrict__ T_old, float * __restrict__ T_new, const int NX, const int NY) 
{ 
    const int i = blockIdx.x * blockDim.x + threadIdx.x ; 
    const int j = blockIdx.y * blockDim.y + threadIdx.y ; 

    const int tid_block = threadIdx.y * BLOCK_SIZE_X + threadIdx.x;  // --- Flattened thread index within a block 

    const int i1  = tid_block % (BLOCK_SIZE_X + 2); 
    const int j1  = tid_block/(BLOCK_SIZE_Y + 2); 

    const int i2  = (BLOCK_SIZE_X * BLOCK_SIZE_Y + tid_block) % (BLOCK_SIZE_X + 2); 
    const int j2  = (BLOCK_SIZE_X * BLOCK_SIZE_Y + tid_block)/(BLOCK_SIZE_Y + 2); 

    int P = i + j * NX;   

    if ((i >= NX) || (j >= NY)) return; 

    __shared__ float T_sh[BLOCK_SIZE_X + 2][BLOCK_SIZE_Y + 2]; 

    if (((blockIdx.x * BLOCK_SIZE_X - 1 + i1) < NX) && ((blockIdx.y * BLOCK_SIZE_Y - 1 + j1) < NY)) 
     T_sh[i1][j1] = T_old[(blockIdx.x * BLOCK_SIZE_X - 1 + i1) + (blockIdx.y * BLOCK_SIZE_Y - 1 + j1) * NX]; 

    if (((i2 < (BLOCK_SIZE_X + 2)) && (j2 < (BLOCK_SIZE_Y + 2))) && (((blockIdx.x * BLOCK_SIZE_X - 1 + i2) < NX) && ((blockIdx.y * BLOCK_SIZE_Y - 1 + j2) < NY))) 
     T_sh[i2][j2] = T_old[(blockIdx.x * BLOCK_SIZE_X - 1 + i2) + (blockIdx.y * BLOCK_SIZE_Y - 1 + j2) * NX]; 

    __syncthreads(); 

    if ((threadIdx.x <= (BLOCK_SIZE_X - 1) && (threadIdx.y <= (BLOCK_SIZE_Y ‐ 1))) && (i>0 && i<NX-1 && j>0 && j<NY-1)) 
     T_new[P] = 0.25 * (T_sh[threadIdx.x + 1][threadIdx.y] + T_sh[threadIdx.x + 1][threadIdx.y + 2] + T_sh[threadIdx.x][threadIdx.y + 1] + T_sh[threadIdx.x + 2][threadIdx.y + 1]); 

} 

/*********************************************/ 
/* JACOBI ITERATION FUNCTION - GPU - TEXTURE */ 
/*********************************************/ 
__global__ void Jacobi_Iterator_GPU_texture(float * __restrict__ T_new, const bool flag, const int NX, const int NY) { 

    const int i = blockIdx.x * blockDim.x + threadIdx.x ; 
    const int j = blockIdx.y * blockDim.y + threadIdx.y ; 

    float P, N, S, E, W;  
    if (flag) { 
              //       N 
     P = tex2D(tex_T_old, i,  j);  // node (i,j)    | 
     N = tex2D(tex_T_old, i,  j + 1); // node (i,j+1)   | 
     S = tex2D(tex_T_old, i,  j - 1); // node (i,j-1)  W ---- P ---- E 
     E = tex2D(tex_T_old, i + 1, j);  // node (i+1,j)   | 
     W = tex2D(tex_T_old, i - 1, j);  // node (i-1,j)   | 
              //       S 
    } else { 
              //       N 
     P = tex2D(tex_T,  i,  j);  // node (i,j)    | 
     N = tex2D(tex_T,  i,  j + 1); // node (i,j+1)   | 
     S = tex2D(tex_T,  i,  j - 1); // node (i,j-1)  W ---- P ---- E 
     E = tex2D(tex_T,  i + 1, j);  // node (i+1,j)   | 
     W = tex2D(tex_T,  i - 1, j);  // node (i-1,j)   | 
              //       S 
    } 

    // --- Only update "interior" (not boundary) node points 
    if (i>0 && i<NX-1 && j>0 && j<NY-1) T_new[i + j*NX] = 0.25 * (E + W + N + S); 
} 

/***********************************/ 
/* JACOBI ITERATION FUNCTION - CPU */ 
/***********************************/ 
void Jacobi_Iterator_CPU(float * __restrict T, float * __restrict T_new, const int NX, const int NY, const int MAX_ITER) 
{ 
    for(int iter=0; iter<MAX_ITER; iter=iter+2) 
    { 
     // --- Only update "interior" (not boundary) node points 
     for(int j=1; j<NY-1; j++) 
      for(int i=1; i<NX-1; i++) { 
       float T_E = T[(i+1) + NX*j]; 
       float T_W = T[(i-1) + NX*j]; 
       float T_N = T[i + NX*(j+1)]; 
       float T_S = T[i + NX*(j-1)]; 
       T_new[i+NX*j] = 0.25*(T_E + T_W + T_N + T_S); 
      } 

     for(int j=1; j<NY-1; j++) 
      for(int i=1; i<NX-1; i++) { 
       float T_E = T_new[(i+1) + NX*j]; 
       float T_W = T_new[(i-1) + NX*j]; 
       float T_N = T_new[i + NX*(j+1)]; 
       float T_S = T_new[i + NX*(j-1)]; 
       T[i+NX*j] = 0.25*(T_E + T_W + T_N + T_S); 
      } 
    } 
} 

/******************************/ 
/* TEMPERATURE INITIALIZATION */ 
/******************************/ 
void Initialize(float * __restrict h_T, const int NX, const int NY) 
{ 
    // --- Set left wall to 1 
    for(int j=0; j<NY; j++) h_T[j * NX] = 1.0; 
} 


/********/ 
/* MAIN */ 
/********/ 
int main() 
{ 
    const int NX = 256;   // --- Number of discretization points along the x axis 
    const int NY = 256;   // --- Number of discretization points along the y axis 

    const int MAX_ITER = 100; // --- Number of Jacobi iterations 

    // --- CPU temperature distributions 
    float *h_T    = (float *)calloc(NX * NY, sizeof(float)); 
    float *h_T_old   = (float *)calloc(NX * NY, sizeof(float)); 
    Initialize(h_T,  NX, NY); 
    Initialize(h_T_old, NX, NY); 
    float *h_T_GPU_result  = (float *)malloc(NX * NY * sizeof(float)); 
    float *h_T_GPU_tex_result = (float *)malloc(NX * NY * sizeof(float)); 
    float *h_T_GPU_sh1_result = (float *)malloc(NX * NY * sizeof(float)); 
    float *h_T_GPU_sh2_result = (float *)malloc(NX * NY * sizeof(float)); 
    float *h_T_GPU_sh3_result = (float *)malloc(NX * NY * sizeof(float)); 

    // --- GPU temperature distribution 
    float *d_T;   gpuErrchk(cudaMalloc((void**)&d_T,   NX * NY * sizeof(float))); 
    float *d_T_old;  gpuErrchk(cudaMalloc((void**)&d_T_old,  NX * NY * sizeof(float))); 
    float *d_T_tex;  gpuErrchk(cudaMalloc((void**)&d_T_tex,  NX * NY * sizeof(float))); 
    float *d_T_old_tex; gpuErrchk(cudaMalloc((void**)&d_T_old_tex, NX * NY * sizeof(float))); 
    float *d_T_sh1;  gpuErrchk(cudaMalloc((void**)&d_T_sh1,  NX * NY * sizeof(float))); 
    float *d_T_old_sh1; gpuErrchk(cudaMalloc((void**)&d_T_old_sh1, NX * NY * sizeof(float))); 
    float *d_T_sh2;  gpuErrchk(cudaMalloc((void**)&d_T_sh2,  NX * NY * sizeof(float))); 
    float *d_T_old_sh2; gpuErrchk(cudaMalloc((void**)&d_T_old_sh2, NX * NY * sizeof(float))); 
    float *d_T_sh3;  gpuErrchk(cudaMalloc((void**)&d_T_sh3,  NX * NY * sizeof(float))); 
    float *d_T_old_sh3; gpuErrchk(cudaMalloc((void**)&d_T_old_sh3, NX * NY * sizeof(float))); 

    gpuErrchk(cudaMemcpy(d_T,   h_T,  NX * NY * sizeof(float), cudaMemcpyHostToDevice)); 
    gpuErrchk(cudaMemcpy(d_T_tex,  h_T,  NX * NY * sizeof(float), cudaMemcpyHostToDevice)); 
    gpuErrchk(cudaMemcpy(d_T_sh1,  h_T,  NX * NY * sizeof(float), cudaMemcpyHostToDevice)); 
    gpuErrchk(cudaMemcpy(d_T_sh2,  h_T,  NX * NY * sizeof(float), cudaMemcpyHostToDevice)); 
    gpuErrchk(cudaMemcpy(d_T_sh3,  h_T,  NX * NY * sizeof(float), cudaMemcpyHostToDevice)); 
    gpuErrchk(cudaMemcpy(d_T_old,  d_T,  NX * NY * sizeof(float), cudaMemcpyDeviceToDevice)); 
    gpuErrchk(cudaMemcpy(d_T_old_tex, d_T_tex, NX * NY * sizeof(float), cudaMemcpyDeviceToDevice)); 
    gpuErrchk(cudaMemcpy(d_T_old_sh1, d_T_sh1, NX * NY * sizeof(float), cudaMemcpyDeviceToDevice)); 
    gpuErrchk(cudaMemcpy(d_T_old_sh2, d_T_sh2, NX * NY * sizeof(float), cudaMemcpyDeviceToDevice)); 
    gpuErrchk(cudaMemcpy(d_T_old_sh3, d_T_sh3, NX * NY * sizeof(float), cudaMemcpyDeviceToDevice)); 

    //cudaChannelFormatDesc desc = cudaCreateChannelDesc<float>(); 
    cudaChannelFormatDesc desc = cudaCreateChannelDesc(32, 0, 0, 0, cudaChannelFormatKindFloat); 

    gpuErrchk(cudaBindTexture2D(NULL, &tex_T,  d_T_tex,  &desc, NX, NY, sizeof(float) * NX)); 
    gpuErrchk(cudaBindTexture2D(NULL, &tex_T_old, d_T_old_tex, &desc, NX, NY, sizeof(float) * NX)); 

    tex_T.addressMode[0] = cudaAddressModeWrap; 
    tex_T.addressMode[1] = cudaAddressModeWrap; 
    tex_T.filterMode = cudaFilterModePoint; 
    tex_T.normalized = false; 

    tex_T_old.addressMode[0] = cudaAddressModeWrap; 
    tex_T_old.addressMode[1] = cudaAddressModeWrap; 
    tex_T_old.filterMode = cudaFilterModePoint; 
    tex_T_old.normalized = false; 

    // --- Grid size 
    dim3 dimBlock(BLOCK_SIZE_X, BLOCK_SIZE_Y); 
    dim3 dimGrid (iDivUp(NX, BLOCK_SIZE_X), iDivUp(NY, BLOCK_SIZE_Y)); 

    // --- Jacobi iterations on the host 
    Jacobi_Iterator_CPU(h_T, h_T_old, NX, NY, MAX_ITER); 

    // --- Jacobi iterations on the device 
    TimingGPU timerGPU; 
    timerGPU.StartCounter(); 
    for (int k=0; k<MAX_ITER; k=k+2) { 
     Jacobi_Iterator_GPU<<<dimGrid, dimBlock>>>(d_T,  d_T_old, NX, NY); // --- Update d_T_old  starting from data stored in d_T 
#ifdef DEBUG 
     gpuErrchk(cudaPeekAtLastError()); 
     gpuErrchk(cudaDeviceSynchronize()); 
#endif   
     Jacobi_Iterator_GPU<<<dimGrid, dimBlock>>>(d_T_old, d_T , NX, NY); // --- Update d_T   starting from data stored in d_T_old 
#ifdef DEBUG 
     gpuErrchk(cudaPeekAtLastError()); 
     gpuErrchk(cudaDeviceSynchronize()); 
#endif 
    } 
    printf("Timing = %f ms\n", timerGPU.GetCounter()); 

    // --- Jacobi iterations on the device - shared memory v1 
    timerGPU.StartCounter(); 
    for (int k=0; k<MAX_ITER; k=k+2) { 
     Jacobi_Iterator_GPU_shared_v1<<<dimGrid, dimBlock>>>(d_T_sh1,  d_T_old_sh1, NX, NY); // --- Update d_T_old  starting from data stored in d_T 
#ifdef DEBUG 
     gpuErrchk(cudaPeekAtLastError()); 
     gpuErrchk(cudaDeviceSynchronize()); 
#endif   
     Jacobi_Iterator_GPU_shared_v1<<<dimGrid, dimBlock>>>(d_T_old_sh1, d_T_sh1 , NX, NY); // --- Update d_T   starting from data stored in d_T_old 
#ifdef DEBUG 
     gpuErrchk(cudaPeekAtLastError()); 
     gpuErrchk(cudaDeviceSynchronize()); 
#endif   
    } 
    printf("Timing with shared memory v1 = %f ms\n", timerGPU.GetCounter()); 

    // --- Jacobi iterations on the device - shared memory v2 
    dim3 dimBlock2(BLOCK_SIZE_X, BLOCK_SIZE_Y); 
    dim3 dimGrid2 (iDivUp(NX, BLOCK_SIZE_X - 2), iDivUp(NY, BLOCK_SIZE_Y - 2)); 
    timerGPU.StartCounter(); 
    for (int k=0; k<MAX_ITER; k=k+2) { 
     Jacobi_Iterator_GPU_shared_v2<<<dimGrid2, dimBlock>>>(d_T_sh2,  d_T_old_sh2, NX, NY); // --- Update d_T_old  starting from data stored in d_T 
#ifdef DEBUG 
     gpuErrchk(cudaPeekAtLastError()); 
     gpuErrchk(cudaDeviceSynchronize()); 
#endif   
     Jacobi_Iterator_GPU_shared_v2<<<dimGrid2, dimBlock>>>(d_T_old_sh2, d_T_sh2 , NX, NY); // --- Update d_T   starting from data stored in d_T_old 
#ifdef DEBUG 
     gpuErrchk(cudaPeekAtLastError()); 
     gpuErrchk(cudaDeviceSynchronize()); 
#endif   
    } 
    printf("Timing with shared memory v2 = %f ms\n", timerGPU.GetCounter()); 

    // --- Jacobi iterations on the device - shared memory v3 
    timerGPU.StartCounter(); 
    for (int k=0; k<MAX_ITER; k=k+2) { 
     Jacobi_Iterator_GPU_shared_v3<<<dimGrid, dimBlock>>>(d_T_sh3,  d_T_old_sh3, NX, NY); // --- Update d_T_old  starting from data stored in d_T 
#ifdef DEBUG 
     gpuErrchk(cudaPeekAtLastError()); 
     gpuErrchk(cudaDeviceSynchronize()); 
#endif   
     Jacobi_Iterator_GPU_shared_v3<<<dimGrid, dimBlock>>>(d_T_old_sh3, d_T_sh3 , NX, NY); // --- Update d_T   starting from data stored in d_T_old 
#ifdef DEBUG 
     gpuErrchk(cudaPeekAtLastError()); 
     gpuErrchk(cudaDeviceSynchronize()); 
#endif   
    } 
    printf("Timing with shared memory v3 = %f ms\n", timerGPU.GetCounter()); 

    // --- Jacobi iterations on the device - texture case 
    timerGPU.StartCounter(); 
    for (int k=0; k<MAX_ITER; k=k+2) { 
     Jacobi_Iterator_GPU_texture<<<dimGrid, dimBlock>>>(d_T_old_tex, 0, NX, NY); // --- Update d_T_tex   starting from data stored in d_T_old_tex 
#ifdef DEBUG 
     gpuErrchk(cudaPeekAtLastError()); 
     gpuErrchk(cudaDeviceSynchronize()); 
#endif   
     Jacobi_Iterator_GPU_texture<<<dimGrid, dimBlock>>>(d_T_tex,  1, NX, NY); // --- Update d_T_old_tex  starting from data stored in d_T_tex 
#ifdef DEBUG 
     gpuErrchk(cudaPeekAtLastError()); 
     gpuErrchk(cudaDeviceSynchronize()); 
#endif   
    } 
    printf("Timing with texture = %f ms\n", timerGPU.GetCounter()); 

    saveCPUrealtxt(h_T,  "C:\\Users\\Documents\\Project\\Differential_Equations\\Heat_Equation\\2D\\DiffusionEquationJacobi\\DiffusionEquation\\CPU_result.txt",  NX * NY); 
    saveGPUrealtxt(d_T_tex, "C:\\Users\\Documents\\Project\\Differential_Equations\\Heat_Equation\\2D\\DiffusionEquationJacobi\\DiffusionEquation\\GPU_result_tex.txt", NX * NY); 
    saveGPUrealtxt(d_T,  "C:\\Users\\Documents\\Project\\Differential_Equations\\Heat_Equation\\2D\\DiffusionEquationJacobi\\DiffusionEquation\\GPU_result.txt",  NX * NY); 
    saveGPUrealtxt(d_T_sh1, "C:\\Users\\Documents\\Project\\Differential_Equations\\Heat_Equation\\2D\\DiffusionEquationJacobi\\DiffusionEquation\\GPU_result_sh1.txt",  NX * NY); 
    saveGPUrealtxt(d_T_sh2, "C:\\Users\\Documents\\Project\\Differential_Equations\\Heat_Equation\\2D\\DiffusionEquationJacobi\\DiffusionEquation\\GPU_result_sh2.txt",  NX * NY); 
    saveGPUrealtxt(d_T_sh3, "C:\\Users\\Documents\\Project\\Differential_Equations\\Heat_Equation\\2D\\DiffusionEquationJacobi\\DiffusionEquation\\GPU_result_sh3.txt",  NX * NY); 

    // --- Copy results from device to host 
    gpuErrchk(cudaMemcpy(h_T_GPU_result,  d_T,  NX * NY * sizeof(float), cudaMemcpyDeviceToHost)); 
    gpuErrchk(cudaMemcpy(h_T_GPU_tex_result, d_T_tex, NX * NY * sizeof(float), cudaMemcpyDeviceToHost)); 
    gpuErrchk(cudaMemcpy(h_T_GPU_sh1_result, d_T_sh1, NX * NY * sizeof(float), cudaMemcpyDeviceToHost)); 
    gpuErrchk(cudaMemcpy(h_T_GPU_sh2_result, d_T_sh2, NX * NY * sizeof(float), cudaMemcpyDeviceToHost)); 
    gpuErrchk(cudaMemcpy(h_T_GPU_sh3_result, d_T_sh3, NX * NY * sizeof(float), cudaMemcpyDeviceToHost)); 

    // --- Calculate percentage root mean square error between host and device results 
    float sum = 0.f, sum_tex = 0.f, sum_ref = 0.f, sum_sh1 = 0.f, sum_sh2 = 0.f, sum_sh3 = 0.f; 
    for (int j=0; j<NY; j++) 
     for (int i=0; i<NX; i++) { 
      sum  = sum  + (h_T_GPU_result [j * NX + i] - h_T[j * NX + i]) * (h_T_GPU_result [j * NX + i] - h_T[j * NX + i]); 
      sum_tex = sum_tex + (h_T_GPU_tex_result[j * NX + i] - h_T[j * NX + i]) * (h_T_GPU_tex_result[j * NX + i] - h_T[j * NX + i]); 
      sum_sh1 = sum_sh1 + (h_T_GPU_sh1_result[j * NX + i] - h_T[j * NX + i]) * (h_T_GPU_sh1_result[j * NX + i] - h_T[j * NX + i]); 
      sum_sh2 = sum_sh2 + (h_T_GPU_sh2_result[j * NX + i] - h_T[j * NX + i]) * (h_T_GPU_sh2_result[j * NX + i] - h_T[j * NX + i]); 
      sum_sh3 = sum_sh3 + (h_T_GPU_sh3_result[j * NX + i] - h_T[j * NX + i]) * (h_T_GPU_sh3_result[j * NX + i] - h_T[j * NX + i]); 
      sum_ref = sum_ref + h_T[j * NX + i]        * h_T[j * NX + i]; 
     } 
    printf("Percentage root mean square error   = %f\n", 100.*sqrt(sum /sum_ref)); 
    printf("Percentage root mean square error texture = %f\n", 100.*sqrt(sum_tex/sum_ref)); 
    printf("Percentage root mean square error shared v1 = %f\n", 100.*sqrt(sum_sh1/sum_ref)); 
    printf("Percentage root mean square error shared v2 = %f\n", 100.*sqrt(sum_sh2/sum_ref)); 
    printf("Percentage root mean square error shared v3 = %f\n", 100.*sqrt(sum_sh3/sum_ref)); 

    return 0; 
}