2013-03-04 83 views
4

我想在CUDA C++代码上运行矢量加法函数,但对于大小为5,000,000的大型浮点数组,它的运行速度比我的CPU版本慢。下面是我说的相关CUDA和CPU代码:在比vectorStepAdd3计算慢在cuda上加载矢量的步骤

#define THREADS_PER_BLOCK 1024 
typedef float real; 
__global__ void vectorStepAddKernel2(real*x, real*y, real*z, real alpha, real beta, int size, int xstep, int ystep, int zstep) 
{ 
    int i = blockDim.x * blockIdx.x + threadIdx.x; 
    if (i < size) 
    { 
     x[i*xstep] = alpha* y[i*ystep] + beta*z[i*zstep]; 
    } 
} 

cudaError_t vectorStepAdd2(real *x, real*y, real* z, real alpha, real beta, int size, int xstep, int ystep, int zstep) 
{ 

    cudaError_t cudaStatus; 
    int threadsPerBlock = THREADS_PER_BLOCK; 
    int blocksPerGrid = (size + threadsPerBlock -1)/threadsPerBlock; 
    vectorStepAddKernel2<<<blocksPerGrid, threadsPerBlock>>>(x, y, z, alpha, beta, size, xstep, ystep, zstep); 

    // cudaDeviceSynchronize waits for the kernel to finish, and returns 
    // any errors encountered during the launch. 
    cudaStatus = cudaDeviceSynchronize(); 
    if (cudaStatus != cudaSuccess) { 
     fprintf(stderr, "cudaDeviceSynchronize returned error code %d after launching vectorStepAddKernel!\n", cudaStatus); 
     exit(1); 
    } 

    return cudaStatus; 
} 

//CPU function: 

void vectorStepAdd3(real *x, real*y, real* z, real alpha, real beta, int size, int xstep, int ystep, int zstep) 
{ 
    for(int i=0;i<size;i++) 
    { 
     x[i*xstep] = alpha* y[i*ystep] + beta*z[i*zstep]; 
    } 
} 

调用vectorStepAdd2结果时各3个数组的是大小5000000和大小= 50000(即,50000个元素添加一起以这种逐步的方式)。

关于我能做些什么来加速GPU代码的任何想法? 我的设备是特斯拉M2090 GPU

感谢

+3

分段访问不适合GPU的内存子系统,它更喜欢连续访问。如果步幅很小(例如<10个元素)并且向量很长,通过纹理访问只读阵列可能会有所帮助,值得一试。如果您正在构建sm_35平台,则对函数原型的简单更改可能会让您的代码通过LDG指令自动使用纹理路径:vectorStepAddKernel2(real * __restrict__ x,const real * __restrict__ y,const real * __restrict__ z,...)' – njuffa 2013-03-04 05:14:22

+0

你正在使用的xstep,ystep和ystep的值是多少? – talonmies 2013-03-04 07:56:49

+0

@talonmies - 我正在使用的xstep,ystep,zstep的值是4,5,7 resp ...但是,它们作为arg动态传递给函数(如您所见),并且可以是任何类似的东西 – assassin 2013-03-05 04:01:51

回答

5

在回答你的问题“什么我可以做,以加快GPU代码的任何想法?”

首先,让我先说一下这样的陈述,即所提出的操作X = alpha * Y + beta * Z没有每个字节所需的数据传输的大量计算强度。因此,我无法在这个特定的代码上击败CPU时间。然而,它可能是有益的覆盖2个思路,加快验证码:

  1. 使用page-locked内存进行数据传输操作。这为GPU版本的数据传输时间减少了约2倍,GPU版本占据了整个GPU执行时间的主导地位。

  2. 按照@njuffa here的建议,使用cudaMemcpy2D的跨步复制技术。结果是2倍:我们可以将数据传输量减少到计算所需的数据量,即,然后我们可以重新编写内核来对注释中建议的数据进行连续操作(同样由njuffa)。这使得数据传输时间进一步提高了3倍,内核计算时间提高了10倍。

此代码提供了这些操作的示例:

#include <stdio.h> 
#include <stdlib.h> 


#define THREADS_PER_BLOCK 1024 
#define DSIZE 5000000 
#define WSIZE 50000 
#define XSTEP 47 
#define YSTEP 43 
#define ZSTEP 41 
#define TOL 0.00001f 


#define cudaCheckErrors(msg) \ 
    do { \ 
     cudaError_t __err = cudaGetLastError(); \ 
     if (__err != cudaSuccess) { \ 
      fprintf(stderr, "Fatal error: %s (%s at %s:%d)\n", \ 
       msg, cudaGetErrorString(__err), \ 
       __FILE__, __LINE__); \ 
      fprintf(stderr, "*** FAILED - ABORTING\n"); \ 
      exit(1); \ 
     } \ 
    } while (0) 

typedef float real; 

__global__ void vectorStepAddKernel2(real *x, real *y, real *z, real alpha, real beta, int size, int xstep, int ystep, int zstep) 
{ 
    int i = blockDim.x * blockIdx.x + threadIdx.x; 
    if (i < size) 
    { 
     x[i*xstep] = alpha* y[i*ystep] + beta*z[i*zstep]; 
    } 
} 

__global__ void vectorStepAddKernel2i(real *x, real *y, real *z, real alpha, real beta, int size) 
{ 
    int i = blockDim.x * blockIdx.x + threadIdx.x; 
    if (i < size) 
    { 
     x[i] = alpha* y[i] + beta*z[i]; 
    } 
} 

void vectorStepAdd2(real *x, real *y, real *z, real alpha, real beta, int size, int xstep, int ystep, int zstep) 
{ 

    int threadsPerBlock = THREADS_PER_BLOCK; 
    int blocksPerGrid = (size + threadsPerBlock -1)/threadsPerBlock; 
    vectorStepAddKernel2<<<blocksPerGrid, threadsPerBlock>>>(x, y, z, alpha, beta, size, xstep, ystep, zstep); 
    cudaDeviceSynchronize(); 
    cudaCheckErrors("kernel2 fail"); 
} 


void vectorStepAdd2i(real *x, real *y, real *z, real alpha, real beta, int size) 
{ 

    int threadsPerBlock = THREADS_PER_BLOCK; 
    int blocksPerGrid = (size + threadsPerBlock -1)/threadsPerBlock; 
    vectorStepAddKernel2i<<<blocksPerGrid, threadsPerBlock>>>(x, y, z, alpha, beta, size); 
    cudaDeviceSynchronize(); 
    cudaCheckErrors("kernel3 fail"); 
} 

//CPU function: 

void vectorStepAdd3(real *x, real*y, real* z, real alpha, real beta, int size, int xstep, int ystep, int zstep) 
{ 
    for(int i=0;i<size;i++) 
    { 
     x[i*xstep] = alpha* y[i*ystep] + beta*z[i*zstep]; 
    } 
} 

int main() { 

    real *h_x, *h_y, *h_z, *c_x, *h_x1; 
    real *d_x, *d_y, *d_z, *d_x1, *d_y1, *d_z1; 

    int dsize = DSIZE; 
    int wsize = WSIZE; 
    int xstep = XSTEP; 
    int ystep = YSTEP; 
    int zstep = ZSTEP; 
    real alpha = 0.5f; 
    real beta = 0.5f; 
    float et; 

/* 
    h_x = (real *)malloc(dsize*sizeof(real)); 
    if (h_x == 0){printf("malloc1 fail\n"); return 1;} 
    h_y = (real *)malloc(dsize*sizeof(real)); 
    if (h_y == 0){printf("malloc2 fail\n"); return 1;} 
    h_z = (real *)malloc(dsize*sizeof(real)); 
    if (h_z == 0){printf("malloc3 fail\n"); return 1;} 
    c_x = (real *)malloc(dsize*sizeof(real)); 
    if (c_x == 0){printf("malloc4 fail\n"); return 1;} 
    h_x1 = (real *)malloc(dsize*sizeof(real)); 
    if (h_x1 == 0){printf("malloc1 fail\n"); return 1;} 
*/ 

    cudaHostAlloc((void **)&h_x, dsize*sizeof(real), cudaHostAllocDefault); 
    cudaCheckErrors("cuda Host Alloc 1 fail"); 
    cudaHostAlloc((void **)&h_y, dsize*sizeof(real), cudaHostAllocDefault); 
    cudaCheckErrors("cuda Host Alloc 2 fail"); 
    cudaHostAlloc((void **)&h_z, dsize*sizeof(real), cudaHostAllocDefault); 
    cudaCheckErrors("cuda Host Alloc 3 fail"); 
    cudaHostAlloc((void **)&c_x, dsize*sizeof(real), cudaHostAllocDefault); 
    cudaCheckErrors("cuda Host Alloc 4 fail"); 
    cudaHostAlloc((void **)&h_x1, dsize*sizeof(real), cudaHostAllocDefault); 
    cudaCheckErrors("cuda Host Alloc 5 fail"); 


    cudaMalloc((void **)&d_x, dsize*sizeof(real)); 
    cudaCheckErrors("cuda malloc1 fail"); 
    cudaMalloc((void **)&d_y, dsize*sizeof(real)); 
    cudaCheckErrors("cuda malloc2 fail"); 
    cudaMalloc((void **)&d_z, dsize*sizeof(real)); 
    cudaCheckErrors("cuda malloc3 fail"); 
    cudaMalloc((void **)&d_x1, wsize*sizeof(real)); 
    cudaCheckErrors("cuda malloc4 fail"); 
    cudaMalloc((void **)&d_y1, wsize*sizeof(real)); 
    cudaCheckErrors("cuda malloc5 fail"); 
    cudaMalloc((void **)&d_z1, wsize*sizeof(real)); 
    cudaCheckErrors("cuda malloc6 fail"); 

    for (int i=0; i< dsize; i++){ 
    h_x[i] = 0.0f; 
    h_x1[i] = 0.0f; 
    c_x[i] = 0.0f; 
    h_y[i] = (real)(rand()/(real)RAND_MAX); 
    h_z[i] = (real)(rand()/(real)RAND_MAX); 
    } 


    cudaEvent_t t_start, t_stop, k_start, k_stop; 
    cudaEventCreate(&t_start); 
    cudaEventCreate(&t_stop); 
    cudaEventCreate(&k_start); 
    cudaEventCreate(&k_stop); 
    cudaCheckErrors("event fail"); 

    // first test original GPU version 

    cudaEventRecord(t_start); 
    cudaMemcpy(d_x, h_x, dsize * sizeof(real), cudaMemcpyHostToDevice); 
    cudaCheckErrors("cuda memcpy 1 fail"); 
    cudaMemcpy(d_y, h_y, dsize * sizeof(real), cudaMemcpyHostToDevice); 
    cudaCheckErrors("cuda memcpy 2 fail"); 
    cudaMemcpy(d_z, h_z, dsize * sizeof(real), cudaMemcpyHostToDevice); 
    cudaCheckErrors("cuda memcpy 3 fail"); 


    cudaEventRecord(k_start); 
    vectorStepAdd2(d_x, d_y, d_z, alpha, beta, wsize, xstep, ystep, zstep); 
    cudaEventRecord(k_stop); 

    cudaMemcpy(h_x, d_x, dsize * sizeof(real), cudaMemcpyDeviceToHost); 
    cudaCheckErrors("cuda memcpy 4 fail"); 
    cudaEventRecord(t_stop); 
    cudaEventSynchronize(t_stop); 
    cudaEventElapsedTime(&et, t_start, t_stop); 
    printf("GPU original version total elapsed time is: %f ms.\n", et); 
    cudaEventElapsedTime(&et, k_start, k_stop); 
    printf("GPU original kernel elapsed time is: %f ms.\n", et); 

    //now test CPU version 

    cudaEventRecord(t_start); 
    vectorStepAdd3(c_x, h_y, h_z, alpha, beta, wsize, xstep, ystep, zstep); 
    cudaEventRecord(t_stop); 
    cudaEventSynchronize(t_stop); 
    cudaEventElapsedTime(&et, t_start, t_stop); 
    printf("CPU version total elapsed time is: %f ms.\n", et); 
    for (int i = 0; i< dsize; i++) 
    if (fabsf((float)(h_x[i]-c_x[i])) > TOL) { 
     printf("cpu/gpu results mismatch at i = %d, cpu = %f, gpu = %f\n", i, c_x[i], h_x[i]); 
     return 1; 
     } 


    // now test improved GPU version 

    cudaEventRecord(t_start); 
// cudaMemcpy2D(d_x1, sizeof(real), h_x, xstep * sizeof(real), sizeof(real), wsize, cudaMemcpyHostToDevice); 
// cudaCheckErrors("cuda memcpy 5 fail"); 
    cudaMemcpy2D(d_y1, sizeof(real), h_y, ystep * sizeof(real), sizeof(real), wsize, cudaMemcpyHostToDevice); 
    cudaCheckErrors("cuda memcpy 6 fail"); 
    cudaMemcpy2D(d_z1, sizeof(real), h_z, zstep * sizeof(real), sizeof(real), wsize, cudaMemcpyHostToDevice); 
    cudaCheckErrors("cuda memcpy 7 fail"); 

    cudaEventRecord(k_start); 
    vectorStepAdd2i(d_x1, d_y1, d_z1, alpha, beta, wsize); 
    cudaEventRecord(k_stop); 

    cudaMemcpy2D(h_x1, xstep*sizeof(real), d_x1, sizeof(real), sizeof(real), wsize, cudaMemcpyDeviceToHost); 
    cudaCheckErrors("cuda memcpy 8 fail"); 
    cudaEventRecord(t_stop); 
    cudaEventSynchronize(t_stop); 
    cudaEventElapsedTime(&et, t_start, t_stop); 
    printf("GPU improved version total elapsed time is: %f ms.\n", et); 
    cudaEventElapsedTime(&et, k_start, k_stop); 
    printf("GPU improved kernel elapsed time is: %f ms.\n", et); 

    for (int i = 0; i< dsize; i++) 
    if (fabsf((float)(h_x[i]-h_x1[i])) > TOL) { 
     printf("gpu/gpu improved results mismatch at i = %d, gpu = %f, gpu imp = %f\n", i, h_x[i], h_x1[i]); 
     return 1; 
     } 

    printf("Results:i CPU  GPU  GPUi \n"); 
    for (int i = 0; i< 20*xstep; i+=xstep) 
    printf(" %d   %f  %f  %f %f %f\n",i, c_x[i], h_x[i], h_x1[i]); 


    return 0; 
} 

如前所述,我还是没能击败的CPU时间,而我认为这要么我自己缺乏编码技能或否则这个操作基本上没有足够的计算复杂度在GPU上感兴趣。不过这里有一些样品的结果:

GPU original version total elapsed time is: 13.352256 ms. 
GPU original kernel elapsed time is: 0.195808 ms. 
CPU version total elapsed time is: 2.599584 ms. 
GPU improved version total elapsed time is: 4.228288 ms. 
GPU improved kernel elapsed time is: 0.027392 ms. 
Results:i CPU  GPU  GPUi 
    0   0.617285  0.617285  0.617285 
    47   0.554522  0.554522  0.554522 
    94   0.104245  0.104245  0.104245 
.... 

我们可以看到,改进后的内核具有约3倍的整体减少相比原来的内核,几乎全部是由于数据的减少拷贝时间。数据复制时间的缩短是由于改进后的2D memcpy,我们只需要复制实际使用的数据。 (没有页面锁定的内存,这些数据传输时间大约是其两倍)。我们还可以看到内核计算时间比原始内核的CPU计算速度快10倍,比改进内核的CPU计算快100倍。不过,考虑到数据传输时间,我们无法克服CPU的速度。

最后一点评论是,cudaMemcpy2D操作的“成本”仍然很高。为了减少100倍的矢量大小,我们只能看到复制时间缩短3倍。因此,跨越式访问仍然会导致使用GPU的相对昂贵的方式。如果我们只是传输50,000个连续元素的向量,我们预计复制时间几乎会线性减少100倍(与5000000个元素的原始复制向量相比)。这意味着复制的时间将少于1毫秒,我们的GPU版本将比CPU更快,至少这个天真的单线程CPU代码。

+0

谢谢!我实际上观察到,在我的机器上,您的改进内核的内核运行时比CPU func调用时间慢。我知道这可能是由于各种原因造成的......但是,正如你所指出的那样,在GPU上解决这个问题似乎并不具有足够的挑战性(或者现代CPU可能非常快) – assassin 2013-03-05 21:58:41