改进的reduce0 CUDA缩减代码不起作用

我已经编写了一个CUDA代码，基本上为我加了一个数组。阵列大小N应该是2的功率，即2^x。但是，我的代码无法正常工作。例如，如果输出是150177410，我的代码输出150177408。我一直在试图调试这个过去的5小时。任何帮助将不胜感激。以下是代码：改进的reduce0 CUDA缩减代码不起作用

//only for array size of 2^x and TPB of 2^y as godata is = num of blocks. But num of blocks 2^sth if previous satisfied 
//Works for arbitrary size array of type 2^x 


#include<stdio.h> 

__global__ void computeAddShared(int *in , int *out, int sizeInput){ 
    //not made parameters gidata and godata to emphasize that parameters get copy of address and are different from pointers in host code 
    extern __shared__ float temp[]; 

    int tid = blockIdx.x * blockDim.x + threadIdx.x; 
    int ltid = threadIdx.x; 
    temp[ltid] = 0; 
    while(tid < sizeInput){ 
     temp[ltid] += in[tid]; 
     tid+=gridDim.x * blockDim.x; // to handle array of any size 
    } 
    __syncthreads(); 
    int offset = 1; 
    while(offset < blockDim.x){ 
     if(ltid % (offset * 2) == 0){ 
      temp[ltid] = temp[ltid] + temp[ltid + offset]; 
     } 
     __syncthreads(); 
     offset*=2; 
    } 
    if(ltid == 0){ 
     out[blockIdx.x] = temp[0]; 
    } 

} 

int main(){ 



    int N = 8192;//should be 2^sth 
    int size = N; 
    int *a = (int*)malloc(N * sizeof(int)); 
    /* TO create random number 
    FILE *f; 
     f = fopen("invertedList.txt" , "w"); 
     a[0] = 1 + (rand() % 8); 
     fprintf(f, "%d,",a[0]); 
     for(int i = 1 ; i< N; i++){ 
      a[i] = a[i-1] + (rand() % 8) + 1; 
      fprintf(f, "%d,",a[i]); 
     } 
     fclose(f); 
     return 0;*/ 
    FILE *f; 
    f = fopen("invertedList.txt","r"); 
    if(f == NULL){ 
      printf("File not found\n"); 
      system("pause"); 
      exit(1); 
    } 
    int count = 0 ; 
    long actualSum = 0; 
    for(int i =0 ; i < N ; i++){ 
     fscanf(f, "%d,", &a[count]); 
     actualSum+=a[count]; 
     count++; 
    } 
    fclose(f); 
    printf("The actual sum is %d\n",actualSum); 
    int* gidata; 
    int* godata; 
    cudaMalloc((void**)&gidata, N* sizeof(int)); 
    cudaMemcpy(gidata,a, size * sizeof(int), cudaMemcpyHostToDevice); 
    int TPB = 256; 
    int blocks = 10; //to get things kicked off 
    cudaEvent_t start, stop; 
    cudaEventCreate(&start); 
    cudaEventCreate(&stop); 
    cudaEventRecord(start, 0); 
    while(blocks != 1){ 
     if(size < TPB){ 
      TPB = size; // size is 2^sth 
     } 
     blocks = (size+ TPB -1)/TPB; 
     cudaMalloc((void**)&godata, blocks * sizeof(int)); 
     computeAddShared<<<blocks, TPB,TPB*sizeof(int)>>>(gidata, godata,size); 
     //cudaFree(gidata); 
     gidata = godata; 
     size = blocks; 
    } 
    //printf("The error by cuda is %s",cudaGetErrorString(cudaGetLastError())); 


    cudaEventRecord(stop, 0); 
    cudaEventSynchronize(stop); 
    float elapsedTime; 
    cudaEventElapsedTime(&elapsedTime , start, stop); 
    printf("time is %f ms", elapsedTime); 
    int *output = (int*)malloc(sizeof(int)); 
    cudaMemcpy(output, gidata,size * sizeof(int), cudaMemcpyDeviceToHost); 
    //Cant free either earlier as both point to same location 
    cudaError_t chk = cudaFree(godata); 
    if(chk!=0){ 
     printf("First chk also printed error. Maybe error in my logic\n"); 
    } 

    printf("The error by threadsyn is %s", cudaGetErrorString(cudaGetLastError())); 
    printf("The sum of the array is %d\n", output[0]); 
    getchar(); 

    return 0; 
}

来源

2013-03-02 Programmer

当您逐步缩小输入域时会发生什么？错误模式可能有助于诊断。 – 2013-03-02 17:07:08

是否真的有必要发布依赖于我们无法访问的外部文件的代码，其中包括10多行多余的注释代码和130多列仅包含毫无意义评论的列宽行？ – talonmies 2013-03-02 17:17:52

我很惊讶，它给出了任何答案！您正在检查'offset 2013-03-02 19:02:33

正如talonmies所说的，内核本身是可以的。基本上，CUDA SDK的reduce0内核改进为算法级联意义上的Brent的优化当使用相同的SDK改进内核reduce6时使用。

内核工作正常可以通过以下测试代码显示，该代码还将reduce0与OP内核的代码中名为reduce0_stackoverflow的性能进行比较。内核reduce0_stackoverflow也报告说，相应的代码行reduce0。

对于以下测试用例，reduce0_stackoverflow在0.030ms中执行，而在GeForce GT540M卡上则为0.049ms。

请注意，下面的代码并不要求数组大小必须是2的幂。

#include <thrust\device_vector.h> 

#define BLOCKSIZE 256 

/********************/ 
/* CUDA ERROR CHECK */ 
/********************/ 
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); } 
inline void gpuAssert(cudaError_t code, char *file, int line, bool abort=true) 
{ 
    if (code != cudaSuccess) 
    { 
     fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line); 
     if (abort) { getchar(); exit(code); } 
    } 
} 

/*******************************************************/ 
/* CALCULATING THE NEXT POWER OF 2 OF A CERTAIN NUMBER */ 
/*******************************************************/ 
unsigned int nextPow2(unsigned int x) 
{ 
    --x; 
    x |= x >> 1; 
    x |= x >> 2; 
    x |= x >> 4; 
    x |= x >> 8; 
    x |= x >> 16; 
    return ++x; 
} 

/*************************************/ 
/* CHECK IF A NUMBER IS A POWER OF 2 */ 
/*************************************/ 
bool isPow2(unsigned int x) 
{ 
    return ((x&(x-1))==0); 
} 

/******************/ 
/* REDUCE0 KERNEL */ 
/******************/ 
/* This reduction interleaves which threads are active by using the modulo 
    operator. This operator is very expensive on GPUs, and the interleaved 
    inactivity means that no whole warps are active, which is also very 
    inefficient */ 
template <class T> 
__global__ void reduce0(T *g_idata, T *g_odata, unsigned int N) 
{ 
    extern __shared__ T sdata[]; 

    unsigned int tid = threadIdx.x;        // Local thread index 
    unsigned int i  = blockIdx.x * blockDim.x + threadIdx.x; // Global thread index 

    // --- Loading data to shared memory 
    sdata[tid] = (i < N) ? g_idata[i] : 0; 

    // --- Before going further, we have to make sure that all the shared memory loads have been completed 
    __syncthreads(); 

    // --- Reduction in shared memory 
    for (unsigned int s=1; s < blockDim.x; s *= 2) 
    { 
     // --- Only the threads with index multiple of 2*s perform additions. Furthermore, modulo arithmetic is slow.  
     if ((tid % (2*s)) == 0) { sdata[tid] += sdata[tid + s]; } 
     // --- At the end of each iteration loop, we have to make sure that all memory operations have been completed 
     __syncthreads(); 
    } 

    // --- Write result for this block to global memory. At the end of the kernel, global memory will contain the results for the summations of 
    //  individual blocks 
    if (tid == 0) g_odata[blockIdx.x] = sdata[0]; 
} 

/***************************/ 
/* REDUCE0 - STACKOVERFLOW */ 
/***************************/ 
template <class T> 
__global__ void reduce0_stackoverflow(T *g_idata , T *g_odata, unsigned int N) { 

    extern __shared__ T sdata[]; 

    unsigned int tid = threadIdx.x;        // Local thread index 
    unsigned int i  = blockIdx.x * blockDim.x + threadIdx.x; // Global thread index 

    // --- Loading data to shared memory 
    //sdata[tid] = (i < N) ? g_idata[i] : 0;      // CUDA SDK 
    sdata[tid] = 0; 
    while(i < N){ 
     sdata[tid] += g_idata[i]; 
     i+=gridDim.x * blockDim.x; // to handle array of any size 
    } 

    // --- Before going further, we have to make sure that all the shared memory loads have been completed 
    __syncthreads(); 

    // --- Reduction in shared memory 
    // for (unsigned int s=1; s < blockDim.x; s *= 2)     // CUDA SDK 
    // { 
    //  if ((tid % (2*s)) == 0) { sdata[tid] += sdata[tid + s]; } 
    //  __syncthreads(); 
    // } 
    unsigned int s = 1; 
    while(s < blockDim.x) 
    { 
     // --- Only the threads with index multiple of 2*s perform additions. Furthermore, modulo arithmetic is slow.  
     if ((tid % (2*s)) == 0) { sdata[tid] += sdata[tid + s]; } 

     // --- At the end of each iteration loop, we have to make sure that all memory operations have been completed 
     __syncthreads(); 

     s*=2; 
    } 

    // --- Write result for this block to global memory. At the end of the kernel, global memory will contain the results for the summations of 
    //  individual blocks 
    if (tid == 0) g_odata[blockIdx.x] = sdata[0]; 

} 

/********/ 
/* MAIN */ 
/********/ 
int main() 
{ 
    const int N = 15336; 

    thrust::device_vector<int> d_vec(N,3); 

    int NumThreads = (N < BLOCKSIZE) ? nextPow2(N) : BLOCKSIZE; 
    int NumBlocks = (N + NumThreads - 1)/NumThreads; 

    // when there is only one warp per block, we need to allocate two warps 
    // worth of shared memory so that we don't index shared memory out of bounds 
    int smemSize = (NumThreads <= 32) ? 2 * NumThreads * sizeof(int) : NumThreads * sizeof(int); 

    // --- Creating events for timing 
    float time; 
    cudaEvent_t start, stop; 
    cudaEventCreate(&start); 
    cudaEventCreate(&stop); 

    thrust::device_vector<int> d_vec_block(NumBlocks); 

    /***********/ 
    /* REDUCE0 */ 
    /***********/ 
    cudaEventRecord(start, 0); 
    reduce0<<<NumBlocks, NumThreads, smemSize>>>(thrust::raw_pointer_cast(d_vec.data()), thrust::raw_pointer_cast(d_vec_block.data()), N); 
    cudaEventRecord(stop, 0); 
    cudaEventSynchronize(stop); 
    cudaEventElapsedTime(&time, start, stop); 
    printf("reduce0 - Elapsed time: %3.3f ms \n", time); 

    // --- The last part of the reduction, which would be expensive to perform on the device, is executed on the host 
    thrust::host_vector<int> h_vec_block(d_vec_block); 
    int sum_reduce0 = 0; 
    for (int i=0; i<NumBlocks; i++) sum_reduce0 = sum_reduce0 + h_vec_block[i]; 
    printf("Result for reduce0 = %i\n",sum_reduce0); 

    /**********************************/ 
    /* REDUCE0 KERNEL - STACKOVERFLOW */ 
    /**********************************/ 
    cudaEventRecord(start, 0); 
    reduce0_stackoverflow<<<NumBlocks/2, NumThreads, smemSize>>>(thrust::raw_pointer_cast(d_vec.data()), thrust::raw_pointer_cast(d_vec_block.data()), N); 
    cudaEventRecord(stop, 0); 
    cudaEventSynchronize(stop); 
    cudaEventElapsedTime(&time, start, stop); 
    printf("reduce0 - stackoverflow - Elapsed time: %3.3f ms \n", time); 

    // --- The last part of the reduction, which would be expensive to perform on the device, is executed on the host 
    h_vec_block = d_vec_block; 
    int sum_reduce0_stackoverflow = 0; 
    for (int i=0; i<NumBlocks; i++) sum_reduce0_stackoverflow = sum_reduce0_stackoverflow + h_vec_block[i]; 
    printf("Result for reduce0_stackoverflow = %i\n",sum_reduce0_stackoverflow); 

    getchar(); 
}

来源

2014-09-02 17:39:47 JackOLantern

改进的reduce0 CUDA缩减代码不起作用

回答

相关问题