与CUDA共享内存互斥 - 增加项目

我的问题的列表如下：我有，我发现使用GPU的兴趣几点的图像。检测是一个重量级的测试，但是只有25个点中有1个平均通过测试。该算法的最后阶段是建立一个点的列表。在CPU上，这将实现为：与CUDA共享内存互斥 - 增加项目

forall pixels x,y 
{ 
    if(test_this_pixel(x,y)) 
     vector_of_coordinates.push_back(Vec2(x,y)); 
}

在GPU上，我有每个CUDA块处理16x16像素。问题是我需要做一些特殊的事情，最终在全局内存中有一个统一的点列表。目前我正在尝试在每个块的共享内存中生成一个本地列表，这些列表最终将被写入全局内存。我试图避免发送任何回到CPU，因为在此之后有更多的CUDA阶段。

我期待，我可以用原子操作来实现对共享内存的push_back功能。但是我无法得到这个工作。有两个问题。第一个烦人的问题是，我经常遇到以下编译器崩溃：“使用原子操作时，nvcc错误：'ptxas'死于状态0xC0000005（ACCESS_VIOLATION）”。我是否可以编译某些东西时遇到了问题。有谁知道是什么原因造成的？

以下内核将重现错误：

__global__ void gpu_kernel(int w, int h, RtmPoint *pPoints, int *pCounts) 
{ 
    __shared__ unsigned int test; 
    atomicInc(&test, 1000); 
}

其次，我的代码，其中包括共享内存互斥锁挂在GPU和我不明白为什么：

__device__ void lock(unsigned int *pmutex) 
{ 
    while(atomicCAS(pmutex, 0, 1) != 0); 
} 

__device__ void unlock(unsigned int *pmutex) 
{ 
    atomicExch(pmutex, 0); 
} 

__global__ void gpu_kernel_non_max_suppress(int w, int h, RtmPoint *pPoints, int *pCounts) 
{ 
    __shared__ RtmPoint localPoints[64]; 
    __shared__ int localCount; 
    __shared__ unsigned int mutex; 

    int x = blockIdx.x * blockDim.x + threadIdx.x; 
    int y = blockIdx.y * blockDim.y + threadIdx.y; 

    int threadid = threadIdx.y * blockDim.x + threadIdx.x; 
    int blockid = blockIdx.y * gridDim.x + blockIdx.x; 

    if(threadid==0) 
    { 
     localCount = 0; 
     mutex = 0; 
    } 

    __syncthreads(); 

    if(x<w && y<h) 
    { 
     if(some_test_on_pixel(x,y)) 
     { 
      RtmPoint point; 
      point.x = x; 
      point.y = y; 

      // this is a local push_back operation 
      lock(&mutex); 
      if(localCount<64) // we should never get >64 points per block 
       localPoints[localCount++] = point; 
      unlock(&mutex); 
     } 
    } 

    __syncthreads(); 

    if(threadid==0) 
     pCounts[blockid] = localCount; 
    if(threadid<localCount) 
     pPoints[blockid * 64 + threadid] = localPoints[threadid]; 
}

在这个例子中代码在this site，作者设法成功地在共享内存上使用原子操作，所以我很困惑为什么我的情况不起作用。如果我注释掉锁和解锁行，代码运行正常，但显然不正确地添加到列表中。

我将不胜感激，为什么这个问题正在发生，一些建议也说不定，如果有一个更好的解决方案，以实现这一目标，因为我担心反正关于使用原子操作或互斥锁的性能问题。

来源

2012-02-28 Robotbugs

我建议使用前缀和落实这部分增加的并行性。要做到这一点，你需要使用共享数组。基本上，前缀总和会将一个数组（1,1,0,1）转换为（0,1,2,2,3），也就是说，将计算一个就地运行的独占总数，以便您可以获得每个线程编写索引。

__shared__ uint8_t vector[NUMTHREADS]; 

.... 

bool emit = (x<w && y<h); 
    emit = emit && some_test_on_pixel(x,y); 
__syncthreads(); 
scan(emit, vector); 
if (emit) { 
    pPoints[blockid * 64 + vector[TID]] = point; 
}

前缀总和例如：基于这里的建议

template <typename T> 
__device__ uint32 scan(T mark, T *output) { 
#define GET_OUT (pout?output:values) 
#define GET_INP (pin?output:values) 
    __shared__ T values[numWorkers]; 
    int pout=0, pin=1; 
    int tid = threadIdx.x; 

    values[tid] = mark; 

    syncthreads(); 

    for(int offset=1; offset < numWorkers; offset *= 2) { 
    pout = 1 - pout; pin = 1 - pout; 
    syncthreads(); 
    if (tid >= offset) { 
     GET_OUT[tid] = (GET_INP[tid-offset]) +(GET_INP[tid]); 
    } 
    else { 
     GET_OUT[tid] = GET_INP[tid]; 
    } 
    syncthreads(); 
    } 

    if(!pout) 
    output[tid] =values[tid]; 

    __syncthreads(); 

    return output[numWorkers-1]; 

#undef GET_OUT 
#undef GET_INP 
}

来源

2012-02-28 19:53:52 perreal

这是相当有趣的。谢谢。 – Robotbugs 2012-02-28 20:03:03

我只是试图实现此一件事是，我发现扫描功能位于行不正确的：“温度[POUT * N + THID] + =温度[销* N + THID - 偏移];”。这实际上应该是“temp [pout * n + thid] = temp [pin * n + thid] + temp [pin * n + thid - offset];” – Robotbugs 2012-02-28 23:09:36

好的，我基本上实现了你所拥有的，我将在稍后发布最终代码。非常感谢。 – Robotbugs 2012-02-28 23:41:50

，我包括我最终使用的代码。它使用16x16像素块。请注意，我现在正在将数据写入一个全局数组中而不分解。我使用全局的atomicAdd函数来计算每组结果的基地址。由于每块只能调用一次，所以我没有发现过多的减速，而通过这样做，我获得了更多的便利。我也避免了prefix_sum输入和输出的共享缓冲区。在内核调用之前，GlobalCount被设置为零。

#define BLOCK_THREADS 256 

__device__ int prefixsum(int threadid, int data) 
{ 
    __shared__ int temp[BLOCK_THREADS*2]; 

    int pout = 0; 
    int pin = 1; 

    if(threadid==BLOCK_THREADS-1) 
     temp[0] = 0; 
    else 
     temp[threadid+1] = data; 

    __syncthreads(); 

    for(int offset = 1; offset<BLOCK_THREADS; offset<<=1) 
    { 
     pout = 1 - pout; 
     pin = 1 - pin; 

     if(threadid >= offset) 
      temp[pout * BLOCK_THREADS + threadid] = temp[pin * BLOCK_THREADS + threadid] + temp[pin * BLOCK_THREADS + threadid - offset]; 
     else 
      temp[pout * BLOCK_THREADS + threadid] = temp[pin * BLOCK_THREADS + threadid]; 

     __syncthreads(); 
    } 

    return temp[pout * BLOCK_THREADS + threadid]; 
} 

__global__ void gpu_kernel(int w, int h, RtmPoint *pPoints, int *pGlobalCount) 
{ 
    __shared__ int write_base; 

    int x = blockIdx.x * blockDim.x + threadIdx.x; 
    int y = blockIdx.y * blockDim.y + threadIdx.y; 

    int threadid = threadIdx.y * blockDim.x + threadIdx.x; 
    int valid = 0; 

    if(x<w && y<h) 
    { 
     if(test_pixel(x,y)) 
     { 
      valid = 1; 
     } 
    } 

    int index = prefixsum(threadid, valid); 

    if(threadid==BLOCK_THREADS-1) 
    { 
     int total = index + valid; 
     if(total>64) 
      total = 64; // global output buffer is limited to 64 points per block 
     write_base = atomicAdd(pGlobalCount, total); // get a location to write them out 
    } 

    __syncthreads(); // ensure write_base is valid for all threads 

    if(valid) 
    { 
     RtmPoint point; 
     point.x = x; 
     point.y = y; 
     if(index<64) 
      pPoints[write_base + index] = point; 
    } 
}

来源

2012-02-29 23:29:54 Robotbugs

使用atomicAdd来协调结果写入的唯一问题是，它们以随机顺序结束，该顺序随着运行而变化。然而，这并不重要，加上它很容易对输出向量进行排序。 – Robotbugs 2012-02-29 23:34:09

与CUDA共享内存互斥 - 增加项目

回答

相关问题