2017-02-10 66 views
1

我想计算CUDA中数组的所有元素的总和。我想出了这个代码。它编译没有任何错误。但结果总是为零。我从cudaMemcpyFromSymbol得到了无效的设备符号。我不能使用任何类似Thrust或Cublas的库。无效的设备符号cudaMemcpyFromSymbol CUDA

#define TRIALS_PER_THREAD 4096 
#define NUM_BLOCKS 256 
#define NUM_THREADS 256 
double *dev; 
__device__ volatile double pi_gpu = 0; 

__global__ void ArraySum(double *array) 

{ 
unsigned int tid = threadIdx.x + blockDim.x * blockIdx.x; 
pi_gpu = pi_gpu + array[tid]; 
__syncthreads(); 
} 

int main (int argc, char *argv[]) { 
cudaMalloc((void **) &dev, NUM_BLOCKS * NUM_THREADS * sizeof(double)); 
    double pi_gpu_h; 

ArraySum<<<NUM_BLOCKS, NUM_THREADS>>>(dev); 
cudaDeviceSynchronize(); 
cudaError err = cudaMemcpyFromSymbol(&pi_gpu_h, &pi_gpu, sizeof(double), cudaMemcpyDeviceToHost); 
if(cudaSuccess != err) 
{ 
    fprintf(stderr, "cudaMemcpyFromSymbolfailed : %s\n", cudaGetErrorString(err)); 
    exit(-1); 
} 
return pi_gpu_h; // this is always zero!!! 
} 
+0

如果你花了30秒来正确格式化你的代码,阅读发布非常困难。 – talonmies

回答

-2

您的代码不是线程安全的。从多个线程写入全局变量是不安全的。如何减少核可能是这个例子:

//Untested code 
global_void plus_reduce(int *input, int N, int *total){ 
    int tid = threadIdx.x; 
    int i = blockIdx.x*blockDim.x + threadIdx.x; 
    // Each block loads its elements into shared memory 
    _shared_ int x[blocksize]; 
    x[tid] = (i<N) ? input[i] : 0; // last block may pad with 0’s 
    _syncthreads(); 
    // Build summation tree over elements. 
    for(int s=blockDim.x/2; s>0; s=s/2){ 
     if(tid < s) x[tid] += x[tid + s]; 
    _syncthreads(); 
} 
// Thread 0 adds the partial sum to the total sum 
if(tid == 0) 
    atomicAdd(total, x[tid]);     
} 

Source

+0

这是如何回答这个问题的? – talonmies

3

从符号调用拷贝符号的说法是不正确。它应该看起来像这样:

cudaMemcpyFromSymbol(&pi_gpu_h, pi_gpu, sizeof(double), cudaMemcpyDeviceToHost)