我新的CUDA C.我写一个简单的数组添加和减少,当它运行从设备复制错误校验回主机我得到一个“未知错误”。我不知道如果错误检查器出现故障,不能返回正确的cudaError,但我不能工作了什么是错的.......cudaGetLastError返回“未知错误”
using namespace std;
#include <iostream>
void CudaAddReduce(int *input, int *output, size_t size);
__global__ void Fill(int *fillItem);
__global__ void Add(int *input1, int *result);
__global__ void Reduce(int *intputArray, int *outputArray);
main(int argc, char *argv[])
{
const int N = 100;
int inp[N];
int outp[N];
size_t size = (N * sizeof(int));
CudaAddReduce(inp,outp,size);
cout << outp[N] << endl;
}
void CudaAddReduce(int *input, int *output, size_t size)
{
// allocate buffers to device
//input
int *d_input;
if (cudaMalloc(&d_input,size) != cudaSuccess){
cerr << cudaGetErrorString(cudaGetLastError()) << endl;
cout << "Input allocation to device" << endl;
exit(1);
}
////////////////////////////
//output
int *d_output;
if (cudaMalloc(&d_output,size) != cudaSuccess){
cerr << cudaGetErrorString(cudaGetLastError()) <<endl;
cout << "output allocation to device" << endl;
exit(1);
}
//////////////////////////////////
//copy buffers to device from host
//////////////////////////////////
//input
if (cudaMemcpy(d_input, input, size, cudaMemcpyHostToDevice) != cudaSuccess){
cerr << cudaGetErrorString(cudaGetLastError()) << endl;
cout << "Input Copy from host to device" << endl;
exit(1);
}
/////////////////////////////////
//execute device kernals
/////////////////////////////////
int numThreads = 256;
int numBlocks = 1;
//Fill Kernal
Fill<<<numBlocks,numThreads>>>(d_input);
// Add Kernal
Add<<<numBlocks,numThreads>>>(d_input,d_output);
//execute Reduce Kernal
Reduce<<<numBlocks,numThreads>>>(d_output,d_input);
cudaThreadSynchronize();
/////////////////////////////////
//copy result from device to host
/////////////////////////////////
//output
if (cudaMemcpy(output,d_output,size,cudaMemcpyDeviceToHost)!= cudaSuccess){
cerr << cudaGetErrorString(cudaGetLastError()) << endl;
cout << "Output Copy from device to host" << endl;
exit(1);
}
//clear device buffers
cudaFree(d_input);
cudaFree(d_output);
}
__global__ void Fill(int *fillItem)
{
int id = blockIdx.x * blockDim.x + threadIdx.x;
fillItem[id] = 1;
}
__global__ void Add (int *input1, int* result)
{
int id = blockIdx.x * blockDim.x + threadIdx.x;
result[id] = input1[id] + input1[id];
}
__global__ void Reduce(int *inputArray, int *outputArray)
{
extern __shared__ int sdata[];
// each thread loads one element from global to shared mem
unsigned int tid = threadIdx.x;
unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;
sdata[tid] = inputArray[i];
__syncthreads();
// do reduction in shared mem
for(unsigned int s=1; s < blockDim.x; s *= 2)
{
if(tid % (2*s) == 0)
{
sdata[tid] += sdata[tid + s];
}
__syncthreads();
}
// write result for this block to global mem
if(tid == 0) outputArray[blockIdx.x] = sdata[0];
}
感谢
从哪里开始...每个内核包含越界内存操作,减少了内核缺乏一个共享内存大小的内核参数,而'cout'在'main'包含越界内存访问。 – talonmies 2012-04-22 20:40:07
我认为一个问题不应该因为提问者编写错误代码而被低估。这个问题是合法的(它甚至吸引了一个合理的答案,它可能不值得赞成票,但是不会投票赞成 – harrism 2012-04-22 23:04:12
我同意哈里斯的观点,虽然生活可能因错误重新命名为YouveDoneSomethingStupidWithMemoryError而变得更容易,因为这是我唯一一次看到它 – 3Pi 2012-04-23 00:30:08