我在windows 7 x64上使用vs2010,为我的大学项目使用CUDA工具包v4.0。我想实现一个简单的gpu-vs-cpu测试,其中大部分都已完成,但我的cuda测试都没有返回任何结果。我用调试器检查了内存,设备内存包含了我需要的所有内容,只有内存复制失败。将数据从设备复制到主机不起作用
host_vector<int> addWithCuda(host_vector<int> h_a, host_vector<int> h_b)
{
int size = h_a.size();
host_vector<int> h_c(size);
// Choose which GPU to run on, change this on a multi-GPU system.
cudaError_t cudaStatus = cudaSetDevice(0);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaSetDevice failed! Do you have a CUDA-capable GPU installed?");
return h_c;
}
else{
// Allocate GPU buffers for three vectors (two input, one output).
// Copy input vectors from host memory to GPU buffers.
device_vector<int> d_c=h_c;
device_vector<int> d_a=h_a;
device_vector<int> d_b=h_b;
int*d_a_ptr = raw_pointer_cast(&d_a[0]);
int*d_b_ptr = raw_pointer_cast(&d_b[0]);
int*d_c_ptr = raw_pointer_cast(&d_c[0]);
int*h_c_ptr = raw_pointer_cast(&h_c[0]);
// Launch a kernel on the GPU with one thread for each element.
addKernel<<<1, size>>>(d_c_ptr, d_a_ptr, d_b_ptr);
// cudaDeviceSynchronize waits for the kernel to finish, and returns
// any errors encountered during the launch.
cudaStatus = cudaDeviceSynchronize();
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaDeviceSynchronize returned error code %d after launching addKernel!\n", cudaStatus);
return h_c;
}
thrust::device_vector<int>::iterator d_it;
thrust::host_vector<int>::iterator h_it;
// Copy output vector from GPU buffer to host memory.
h_c=d_c;
printf("||Debug h_c[0]=%d\td_c[0]=%d\n",h_c[0],d_c[0]);
}
cudaStatus = cudaDeviceReset();
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaDeviceReset failed!");
}
return h_c;
}
注意代码行“h_c = d_c;”。在推力这应该是从d_c(一个设备向量)复制数据到h_c(一个主机向量)。这条线不会失败,但也不会正确执行。 h_c始终保持全零。我试过其他几种方法,比如“thrust :: copy(d_c.begin(),d_c.end(),h_c.begin());”或“cudaMemcpy(h_c_ptr,d_c_ptr,size * sizeof(int),cudaMemcpyDeviceToHost);”甚至“for(int i = 0; i < size; ++ i)h_c [i] = d_c [i];”。没有工作。我在这里失去了:(
人有类似的东西吗?都有助于apreciated。
你确信''''addKernel'''能正常工作吗? –
是的,就像我说的,我检查了我的d_c向量的内容,它有正确的值。问题是d_c的内容拷贝到h_c中 –
如果你省略'''cudaDeviceSynchronize''会发生什么?在这个例子中它实际上并不是必须的,而且'''size''的值是什么? –