我想修改CUDA SDK中的imageDenosing类,我需要多次重复过滤以捕获时间。但我的代码无法正常工作。CudaMalloc如何工作?
//开始
__global__ void F1D(TColor *image,int imageW,int imageH, TColor *buffer)
{
const int ix = blockDim.x * blockIdx.x + threadIdx.x;
const int iy = blockDim.y * blockIdx.y + threadIdx.y;
if(iy != 0 && iy < imageH-1 && ix < imageW)
{
float4 fresult = get_color(image[imageW * iy + ix]);
float4 fresult4 = get_color(image[imageW * (iy+1) + ix]);
float4 fresult5 = get_color(image[imageW * (iy-1) + ix]);
float4 fresult7;
fresult7.x = fresult.x*0.5+fresult4.x*.25+fresult5.x*.25;
fresult7.y = fresult.y*0.5+fresult4.y*.25+fresult5.y*.25;
fresult7.z = fresult.z*0.5+fresult4.z*.25+fresult5.z*.25;
buffer[imageW * iy + ix] =
make_color(fresult7.x,fresult7.y,fresult7.z,0);
}
image[imageW * iy + ix] = buffer[imageW * iy + ix];
//should be use cudaMemcpy, But it fails
}
//的extern
extern "C" void
cuda_F1D(TColor *dst, int imageW, int imageH)
{
dim3 threads(BLOCKDIM_X, BLOCKDIM_Y);
dim3 grid(iDivUp(imageW, BLOCKDIM_X), iDivUp(imageH, BLOCKDIM_Y));
Copy<<<grid, threads>>>(dst, imageW, imageH);
size_t size = imageW*imageH*sizeof(TColor);
TColor *host =(TColor*) malloc(size);
TColor *dst2;
//TColor *dst3;
//TColor *d = new TColor(imageW*imageH*sizeof(TColor));
dim3 threads2(imageW,1);
dim3 grid2(iDivUp(imageW, imageW), iDivUp(imageH, 1));
*for(int i = 0;i<1;i++)
{
cudaMalloc((void **)&dst2, size);
cudaMemcpy(dst2, dst, imageW*imageH*sizeof(TColor),cudaMemcpyHostToDevice);
//cudaMalloc((void **)&dst3, imageW*imageH*sizeof(TColor));
//cudaMemcpy(dst3, dst, imageW*imageH*sizeof(TColor),cudaMemcpyHostToDevice);
F1D<<<grid2, threads2>>>(dst, imageW, imageH,dst2);
//cudaMemcpy(dst, dst3, imageW*imageH*sizeof(TColor),cudaMemcpyDeviceToHost);
cudaFree(dst2);
}*
}
此代码的工作,但不能像数组同步。并导致许多同步问题
推测dst也是cudaMalloc'd在别的地方?一个想到未来的参考,也许你打算让“我”循环执行更多的迭代 - 你应该避免在该循环内的cudaMalloc和cudaFree,并只做一次。让他们走上性能的道路并不是一个好主意,他们不是超快的。如果你的算法允许,你也可以考虑异步memcpys。 – Tom 2010-05-31 08:41:51
ITYM“去噪”? – 2010-06-04 14:45:15
YEs ... mistyping – kitw 2010-06-04 14:47:06