0
在这个简短的例子中,我试图通过一个带有指针的struct
init在cuda设备内存中传递一个表。复制到主机 - >设备,设备 - >主机似乎可行,但在`_ global _ function nothing works. Values for
dA`为空,我无法更改它们。传递一个指向CUDA设备内存的指针init init
我不知道如何从价值A
复制到dA
。如果我使用这样的基本表格3210它可以工作,但这里不是我想要做的。这是代码:
#include<assert.h>
#include <cuda.h>
#include <stdio.h>
#include <iostream>
#include <iomanip>
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <cuda_runtime.h>
#include <cuda_runtime_api.h>
#define N 5// side of matrix containing data
#define checkCudaErrors(val) check((val), #val, __FILE__, __LINE__)
typedef struct {float re,im;} fcomplex;
__global__ void kernel(fcomplex * da)
{
int x = threadIdx.x;
int y = threadIdx.y;
int i = (N*y) + x;
//da[i].re += 2;
printf("%f \n",da[i].re);
}
int main(int argc, char * argv[])
{
fcomplex *dA,**A,**B;
A= (fcomplex **)malloc(N * sizeof(fcomplex*));
B=(fcomplex **)malloc(N * sizeof(fcomplex* ));
for (int i = 0; i < N; i++){
A[i] = (fcomplex *)malloc(N * sizeof(fcomplex));
B[i] = (fcomplex *)malloc(N * sizeof(fcomplex));
}
for (int i = 0; i < N; i++)
{ for (int d= 0; d < N; d++)
{
A[i][d].re = i*d;
A[i][d].im = i*d;
}
}
checkCudaErrors(cudaMalloc((void **)&dA, (size_t)(sizeof(fcomplex)*N*N)));
checkCudaErrors(cudaMemcpy(dA,A,N*N*sizeof(fcomplex),cudaMemcpyHostToDevice));
const dim3 blockSize(N,N);
const dim3 gridSize(1,1);
kernel<<<gridSize,blockSize>>>(dA);
checkCudaErrors(cudaThreadSynchronize());
checkCudaErrors(cudaGetLastError());
checkCudaErrors(cudaMemcpy(B, dA, sizeof(fcomplex)*N*N, cudaMemcpyDeviceToHost));
for (int i = 0; i < N; i++)
{ for (int d= 0; d < N; d++)
{
printf("%f-%f\n",A[i][d].re,B[i][d].re);
printf("%f-%f\n",A[i][d].im,B[i][d].im);
}
}
//verify(A,B,N);
free(A);
free(B);
cudaFree(dA);
//cudaFree(dB);
}
void verify(fcomplex ** A, fcomplex ** B, int size)
{
for (int i = 0; i < size; i++)
{ for (int d= 0; d < size; d++)
{
assert(A[i][d].re==B[i][d].re);
}
}
printf("Correct!");
}
完美,非常感谢。 :) – volty41 2013-04-30 11:18:27