回答问题的唯一方法是通过基准测试。如果你这样做,根据你的硬件你可能得不到相同的答案。当我运行此:
#define NITER (128)
struct data {
int x;
int y;
int z;
};
struct __align__(16) aligned_data {
int x;
int y;
int z;
};
template<typename T, int niter>
__global__
void kernel(T *in, int *out, int dowrite=0)
{
int tid = threadIdx.x + blockIdx.x * blockDim.x;
int nthreads = blockDim.x * gridDim.x;
int oval = 0;
#pragma unroll
for(int i=0; i<niter; ++i,tid+=nthreads) {
T val = in[tid];
oval += val.x + val.y + val.z;
}
if (dowrite) {
out[tid] = oval;
}
}
template __global__ void kernel<data,NITER>(data *, int*, int);
template __global__ void kernel<aligned_data,NITER>(aligned_data *, int*, int);
int main()
{
const int bs = 512;
const int nb = 32;
const int nvals = bs * nb * NITER;
data *d_; cudaMalloc((void **)&d_, sizeof(data) * size_t(nvals));
aligned_data *ad_; cudaMalloc((void **)&ad_, sizeof(aligned_data) * size_t(nvals));
for(int i=0; i<10; ++i) {
kernel<data,NITER><<<nb, bs>>>(d_, (int *)0, 0);
kernel<aligned_data,NITER><<<nb, bs>>>(ad_, (int *)0, 0);
cudaDeviceSynchronize();
}
cudaDeviceReset();
return 0;
}
我看到排列结构的版本给出了一个计算5.2功能的设备上整体更高的性能:
Time(%) Time Calls Avg Min Max Name
52.71% 2.3995ms 10 239.95us 238.10us 241.79us void kernel<data, int=128>(data*, int*, int)
47.29% 2.1529ms 10 215.29us 214.91us 215.51us void kernel<aligned_data, int=128>(aligned_data*, int*, int)
在这种情况下,我会假设,大约10%的改善是直至发布的较低数量的加载指令。在未对齐的情况下,编译器发出三个32位加载来获取结构,而在对齐的情况下,编译器发出一个128位加载来获取结构。指令的减少似乎抵消了25%的内存带宽浪费。在其他具有不同内存指令吞吐量的硬件比率的硬件上,结果可能会有所不同。