我正在学习CUDA。今天,我在书中尝试了一些代码:CUDA Application Design And Development
,这让我很惊讶。为什么CUDA Thrust如此之慢?这里是代码和输出。CUDA推力降低如此之慢?
#include <iostream>
using namespace std;
#include<thrust/reduce.h>
#include<thrust/sequence.h>
#include<thrust/host_vector.h>
#include<thrust/device_vector.h>
#include <device_launch_parameters.h>
#include "GpuTimer.h"
__global__ void fillKernel(int *a, int n)
{
int tid = blockDim.x * blockIdx.x + threadIdx.x;
if(tid <n) a[tid] = tid;
}
void fill(int *d_a, int n)
{
int nThreadsPerBlock = 512;
int nBlock = n/nThreadsPerBlock + ((n/nThreadsPerBlock)?1:0);
fillKernel<<<nBlock, nThreadsPerBlock>>>(d_a, n);
}
int main()
{
const int N = 500000;
GpuTimer timer1, timer2;
thrust::device_vector<int> a(N);
fill(thrust::raw_pointer_cast(&a[0]), N);
timer1.Start();
int sumA = thrust::reduce(a.begin(), a.end(), 0);
timer1.Stop();
cout << "Thrust reduce costs " << timer1.Elapsed() << "ms." << endl;
int sumCheck = 0;
timer2.Start();
for(int i = 0; i < N; i++)
sumCheck += i;
timer2.Stop();
cout << "Traditional reduce costs " << timer2.Elapsed() << "ms." << endl;
if (sumA == sumCheck)
cout << "Correct!" << endl;
return 0;
}
也许是因为您的输入数据量很小,或者您的GPU速度太慢,或者您的主机CPU速度很快,或者您的CUDA平台有很多延迟?当我们不知道实验如何进行时,我们如何回答为什么您的特定实验不符合一些任意期望的结果? – talonmies 2013-03-07 09:44:30