我在OpenCL中创建了一个实时光线跟踪器。这是在GTX 580上开发的。我停止了几年,最近又复活了。我预计用更新的“更好”的Nvidia GPU可以更快运行。但是,它仍然运行在GTX 580内核在较新和“更好”的Nvidia GPU上运行速度较慢
这里最快的是时代的表的基准情景我用
GPU Kernel time CPU OS System Mem
GTX 580 11 ms E5-1670 Windows 7 32 GB
GTX Titan 15 ms W5580 (two processors) Windows 7 48 GB
GTX 980M 15 ms i7-4710HQ (laptop) Windows 10 16 GB
每台计算机都Nvidia驱动361.43安装在三台不同的计算机和图形卡2016年1月10日,主机代码使用Visual Studio 2013 64位版本模式进行编译。
我也对GTX 580
观察更快的帧速率我用
time_end = clevent.getProfilingInfo<CL_PROFILING_COMMAND_END>();
time_start = clevent.getProfilingInfo<CL_PROFILING_COMMAND_START>();
获得内核时间。我不使用双浮点扩展(//#pragma OPENCL EXTENSION cl_khr_fp64 : enable
)。
内核代码被分解成几个内核文件,我把它组合成一个文件,这是几千行代码。
为什么我的内核在较新的“更好的”硬件上变慢?
这里是我创建上下文的代码。这还不是全部要有道理,但它可能比没有好
void Contexts::init(string sourceCode) {
run_time = -1;
context = createCLContext(type, vendor);
cl_uint uiNumSupportedFormats = 0;
devices = context.getInfo<CL_CONTEXT_DEVICES>();
int err = 0;
try{
//queues.push_back(cl::CommandQueue(context, devices[i], 0, &err));
//queue = cl::CommandQueue(context, devices[device], CL_QUEUE_PROFILING_ENABLE, &err);
queue = cl::CommandQueue(context, devices[device], CL_QUEUE_PROFILING_ENABLE|CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE, &err);
//printf("\t\tDevice: %s\n", devices[device].getInfo<CL_DEVICE_NAME>().c_str());
}
catch (cl::Error er) {
printf("ERROR: %s(%d)\n", er.what(), er.err());
}
//ndevices = devices.size();
//if(ndevices>max_devices) ndevices = max_devices;
program = buildProgramFromSource(context, sourceCode);
try{
kernel1 = cl::Kernel(program, "trace", &err);
kernel2 = cl::Kernel(program, "transform_primitives", &err);
kernel_postprocess = cl::Kernel(program, "post_process", &err);
}
catch (cl::Error er) {
printf("ERROR: %s(%d)\n", er.what(), er.err());
}
}
cl::Buffer Contexts::copy_buffer(int size, const void* ptr, int flags = CL_MEM_READ_ONLY) {
cl::Buffer out;
if(size>0) {
out = cl::Buffer(context, flags| CL_MEM_COPY_HOST_PTR, size, (void*)ptr);
}
else {
//NULL pointers to kernel do not seem to work on INTEL so use this hack
out = cl::Buffer(context, flags, 1, NULL);
}
return out;
}
void Contexts::copy_buffers() {
//int cubemap_size = para->cubemap->sizeX * para->cubemap->sizeY * 6 * para->cubemap->ncubemap;
//if(para->cubemap->sizeX== -1) cubemap_size = 0;
int nobj = para->kernel1_parameters.nobj;
int nprim = para->kernel1_parameters.nprim;
int nmat= para->kernel1_parameters.nmat;
int nlight = para->kernel1_parameters.nlight;
int nnode = para->kernel1_parameters.nnode;
int nmap = para->nmaps;
int err = 0;
int npixels = para->kernel1_parameters.height*para->kernel1_parameters.width;
int exposure_samples = para->kernel1_parameters.exposure_samples;
int mask_size = para->kernel1_parameters.mask_size;
int nmask = (2*mask_size+1)*(2*mask_size+1);
cl_objects_mem = copy_buffer(sizeof(CSG_object)*nobj, para->objects);
cl_node_mem = copy_buffer(sizeof(Node)*nnode, para->nodes);
cl_prim_mem = copy_buffer(sizeof(Primitive)*nprim, para->prims, CL_MEM_READ_WRITE);
cl_light_mem = copy_buffer(sizeof(Light)*nlight, para->lights);
cl_mat_mem = copy_buffer(sizeof(Material)*nmat, para->mats);
cubemap_info = copy_buffer(sizeof(Cubemap_info)*nmap, para->maps);
cubemap_images = copy_buffer(sizeof(cl_uchar4)*para->envmap_npixels, para->envmap_images);
cl_mask_mem = copy_buffer(sizeof(cl_float)*nmask, para->mask);
cl_image_mem = cl::Buffer(context, CL_MEM_WRITE_ONLY, sizeof(cl_uchar4)*npixels, NULL, &err);
cl_results_mem = cl::Buffer(context, CL_MEM_READ_WRITE, sizeof(cl_float4)*npixels, NULL, &err);
cl_luminance = cl::Buffer(context, CL_MEM_WRITE_ONLY, sizeof(cl_float)*exposure_samples, NULL, &err);
if(para->surfacecpy_sw) {
cmPinnedBufOut1 = cl::Buffer(context, CL_MEM_WRITE_ONLY |CL_MEM_ALLOC_HOST_PTR, sizeof(cl_uchar4)*npixels, NULL, NULL);
image = (int*)queue.enqueueMapBuffer(cmPinnedBufOut1, CL_TRUE, CL_MAP_READ, 0, sizeof(cl_uchar4)*npixels, 0, NULL, NULL);
//queue.enqueueUnmapMemObject(cmPinnedBufOut1, image);
//int pageSize = 4096;
//image = (int*) _aligned_malloc(sizeof(cl_uchar4)*npixels, pageSize);
//CL_MEM_USE_PERSISTENT_MEM_AMD
}
cmPinnedBufOut2 = cl::Buffer(context, CL_MEM_WRITE_ONLY |CL_MEM_ALLOC_HOST_PTR, sizeof(cl_float)*exposure_samples, NULL, NULL);
luminance = (float*)queue.enqueueMapBuffer(cmPinnedBufOut2, CL_TRUE, CL_MAP_READ, 0, sizeof(cl_float)*exposure_samples, 0, NULL, NULL);
queue.finish();
//int kindex = 0;
kernel1.setArg(0, cl_objects_mem);
kernel1.setArg(1, cl_node_mem);
kernel1.setArg(2, cl_prim_mem);
kernel1.setArg(3, cl_mat_mem);
kernel1.setArg(4, cl_light_mem);
kernel1.setArg(5, cubemap_info);
kernel1.setArg(6, cubemap_images);
kernel1.setArg(7, cl_results_mem);
kernel_postprocess.setArg(0, cl_results_mem);
kernel_postprocess.setArg(1, cl_luminance);
kernel_postprocess.setArg(2, cl_image_mem);
kernel_postprocess.setArg(3, cl_mask_mem);
kernel2.setArg(0, cl_prim_mem);
}
void Contexts::run() {
int nprim = para->kernel2_parameters.nprim;
cl_float speed = para->kernel2_parameters.speed;
cl_float4 speed_obj = para->kernel2_parameters.speed_obj;
cl_float16 cl_viewTransform;
for(int i=0; i<16; i++)
cl_viewTransform.s[i] = para->viewTransform[i];
//para->kernel1_parameters.offset = offset;
//para->kernel1_parameters.offset2 = offset2;
kernel1.setArg(8, cl_viewTransform);
kernel1.setArg(9, para->kernel1_parameters);
kernel1.setArg(10, offset);
kernel_postprocess.setArg(4, para->kernel1_parameters);
kernel_postprocess.setArg(5, offset);
kernel_postprocess.setArg(6, offset2);
//kernel1.setArg(11, offset2);
cl::NDRange local_size = cl::NDRange(local_work_size);
if(local_work_size == 0) {
local_size = cl::NullRange;
}
queue.enqueueNDRangeKernel(kernel1, cl::NullRange, cl::NDRange(size), local_size, NULL, &clevent);
queue.finish();
cl_ulong time_start, time_end;
time_end = clevent.getProfilingInfo<CL_PROFILING_COMMAND_END>();
time_start = clevent.getProfilingInfo<CL_PROFILING_COMMAND_START>();
run_time = (float)(time_end - time_start);
//post_process
queue.enqueueNDRangeKernel(kernel_postprocess, cl::NullRange, cl::NDRange(size), local_size, NULL, &clevent);
queue.finish();
time_end = clevent.getProfilingInfo<CL_PROFILING_COMMAND_END>();
time_start = clevent.getProfilingInfo<CL_PROFILING_COMMAND_START>();
run_time += (float)(time_end - time_start);
//printf("run time %f, run time2 %f\n", run_time, run_time2);
//kernel2
kernel2.setArg(1, speed);
kernel2.setArg(2, speed_obj);
queue.enqueueNDRangeKernel(kernel2, cl::NullRange, cl::NDRange(nprim), cl::NullRange, NULL, &clevent);
queue.finish();
time_end = clevent.getProfilingInfo<CL_PROFILING_COMMAND_END>();
time_start = clevent.getProfilingInfo<CL_PROFILING_COMMAND_START>();
run_time += (float)(time_end - time_start);
if(para->getoutput_sw) {
if(!para->surfacecpy_sw) {
if(SDL_MUSTLOCK(para->surface)) {
if(SDL_LockSurface(para->surface) < 0) return;
}
queue.enqueueReadBuffer(cl_image_mem, CL_TRUE, 0, sizeof(cl_uchar4)*size, (int*)para->surface->pixels + offset, NULL, &clevent);
queue.finish();
if(SDL_MUSTLOCK(para->surface))
SDL_UnlockSurface(para->surface);
}
else {
queue.enqueueReadBuffer(cl_image_mem, CL_TRUE, 0, sizeof(cl_uchar4)*size, (int*)image, NULL, &clevent);
queue.finish();
}
queue.enqueueReadBuffer(cl_luminance, CL_TRUE, 0, sizeof(cl_float)*size2, luminance, NULL, &clevent);
queue.finish();
}
}
你是否期待我们盲目猜测?硬件已经变得更加强大,但我们不知道你的追踪器是什么样的或者它的运作方式。 你的基准测试时间看起来很小,似乎在ms范围内,它会随机波动过多,无法准确判断实际性能。 – Cubic
我意识到我可能没有提供足够的信息来回答这个问题。但是,这可能是OpenCL和Nvidia的一个已知问题?我可能很快发布我的代码,但它非常复杂。时间差异不是统计波动。我以每秒多帧的速度运行(在GTX 580上接近90 FPS),并且我引用了一个平均值。帧率明显不同。场景可以是动态的(移动对象),但对于基准场景来说,场景是静态的。 –
“GPU更好”这是一个疯狂的猜测,一切可能都不会更好。如果GPU A有100组,并且GPU B有1000组,但内存总线相同。然后在公共汽车上同时进行分散读取将使GPU B崩溃而不是GPU A。这一切都归结为您正在使用的内核,并且如果它足够通用以适应并获得新GPU的额外性能。 – DarkZeros