2011-04-04 55 views
1

我学习openCL,我尝试测试尺寸,但它们不适用于我。下面是内核代码:opencl尺寸对我不起作用

#pragma OPENCL EXTENSION cl_khr_byte_addressable_store : enable 

typedef struct _data { 
    unsigned long wId; // group_id 
    unsigned long iId[4]; // global_item_id 
} DATA; 

__kernel void tKernel(__global DATA *x, __global DATA *y, __global DATA *z) { 
    // x dimension 
    int xGrId = get_group_id(0); 
    int xLId = get_local_id(0); 
    int xGlId = get_global_id(0); 
    // y dimension 
    int yGrId = get_group_id(1); 
    int yLId = get_local_id(1); 
    int yGlId = get_global_id(1); 
    // z dimension 
    int zGrId = get_group_id(2); 
    int zLId = get_local_id(2); 
    int zGlId = get_global_id(2); 

    x += xGrId; 
    x->wId = xGrId; 
    x->iId[xLId] = xGlId; 

    y += yGrId; 
    y->wId = yGrId; 
    y->iId[yLId] = yGlId; 

    z += zGrId; 
    z->wId = zGrId; 
    z->iId[zLId] = zGlId; 
} 

这里是我的主机代码:

#include <stdio.h> 
#include <stdlib.h> 
#include <assert.h> 

#ifdef __APPLE__ 
#include <OpenCL/opencl.h> 
#else 
#include <CL/cl.h> 
#endif 

#define GLOBAL_ITEM_SIZE (16) 
#define LOCAL_ITEM_SIZE (4) 
#define MAX_SOURCE_SIZE (0x100000) 

typedef struct _data { 
    unsigned long wId; 
    unsigned long iId[LOCAL_ITEM_SIZE]; 
} DATA; 

int main() 
{ 
    cl_platform_id platform_id = NULL; 
    cl_device_id device_id = NULL; 
    cl_context context = NULL; 
    cl_command_queue command_queue = NULL; 
    cl_mem xMobj = NULL; 
    cl_mem yMobj = NULL; 
    cl_mem zMobj = NULL; 
    cl_program program = NULL; 
    cl_kernel kernel = NULL; 
    cl_uint ret_num_devices; 
    cl_uint ret_num_platforms; 
    cl_int ret; 

    size_t group_size = GLOBAL_ITEM_SIZE/LOCAL_ITEM_SIZE; 
    DATA x[group_size]; 
    DATA y[group_size]; 
    DATA z[group_size]; 


    FILE *fp; 
    const char fileName[] = "./kernel.cl"; 
    size_t source_size; 
    char *source_str; 

    /* Load kernel source file */ 
    fp = fopen(fileName, "r"); 
    if (!fp) { 
     fprintf(stderr, "Failed to load kernel.\n"); 
     exit(EXIT_FAILURE); 
    } 
    source_str = (char *)malloc(MAX_SOURCE_SIZE); 
    source_size = fread(source_str, 1, MAX_SOURCE_SIZE, fp); 
    fclose(fp); 

    /* Get Platform/Device Information */ 
    ret = clGetPlatformIDs(1, &platform_id, &ret_num_platforms); 
    assert(ret == CL_SUCCESS); 
    ret = clGetDeviceIDs(platform_id, CL_DEVICE_TYPE_GPU, 1, &device_id, &ret_num_devices); 
    assert(ret == CL_SUCCESS); 

    /* Create OpenCL Context */ 
    context = clCreateContext(NULL, 1, &device_id, NULL, NULL, &ret); 

    /* Create command queue */ 
    command_queue = clCreateCommandQueue(context, device_id, 0, &ret); 

    /* Create Buffer Objects */ 
    xMobj = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(x), NULL, &ret); 
    yMobj = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(y), NULL, &ret); 
    zMobj = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(z), NULL, &ret); 

    /* Create kernel program from source file */ 
    program = clCreateProgramWithSource(context, 1, (const char **)&source_str, (const size_t *)&source_size, &ret); 
    assert(ret == CL_SUCCESS); 
    ret = clBuildProgram(program, 1, &device_id, NULL, NULL, NULL); 
    if (ret != CL_SUCCESS) { 
     printf("\nFail to build the program\n"); 
     char buffer[10240]; 
     clGetProgramBuildInfo(program, device_id, CL_PROGRAM_BUILD_LOG, sizeof(buffer), buffer, NULL); 
     printf("%s\n", buffer); 
     exit(EXIT_FAILURE); 
    } 

    /* Create data parallel OpenCL kernel */ 
    kernel = clCreateKernel(program, "tKernel", &ret); 
    assert(ret == CL_SUCCESS); 

    /* Set OpenCL kernel arguments */ 
    ret = clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *)&xMobj); 
    assert(ret == CL_SUCCESS); 
    ret = clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *)&yMobj); 
    assert(ret == CL_SUCCESS); 
    ret = clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *)&zMobj); 
    assert(ret == CL_SUCCESS); 

    size_t global_item_size[3] = {GLOBAL_ITEM_SIZE, GLOBAL_ITEM_SIZE, 1}; 
    size_t local_item_size[3] = {LOCAL_ITEM_SIZE, LOCAL_ITEM_SIZE, 1}; 

    /* Execute OpenCL kernel as data parallel */ 
    ret = clEnqueueNDRangeKernel(command_queue, kernel, 1, NULL, 
          global_item_size, local_item_size, 0, NULL, NULL); 
    if (ret == CL_INVALID_WORK_GROUP_SIZE) { 
     printf("Invalid work group size: error when compute group size: %d/%d", GLOBAL_ITEM_SIZE, LOCAL_ITEM_SIZE); 
     exit(EXIT_FAILURE); 
    } 

    /* Transfer result to host */ 
    ret = clEnqueueReadBuffer(command_queue, xMobj, CL_TRUE, 0, sizeof(x), x, 0, NULL, NULL); 
    assert(ret == CL_SUCCESS); 
    ret = clEnqueueReadBuffer(command_queue, yMobj, CL_TRUE, 0, sizeof(y), y, 0, NULL, NULL); 
    printf("%d\n", ret); 
    assert(ret == CL_SUCCESS); 
    ret = clEnqueueReadBuffer(command_queue, zMobj, CL_TRUE, 0, sizeof(z), z, 0, NULL, NULL); 
    assert(ret == CL_SUCCESS); 

    /* Display Results */ 
    int i; 
    int j; 
    printf("X dimension:\n"); 
    for (i = 0; i < group_size; i++) { 
     printf("%d: -> group_id %lu ~> work_item_ids: ", i, x[i].wId); 
     for (j = 0; j < LOCAL_ITEM_SIZE; j++) 
      printf("%2lu, ", x[i].iId[j]); 
     printf("\n"); 
    } 
    printf("\n"); 

    printf("Y dimension:\n"); 
    for (i = 0; i < group_size; i++) { 
     printf("%d: -> group_id %lu ~> work_item_ids: ", i, y[i].wId); 
     for (j = 0; j < LOCAL_ITEM_SIZE; j++) 
      printf("%2lu, ", y[i].iId[j]); 
     printf("\n"); 
    } 
    printf("\n"); 

    printf("Z dimension:\n"); 
    for (i = 0; i < group_size; i++) { 
     printf("%d: -> group_id %lu ~> work_item_ids: ", i, z[i].wId); 
     for (j = 0; j < LOCAL_ITEM_SIZE; j++) 
      printf("%2lu, ", z[i].iId[j]); 
     printf("\n"); 
    } 

    /* Finalization */ 
    ret = clFlush(command_queue); 
    ret = clFinish(command_queue); 
    ret = clReleaseKernel(kernel); 
    ret = clReleaseProgram(program); 
    ret = clReleaseMemObject(xMobj); 
    ret = clReleaseCommandQueue(command_queue); 
    ret = clReleaseContext(context); 

    free(source_str); 

    return 0; 
} 

我建立在Mac OS X 10.6的代码。输出功率为:

X dimension: 
0: -> group_id 0 ~> work_item_ids: 0, 1, 2, 3, 
1: -> group_id 1 ~> work_item_ids: 4, 5, 6, 7, 
2: -> group_id 2 ~> work_item_ids: 8, 9, 10, 11, 
3: -> group_id 3 ~> work_item_ids: 12, 13, 14, 15, 

Y dimension: 
0: -> group_id 0 ~> work_item_ids: 0, 0, 0, 0, 
1: -> group_id 0 ~> work_item_ids: 0, 0, 0, 0, 
2: -> group_id 0 ~> work_item_ids: 0, 0, 0, 0, 
3: -> group_id 0 ~> work_item_ids: 0, 0, 0, 0, 

Z dimension: 
0: -> group_id 0 ~> work_item_ids: 0, 0, 0, 0, 
1: -> group_id 0 ~> work_item_ids: 0, 0, 0, 0, 
2: -> group_id 0 ~> work_item_ids: 0, 0, 0, 0, 
3: -> group_id 0 ~> work_item_ids: 0, 0, 0, 0, 

正如你所看到的,Y是空的(Z尺寸必须由空的,因为我没有设置)。 我尝试设置y *参数在内核中的值为x值,结果如x尺寸,所以内存拷贝是没问题的。谁能帮我?

回答

1
ret = clEnqueueNDRangeKernel(command_queue, kernel, 1, NULL, 
          global_item_size, local_item_size, 0, NULL, NULL); 

该调用的第三个参数是NDRange空间的维数,并且您将其设置为1,即一个维度。将它设置为2

+0

谢谢,我重写参数去2号。但是还有一个问题呢,从设备传输数据到主机: ** RET = clEnqueueReadBuffer(command_queue,xMobj,CL_TRUE,0,的sizeof( x),x,0,NULL,NULL); ** retun number -36这是** CL_INVALID_COMMAND_QUEUE ** – 2011-04-05 16:10:38

+0

对不起,我做了这个错误,因为我没有像主机一样的内核代码结构的数组大小代码结构。现在没事了。感谢您的帮助。 – 2011-04-05 16:19:15