Results 1 to 3 of 3

Thread: global & local size in 2D problem

  1. #1
    Junior Member
    Join Date
    Oct 2012
    Posts
    5

    global & local size in 2D problem

    i have also another question..

    I want to make sure if i understand the definition of size_t global_size[] and size_t local_size[] in 2 dimensions, for global and local work space!

    I have to read from buffer (from my first kernel) an array of QxN size , and from my second kernel an array of Qxk size! so i suppose that i have to define global_size[] = {Q,N} and local_size[] = {16,16} i think that the local size is similar like the block size in cuda so i choose 16x16 to be more appropriate !
    For my second kernel i define global_size[] = {Q,k} and local_size[] = {16,16}.

    So, when i run my program i give 3 arguments, size N, size Q and size k, if i try to create an array of
    {N=16, Q=16, k=16} or {N=64, Q=64, k=16} i don't have any problem, but if i try for {N=128, Q=128, k=16} or another combination i have the bug error -54 CL_INVALID_WORK_GROUP_SIZE , so i think that something i didn't understand so well, i would be grateful if someone help me to manage with that issue! I have read many blocks and sites but i post again to this page because i want an answer to my specific problem!

    thank you anyway!

    here is the code...

    //======================= GPU ==========================//

    /* Create device and context */
    device = create_device();
    context = clCreateContext(NULL, 1, &device, NULL, NULL, &err);

    if(err < 0) {
    perror("Couldn't create a context");
    exit(1);
    }

    /* Build program */
    program1 = build_program(context, device, PROGRAM_FILE);

    /* Create data buffer */
    int numQuery = numQueries;
    int D=numDim;

    double *data = training_set.data;
    double *Query = query_set.data;

    size_t global_size[] = {numQuery,numObjects};
    size_t local_size[] = {BLK_SIZE,BLK_SIZE};


    Matrix d_N;
    d_N.width = D; //num Dim
    d_N.height = numObjects; //num objects

    size_t sizeN = D * numObjects * sizeof(double);


    d_N.elements = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, sizeN, data, &err);

    Matrix d_Q;
    d_Q.width = D; //num Dim
    d_Q.height = numQuery; //num querries

    size_t sizeQ = numQuery * D * sizeof(double);

    d_Q.elements = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, sizeQ, Query, &err);

    //result
    Matrix d_result_Dist;
    d_result_Dist.width = numObjects;//num objects
    d_result_Dist.height = numQuery ;//num querries

    size_t sizeDist = numObjects * numQuery * sizeof(double);

    d_result_Dist.elements = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, sizeDist, Dist, &err);//mipws kapws prepei na xwthei to num of groups

    if(err < 0) {
    perror("Couldn't create a buffer");
    exit(1);
    };

    /* Create a command queue */
    queue = clCreateCommandQueue(context, device, CL_QUEUE_PROFILING_ENABLE , &err);
    if(err < 0){
    perror("Couldn't create a command queue 1");
    printf("error %d \n ", err);
    exit(1);
    };

    /* Create a kernel EDW THA PAREI TON PRWTO KERNEL */
    kernel_createDist = clCreateKernel(program1, "kernel_createDist", &err);
    if(err < 0) {
    perror("Couldn't create a kernel 1");
    printf("error %d \n ", err);
    exit(1);
    };


    /* Create kernel arguments TO LOCAL TO EXEI VALEI STO PARADEIGMA EPEIDI O KERNEL TOU EXEI ORISMA TYPOU _LOCAL */
    err = clSetKernelArg(kernel_createDist, 0, sizeof(d_N.elements),(void*) &d_N.elements);
    err = clSetKernelArg(kernel_createDist, 1, sizeof(d_Q.elements), (void*)&d_Q.elements);
    err = clSetKernelArg(kernel_createDist, 2, sizeof(d_result_Dist.elements), (void*)&d_result_Dist.elements);
    err = clSetKernelArg(kernel_createDist, 3, sizeof(d_N.width), (void*)&d_N.width);
    err = clSetKernelArg(kernel_createDist, 4, sizeof(d_result_Dist.width), (void*)&d_result_Dist.width);

    if(err < 0) {
    perror("Couldn't create a kernel argument 1");
    printf("error %d \n ", err);
    exit(1);
    }
    //wait complete queue
    clFinish(queue);

    /* Enqueue kernel */
    err = clEnqueueNDRangeKernel(queue, kernel_createDist, 2, NULL, global_size, local_size, 0, NULL, &event);

    if(err < 0) {
    perror("Couldn't enqueue the kernel 1 %d");
    printf("error %d \n ", err);
    exit(1);
    }
    // wait for sure to finish the kernel
    clWaitForEvents(1 , &event);

    // compute of time in GPU
    clGetEventProfilingInfo(event, CL_PROFILING_COMMAND_START, sizeof(start), &start, NULL);
    clGetEventProfilingInfo(event, CL_PROFILING_COMMAND_END, sizeof(end), &end, NULL);

    float executionTimeInMilliseconds1 = (end - start) * 1.0e-6f;
    printf("[OPENCL] Time elapsed for GPU first kernel: %f s\n", executionTimeInMilliseconds1);

    /* Read the kernel's output */
    err = clEnqueueReadBuffer(queue, d_result_Dist.elements, CL_TRUE, 0, sizeDist, Dist, 0, NULL, NULL);
    if(err < 0) {
    perror("Couldn't enqueue the kernel 1 %d");
    printf("error %d \n ", err);
    exit(1);
    }


    clReleaseMemObject(d_N.elements);
    clReleaseMemObject(d_Q.elements);
    clReleaseMemObject(d_result_Dist.elements);


    clReleaseKernel(kernel_createDist);
    clReleaseCommandQueue(queue);
    clReleaseProgram(program1);
    clReleaseContext(context);

    //~ printf("----Dist gpu---\n");
    //~ printMat(Dist,numQuery,numObjects);
    //~

    //============================= SECOND KERNEL =================================//
    device = create_device();
    context = clCreateContext(NULL, 1, &device, NULL, NULL, &err);
    if(err < 0) {
    perror("Couldn't create a context 2");
    printf("error %d \n ", err);
    exit(1);
    }

    program2 = build_program(context, device, PROGRAM_FILE);

    size_t global_size2[] = {numQuery,k};
    size_t local_size2[] = {BLK_SIZE,BLK_SIZE};

    //input
    Matrix d_D;
    d_D.width = numObjects;//num objects
    d_D.height = numQuery ;//num querries

    size_t sizeDist1 = numObjects * numQuery * sizeof(double);

    d_D.elements = clCreateBuffer(context,CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, sizeDist1, Dist, &err);



    //result data
    Matrix d_NNidx;
    d_NNidx.width = k;
    d_NNidx.height = numQuery; //num querries

    size_t sizeId = k * numQuery * sizeof(double);

    d_NNidx.elements = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, sizeId, NNidx, &err);

    //result data
    Matrix d_result;
    d_result.width = k;
    d_result.height = numQuery ;//num querries

    size_t sizeRe = k * numQuery * sizeof(double);

    d_result.elements = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, sizeRe, NNdist, &err);

    /* Create a command queue */
    queue = clCreateCommandQueue(context, device, CL_QUEUE_PROFILING_ENABLE, &err);
    if(err < 0) {
    perror("Couldn't create a command queue 2");
    printf("error %d \n ", err);
    exit(1);
    };

    kernel_ParallelSorting = clCreateKernel(program2, "kernel_ParallelSorting", &err);

    if(err < 0) {
    perror("Couldn't create a kernel 2");
    printf("error %d \n ", err);
    exit(1);
    };

    //~ /* Create kernel arguments */
    err = clSetKernelArg(kernel_ParallelSorting, 0, sizeof(d_D.elements), (void*)&d_D.elements);

    err = clSetKernelArg(kernel_ParallelSorting, 1, sizeof(d_result.elements), (void*)&d_result.elements);

    err = clSetKernelArg(kernel_ParallelSorting, 2, sizeof(d_NNidx.elements), (void*)&d_NNidx.elements);

    err = clSetKernelArg(kernel_ParallelSorting, 3, sizeof(d_D.width), (void*)&d_D.width);

    err = clSetKernelArg(kernel_ParallelSorting, 4, sizeof(d_result.width), (void*)&d_result.width);


    if(err < 0) {
    perror("Couldn't create a kernel argument 2");
    printf("error num %d\n",err);
    exit(1);
    }

    clFinish(queue);

    //~ /* Enqueue kernel */
    err = clEnqueueNDRangeKernel(queue, kernel_ParallelSorting, 2, NULL, global_size2, local_size2, 0, NULL, &event);

    if(err < 0) {
    perror("Couldn't enqueue the kernel 2");
    printf("error %d \n ", err);
    exit(1);
    }

    clWaitForEvents(1 , &event);

    clGetEventProfilingInfo(event, CL_PROFILING_COMMAND_START, sizeof(start), &start, NULL);
    clGetEventProfilingInfo(event, CL_PROFILING_COMMAND_END, sizeof(end), &end, NULL);

    float executionTimeInMilliseconds2 = (end - start) * 1.0e-6f;
    printf("[OPENCL] Time elapsed for GPU first kernel: %f s\n", executionTimeInMilliseconds2);


    //read buffer 1
    err = clEnqueueReadBuffer(queue, d_result.elements , CL_TRUE, 0, sizeRe, NNdist, 0, NULL, NULL);

    if(err < 0) {
    perror("Couldn't read the buffer 2 for d_kDist");
    printf("error %d \n ", err);
    exit(1);
    }


    // read buffer 2
    err = clEnqueueReadBuffer(queue, d_NNidx.elements, CL_TRUE, 0, sizeId, NNidx, 0, NULL, NULL);

    if(err < 0) {
    perror("Couldn't read the buffer 2 for d_kDist");
    exit(1);
    }


    printf("+++++++++++++++++++++ knns GPU +++++++++++++++++++++++++\n");
    printMat(NNdist,numQueries,k);



    clReleaseMemObject(d_D.elements);
    clReleaseMemObject(d_result.elements);
    clReleaseMemObject(d_NNidx.elements);

    clReleaseKernel(kernel_ParallelSorting);
    clReleaseCommandQueue(queue);
    clReleaseProgram(program2);

    clReleaseContext(context);

  2. #2
    Senior Member
    Join Date
    Aug 2011
    Posts
    271

    Re: global & local size in 2D problem

    Quote Originally Posted by pelliegia
    size_t global_size[] = {numQuery,numObjects};
    size_t local_size[] = {BLK_SIZE,BLK_SIZE};
    I know you mentioned numbers in your question, but what are all these values actually set to here?

    Hint: global_size[x] needs to be an integer multiple of local_size[x]

    The error code you're receiving is suggesting that this is not the case (or that BLK_SIZE is too big for the device).

  3. #3
    Senior Member
    Join Date
    Dec 2011
    Posts
    163

    Re: global & local size in 2D problem

    Total local sizes must be less than CL_DEVICE_MAX_WORK_ITEM_SIZES (per dimension), and the total work group size (multiply the local sizes together) must be less than CL_DEVICE_MAX_WORK_GROUP_SIZE and CL_KERNEL_WORK_GROUP_SIZE to work. Supposedly they will be faster if the work group size is a multiple of CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE but this isn't always true.

    The local size can be left unspecified and the driver will use something legal that works. Be aware that for some global sizes, this could be 1,1 which will be inefficient. But its a good way to get things working before getting fancy.

    The global size must be a multiple of the local size. Therefore, you need to round up the global size in some cases. Also, then you need to do a check in your kernels to only process if the global_id values are less than your actual desired size (passed as arguments).

    Messy stuff that should have just been handled by the runtime...

Similar Threads

  1. OpenCL Ndrange Global Size/Local Size
    By BiS in forum OpenCL
    Replies: 4
    Last Post: 01-04-2013, 07:52 AM
  2. copy from global memory to local memory..problem
    By phoebe0105 in forum OpenCL
    Replies: 3
    Last Post: 06-03-2010, 02:14 PM

Posting Permissions

  • You may not post new threads
  • You may not post replies
  • You may not post attachments
  • You may not edit your posts
  •