Results 1 to 6 of 6

Thread: OpenCL on MacBook Pro with NVidia 320m

  1. #1
    Junior Member
    Join Date
    Jan 2011
    Posts
    3

    OpenCL on MacBook Pro with NVidia 320m

    Hey all

    im trying to setup an Application to make some calculation on my video card. The problem is, that my cpu is much faster then the gpu.

    When i start the program, i get the following msg:
    Connecting to NVIDIA GeForce 320M,
    max_compute_units: 6
    max_work_groub_size: 512
    max_work_item_dimensions: 3
    It's working on any of your system? If so, where is my mistake?

    The Kernel:
    __kernel void
    add(__global float *a,
    __global float *b,
    __global float *answer)
    {
    int gid = get_global_id(0);
    answer[gid] = a[gid] + b[gid];
    answer[gid] *= 0.46*0.48*6.54*4.21 * (10.56*sin(a[gid]) + 3.47 * cos(b[gid])*b[gid]*a[gid]);
    answer[gid] /= 0.46*0.48*6.54*4.21 * (10.56*sin(a[gid]) + 3.47 * cos(b[gid])*b[gid]*a[gid]);
    answer[gid] *= 0.46*0.48*6.54*4.21 * (10.56*sin(a[gid]) + 3.47 * cos(b[gid])*b[gid]*a[gid]);
    answer[gid] /= 0.46*0.48*6.54*4.21 * (10.56*sin(a[gid]) + 3.47 * cos(b[gid])*b[gid]*a[gid]);
    answer[gid] *= 0.46*0.48*6.54*4.21 * (10.56*sin(a[gid]) + 3.47 * cos(b[gid])*b[gid]*a[gid]);
    answer[gid] /= 0.46*0.48*6.54*4.21 * (10.56*sin(a[gid]) + 3.47 * cos(b[gid])*b[gid]*a[gid]);
    answer[gid] *= 0.46*0.48*6.54*4.21 * (10.56*sin(a[gid]) + 3.47 * cos(b[gid])*b[gid]*a[gid]);
    answer[gid] /= 0.46*0.48*6.54*4.21 * (10.56*sin(a[gid]) + 3.47 * cos(b[gid])*b[gid]*a[gid]);
    answer[gid] *= 0.46*0.48*6.54*4.21 * (10.56*sin(a[gid]) + 3.47 * cos(b[gid])*b[gid]*a[gid]);
    answer[gid] /= 0.46*0.48*6.54*4.21 * (10.56*sin(a[gid]) + 3.47 * cos(b[gid])*b[gid]*a[gid]);
    answer[gid] *= 0.46*0.48*6.54*4.21 * (10.56*sin(a[gid]) + 3.47 * cos(b[gid])*b[gid]*a[gid]);
    answer[gid] /= 0.46*0.48*6.54*4.21 * (10.56*sin(a[gid]) + 3.47 * cos(b[gid])*b[gid]*a[gid]);
    answer[gid] *= 0.46*0.48*6.54*4.21 * (10.56*sin(a[gid]) + 3.47 * cos(b[gid])*b[gid]*a[gid]);
    answer[gid] /= 0.46*0.48*6.54*4.21 * (10.56*sin(a[gid]) + 3.47 * cos(b[gid])*b[gid]*a[gid]);
    answer[gid] *= 0.46*0.48*6.54*4.21 * (10.56*sin(a[gid]) + 3.47 * cos(b[gid])*b[gid]*a[gid]);
    answer[gid] /= 0.46*0.48*6.54*4.21 * (10.56*sin(a[gid]) + 3.47 * cos(b[gid])*b[gid]*a[gid]);
    answer[gid] *= 0.46*0.48*6.54*4.21 * (10.56*sin(a[gid]) + 3.47 * cos(b[gid])*b[gid]*a[gid]);
    answer[gid] /= 0.46*0.48*6.54*4.21 * (10.56*sin(a[gid]) + 3.47 * cos(b[gid])*b[gid]*a[gid]);
    answer[gid] *= 0.46*0.48*6.54*4.21 * (10.56*sin(a[gid]) + 3.47 * cos(b[gid])*b[gid]*a[gid]);
    answer[gid] /= 0.46*0.48*6.54*4.21 * (10.56*sin(a[gid]) + 3.47 * cos(b[gid])*b[gid]*a[gid]);
    answer[gid] *= 0.46*0.48*6.54*4.21 * (10.56*sin(a[gid]) + 3.47 * cos(b[gid])*b[gid]*a[gid]);
    answer[gid] /= 0.46*0.48*6.54*4.21 * (10.56*sin(a[gid]) + 3.47 * cos(b[gid])*b[gid]*a[gid]);
    answer[gid] *= 0.46*0.48*6.54*4.21 * (10.56*sin(a[gid]) + 3.47 * cos(b[gid])*b[gid]*a[gid]);
    answer[gid] /= 0.46*0.48*6.54*4.21 * (10.56*sin(a[gid]) + 3.47 * cos(b[gid])*b[gid]*a[gid]);
    }
    My main.c:
    #ifdef __APPLE__
    #include <OpenGL/OpenGL.h>
    #include <GLUT/glut.h>
    //#include <OpenGL/glu.h>
    #else
    #include <GL/glut.h>
    //#include <GL/glu.h>
    #endif

    #include <OpenCL/OpenCL.h>
    #include <iostream>
    #include <assert.h>
    #include <sys/sysctl.h>
    #include <sys/stat.h>
    #include <stdlib.h>
    #include <stdio.h>


    #pragma mark -
    #pragma mark Utilities
    char * load_program_source(const char *filename)
    {
    struct stat statbuf;
    FILE *fh;

    char *source;
    fh = fopen(filename, "r");
    if (fh == 0)
    return 0;

    stat(filename, &statbuf);
    source = (char *) malloc(statbuf.st_size + 1);
    fread(source, statbuf.st_size, 1, fh);
    source[statbuf.st_size] = '\0';
    return source;
    }

    #pragma mark -
    #pragma mark Main OpenCL Routine
    int runCL(float * a, float * b, float * results, int n)
    {
    cl_program program[1];
    cl_kernel kernel[1];

    cl_command_queue cmd_queue;
    cl_context context;

    cl_device_id cpu = NULL, device = NULL;

    cl_int err = 0;
    size_t returned_size = 0;
    size_t buffer_size;

    cl_mem a_mem, b_mem, ans_mem;

    #pragma mark Device Information
    {
    // Find the CPU CL device, as a fallback
    //26:00
    err = clGetDeviceIDs(NULL, CL_DEVICE_TYPE_CPU, 1, &cpu, NULL);
    assert(err == CL_SUCCESS);

    // Find the GPU CL device, this is what we really want
    // If there is no GPU device is CL capable, fall back to CPU
    err = clGetDeviceIDs(NULL, CL_DEVICE_TYPE_GPU, 1, &device, NULL);
    //if (err != CL_SUCCESS)
    device = cpu;
    assert(device);

    // Get some information about the returned device
    cl_char vendor_name[1024] = {0};
    cl_char device_name[1024] = {0};
    cl_uint max_compute_units = 0;
    size_t max_work_groub_size = 0;
    cl_uint max_work_item_dimensions = 0;


    //27:00
    err = clGetDeviceInfo(device, CL_DEVICE_VENDOR, sizeof(vendor_name),
    vendor_name, &returned_size);

    err |= clGetDeviceInfo(device, CL_DEVICE_NAME, sizeof(device_name),
    device_name, &returned_size);

    err |= clGetDeviceInfo(device, CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(max_compute_units),
    &max_compute_units, &returned_size);

    err |= clGetDeviceInfo(device, CL_DEVICE_MAX_WORK_GROUP_SIZE, sizeof(max_work_groub_size),
    &max_work_groub_size, &returned_size);

    err |= clGetDeviceInfo(device, CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS, sizeof(max_work_item_dimensions),
    &max_work_item_dimensions, &returned_size);




    assert(err == CL_SUCCESS);
    printf("Connecting to %s %s, \nmax_compute_units: %d\nmax_work_groub_size: %zu \nmax_work_item_dimensions: %d...\n", vendor_name, device_name, max_compute_units, max_work_groub_size, max_work_item_dimensions);



    }


    #pragma mark Context and Command Queue
    {
    // Now create a context to perform our calculation with the
    // specified device
    context = clCreateContext(0, 1, &device, NULL, NULL, &err);
    assert(err == CL_SUCCESS);

    // And also a command queue for the context
    cmd_queue = clCreateCommandQueue(context, device, 0, NULL);
    }


    #pragma mark Program and Kernel Creation
    {
    // Load the program source from disk
    // The kernel/program is the project directory and in Xcode the executable
    // is set to launch from that directory hence we use a relative path
    const char * filename = "example.cl";
    char *program_source = load_program_source(filename);
    program[0] = clCreateProgramWithSource(context, 1, (const char**)&program_source,
    NULL, &err);

    assert(err == CL_SUCCESS);

    // 28:40
    err = clBuildProgram(program[0], 0, NULL, NULL, NULL, NULL);
    assert(err == CL_SUCCESS);

    // Now create the kernel "objects" that we want to use in the example file
    kernel[0] = clCreateKernel(program[0], "add", &err);
    }


    #pragma mark Memory Allocation
    {

    // Allocate memory on the device to hold our data and store the results into
    buffer_size = sizeof(float) * n;

    // Input array a
    //30:10
    a_mem = clCreateBuffer(context, CL_MEM_READ_ONLY, buffer_size, NULL, NULL);

    //32:20
    err = clEnqueueWriteBuffer(cmd_queue, a_mem, CL_TRUE, 0, buffer_size,
    (void*)a, 0, NULL, NULL);

    // Input array b
    b_mem = clCreateBuffer(context, CL_MEM_READ_ONLY, buffer_size, NULL, NULL);

    err |= clEnqueueWriteBuffer(cmd_queue, b_mem, CL_TRUE, 0, buffer_size,
    (void*)b, 0, NULL, NULL);

    assert(err == CL_SUCCESS);

    // Results array
    ans_mem= clCreateBuffer(context, CL_MEM_READ_WRITE, buffer_size, NULL, NULL);

    // Get all of the stuff written and allocated
    clFinish(cmd_queue);
    }


    #pragma mark Kernel Arguments
    {

    // Now setup the arguments to our kernel
    //33:48
    err = clSetKernelArg(kernel[0], 0, sizeof(cl_mem), &a_mem);
    err |= clSetKernelArg(kernel[0], 1, sizeof(cl_mem), &b_mem);
    err |= clSetKernelArg(kernel[0], 2, sizeof(cl_mem), &ans_mem);

    assert(err == CL_SUCCESS);

    }


    #pragma mark Execution and Read
    {

    // Run the calculation by enqueuing it and forcing the
    // command queue to complete the task
    size_t global_work_size = n;
    //33:59
    err = clEnqueueNDRangeKernel(cmd_queue, kernel[0], 1, NULL,
    &global_work_size, NULL, 0, NULL, NULL);


    assert(err == CL_SUCCESS);
    clFinish(cmd_queue);

    // Once finished read back the results from the answer
    // array into the results array
    //35:35
    err = clEnqueueReadBuffer(cmd_queue, ans_mem, CL_TRUE, 0, buffer_size,
    results, 0, NULL, NULL);

    assert(err == CL_SUCCESS);
    clFinish(cmd_queue);

    }


    #pragma mark Teardown
    {
    clReleaseMemObject(a_mem);
    clReleaseMemObject(b_mem);
    clReleaseMemObject(ans_mem);

    clReleaseCommandQueue(cmd_queue);
    clReleaseContext(context);
    }
    return CL_SUCCESS;
    }



    int main(int argc, char **argv) {
    // Problem size
    // int n = 2048*16*16*16*4;
    int n = 40;

    // Allocate some memory and a place for the results
    float * a = (float *)malloc(n*sizeof(float));
    float * b = (float *)malloc(n*sizeof(float));
    float * results = (float *)malloc(n*sizeof(float));

    // Fill in the values
    for(int i=0;i<n;i++) {
    a[i] = (float)i;
    b[i] = (float)n-i;
    results[i] = 0.f;
    }

    // Do the OpenCL calculation
    runCL(a, b, results, n);

    // Print out some results.
    // for(int i=0;i<n;i++)
    //if (i+1 != results[i])
    // printf("%f\n",results[i]);

    printf("%f\n",results[n-1]);


    // Free up memory
    free(a);
    free(b);
    free(results);

    return 0;
    }
    thank you

  2. #2
    Member
    Join Date
    Mar 2010
    Location
    Raleigh, NC
    Posts
    55

    Re: OpenCL on MacBook Pro with NVidia 320m

    What information do you have saying the CPU is faster than your GPU?

    Also, that message is a good message - it means that you have initialized your hardware correctly.

    The main thing I see that might be problematic is the number of memory accesses you have in the kernel. IIRC, The global memory can be fickle, depending on the implementation as to where it is actually being stored. Your implementation might be storing the global memory in the main RAM, instead of on the GPU, thus causing a huge amount of communication to occur. <Note, I could be wrong on this - David.Garcia should clarify me on this>.

    You could try and improve this by something along the lines of:

    Code :
    __kernel void add(__global float *a, __global float *b, __global float *answer)
    {
    int gid = get_global_id(0);
    int iValA = a[gid];
    int iValB = b[gid];
     
    ... Rest of your code...

    This more or less would store a value of a[] and b[] for that gid point on the card. Thus, it would minimize the number of accesses you are doing. If the communication overhead is really your issue, this should take care of it.

    I don't see anything else glaring at me for this issue. I'll take a look at it tonight when I am at home and can run it on my non-Mac desktop to see if I can reproduce your slowness.

  3. #3

    Re: OpenCL on MacBook Pro with NVidia 320m

    A good compiler will optimize for those memory accesses and use a local variable as HolyGeneralK suggested, but i wouldn't try to rely on that. If, however, the low-level vm code is making those accesses on-the-fly and executing all these computations sequentially, it wouldn't surprise me that the CPU version is faster, as memory accesses are cheaper and sequential functions more efficient on CPU than GPU.

    To make it more GPU-friendly, try something like

    Code :
    int gid = get_global_id(0);
    float valA = a[gid];
    float valB = b[gid];
    float resultValA;
    float multiplier =  0.46*0.48*6.54*4.21 * (10.56*sin(valA) + 3.47 * cos(valB)*valB*valA);
    float divider = 1.0/multiplier; // or native_recip(multiplier) or half_recip(multiplier) for faster approximations
     
    resultValA = valA + valB;
    resultValA *= multiplier;
    resultValA *= divider; // multiplying by a reciprocal is cheaper than dividing
    // ... repeat lines above as needed
     
    answer[gid] = resultValA;

    only three memory accesses and far less sequential computations (again, this may be done by the compiler, but not necessarily)

  4. #4
    Junior Member
    Join Date
    Jan 2011
    Posts
    3

    Re: OpenCL on MacBook Pro with NVidia 320m

    Hey all

    The short version is: chai's tip is working very well.

    i did a little bit of modification on my code above and also tried the following 3 kernels:

    kernel 1:
    Code :
    __kernel void
    add(__global float *a,
                 __global float *b,
                 __global float *answer)
    {
            int gid = get_global_id(0);
            answer[gid] = a[gid] + b[gid];
    	for (int i = 1; i < 100000; i++) {
                answer[gid] *= i*(10.56*sin(a[gid]) + 3.47 * cos(b[gid])*b[gid]*a[gid]);
                answer[gid] /= i*(10.56*sin(a[gid]) + 3.47 * cos(b[gid])*b[gid]*a[gid]);
            }
    }

    kernel 2 (inspired by HolyGeneralK):
    Code :
    __kernel void
    add(__global float *a,
                 __global float *b,
                 __global float *answer)
    {
            int gid = get_global_id(0);
            float iValA = a[gid];
    	float iValB = b[gid];
            answer[gid] = iValA + iValB;
            for (int i = 1; i < 100000; i++) {
                answer[gid] *= i*(10.56*sin(iValA) + 3.47 * cos(iValB)*iValB*iValA);
                answer[gid] /= i*(10.56*sin(iValA) + 3.47 * cos(iValB)*iValB*iValA);
    	}
    }

    kernel 3 (inspired by chai):
    Code :
    __kernel void
    add( __global float *a,
         __global float *b,
         __global float *answer)
    {
            int gid = get_global_id(0);
            float valA = a[gid];
            float valB = b[gid];
            float resultValA;
            float multiplier =  0.46*0.48*6.54*4.21 * (10.56*sin(valA) + 3.47 * cos(valB)*valB*valA);
            float divider = 1.0/multiplier; // or native_recip(multiplier) or half_recip(multiplier) for faster approximations
     
    	resultValA = valA + valB;
    	for (int i = 1; i < 100000; i++) {
                resultValA *= multiplier;
                resultValA *= divider; // multiplying by a reciprocal is cheaper than dividing
    	}
     
            answer[gid] = resultValA;
    }

    Well, the main function now calls runCL(...) with both, GPU and CPU version. It also messures the time in seconds. Here are the outputs:

    with kernel 1:
    n = 200
    ================================================== ========
    GPU...
    Connecting to NVIDIA GeForce 320M,
    max_compute_units: 6
    max_work_groub_size: 512
    max_work_item_dimensions: 3...


    seconds = 9
    ---------------------------------------------------------
    CPU...
    Connecting to Intel Intel(R) Core(TM)2 Duo CPU P8600 @ 2.40GHz,
    max_compute_units: 2
    max_work_groub_size: 1
    max_work_item_dimensions: 3...


    seconds = 1
    ---------------------------------------------------------
    with n = 2000 i get the following
    Code :
    Assertion failed: (err == CL_SUCCESS), function runCL, file main.cc, line 199.
    Abort trap

    with kernel 2:
    n = 2000
    ================================================== ========
    GPU...
    Connecting to NVIDIA GeForce 320M,
    max_compute_units: 6
    max_work_groub_size: 512
    max_work_item_dimensions: 3...


    seconds = 13
    ---------------------------------------------------------
    CPU...
    Connecting to Intel Intel(R) Core(TM)2 Duo CPU P8600 @ 2.40GHz,
    max_compute_units: 2
    max_work_groub_size: 1
    max_work_item_dimensions: 3...


    seconds = 4
    ---------------------------------------------------------
    for n = 20000 i get the following:
    Code :
    Assertion failed: (err == CL_SUCCESS), function runCL, file main.cc, line 199.
    Abort trap


    with kernel 3:
    n = 200000
    ================================================== ========
    GPU...
    Connecting to NVIDIA GeForce 320M,
    max_compute_units: 6
    max_work_groub_size: 512
    max_work_item_dimensions: 3...


    seconds = 13
    ---------------------------------------------------------
    CPU...
    Connecting to Intel Intel(R) Core(TM)2 Duo CPU P8600 @ 2.40GHz,
    max_compute_units: 2
    max_work_groub_size: 1
    max_work_item_dimensions: 3...


    seconds = 61
    ---------------------------------------------------------

    ok, now the main.cc:
    Code :
    #ifdef __APPLE__
    #include <OpenCL/OpenCL.h>
    #else
    #include <CL/cl.h>
    #endif
     
     
    #include <iostream>
    #include <assert.h>
    #include <sys/sysctl.h>
    #include <sys/stat.h>
    #include <stdlib.h>
    #include <stdio.h>
    #include <unistd.h>
    #include <sys/time.h>
    #include <time.h>
     
     
     
    #pragma mark -
    #pragma mark Utilities
    char * load_program_source(const char *filename)
    {
      struct stat statbuf;
      FILE *fh;
     
      char *source;
      fh = fopen(filename, "r");
      if (fh == 0)
        return 0;
     
      stat(filename, &statbuf);
      source = (char *) malloc(statbuf.st_size + 1);
      fread(source, statbuf.st_size, 1, fh);
      source[statbuf.st_size] = '\0';
      return source;   
    }
     
     
    #pragma mark -
    #pragma mark Main OpenCL Routine
    int runCL(float * a, float * b, float * results, int n, int dev)
    {
      cl_program program[1];
      cl_kernel kernel[1];
     
      cl_command_queue cmd_queue;
      cl_context   context;
     
      cl_device_id cpu = NULL, device = NULL;
     
      cl_int err = 0;
      size_t returned_size = 0;
      size_t buffer_size;
     
      cl_mem a_mem, b_mem, ans_mem;
     
    #pragma mark Device Information
      {    
        // Find the CPU CL device, as a fallback
        //26:00
        err = clGetDeviceIDs(NULL, CL_DEVICE_TYPE_CPU, 1, &cpu, NULL);
        assert(err == CL_SUCCESS);
     
        // Find the GPU CL device, this is what we really want
        // If there is no GPU device is CL capable, fall back to CPU
        err = clGetDeviceIDs(NULL, CL_DEVICE_TYPE_GPU, 1, &device, NULL);
        if (err != CL_SUCCESS || dev > 0)
          device = cpu;
        assert(device);
     
        // Get some information about the returned device
        cl_char vendor_name[1024] = {0};
        cl_char device_name[1024] = {0};
        cl_uint max_compute_units = 0;
        size_t max_work_groub_size = 0;
        cl_uint max_work_item_dimensions = 0;
     
     
        //27:00
        err = clGetDeviceInfo(device, CL_DEVICE_VENDOR, sizeof(vendor_name), 
    			  vendor_name, &returned_size);
     
        err |= clGetDeviceInfo(device, CL_DEVICE_NAME, sizeof(device_name), 
    			   device_name, &returned_size);
     
        err |= clGetDeviceInfo(device, CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(max_compute_units), 
    			   &max_compute_units, &returned_size);
     
        err |= clGetDeviceInfo(device, CL_DEVICE_MAX_WORK_GROUP_SIZE, sizeof(max_work_groub_size), 
    			   &max_work_groub_size, &returned_size);
     
        err |= clGetDeviceInfo(device, CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS, sizeof(max_work_item_dimensions), 
    			   &max_work_item_dimensions, &returned_size);
     
        assert(err == CL_SUCCESS);
        printf("Connecting to %s %s, \nmax_compute_units: %d\nmax_work_groub_size: %zu \nmax_work_item_dimensions: %d...\n", vendor_name, device_name, max_compute_units, max_work_groub_size, max_work_item_dimensions);
      }
     
     
    #pragma mark Context and Command Queue
      {
        // Now create a context to perform our calculation with the 
        // specified device 
        context = clCreateContext(0, 1, &device, NULL, NULL, &err);
        assert(err == CL_SUCCESS);
     
        // And also a command queue for the context
        cmd_queue = clCreateCommandQueue(context, device, 0, NULL);
      }
     
     
    #pragma mark Program and Kernel Creation
      {
        // Load the program source from disk
        // The kernel/program is the project directory and in Xcode the executable
        // is set to launch from that directory hence we use a relative path
        const char * filename = "example3.cl";
        char *program_source = load_program_source(filename);
        program[0] = clCreateProgramWithSource(context, 1, (const char**)&program_source,
    					   NULL, &err);
     
        assert(err == CL_SUCCESS);
     
        // 28:40
        err = clBuildProgram(program[0], 0, NULL, NULL, NULL, NULL);
        assert(err == CL_SUCCESS);
     
        // Now create the kernel "objects" that we want to use in the example file 
        kernel[0] = clCreateKernel(program[0], "add", &err);
      }
     
     
    #pragma mark Memory Allocation
      {
     
        // Allocate memory on the device to hold our data and store the results into
        buffer_size = sizeof(float) * n;
     
        // Input array a
        //30:10
        a_mem = clCreateBuffer(context, CL_MEM_READ_ONLY, buffer_size, NULL, NULL);
     
        //32:20
        err = clEnqueueWriteBuffer(cmd_queue, a_mem, CL_TRUE, 0, buffer_size,
    			       (void*)a, 0, NULL, NULL);
     
        // Input array b
        b_mem = clCreateBuffer(context, CL_MEM_READ_ONLY, buffer_size, NULL, NULL);
     
        err |= clEnqueueWriteBuffer(cmd_queue, b_mem, CL_TRUE, 0, buffer_size,
    				(void*)b, 0, NULL, NULL);
     
        assert(err == CL_SUCCESS);
     
        // Results array
        ans_mem= clCreateBuffer(context, CL_MEM_READ_WRITE, buffer_size, NULL, NULL); 
     
        // Get all of the stuff written and allocated 
        clFinish(cmd_queue);
      }
     
     
    #pragma mark Kernel Arguments
      {
     
        // Now setup the arguments to our kernel
        //33:48
        err  = clSetKernelArg(kernel[0],  0, sizeof(cl_mem), &a_mem);
        err |= clSetKernelArg(kernel[0],  1, sizeof(cl_mem), &b_mem);
        err |= clSetKernelArg(kernel[0],  2, sizeof(cl_mem), &ans_mem);
     
        assert(err == CL_SUCCESS);
     
      }
     
     
    #pragma mark Execution and Read
      {
     
        // Run the calculation by enqueuing it and forcing the 
        // command queue to complete the task
        size_t global_work_size = n;
        //33:59
        err = clEnqueueNDRangeKernel(cmd_queue, kernel[0], 1, NULL, 
    				 &global_work_size, NULL, 0, NULL, NULL);
     
     
        assert(err == CL_SUCCESS);
        clFinish(cmd_queue);
     
        // Once finished read back the results from the answer 
        // array into the results array
        //35:35
        err = clEnqueueReadBuffer(cmd_queue, ans_mem, CL_TRUE, 0, buffer_size, 
    			      results, 0, NULL, NULL);
     
        assert(err == CL_SUCCESS);
        clFinish(cmd_queue);
     
      }
     
     
    #pragma mark Teardown
      {
        clReleaseMemObject(a_mem);
        clReleaseMemObject(b_mem);
        clReleaseMemObject(ans_mem);
     
        clReleaseCommandQueue(cmd_queue);
        clReleaseContext(context);
      }
      return CL_SUCCESS;
    }
     
     
     
    int main(int argc, char **argv) { 
      int n;
      if (argc < 2)
        n = 8;
      else
        n = atoi(argv[1]);
     
      printf( "\n n = %d\n", n);
     
      struct timeval tp1;
      struct timeval tp2;
     
      // Allocate some memory and a place for the results
      float * a = (float *)malloc(n*sizeof(float));
      float * b = (float *)malloc(n*sizeof(float));
      float * results = (float *)malloc(n*sizeof(float));
     
      // Fill in the values
      for(int i=0;i<n;i++)  {
        a[i] = (float)i;
        b[i] = (float)n-i;
        results[i] = 0.f;
      }
     
      // Do the OpenCL calculation
      printf("==========================================================\n");
      printf("   GPU...\n");
      gettimeofday(&tp1, NULL);
      runCL(a, b, results, n, 0); // GPU
      gettimeofday(&tp2, NULL);
      printf( "\n");
      printf( "\n   seconds = %ld\n", tp2.tv_sec-tp1.tv_sec );
     
      printf("---------------------------------------------------------\n");
      printf("   CPU...\n");
      gettimeofday(&tp1, NULL);
      runCL(a, b, results, n, 1); // CPU
      gettimeofday(&tp2, NULL);
      printf( "\n");
      printf( "\n   seconds = %ld\n", tp2.tv_sec-tp1.tv_sec );
      printf("---------------------------------------------------------\n");
     
      // Free up memory
      free(a);
      free(b);
      free(results);
     
      return 0; 
    }

    im compiling with
    Code :
    g++-4.2 -Wall -O3 -funroll-loops  -MMD -MF release/main.d -c main.cc -o release/main.o
    g++-4.2 -o release/main ./release/main.o -framework OpenCL

    thank you all a lot. Now i can play with this code.

    bye

  5. #5

    Re: OpenCL on MacBook Pro with NVidia 320m

    thanks for the very thorough follow-up! glad the suggestions helped.

    also, since you're on NVIDIA, you might want to try the loop unrolling extension. It would take way too long to fully unroll the 100000 iterations, but the extension can divide it into N unrolled "chunks". The tradeoff is compiled kernel size and compilation time vs a potential speedup of avoiding conditional statements. GPUs like unrolled kernels, CPUs are optimized for conditionals and sequential loops.

    cl_nv_pragma_unroll documentation

    Code :
    #pragma OPENCL EXTENSION cl_nv_pragma_unroll : enable // not sure if this line is necessary or not, I'm using ATI :P
     
    __kernel void
    add( __global float *a,
         __global float *b,
         __global float *answer)
    {
            int gid = get_global_id(0);
            float valA = a[gid];
            float valB = b[gid];
            float resultValA;
            float multiplier =  0.46*0.48*6.54*4.21 * (10.56*sin(valA) + 3.47 * cos(valB)*valB*valA);
            float divider = 1.0/multiplier; // or native_recip(multiplier) or half_recip(multiplier) for faster approximations
     
       resultValA = valA + valB;
    #pragma unroll 100 //smaller values = shorter compile time and less kernel memory, larger values = longer compile time, more memory, but often much faster
       for (int i = 1; i < 100000; i++) {
                resultValA *= multiplier;
                resultValA *= divider; // multiplying by a reciprocal is cheaper than dividing
       }
     
            answer[gid] = resultValA;
    }

  6. #6
    Junior Member
    Join Date
    Jan 2011
    Posts
    3

    Re: OpenCL on MacBook Pro with NVidia 320m

    Hey

    thanks for the tip.

    Kernel 4:
    Code :
    __kernel void
    add( __global float *a,
         __global float *b,
         __global float *answer)
    {
            int gid = get_global_id(0);
            float valA = a[gid];
            float valB = b[gid];
            float resultValA;
            float multiplier =  0.46*0.48*6.54*4.21 * (10.56*sin(valA) + 3.47 * cos(valB)*valB*valA);
            float divider = 1.0/multiplier; // or native_recip(multiplier) or half_recip(multiplier) for faster approximations
     
            resultValA = valA + valB;
            #pragma unroll 100 //smaller values = shorter compile time and less kernel memory, larger values = longer compile time, more memory, but often much\
     faster
            for (int i = 1; i < 100000; i++) {
                resultValA *= multiplier;
                resultValA *= divider; // multiplying by a reciprocal is cheaper than dividing
            }
     
            answer[gid] = resultValA;
    }

    It's output:
    n = 200000
    ================================================== ========
    GPU...
    Connecting to NVIDIA GeForce 320M,
    max_compute_units: 6
    max_work_groub_size: 512
    max_work_item_dimensions: 3...


    seconds = 7
    ---------------------------------------------------------
    CPU...
    Connecting to Intel Intel(R) Core(TM)2 Duo CPU P8600 @ 2.40GHz,
    max_compute_units: 2
    max_work_groub_size: 1
    max_work_item_dimensions: 3...


    seconds = 60
    ---------------------------------------------------------

Similar Threads

  1. No WebGL support for MacBook Pro/Radeon X1600?
    By todrobbins in forum User Hardware, Software Help
    Replies: 0
    Last Post: 08-08-2011, 09:59 PM
  2. Replies: 1
    Last Post: 09-29-2009, 10:43 AM

Posting Permissions

  • You may not post new threads
  • You may not post replies
  • You may not post attachments
  • You may not edit your posts
  •