Hey all

im trying to setup an Application to make some calculation on my video card. The problem is, that my cpu is much faster then the gpu.

When i start the program, i get the following msg:
Connecting to NVIDIA GeForce 320M,
max_compute_units: 6
max_work_groub_size: 512
max_work_item_dimensions: 3
It's working on any of your system? If so, where is my mistake?

The Kernel:
__kernel void
add(__global float *a,
__global float *b,
__global float *answer)
{
int gid = get_global_id(0);
answer[gid] = a[gid] + b[gid];
answer[gid] *= 0.46*0.48*6.54*4.21 * (10.56*sin(a[gid]) + 3.47 * cos(b[gid])*b[gid]*a[gid]);
answer[gid] /= 0.46*0.48*6.54*4.21 * (10.56*sin(a[gid]) + 3.47 * cos(b[gid])*b[gid]*a[gid]);
answer[gid] *= 0.46*0.48*6.54*4.21 * (10.56*sin(a[gid]) + 3.47 * cos(b[gid])*b[gid]*a[gid]);
answer[gid] /= 0.46*0.48*6.54*4.21 * (10.56*sin(a[gid]) + 3.47 * cos(b[gid])*b[gid]*a[gid]);
answer[gid] *= 0.46*0.48*6.54*4.21 * (10.56*sin(a[gid]) + 3.47 * cos(b[gid])*b[gid]*a[gid]);
answer[gid] /= 0.46*0.48*6.54*4.21 * (10.56*sin(a[gid]) + 3.47 * cos(b[gid])*b[gid]*a[gid]);
answer[gid] *= 0.46*0.48*6.54*4.21 * (10.56*sin(a[gid]) + 3.47 * cos(b[gid])*b[gid]*a[gid]);
answer[gid] /= 0.46*0.48*6.54*4.21 * (10.56*sin(a[gid]) + 3.47 * cos(b[gid])*b[gid]*a[gid]);
answer[gid] *= 0.46*0.48*6.54*4.21 * (10.56*sin(a[gid]) + 3.47 * cos(b[gid])*b[gid]*a[gid]);
answer[gid] /= 0.46*0.48*6.54*4.21 * (10.56*sin(a[gid]) + 3.47 * cos(b[gid])*b[gid]*a[gid]);
answer[gid] *= 0.46*0.48*6.54*4.21 * (10.56*sin(a[gid]) + 3.47 * cos(b[gid])*b[gid]*a[gid]);
answer[gid] /= 0.46*0.48*6.54*4.21 * (10.56*sin(a[gid]) + 3.47 * cos(b[gid])*b[gid]*a[gid]);
answer[gid] *= 0.46*0.48*6.54*4.21 * (10.56*sin(a[gid]) + 3.47 * cos(b[gid])*b[gid]*a[gid]);
answer[gid] /= 0.46*0.48*6.54*4.21 * (10.56*sin(a[gid]) + 3.47 * cos(b[gid])*b[gid]*a[gid]);
answer[gid] *= 0.46*0.48*6.54*4.21 * (10.56*sin(a[gid]) + 3.47 * cos(b[gid])*b[gid]*a[gid]);
answer[gid] /= 0.46*0.48*6.54*4.21 * (10.56*sin(a[gid]) + 3.47 * cos(b[gid])*b[gid]*a[gid]);
answer[gid] *= 0.46*0.48*6.54*4.21 * (10.56*sin(a[gid]) + 3.47 * cos(b[gid])*b[gid]*a[gid]);
answer[gid] /= 0.46*0.48*6.54*4.21 * (10.56*sin(a[gid]) + 3.47 * cos(b[gid])*b[gid]*a[gid]);
answer[gid] *= 0.46*0.48*6.54*4.21 * (10.56*sin(a[gid]) + 3.47 * cos(b[gid])*b[gid]*a[gid]);
answer[gid] /= 0.46*0.48*6.54*4.21 * (10.56*sin(a[gid]) + 3.47 * cos(b[gid])*b[gid]*a[gid]);
answer[gid] *= 0.46*0.48*6.54*4.21 * (10.56*sin(a[gid]) + 3.47 * cos(b[gid])*b[gid]*a[gid]);
answer[gid] /= 0.46*0.48*6.54*4.21 * (10.56*sin(a[gid]) + 3.47 * cos(b[gid])*b[gid]*a[gid]);
answer[gid] *= 0.46*0.48*6.54*4.21 * (10.56*sin(a[gid]) + 3.47 * cos(b[gid])*b[gid]*a[gid]);
answer[gid] /= 0.46*0.48*6.54*4.21 * (10.56*sin(a[gid]) + 3.47 * cos(b[gid])*b[gid]*a[gid]);
}
My main.c:
#ifdef __APPLE__
#include <OpenGL/OpenGL.h>
#include <GLUT/glut.h>
//#include <OpenGL/glu.h>
#else
#include <GL/glut.h>
//#include <GL/glu.h>
#endif

#include <OpenCL/OpenCL.h>
#include <iostream>
#include <assert.h>
#include <sys/sysctl.h>
#include <sys/stat.h>
#include <stdlib.h>
#include <stdio.h>


#pragma mark -
#pragma mark Utilities
char * load_program_source(const char *filename)
{
struct stat statbuf;
FILE *fh;

char *source;
fh = fopen(filename, "r");
if (fh == 0)
return 0;

stat(filename, &statbuf);
source = (char *) malloc(statbuf.st_size + 1);
fread(source, statbuf.st_size, 1, fh);
source[statbuf.st_size] = '\0';
return source;
}

#pragma mark -
#pragma mark Main OpenCL Routine
int runCL(float * a, float * b, float * results, int n)
{
cl_program program[1];
cl_kernel kernel[1];

cl_command_queue cmd_queue;
cl_context context;

cl_device_id cpu = NULL, device = NULL;

cl_int err = 0;
size_t returned_size = 0;
size_t buffer_size;

cl_mem a_mem, b_mem, ans_mem;

#pragma mark Device Information
{
// Find the CPU CL device, as a fallback
//26:00
err = clGetDeviceIDs(NULL, CL_DEVICE_TYPE_CPU, 1, &cpu, NULL);
assert(err == CL_SUCCESS);

// Find the GPU CL device, this is what we really want
// If there is no GPU device is CL capable, fall back to CPU
err = clGetDeviceIDs(NULL, CL_DEVICE_TYPE_GPU, 1, &device, NULL);
//if (err != CL_SUCCESS)
device = cpu;
assert(device);

// Get some information about the returned device
cl_char vendor_name[1024] = {0};
cl_char device_name[1024] = {0};
cl_uint max_compute_units = 0;
size_t max_work_groub_size = 0;
cl_uint max_work_item_dimensions = 0;


//27:00
err = clGetDeviceInfo(device, CL_DEVICE_VENDOR, sizeof(vendor_name),
vendor_name, &returned_size);

err |= clGetDeviceInfo(device, CL_DEVICE_NAME, sizeof(device_name),
device_name, &returned_size);

err |= clGetDeviceInfo(device, CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(max_compute_units),
&max_compute_units, &returned_size);

err |= clGetDeviceInfo(device, CL_DEVICE_MAX_WORK_GROUP_SIZE, sizeof(max_work_groub_size),
&max_work_groub_size, &returned_size);

err |= clGetDeviceInfo(device, CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS, sizeof(max_work_item_dimensions),
&max_work_item_dimensions, &returned_size);




assert(err == CL_SUCCESS);
printf("Connecting to %s %s, \nmax_compute_units: %d\nmax_work_groub_size: %zu \nmax_work_item_dimensions: %d...\n", vendor_name, device_name, max_compute_units, max_work_groub_size, max_work_item_dimensions);



}


#pragma mark Context and Command Queue
{
// Now create a context to perform our calculation with the
// specified device
context = clCreateContext(0, 1, &device, NULL, NULL, &err);
assert(err == CL_SUCCESS);

// And also a command queue for the context
cmd_queue = clCreateCommandQueue(context, device, 0, NULL);
}


#pragma mark Program and Kernel Creation
{
// Load the program source from disk
// The kernel/program is the project directory and in Xcode the executable
// is set to launch from that directory hence we use a relative path
const char * filename = "example.cl";
char *program_source = load_program_source(filename);
program[0] = clCreateProgramWithSource(context, 1, (const char**)&program_source,
NULL, &err);

assert(err == CL_SUCCESS);

// 28:40
err = clBuildProgram(program[0], 0, NULL, NULL, NULL, NULL);
assert(err == CL_SUCCESS);

// Now create the kernel "objects" that we want to use in the example file
kernel[0] = clCreateKernel(program[0], "add", &err);
}


#pragma mark Memory Allocation
{

// Allocate memory on the device to hold our data and store the results into
buffer_size = sizeof(float) * n;

// Input array a
//30:10
a_mem = clCreateBuffer(context, CL_MEM_READ_ONLY, buffer_size, NULL, NULL);

//32:20
err = clEnqueueWriteBuffer(cmd_queue, a_mem, CL_TRUE, 0, buffer_size,
(void*)a, 0, NULL, NULL);

// Input array b
b_mem = clCreateBuffer(context, CL_MEM_READ_ONLY, buffer_size, NULL, NULL);

err |= clEnqueueWriteBuffer(cmd_queue, b_mem, CL_TRUE, 0, buffer_size,
(void*)b, 0, NULL, NULL);

assert(err == CL_SUCCESS);

// Results array
ans_mem= clCreateBuffer(context, CL_MEM_READ_WRITE, buffer_size, NULL, NULL);

// Get all of the stuff written and allocated
clFinish(cmd_queue);
}


#pragma mark Kernel Arguments
{

// Now setup the arguments to our kernel
//33:48
err = clSetKernelArg(kernel[0], 0, sizeof(cl_mem), &a_mem);
err |= clSetKernelArg(kernel[0], 1, sizeof(cl_mem), &b_mem);
err |= clSetKernelArg(kernel[0], 2, sizeof(cl_mem), &ans_mem);

assert(err == CL_SUCCESS);

}


#pragma mark Execution and Read
{

// Run the calculation by enqueuing it and forcing the
// command queue to complete the task
size_t global_work_size = n;
//33:59
err = clEnqueueNDRangeKernel(cmd_queue, kernel[0], 1, NULL,
&global_work_size, NULL, 0, NULL, NULL);


assert(err == CL_SUCCESS);
clFinish(cmd_queue);

// Once finished read back the results from the answer
// array into the results array
//35:35
err = clEnqueueReadBuffer(cmd_queue, ans_mem, CL_TRUE, 0, buffer_size,
results, 0, NULL, NULL);

assert(err == CL_SUCCESS);
clFinish(cmd_queue);

}


#pragma mark Teardown
{
clReleaseMemObject(a_mem);
clReleaseMemObject(b_mem);
clReleaseMemObject(ans_mem);

clReleaseCommandQueue(cmd_queue);
clReleaseContext(context);
}
return CL_SUCCESS;
}



int main(int argc, char **argv) {
// Problem size
// int n = 2048*16*16*16*4;
int n = 40;

// Allocate some memory and a place for the results
float * a = (float *)malloc(n*sizeof(float));
float * b = (float *)malloc(n*sizeof(float));
float * results = (float *)malloc(n*sizeof(float));

// Fill in the values
for(int i=0;i<n;i++) {
a[i] = (float)i;
b[i] = (float)n-i;
results[i] = 0.f;
}

// Do the OpenCL calculation
runCL(a, b, results, n);

// Print out some results.
// for(int i=0;i<n;i++)
//if (i+1 != results[i])
// printf("%f\n",results[i]);

printf("%f\n",results[n-1]);


// Free up memory
free(a);
free(b);
free(results);

return 0;
}
thank you