Hi, I really need some help with this thing. I would really like some pointers on what to do to make this thing faster. Since the CPU (intel i5 2500K) runs it faster than my radeon 6780 it feels like I haven't taken advantage if Opencl's parallelism or something.

I am thinking it might have something to do that my local work size is 1 or that I need to read back my statesbuffer to my original vector with states (position and velocity) and then resend it again into the kernel which might take a long time on the GPU (?)

I've taken the code from the book opencl programming guide and modified it heavily
Kernel:

Code :
//
 
// Convolution.cl
//
//    This is a simple kernel performing convolution.
 
__kernel void convolve2(
	__global float * states,
	 int antalAtomer,
	 float dt,
	 int l,
	 float lx,
	 float ly,
	int TMAX)
{
    const int x = get_global_id(0);
 
 
 
 
 
 
 
 
 
		if(states[x]<0 || states[x]>lx)
			{
				states[x+antalAtomer*2]=-states[x+antalAtomer*2];
 
			}
		if(states[x+antalAtomer]<0 || states[x+antalAtomer]>ly)
 
			{
 
				states[x+antalAtomer*3]=-states[x+antalAtomer*3];
 
			}
 
		if(l==floor(0.1*TMAX) || l==floor(0.2*TMAX) || l==floor(0.25*TMAX) || l==floor(0.35*TMAX) || l==floor(0.50*TMAX))  // kyl ner skiten så den inte buggar loss
			{
 
			states[x+antalAtomer*2]=0;
			states[x+antalAtomer*3]=0;
			}
 
	states[x] += states[x+antalAtomer*2]*dt;                 // flytta atomerna varannan gång
	states[x+antalAtomer] +=states[x+antalAtomer*3]*dt;
 
	//	states[x] =0;                 // flytta atomerna varannan gång
	//states[x+antalAtomer] =l;
 
 
 
 
 
 
}//end convolve2
 
 
 
__kernel void convolve(
	__global float * states,
	 int antalAtomer,
	 float dt,
	 int l,
	 float lx,
	float ly,
	int TMAX)
{
 
	float a=antalAtomer/2;
 
		 const int x2 = get_global_id(0);
    const int x= x2%antalAtomer;
 
 
 
		int k=(x+1+(int)x2/antalAtomer)%antalAtomer; // 0 om man är i "första"
 
 
                float dx=states[x]-states[k];
   		 float dy=states[antalAtomer+x]-states[antalAtomer+k];
 
  		 float r=sqrt(pow(dx,2)+pow(dy,2));
 
   		float F=12*pow(r,-14)-6*pow(r,-8);
 
 
    			states[x+antalAtomer*2] += F*dx*dt;
			states[x+antalAtomer*3] += F*dy*dt;
 
			states[k+antalAtomer*2] -= F*dx*dt;
			states[k+antalAtomer*3] -= F*dy*dt;
 
 
 
 
 
 
 
} // end convolve



AND the code:

Code :
 
#include <iostream>
#include <fstream>
#include <sstream>
#include <string>
#include <istream>
using namespace std;
 
#ifdef __APPLE__
#include <OpenCL/cl.h>
#else
#include <CL/cl.h>
#endif
 
#if !defined(CL_CALLBACK)
#define CL_CALLBACK
#endif
 
 
///
// Function to check and handle OpenCL errors
inline void 
checkErr(cl_int err, const char * name)
{
    if (err != CL_SUCCESS) {
        std::cerr << "ERROR: " <<  name << " (" << err << ")" << std::endl;
        exit(EXIT_FAILURE);
    }
}
 
void CL_CALLBACK contextCallback(
	const char * errInfo,
	const void * private_info,
	size_t cb,
	void * user_data)
{
	std::cout << "Error occured during context use: " << errInfo << std::endl;
	// should really perform any clearup and so on at this point
	// but for simplicitly just exit.
	exit(1);
}
 
///
//	main() for Convoloution example
//
int main(int argc, char** argv)
{
 
 
const cl_uint antalAtomer=10000;
 float lx=50;
 float ly=50;
//string mystr ("1204");
//cout<<"ange antal atomer, (troligtvis helst jämnt antal) och kanske en multipel av något i GPU/CPU):"<<endl;
//cin>> antalAtomer;
//cout<<"ange lx: "<< endl;
//cin >>lx;
//cout<<"ange ly: "<< endl;
//cin >> ly;
//cout<<" ange filnamnet för datafilen: "<< endl;
//cin>> mystr;
 
	ofstream statesText("states.txt");
float states[antalAtomer*4];  //hela stateslistan
 
 
 
    cl_int errNum;
    cl_uint numPlatforms;
	cl_uint numDevices;
    cl_platform_id * platformIDs;
	cl_device_id * deviceIDs;
    cl_context context = NULL;
	cl_command_queue queue;
	cl_command_queue queue2;
	cl_program program;
	cl_kernel kernel;
	cl_kernel kernel2;
 
	cl_mem statesBuffer;  // buffer för tillstånden..? 
 
	cl_uint TMAX=50;
	int avskal=1;
	float dt=0.001;
 
	float a=antalAtomer/2;
	cout<< "dt= : "<<dt<<endl;
 
 
		statesText<<antalAtomer<<" "<<lx<<" "<<ly<<" "<<TMAX<<" "<<dt<<" ";
 for ( int l=0;l<2*antalAtomer-5;l++)
    {
      statesText<<1<<" ";
    }
 statesText<<"\n";
 
  for(cl_uint i=0; i<antalAtomer; i++)  //gör en initialposition
    {
 
      int atomRot=(int)ceil(sqrt(antalAtomer));
      //states[i]=0;               //x-position
      //states[i+antalAtomer]=0;   //y-position
 
	  states[i]=(float)lx/(atomRot-1)*(i-atomRot*((int)i/(atomRot)));               //x-position
      states[i+antalAtomer]=(float)ly/(atomRot-1)*((int)(i/(atomRot)));   //y-position
 
 
 
	  states[i+antalAtomer*2]=0; // 0 i Vx
	  states[i+antalAtomer*3]=0; // 0 i Vy
 
    }
 
 
	   for(int i=0; i<antalAtomer; i++)
    {
      statesText<<states[i]<< " " << states[i+antalAtomer]<< " ";
    }
 
	statesText<<endl;
 
 
 
      // First, select an OpenCL platform to run on.  
	errNum = clGetPlatformIDs(0, NULL, &numPlatforms);
	checkErr( 
		(errNum != CL_SUCCESS) ? errNum : (numPlatforms <= 0 ? -1 : CL_SUCCESS), 
		"clGetPlatformIDs"); 
 
	platformIDs = (cl_platform_id *)alloca(
       		sizeof(cl_platform_id) * numPlatforms);
 
    errNum = clGetPlatformIDs(numPlatforms, platformIDs, NULL);
    checkErr( 
	   (errNum != CL_SUCCESS) ? errNum : (numPlatforms <= 0 ? -1 : CL_SUCCESS), 
	   "clGetPlatformIDs");
 
	// Iterate through the list of platforms until we find one that supports
	// a CPU device, otherwise fail with an error.
	deviceIDs = NULL;
	cl_uint i;
	for (i = 0; i < numPlatforms; i++)
	{
		errNum = clGetDeviceIDs(
            platformIDs[i], 
            CL_DEVICE_TYPE_GPU, 
            0,
            NULL,
            &numDevices);
		if (errNum != CL_SUCCESS && errNum != CL_DEVICE_NOT_FOUND)
	    {
			checkErr(errNum, "clGetDeviceIDs");
        }
	    else if (numDevices > 0) 
		{
		   	deviceIDs = (cl_device_id *)alloca(sizeof(cl_device_id) * numDevices);
			errNum = clGetDeviceIDs(
				platformIDs[i],
				CL_DEVICE_TYPE_GPU,
				numDevices, 
				&deviceIDs[0], 
				NULL);
			checkErr(errNum, "clGetDeviceIDs");
			break;
	   }
	}
 
	// Check to see if we found at least one CPU device, otherwise return
	if (deviceIDs == NULL) {
		std::cout << "No CPU device found" << std::endl;
		exit(-1);
	}
 
    // Next, create an OpenCL context on the selected platform.  
    cl_context_properties contextProperties[] =
    {
        CL_CONTEXT_PLATFORM,
        (cl_context_properties)platformIDs[i],
        0
    };
    context = clCreateContext(
		contextProperties, 
		numDevices,
        deviceIDs, 
		&contextCallback,
		NULL, 
		&errNum);
	checkErr(errNum, "clCreateContext");
 
	std::ifstream srcFile("Convolution.cl");
    checkErr(srcFile.is_open() ? CL_SUCCESS : -1, "reading Convolution.cl");
 
	std::string srcProg(
        std::istreambuf_iterator<char>(srcFile),
        (std::istreambuf_iterator<char>()));
 
	const char * src = srcProg.c_str();
	size_t length = srcProg.length();
 
	// Create program from source
	program = clCreateProgramWithSource(
		context, 
		1, 
		&src, 
		&length, 
		&errNum);
	checkErr(errNum, "clCreateProgramWithSource");
 
	// Build program
	errNum = clBuildProgram(
		program,
		numDevices,
		deviceIDs,
		NULL,
		NULL,
		NULL);
    if (errNum != CL_SUCCESS)
    {
        // Determine the reason for the error
        char buildLog[16384];
        clGetProgramBuildInfo(
			program, 
			deviceIDs[0], 
			CL_PROGRAM_BUILD_LOG,
            sizeof(buildLog), 
			buildLog, 
			NULL);
 
        std::cerr << "Error in kernel: " << std::endl;
        std::cerr << buildLog;
		checkErr(errNum, "clBuildProgram");
    }
	// Create kernel object
	kernel = clCreateKernel(
		program,
		"convolve",
		&errNum);
	checkErr(errNum, "clCreateKernel");
 
 
		kernel2 = clCreateKernel(
		program,
		"convolve2",
		&errNum);
	checkErr(errNum, "clCreateKernel");
 
 
 
	//BUFFER FÖR TILLSTÅNDEN **************** :D :D :D :PPPPPPPPPPPPPPppPpPPp
		statesBuffer = clCreateBuffer(
		context,
		CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR,
		sizeof(float) * antalAtomer * 4,
		static_cast<void *>(states),
		&errNum);
	checkErr(errNum, "clCreateBuffer(statesBuffer)");
 
 
 
	// Pick the first device and create command queue.
	queue = clCreateCommandQueue(
		context,
		deviceIDs[0],
		0,
		&errNum);
	checkErr(errNum, "clCreateCommandQueue");
 
 
 
				for(int l=0; l<TMAX; l++) // MAiN LO0000000000000000000000000000000000000000OP
	{
 
    errNum  = clSetKernelArg(kernel, 0, sizeof(cl_mem), &statesBuffer);  //se
	errNum |= clSetKernelArg(kernel, 1, sizeof(cl_uint), &antalAtomer); // skicka in antalatomer
	errNum |= clSetKernelArg(kernel, 2, sizeof(float), &dt);
	errNum |= clSetKernelArg(kernel, 3, sizeof(int), &l);
	errNum |= clSetKernelArg(kernel, 4, sizeof(float), &lx);
	errNum |= clSetKernelArg(kernel, 5, sizeof(float), &ly);
	errNum |= clSetKernelArg(kernel, 6, sizeof(cl_uint), &TMAX);
 
	checkErr(errNum, "clSetKernelArg");
 
	const size_t globalWorkSize[1] = { antalAtomer*(antalAtomer/2)};
    const size_t localWorkSize[1]  = { 1 };
 
	cl_event event;
 
    // Queue the kernel up for execution across the array
    errNum = clEnqueueNDRangeKernel(
		queue, 
		kernel, 
		1, 
		NULL,
        globalWorkSize, 
		localWorkSize,
        0, 
		NULL, 
		&event);
 
	//clWaitForEvents(1, &event);  //väntar så man inte börjar med annan skit innan detta är klart
 
 
	checkErr(errNum, "clEnqueueNDRangeKernel");
 
 
	errNum = clEnqueueReadBuffer(
		queue, 
		statesBuffer, 
		CL_TRUE,
        0, 
		sizeof(float) * 4 * antalAtomer, 
		states,
        0, 
		NULL, 
		NULL);
	checkErr(errNum, "clEnqueueReadBuffer");
 
 
 
 
	//FÖRSTAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA
	// FÖRSTA GREJEN TYP RÄKNA KRAFT ******************************************DDDDDDDDDDDDSSSS
 
 
 
 
	    errNum  = clSetKernelArg(kernel2, 0, sizeof(cl_mem), &statesBuffer);  //se
	errNum |= clSetKernelArg(kernel2, 1, sizeof(cl_uint), &antalAtomer); // skicka in antalatomer
	errNum |= clSetKernelArg(kernel2, 2, sizeof(float), &dt);
	errNum |= clSetKernelArg(kernel2, 3, sizeof(int), &l);
	errNum |= clSetKernelArg(kernel2, 4, sizeof(float), &lx);
	errNum |= clSetKernelArg(kernel2, 5, sizeof(float), &ly);
	errNum |= clSetKernelArg(kernel2, 6, sizeof(cl_uint), &TMAX);
 
 
	checkErr(errNum, "clSetKernelArg");
 
	const size_t globalWorkSize2[1] = { antalAtomer};
    const size_t localWorkSize2[1]  = { 1 };
 
 
    // Queue the kernel up for execution across the array
    errNum = clEnqueueNDRangeKernel(
		queue, 
		kernel2, 
		1, 
		NULL,
        globalWorkSize2, 
		localWorkSize2,
        0, 
		NULL, 
		NULL);
	checkErr(errNum, "clEnqueueNDRangeKernel");
 
	errNum = clEnqueueReadBuffer(
		queue, 
		statesBuffer, 
		CL_TRUE,
        0, 
		sizeof(float) * 4 * antalAtomer, 
		states,
        0, 
		NULL, 
		NULL);
	checkErr(errNum, "clEnqueueReadBuffer");
 
 
 
 
 
	 if(l%avskal==0)
	{
 
      for(int k=0; k<antalAtomer; k++)
	{
	  statesText<<states[k]<< " " << states[k+antalAtomer]<<" ";
 
	}
      statesText<<"\n";
      cout<<" klarhet "<< (double) 100*l/TMAX<<"\n";
	}
 
 
	}
    std::cout << std::endl << "Executed program succesfully." << std::endl;
 
 
 
 
	return 0;
}