Results 1 to 6 of 6

Thread: Slow MD-program. My cpu runs it faster than my GPU

  1. #1
    Junior Member
    Join Date
    Jun 2012
    Posts
    7

    Slow MD-program. My cpu runs it faster than my GPU

    Hi, I really need some help with this thing. I would really like some pointers on what to do to make this thing faster. Since the CPU (intel i5 2500K) runs it faster than my radeon 6780 it feels like I haven't taken advantage if Opencl's parallelism or something.

    I am thinking it might have something to do that my local work size is 1 or that I need to read back my statesbuffer to my original vector with states (position and velocity) and then resend it again into the kernel which might take a long time on the GPU (?)

    I've taken the code from the book opencl programming guide and modified it heavily
    Kernel:

    Code :
    //
     
    // Convolution.cl
    //
    //    This is a simple kernel performing convolution.
     
    __kernel void convolve2(
    	__global float * states,
    	 int antalAtomer,
    	 float dt,
    	 int l,
    	 float lx,
    	 float ly,
    	int TMAX)
    {
        const int x = get_global_id(0);
     
     
     
     
     
     
     
     
     
    		if(states[x]<0 || states[x]>lx)
    			{
    				states[x+antalAtomer*2]=-states[x+antalAtomer*2];
     
    			}
    		if(states[x+antalAtomer]<0 || states[x+antalAtomer]>ly)
     
    			{
     
    				states[x+antalAtomer*3]=-states[x+antalAtomer*3];
     
    			}
     
    		if(l==floor(0.1*TMAX) || l==floor(0.2*TMAX) || l==floor(0.25*TMAX) || l==floor(0.35*TMAX) || l==floor(0.50*TMAX))  // kyl ner skiten så den inte buggar loss
    			{
     
    			states[x+antalAtomer*2]=0;
    			states[x+antalAtomer*3]=0;
    			}
     
    	states[x] += states[x+antalAtomer*2]*dt;                 // flytta atomerna varannan gång
    	states[x+antalAtomer] +=states[x+antalAtomer*3]*dt;
     
    	//	states[x] =0;                 // flytta atomerna varannan gång
    	//states[x+antalAtomer] =l;
     
     
     
     
     
     
    }//end convolve2
     
     
     
    __kernel void convolve(
    	__global float * states,
    	 int antalAtomer,
    	 float dt,
    	 int l,
    	 float lx,
    	float ly,
    	int TMAX)
    {
     
    	float a=antalAtomer/2;
     
    		 const int x2 = get_global_id(0);
        const int x= x2%antalAtomer;
     
     
     
    		int k=(x+1+(int)x2/antalAtomer)%antalAtomer; // 0 om man är i "första"
     
     
                    float dx=states[x]-states[k];
       		 float dy=states[antalAtomer+x]-states[antalAtomer+k];
     
      		 float r=sqrt(pow(dx,2)+pow(dy,2));
     
       		float F=12*pow(r,-14)-6*pow(r,-8);
     
     
        			states[x+antalAtomer*2] += F*dx*dt;
    			states[x+antalAtomer*3] += F*dy*dt;
     
    			states[k+antalAtomer*2] -= F*dx*dt;
    			states[k+antalAtomer*3] -= F*dy*dt;
     
     
     
     
     
     
     
    } // end convolve



    AND the code:

    Code :
     
    #include <iostream>
    #include <fstream>
    #include <sstream>
    #include <string>
    #include <istream>
    using namespace std;
     
    #ifdef __APPLE__
    #include <OpenCL/cl.h>
    #else
    #include <CL/cl.h>
    #endif
     
    #if !defined(CL_CALLBACK)
    #define CL_CALLBACK
    #endif
     
     
    ///
    // Function to check and handle OpenCL errors
    inline void 
    checkErr(cl_int err, const char * name)
    {
        if (err != CL_SUCCESS) {
            std::cerr << "ERROR: " <<  name << " (" << err << ")" << std::endl;
            exit(EXIT_FAILURE);
        }
    }
     
    void CL_CALLBACK contextCallback(
    	const char * errInfo,
    	const void * private_info,
    	size_t cb,
    	void * user_data)
    {
    	std::cout << "Error occured during context use: " << errInfo << std::endl;
    	// should really perform any clearup and so on at this point
    	// but for simplicitly just exit.
    	exit(1);
    }
     
    ///
    //	main() for Convoloution example
    //
    int main(int argc, char** argv)
    {
     
     
    const cl_uint antalAtomer=10000;
     float lx=50;
     float ly=50;
    //string mystr ("1204");
    //cout<<"ange antal atomer, (troligtvis helst jämnt antal) och kanske en multipel av något i GPU/CPU):"<<endl;
    //cin>> antalAtomer;
    //cout<<"ange lx: "<< endl;
    //cin >>lx;
    //cout<<"ange ly: "<< endl;
    //cin >> ly;
    //cout<<" ange filnamnet för datafilen: "<< endl;
    //cin>> mystr;
     
    	ofstream statesText("states.txt");
    float states[antalAtomer*4];  //hela stateslistan
     
     
     
        cl_int errNum;
        cl_uint numPlatforms;
    	cl_uint numDevices;
        cl_platform_id * platformIDs;
    	cl_device_id * deviceIDs;
        cl_context context = NULL;
    	cl_command_queue queue;
    	cl_command_queue queue2;
    	cl_program program;
    	cl_kernel kernel;
    	cl_kernel kernel2;
     
    	cl_mem statesBuffer;  // buffer för tillstånden..? 
     
    	cl_uint TMAX=50;
    	int avskal=1;
    	float dt=0.001;
     
    	float a=antalAtomer/2;
    	cout<< "dt= : "<<dt<<endl;
     
     
    		statesText<<antalAtomer<<" "<<lx<<" "<<ly<<" "<<TMAX<<" "<<dt<<" ";
     for ( int l=0;l<2*antalAtomer-5;l++)
        {
          statesText<<1<<" ";
        }
     statesText<<"\n";
     
      for(cl_uint i=0; i<antalAtomer; i++)  //gör en initialposition
        {
     
          int atomRot=(int)ceil(sqrt(antalAtomer));
          //states[i]=0;               //x-position
          //states[i+antalAtomer]=0;   //y-position
     
    	  states[i]=(float)lx/(atomRot-1)*(i-atomRot*((int)i/(atomRot)));               //x-position
          states[i+antalAtomer]=(float)ly/(atomRot-1)*((int)(i/(atomRot)));   //y-position
     
     
     
    	  states[i+antalAtomer*2]=0; // 0 i Vx
    	  states[i+antalAtomer*3]=0; // 0 i Vy
     
        }
     
     
    	   for(int i=0; i<antalAtomer; i++)
        {
          statesText<<states[i]<< " " << states[i+antalAtomer]<< " ";
        }
     
    	statesText<<endl;
     
     
     
          // First, select an OpenCL platform to run on.  
    	errNum = clGetPlatformIDs(0, NULL, &numPlatforms);
    	checkErr( 
    		(errNum != CL_SUCCESS) ? errNum : (numPlatforms <= 0 ? -1 : CL_SUCCESS), 
    		"clGetPlatformIDs"); 
     
    	platformIDs = (cl_platform_id *)alloca(
           		sizeof(cl_platform_id) * numPlatforms);
     
        errNum = clGetPlatformIDs(numPlatforms, platformIDs, NULL);
        checkErr( 
    	   (errNum != CL_SUCCESS) ? errNum : (numPlatforms <= 0 ? -1 : CL_SUCCESS), 
    	   "clGetPlatformIDs");
     
    	// Iterate through the list of platforms until we find one that supports
    	// a CPU device, otherwise fail with an error.
    	deviceIDs = NULL;
    	cl_uint i;
    	for (i = 0; i < numPlatforms; i++)
    	{
    		errNum = clGetDeviceIDs(
                platformIDs[i], 
                CL_DEVICE_TYPE_GPU, 
                0,
                NULL,
                &numDevices);
    		if (errNum != CL_SUCCESS && errNum != CL_DEVICE_NOT_FOUND)
    	    {
    			checkErr(errNum, "clGetDeviceIDs");
            }
    	    else if (numDevices > 0) 
    		{
    		   	deviceIDs = (cl_device_id *)alloca(sizeof(cl_device_id) * numDevices);
    			errNum = clGetDeviceIDs(
    				platformIDs[i],
    				CL_DEVICE_TYPE_GPU,
    				numDevices, 
    				&deviceIDs[0], 
    				NULL);
    			checkErr(errNum, "clGetDeviceIDs");
    			break;
    	   }
    	}
     
    	// Check to see if we found at least one CPU device, otherwise return
    	if (deviceIDs == NULL) {
    		std::cout << "No CPU device found" << std::endl;
    		exit(-1);
    	}
     
        // Next, create an OpenCL context on the selected platform.  
        cl_context_properties contextProperties[] =
        {
            CL_CONTEXT_PLATFORM,
            (cl_context_properties)platformIDs[i],
            0
        };
        context = clCreateContext(
    		contextProperties, 
    		numDevices,
            deviceIDs, 
    		&contextCallback,
    		NULL, 
    		&errNum);
    	checkErr(errNum, "clCreateContext");
     
    	std::ifstream srcFile("Convolution.cl");
        checkErr(srcFile.is_open() ? CL_SUCCESS : -1, "reading Convolution.cl");
     
    	std::string srcProg(
            std::istreambuf_iterator<char>(srcFile),
            (std::istreambuf_iterator<char>()));
     
    	const char * src = srcProg.c_str();
    	size_t length = srcProg.length();
     
    	// Create program from source
    	program = clCreateProgramWithSource(
    		context, 
    		1, 
    		&src, 
    		&length, 
    		&errNum);
    	checkErr(errNum, "clCreateProgramWithSource");
     
    	// Build program
    	errNum = clBuildProgram(
    		program,
    		numDevices,
    		deviceIDs,
    		NULL,
    		NULL,
    		NULL);
        if (errNum != CL_SUCCESS)
        {
            // Determine the reason for the error
            char buildLog[16384];
            clGetProgramBuildInfo(
    			program, 
    			deviceIDs[0], 
    			CL_PROGRAM_BUILD_LOG,
                sizeof(buildLog), 
    			buildLog, 
    			NULL);
     
            std::cerr << "Error in kernel: " << std::endl;
            std::cerr << buildLog;
    		checkErr(errNum, "clBuildProgram");
        }
    	// Create kernel object
    	kernel = clCreateKernel(
    		program,
    		"convolve",
    		&errNum);
    	checkErr(errNum, "clCreateKernel");
     
     
    		kernel2 = clCreateKernel(
    		program,
    		"convolve2",
    		&errNum);
    	checkErr(errNum, "clCreateKernel");
     
     
     
    	//BUFFER FÖR TILLSTÅNDEN **************** :D :D :D :PPPPPPPPPPPPPPppPpPPp
    		statesBuffer = clCreateBuffer(
    		context,
    		CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR,
    		sizeof(float) * antalAtomer * 4,
    		static_cast<void *>(states),
    		&errNum);
    	checkErr(errNum, "clCreateBuffer(statesBuffer)");
     
     
     
    	// Pick the first device and create command queue.
    	queue = clCreateCommandQueue(
    		context,
    		deviceIDs[0],
    		0,
    		&errNum);
    	checkErr(errNum, "clCreateCommandQueue");
     
     
     
    				for(int l=0; l<TMAX; l++) // MAiN LO0000000000000000000000000000000000000000OP
    	{
     
        errNum  = clSetKernelArg(kernel, 0, sizeof(cl_mem), &statesBuffer);  //se
    	errNum |= clSetKernelArg(kernel, 1, sizeof(cl_uint), &antalAtomer); // skicka in antalatomer
    	errNum |= clSetKernelArg(kernel, 2, sizeof(float), &dt);
    	errNum |= clSetKernelArg(kernel, 3, sizeof(int), &l);
    	errNum |= clSetKernelArg(kernel, 4, sizeof(float), &lx);
    	errNum |= clSetKernelArg(kernel, 5, sizeof(float), &ly);
    	errNum |= clSetKernelArg(kernel, 6, sizeof(cl_uint), &TMAX);
     
    	checkErr(errNum, "clSetKernelArg");
     
    	const size_t globalWorkSize[1] = { antalAtomer*(antalAtomer/2)};
        const size_t localWorkSize[1]  = { 1 };
     
    	cl_event event;
     
        // Queue the kernel up for execution across the array
        errNum = clEnqueueNDRangeKernel(
    		queue, 
    		kernel, 
    		1, 
    		NULL,
            globalWorkSize, 
    		localWorkSize,
            0, 
    		NULL, 
    		&event);
     
    	//clWaitForEvents(1, &event);  //väntar så man inte börjar med annan skit innan detta är klart
     
     
    	checkErr(errNum, "clEnqueueNDRangeKernel");
     
     
    	errNum = clEnqueueReadBuffer(
    		queue, 
    		statesBuffer, 
    		CL_TRUE,
            0, 
    		sizeof(float) * 4 * antalAtomer, 
    		states,
            0, 
    		NULL, 
    		NULL);
    	checkErr(errNum, "clEnqueueReadBuffer");
     
     
     
     
    	//FÖRSTAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA
    	// FÖRSTA GREJEN TYP RÄKNA KRAFT ******************************************DDDDDDDDDDDDSSSS
     
     
     
     
    	    errNum  = clSetKernelArg(kernel2, 0, sizeof(cl_mem), &statesBuffer);  //se
    	errNum |= clSetKernelArg(kernel2, 1, sizeof(cl_uint), &antalAtomer); // skicka in antalatomer
    	errNum |= clSetKernelArg(kernel2, 2, sizeof(float), &dt);
    	errNum |= clSetKernelArg(kernel2, 3, sizeof(int), &l);
    	errNum |= clSetKernelArg(kernel2, 4, sizeof(float), &lx);
    	errNum |= clSetKernelArg(kernel2, 5, sizeof(float), &ly);
    	errNum |= clSetKernelArg(kernel2, 6, sizeof(cl_uint), &TMAX);
     
     
    	checkErr(errNum, "clSetKernelArg");
     
    	const size_t globalWorkSize2[1] = { antalAtomer};
        const size_t localWorkSize2[1]  = { 1 };
     
     
        // Queue the kernel up for execution across the array
        errNum = clEnqueueNDRangeKernel(
    		queue, 
    		kernel2, 
    		1, 
    		NULL,
            globalWorkSize2, 
    		localWorkSize2,
            0, 
    		NULL, 
    		NULL);
    	checkErr(errNum, "clEnqueueNDRangeKernel");
     
    	errNum = clEnqueueReadBuffer(
    		queue, 
    		statesBuffer, 
    		CL_TRUE,
            0, 
    		sizeof(float) * 4 * antalAtomer, 
    		states,
            0, 
    		NULL, 
    		NULL);
    	checkErr(errNum, "clEnqueueReadBuffer");
     
     
     
     
     
    	 if(l%avskal==0)
    	{
     
          for(int k=0; k<antalAtomer; k++)
    	{
    	  statesText<<states[k]<< " " << states[k+antalAtomer]<<" ";
     
    	}
          statesText<<"\n";
          cout<<" klarhet "<< (double) 100*l/TMAX<<"\n";
    	}
     
     
    	}
        std::cout << std::endl << "Executed program succesfully." << std::endl;
     
     
     
     
    	return 0;
    }

  2. #2
    Junior Member
    Join Date
    Jun 2012
    Posts
    7

    Re: Slow MD-program. My cpu runs it faster than my GPU

    When I change local work size from 1 to higher numbers e.g. 64 my program finishes about 20 times faster but it doesn't calculate it correctly (it diverges and it seems that you don't calculate forces over all the atoms).

    This confuses me greatly since the number of work items is the same (right?) and get_global_id(0) should have the same range (right?) so the result should not change from having local work size 1 to 64.

  3. #3
    Junior Member
    Join Date
    Jun 2012
    Posts
    7

    Re: Slow MD-program. My cpu runs it faster than my GPU

    Also, when you calculate it with the CPU only, you get the right result regardless of what your local work size is...

    (btw, how do you edit your posts?)

  4. #4
    Junior Member
    Join Date
    Jun 2012
    Posts
    7

    Re: Slow MD-program. My cpu runs it faster than my GPU

    Since the program gives errenous results with the GPU only it should be something wrong with the concurrency. I thought that maybe openCL started moving atoms before I had finished calculated their change in velocity in the first kernel.
    (I have two kernels, one that calculates change in velocity and one that moves the atom and then I loop through that many times).

    I figured I had to add events between enqueNDRangeKernel(kernel1,..) ->EnqueReadBuffer, EnqueueNDRangeKernel(kernel2) ->EnqueueReadBuffer (and then reapeat)
    However this doesn't work either. What COULD be wrong with this?

    I was wondering since I was looping all this, will the events be listed as CL_successful from the first loop? so you only get events in the first loop.. This is a part of my code:


    Code :
    	for(int l=0; l<TMAX; l++) // Main loop
    	{
     
        errNum  = clSetKernelArg(kernel, 0, sizeof(cl_mem), &statesBuffer);  //se
          .
          .
          .
    	errNum |= clSetKernelArg(kernel, 7, sizeof(float), &a);
     
    	checkErr(errNum, "clSetKernelArg");
     
    	const size_t globalWorkSize[1] = { antalAtomer*(antalAtomer/2)};
        const size_t localWorkSize[1]  = { ls };
     
    	cl_event event;
     
        // Queue the kernel up for execution across the array
        errNum = clEnqueueNDRangeKernel(
    		queue, 
    		kernel, 
    		1, 
    		NULL,
            globalWorkSize, 
    		localWorkSize,
            0, 
    		NULL, 
    		&event);
     
    	clWaitForEvents(1, &event);  //Waits so you finish up this part first
     
     
    	checkErr(errNum, "clEnqueueNDRangeKernel");
     
    	cl_event eventRead; //waits so you finish reading buffer to states so you can send it back to kernel2
     
    	errNum = clEnqueueReadBuffer(
    		queue, 
    		statesBuffer, 
    		CL_TRUE,
            0, 
    		sizeof(float) * 4 * antalAtomer, 
    		states,
            1, 
    		&event, 
    		&eventRead);
    	checkErr(errNum, "clEnqueueReadBuffer");
     
     
     
     
    	//now begin with kernel2, moving the atoms
     
     
     
     
    	    errNum  = clSetKernelArg(kernel2, 0, sizeof(cl_mem), &statesBuffer);  //se
    	.
            .
             .
    	errNum |= clSetKernelArg(kernel2, 6, sizeof(cl_uint), &TMAX);
     
     
    	checkErr(errNum, "clSetKernelArg");
     
    	const size_t globalWorkSize2[1] = { antalAtomer};
        const size_t localWorkSize2[1]  = { ls };
     
    	cl_event event2;
     
        // Queue the kernel up for execution across the array
        errNum = clEnqueueNDRangeKernel(
    		queue, 
    		kernel2, 
    		1, 
    		NULL,
            globalWorkSize2, 
    		localWorkSize2,
            1, 
    		&eventRead,   //eventread from buffer read (maybe a bit excessive)
    		&event2);
    	clWaitForEvents(1, &event2);  //Waits so you finish up
    	checkErr(errNum, "clEnqueueNDRangeKernel");
     
    	cl_event eventRead2; 
    	errNum = clEnqueueReadBuffer(
    		queue, 
    		statesBuffer, 
    		CL_TRUE,
            0, 
    		sizeof(float) * 4 * antalAtomer, 
    		states,
            1, 
    		&event2,  // doesnt start reading buffer before kernel2 is finished
    		&eventRead2); 
     
    	clWaitForEvents(1, &eventRead2);
    	checkErr(errNum, "clEnqueueReadBuffer"); 
     
     
     
     
     
    	 if(l%avskal==0)
    	{
     
          for(int k=0; k<antalAtomer; k++)
    	{
    	  statesText<<states[k]<< " " << states[k+antalAtomer]<<" "; // prints the coordinates to a file
     
    	}
          statesText<<"\n";
          cout<<" klarhet "<< (double) 100*l/TMAX<<"\n";  // gives the perentage finished
    	}
     
     
    	}

  5. #5
    Junior Member
    Join Date
    Jun 2012
    Posts
    7

    Re: Slow MD-program. My cpu runs it faster than my GPU

    I actully think I might have figured out the problem, but I am not sure on how to solve it though. The thing is that in one of my kernels you got the following when adding the velocity:
    Code :
        		states[x+antalAtomer*2] += F*dx*dt;
    			states[x+antalAtomer*3] += F*dy*dt;
     
    			states[k+antalAtomer*2] -= F*dx*dt;
    			states[k+antalAtomer*3] -= F*dy*dt;


    since the positions in the global vector "states" could be read/write by another thread in another workitem you clearly have problem. I am not sure how to work this out though. Atomic functions seem to have been the solution but it seems they only work for integers


    this did not work:

    Code :
    			barrier(CLK_GLOBAL_MEM_FENCE);
        			states[x+antalAtomer*2] += F*dx*dt;
    barrier(CLK_GLOBAL_MEM_FENCE);	
    			states[x+antalAtomer*3] += F*dy*dt;
    barrier(CLK_GLOBAL_MEM_FENCE);
    			states[k+antalAtomer*2] -= F*dx*dt;
    barrier(CLK_GLOBAL_MEM_FENCE);
    			states[k+antalAtomer*3] -= F*dy*dt;
    barrier(CLK_GLOBAL_MEM_FENCE);

  6. #6
    Senior Member
    Join Date
    Aug 2011
    Posts
    271

    Re: Slow MD-program. My cpu runs it faster than my GPU

    On AMD hardware, the local worksize should be a multiple of 64, otherwise you're throwing away at least N/64*100 % of your total ALU capacity.

    Atomics should only be used for limited cases - like enquing results - they're too slow for per-work-item per-internal-loop calculations, and by definition they always will be (they remove the parallelism you're trying to add).

    TBH I can't really follow your code, but if you have an algorithm whose items are interdependent (i.e. the output of a previous result feeds into a later one), then parallelising it can range from simple to very complex.

    For any CPU implementation i've seen, the local work group executes as a single thread, and multiple local work items are implemented using loops (and/or simd channels), so any within-work-item order problems will be hidden. Whereas a gpu will execute many work items concurrently within the work group.

Similar Threads

  1. CPU faster in vector addition than GPU
    By SabinManiac in forum OpenCL
    Replies: 5
    Last Post: 10-13-2011, 12:14 PM
  2. Faster on CPU than on the GPU
    By vijaykiran in forum OpenCL
    Replies: 1
    Last Post: 08-12-2010, 10:44 PM

Posting Permissions

  • You may not post new threads
  • You may not post replies
  • You may not post attachments
  • You may not edit your posts
  •