Results 1 to 10 of 10

Thread: Memory Problem when trying to speed up Kernel

  1. #1
    Junior Member
    Join Date
    Dec 2010
    Posts
    2

    Memory Problem when trying to speed up Kernel

    I'm fairly new to OpenCL and I'm running OS X 10.6 which the Nvidia 330 graphics card. I'm working on a cloth simulation in C++ which I've managed to write a kernel for that compiles and runs. The problem is that it's running slower than it did on the cpu without OpenCL. I believe the reason for this is that every time I call the update() method to do some calculations I'm setting the context and device and then recompiling the Kernel from source.

    To solve this, I tried encapsulating the various OpenCL types I needed into the cloth simulation class to try and store them there, and then created an initCL() to set up these values. I then created a runCL() to execute the kernel. Strangely this only gives me a memory problem when I separate the OpenCL stuff into two methods. It works fine if the initCL() and runCL() are both combined into one method though which is why I'm a little stuck.
    The initialisation of OpenCL variables
    Code :
    int VPESimulationCloth::initCL(){
    	// Find the CPU CL device, as a fallback
    	err = clGetDeviceIDs(NULL, CL_DEVICE_TYPE_CPU, 1, &device, NULL);
    	assert(err == CL_SUCCESS);
     
    	// Find the GPU CL device, this is what we really want
    	// If there is no GPU device is CL capable, fall back to CPU
    	err = clGetDeviceIDs(NULL, CL_DEVICE_TYPE_GPU, 1, &device, NULL);
    	if (err != CL_SUCCESS) err = clGetDeviceIDs(NULL, CL_DEVICE_TYPE_CPU, 1, &device, NULL);
    	assert(device);
     
    	// Get some information about the returned device
    	cl_char vendor_name[1024] = {0};
    	cl_char device_name[1024] = {0};
    	err = clGetDeviceInfo(device, CL_DEVICE_VENDOR, sizeof(vendor_name), 
    						  vendor_name, &returned_size);
    	err |= clGetDeviceInfo(device, CL_DEVICE_NAME, sizeof(device_name), 
    						   device_name, &returned_size);
    	assert(err == CL_SUCCESS);
    	//printf("Connecting to %s %s...\n", vendor_name, device_name);
     
    	// Now create a context to perform our calculation with the 
    	// specified device 
    	context = clCreateContext(0, 1, &device, NULL, NULL, &err);
    	assert(err == CL_SUCCESS);
     
    	// And also a command queue for the context
    	cmd_queue = clCreateCommandQueue(context, device, 0, NULL);
     
    	// Load the program source from disk
    	// The kernel/program should be in the resource directory
    	const char * filename = "clothSimKernel.cl";
    	char *program_source = load_program_source(filename);
     
     
    	program[0] = clCreateProgramWithSource(context, 1, (const char**)&program_source,
    										   NULL, &err);
    	if (!program[0])
    	{
    		printf("Error: Failed to create compute program!\n");
    		return EXIT_FAILURE;
    	}
    	assert(err == CL_SUCCESS);
     
    	err = clBuildProgram(program[0], 0, NULL, NULL, NULL, NULL);
    	if (err != CL_SUCCESS)
    	{
    		char build[2048];
    		clGetProgramBuildInfo(program[0], device, CL_PROGRAM_BUILD_LOG, 2048, build, NULL);
    		printf("Build Log:\n%s\n",build);
    		if (err == CL_BUILD_PROGRAM_FAILURE) {
    			printf("CL_BUILD_PROGRAM_FAILURE\n");
    		}
    	}
    	if (err != CL_SUCCESS) {
    		cout<<getErrorDesc(err)<<endl;
    	}
    	assert(err == CL_SUCCESS);
    	//writeBinaries();
    	// Now create the kernel "objects" that we want to use in the example file 
    	kernel[0] = clCreateKernel(program[0], "clothSimulation", &err);
     
    }

    The method to execute the kernel
    Code :
    int VPESimulationCloth::runCL(){
     
    	// Find the GPU CL device, this is what we really want
    	// If there is no GPU device is CL capable, fall back to CPU
    	err = clGetDeviceIDs(NULL, CL_DEVICE_TYPE_GPU, 1, &device, NULL);
    	if (err != CL_SUCCESS) err = clGetDeviceIDs(NULL, CL_DEVICE_TYPE_CPU, 1, &device, NULL);
    	assert(device);
     
    	// Get some information about the returned device
    	cl_char vendor_name[1024] = {0};
    	cl_char device_name[1024] = {0};
    	err = clGetDeviceInfo(device, CL_DEVICE_VENDOR, sizeof(vendor_name), 
    						  vendor_name, &returned_size);
    	err |= clGetDeviceInfo(device, CL_DEVICE_NAME, sizeof(device_name), 
    						   device_name, &returned_size);
    	assert(err == CL_SUCCESS);
    	//printf("Connecting to %s %s...\n", vendor_name, device_name);
     
    	// Now create a context to perform our calculation with the 
    	// specified device 
     
    	//cmd_queue = clCreateCommandQueue(context, device, 0, NULL);
    	//memory allocation
    	cl_mem nowPos_mem, prevPos_mem, rForce_mem, mass_mem, passive_mem, canMove_mem,numPart_mem, theForces_mem, numForces_mem, drag_mem, answerPos_mem;
     
    	// Allocate memory on the device to hold our data and store the results into
    	buffer_size = sizeof(float4) * numParts;
     
    	// Input arrays 
    	nowPos_mem = clCreateBuffer(context, CL_MEM_READ_ONLY, buffer_size, NULL, NULL);
    [color=#FF0000]	err = clEnqueueWriteBuffer(cmd_queue, nowPos_mem, CL_TRUE, 0, buffer_size,
    							   (void*)nowPos, 0, NULL, NULL);
    	if (err != CL_SUCCESS) {
    		cout<<getErrorDesc(err)<<endl;
    	}
    	assert(err == CL_SUCCESS);[/color]
    	prevPos_mem = clCreateBuffer(context, CL_MEM_READ_ONLY, buffer_size, NULL, NULL);
    	err = clEnqueueWriteBuffer(cmd_queue, prevPos_mem, CL_TRUE, 0, buffer_size,
    							   (void*)prevPos, 0, NULL, NULL);
    	assert(err == CL_SUCCESS);
    	rForce_mem = clCreateBuffer(context, CL_MEM_READ_ONLY, buffer_size, NULL, NULL);
    	err = clEnqueueWriteBuffer(cmd_queue, rForce_mem, CL_TRUE, 0, buffer_size,
    							   (void*)rForce, 0, NULL, NULL);
    	assert(err == CL_SUCCESS);
    	mass_mem = clCreateBuffer(context, CL_MEM_READ_ONLY, buffer_size, NULL, NULL);
    	err = clEnqueueWriteBuffer(cmd_queue, mass_mem, CL_TRUE, 0, buffer_size,
    							   (void*)mass, 0, NULL, NULL);
    	assert(err == CL_SUCCESS);
    	answerPos_mem = clCreateBuffer(context, CL_MEM_READ_WRITE, buffer_size, NULL, NULL);
    	//uint buffer
    	buffer_size = sizeof(uint) * numParts;
    	passive_mem = clCreateBuffer(context, CL_MEM_READ_ONLY, buffer_size, NULL, NULL);
    	err = clEnqueueWriteBuffer(cmd_queue, passive_mem, CL_TRUE, 0, buffer_size,
    							   (void*)passive, 0, NULL, NULL);
    	assert(err == CL_SUCCESS);
    	canMove_mem = clCreateBuffer(context, CL_MEM_READ_ONLY, buffer_size, NULL, NULL);
    	err = clEnqueueWriteBuffer(cmd_queue, canMove_mem, CL_TRUE, 0, buffer_size,
    							   (void*)canMove, 0, NULL, NULL);
    	assert(err == CL_SUCCESS);
     
    	buffer_size = sizeof(float4) * numForces;
    	theForces_mem = clCreateBuffer(context, CL_MEM_READ_ONLY, buffer_size, NULL, NULL);
    	err = clEnqueueWriteBuffer(cmd_queue, theForces_mem, CL_TRUE, 0, buffer_size,
    							   (void*)theForces, 0, NULL, NULL);
    	assert(err == CL_SUCCESS);
     
    	//drag float
    	buffer_size = sizeof(float);
    	drag_mem = clCreateBuffer(context, CL_MEM_READ_ONLY, buffer_size, NULL, NULL);
    	err |= clEnqueueWriteBuffer(cmd_queue, drag_mem, CL_TRUE, 0, buffer_size,
    								(void*)drag, 0, NULL, NULL);
    	assert(err == CL_SUCCESS);
     
    	// Now setup the arguments to our kernel
    	err  = clSetKernelArg(kernel[0],  0, sizeof(cl_mem), &nowPos_mem);
    	err |= clSetKernelArg(kernel[0],  1, sizeof(cl_mem), &prevPos_mem);
    	err |= clSetKernelArg(kernel[0],  2, sizeof(cl_mem), &rForce_mem);
    	err |= clSetKernelArg(kernel[0],  3, sizeof(cl_mem), &mass_mem);
    	err |= clSetKernelArg(kernel[0],  4, sizeof(cl_mem), &passive_mem);
    	err |= clSetKernelArg(kernel[0],  5, sizeof(cl_mem), &canMove_mem);
    	err |= clSetKernelArg(kernel[0],  6, sizeof(cl_mem), &numParts);
    	err |= clSetKernelArg(kernel[0],  7, sizeof(cl_mem), &theForces_mem);
    	err |= clSetKernelArg(kernel[0],  8, sizeof(cl_mem), &numForces);
    	err |= clSetKernelArg(kernel[0],  9, sizeof(cl_mem), &drag_mem);
    	err |= clSetKernelArg(kernel[0],  10, sizeof(cl_mem), &answerPos_mem);
    	if (err != CL_SUCCESS) {
    		cout<<getErrorDesc(err)<<endl;
    	}
    	assert(err == CL_SUCCESS);
    	// Run the calculation by enqueuing it and forcing the 
    	// command queue to complete the task
    	size_t global_work_size = numParts;
    	size_t local_work_size = global_work_size/8;
    	err = clEnqueueNDRangeKernel(cmd_queue, kernel[0], 1, NULL, 
    								 &global_work_size, &local_work_size, 0, NULL, NULL);
    	if (err != CL_SUCCESS) {
    		cout<<getErrorDesc(err)<<endl;
    	}
     
    	assert(err == CL_SUCCESS);
    	//clFinish(cmd_queue);
     
    	// Once finished read back the results from the answer 
    	// array into the results array
    	//reset the buffer first
    	buffer_size = sizeof(float4) * numParts;
    	err = clEnqueueReadBuffer(cmd_queue, answerPos_mem, CL_TRUE, 0, buffer_size, 
    							  answerPos, 0, NULL, NULL);
    	if (err != CL_SUCCESS) {
    		cout<<getErrorDesc(err)<<endl;
    	}
     
     
    	//cl mem
    	clReleaseMemObject(nowPos_mem);
    	clReleaseMemObject(prevPos_mem);
    	clReleaseMemObject(rForce_mem);
    	clReleaseMemObject(mass_mem);
    	clReleaseMemObject(passive_mem);
    	clReleaseMemObject(canMove_mem);
    	clReleaseMemObject(theForces_mem);
    	clReleaseMemObject(drag_mem);
    	clReleaseMemObject(answerPos_mem);
    	clReleaseCommandQueue(cmd_queue);
    	clReleaseContext(context);
    	assert(err == CL_SUCCESS);
    	return err;
     
    }


    The program compiles and runs but I then get a SIGABRIT or EXC BAD ACCESS at the point marked in red. When I get a SIGABRIT I get the error CL_INVALID_COMMAND_QUEUE but I can't work out for the life of me why this only happens when I split up the two methods.

    Also if anyone can tell me a better way to do this or if the JIT recompiling isn't what's slowing my code down then I'd be very grateful because I've been staring at this for far too long!

    Thanks,

    Jon

  2. #2
    Member
    Join Date
    Oct 2010
    Location
    Vancouver, Canada
    Posts
    66

    Re: Memory Problem when trying to speed up Kernel

    Do you happen to get this failure on the second call to runCL? At the end of runCL you're releasing all your objects, thus on the second call to runCL you would be using invalid objects.

    I suggest that you create a deinitCL function as well, and move the release operations there. I also strongly recommend that you move ALL your object creation (i.e. your cl_mem objects) and acquisition (i.e. getting the device) into initCL/deinitCL too. Ideally runCL should only be calling enqueue and setarg methods.

  3. #3
    Junior Member
    Join Date
    Dec 2010
    Posts
    2

    Re: Memory Problem when trying to speed up Kernel

    Ah thank you so much! How could I miss something so frustratingly obvious?! That was exactly it and now it's running perfectly smoothly. I think in my mind that was just freeing memory as if it was a normal malloc() so I'd mentally skipped over it.

  4. #4
    Junior Member
    Join Date
    Jan 2012
    Posts
    4

    Re: Memory Problem when trying to speed up Kernel

    Hey there,

    I'm completely new to OpenCL and since I seem to have a similar Issue I didn't want to open a new thread.

    But what's different in my case is that I only have a single method in which I create the kernel, run and delete it. The method itself is called several times (approx. 20), but so far I don't access any data after I free memory.
    My program crashes appear to be random. Sometimes it crashes on the first execution of "move", sometimes it runs through nicely and sometimes it crashes in between. I don't get any error code, the program window just closes and that's it.

    I really tried everything I could imagine!
    Hopefully that's just a beginners issue.

    The code:
    Code :
    		void move(
    			const float *SrcPos, float *DestPos,
    			size_t numVertices)
    		{			
    			cl_context context = 0;
    			cl_command_queue commandQueue = 0;
    			cl_program program = 0;
    			cl_device_id device = 0;
    			cl_kernel kernel = 0;
    			cl_int errNum;
     
    			// Create an OpenCL context on first available platform
    			context = CreateContext();
     
    			// Create a command-queue on the first device available
    			// on the created context
    			commandQueue = CreateCommandQueue(context, &device);
     
    			char* kernelPath = "move.cl";			
     
    			program = CreateProgram(context, device, kernelPath);
     
    			// Create OpenCL kernel
    			kernel = clCreateKernel(program, "move_kernel", NULL);			
     
    			// Create memory objects that will be used as arguments to kernel.
    			float* result = (float*)malloc(numVertices * sizeof(float));
     
    			cl_mem d_result = clCreateBuffer(context, CL_MEM_WRITE_ONLY,
    										   sizeof(float) * numVertices, NULL, &errNum);	
     
    			cl_mem d_pSrcPos = clCreateBuffer( context, CL_MEM_READ_ONLY,
    										   sizeof(float) * numVertices * srcPosStride, (void*)pSrcPos, &errNum);		
     
    			//#########################
    			// This is the call that causes the crash
    			//#########################
    			errNum = clEnqueueWriteBuffer(commandQueue, d_pSrcPos, CL_TRUE, 0, sizeof(float) * numVertices * 3, 
    									(void*)pSrcPos, 0, NULL, NULL);
    			//#########################
    			if (errNum != CL_SUCCESS)
    			{
    				std::cerr << "Error setting kernel argument." << std::endl;
    				std::cout << "Error code: " << errNum << std::endl;	
    				std::getchar();
    				return;
    			}
     
    			cl_mem d_pDestPos = clCreateBuffer( context, CL_MEM_WRITE_ONLY,
    										   sizeof(float) * numVertices * destPosStride, NULL, &errNum);
     
    			errNum = clSetKernelArg(kernel, 0, sizeof(cl_mem), &d_pSrcPos);
    			errNum |= clSetKernelArg(kernel, 1, sizeof(cl_mem), &d_pDestPos);
    			errNum |= clSetKernelArg(kernel, 2, sizeof(int), &numVertices);
    			errNum |= clSetKernelArg(kernel, 3, sizeof(cl_mem), &d_result);
     
    			if (errNum != CL_SUCCESS)
    			{
    				std::cerr << "Error setting kernel arguments." << std::endl;
    				std::cout << "Error code: " << errNum << std::endl;
    				Cleanup(context, commandQueue, program, kernel, memObjects);
    				std::getchar();
    				return;
    			}
     
    			size_t localWorkSize[1] = { 1 };
    			size_t globalWorkSize[1] = { numVertices };
     
    			// Queue the kernel up for execution across the array
    			errNum = clEnqueueNDRangeKernel(commandQueue, kernel, 1, NULL,
    											globalWorkSize, localWorkSize,
    											0, NULL, NULL);
     
    			// Read the output buffer back to the Host		
    			errNum = clEnqueueReadBuffer(commandQueue, d_result, CL_TRUE,
    										 0, numVertices * sizeof(float), result,
    										 0, NULL, NULL);
     
    			std::cout << "Executed program succesfully." << std::endl;
     
    			free(result);
     
    			clReleaseMemObject(d_result);
    			clReleaseMemObject(d_pSrcPos);
    			clReleaseMemObject(d_pDestPos);
     
    			clReleaseCommandQueue(commandQueue);
     
    			clReleaseKernel(kernel);
     
    			clReleaseProgram(program);
     
    			clReleaseContext(context);
    }

    The kernel so far looks like this:
    Code :
    __kernel void move_kernel(__global const float *d_pSrcPos,
    											__global float *d_pDestPos,
    											int numVertices,
    											__global float *d_result
    											)
    {	
            int gid = get_global_id(0);	
     
            d_result[gid] = gid;
    }

    Since I'm still working on it, the variable "result" is just a test variable, that I sometimes use to output values (e.g. get_global_id(), or such).
    For the sake of readability I also deleted all the error handling like
    Code :
    if(kernel == NULL)
    since I really tested everything and the only thing that causes the crash is the marked function call.

    Does anyone have an Idea what could cause my Problem?

    Thanks in advance!

    Cheers,
    --Markus

  5. #5
    Member
    Join Date
    Oct 2010
    Location
    Vancouver, Canada
    Posts
    66

    Re: Memory Problem when trying to speed up Kernel

    sizeof(float) * numVertices * 3
    The write is the only place I see where you multiply by 3.

  6. #6
    Junior Member
    Join Date
    Jan 2012
    Posts
    4

    Re: Memory Problem when trying to speed up Kernel

    Quote Originally Posted by andrew.brownsword
    sizeof(float) * numVertices * 3
    The write is the only place I see where you multiply by 3.
    Good call, but sadly that's just a leftover I forgot in the hurry yesterday. Sorry!
    Actually the 3 is fine for this example, instead the clCreateBuffer should be as well
    Code :
    cl_mem d_pSrcPos = clCreateBuffer( context, CL_MEM_READ_ONLY,
                                     sizeof(float) * numVertices * 3, (void*)pSrcPos, &errNum);

    (Each of the SrcPos elements is a vertex, which will later consist of either 2 or 3 coordinates, therefor I will pass this srcPosStride to know how many coordinates to modify before moving the next vertex. But for now it has no meaning so I wanted to remove it to make it less complicated... mission failed! ^^ )

  7. #7
    Member
    Join Date
    Oct 2010
    Location
    Vancouver, Canada
    Posts
    66

    Re: Memory Problem when trying to speed up Kernel

    And I assume the first parameter should be pSrcPos as well?

    Its a bit challenging to figure out what your problem is when you aren't posting the code you're actually running...

  8. #8
    Junior Member
    Join Date
    Jan 2012
    Posts
    4

    Re: Memory Problem when trying to speed up Kernel

    Yes you're right.
    The Issue with my actual code is, that I'm not sure how much of the code I'm allowed to share. But mainly it's very messy with about 3 times as many lines of code, which I'm constantly commenting in and out as well as additional buffers and variables that I'm not using so far. For me it would be even more confusing to read such code, but if you wish I can try to clean it and post it again.

    But I can say, that the code runs nicely and reliable as soon as I delete the line:
    Code :
    errNum = clEnqueueWriteBuffer(commandQueue, d_pSrcPos, CL_TRUE, 0, sizeof(float) * numVertices * 3, (void*)pSrcPos, 0, NULL, NULL);
    I also tried to use
    Code :
    cl_mem d_pSrcPos = clCreateBuffer( context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, sizeof(float) * numVertices * 3, (void*)pSrcPos, &errNum);
    before instead, but that causes the same random crash behavior...

    Thanks,
    --Markus

  9. #9
    Member
    Join Date
    Oct 2010
    Location
    Vancouver, Canada
    Posts
    66

    Re: Memory Problem when trying to speed up Kernel

    Have you checked all your error and other return values? How big is the buffer you're asking for? Is it possible you've previously leaked memory? Does this happen with another vendor's driver or different device?

  10. #10
    Junior Member
    Join Date
    Jan 2012
    Posts
    4

    Re: Memory Problem when trying to speed up Kernel

    Have you checked all your error and other return values?
    Yes, I did. Error code is always 0.
    How big is the buffer you're asking for?
    The numVertices varies between 4 and 9907, so the full amount should be between 48 bytes and 118,884 bytes, right? But what's strange is that it doesn't crash only on the biggest object, but also on smaller ones...
    Is it possible you've previously leaked memory?
    This is the only part where I use OpenCL and i try to convert an already working c++ method.
    Does this happen with another vendor's driver or different device?
    I currently don't have the chance to try it on a different GPU, but I could try the CPU.
    Does it help if I tell you I'm running this on a Nvidia Quadro 5000 and the driver version 285.58.

    Thank you!

Similar Threads

  1. Replies: 6
    Last Post: 02-28-2013, 04:59 PM
  2. Question about Kernel Arguement Speed
    By Gauge in forum OpenCL
    Replies: 1
    Last Post: 04-21-2011, 02:56 PM

Posting Permissions

  • You may not post new threads
  • You may not post replies
  • You may not post attachments
  • You may not edit your posts
  •