HI, I'm getting a CL_OUT_OF_RESOURCES. I'll post two implementations. They are pratically identical.
The first one runs ok. I'm calling a function on a loop that runs my kernel with different arguments.

Code :
float Parallel::step(){
	size_t i=0,max=0;
	float diff=0;
 
        //PAY ATTENTION HERE AND TO IMPLEMANTION OF stepGRoup()
	for(i=0;i<groups.size();++i){
		stepGroup(i);
	}
 
	try{
	queue.enqueueBarrier();
	//Read x buffer back
	queue.enqueueReadBuffer(d_x, CL_TRUE, 0, sizeof(float)*rows, x);
 
	} catch ( cl::Error& err ) {
		std::cerr << "Caught exception: " << err.what() << '(' << err.err() << ')' << std::endl;
	}
}
 
void Parallel::stepGroup(size_t grp_idx){
	size_t group_rows=groups[grp_idx].size();
	size_t rows_offset=0;
 
	for(size_t i=0;i<grp_idx;++i)
		rows_offset+=groups[i].size();
 
 
	try{
 
	size_t local_size=512;
	size_t global_size;
	if(group_rows%local_size)
		global_size=(group_rows/local_size+1)*local_size;
	else
		global_size=group_rows/local_size;
 
	//Kernel setargs
	kernel.setArg(0,d_values);	
	kernel.setArg(1,d_col_idx);	
	kernel.setArg(2,d_row_index);	
	kernel.setArg(3,d_b);	
	kernel.setArg(4,d_x);	
	kernel.setArg(5,rows_offset);	
	kernel.setArg(6,group_rows);	
	kernel.setArg(7,max_nz);	
	kernel.setArg(8,rows);	
 
 
	//Enque the kernel now
	//std::cerr<<"global_size: "<<global_size<<" local_size: "<<local_size<<std::endl;
 
	queue.enqueueNDRangeKernel(kernel, cl::NullRange, cl::NDRange(global_size),cl::NDRange(local_size));
 
	} catch ( cl::Error& err ) {
		std::cerr << "Caught exception: " << err.what() << '(' << err.err() << ')' << std::endl;
	}	
 
}

The second one, instead of looping on each group and calling stepGroup(), I'm calling a function called stepAllGroups() that has the loop for the groups in it:

Code :
float Parallel::step(){
	size_t i=0,max=0;
	float diff=0;
 
        //PAY ATTENTION HERE AND TO IMPLEMANTION OF stepAllGroups()
	stepAllGroups();
 
	try{
	queue.enqueueBarrier();
	//Read x buffer back
	queue.enqueueReadBuffer(d_x, CL_TRUE, 0, sizeof(float)*rows, x);
 
	} catch ( cl::Error& err ) {
		std::cerr << "Caught exception: " << err.what() << '(' << err.err() << ')' << std::endl;
	}
}
 
void Parallel::stepAllGroups(){
	size_t group_rows;
	size_t rows_offset=0;
 
	size_t local_size=512;
	size_t global_size;
 
	queue.enqueueBarrier();
	for(size_t i=0;i<groups.size();++i){
		group_rows=groups[i].size();
		rows_offset+=group_rows;
 
		std::cerr<<"Calling kernel on group "<<i<<" with group_rows "<<group_rows<<std::endl;
 
		try{
 
			if(groups[i].size()%local_size)
				global_size=(group_rows/local_size+1)*local_size;
			else
				global_size=group_rows/local_size;
 
			kernel.setArg(0,d_values);	
			kernel.setArg(1,d_col_idx);	
			kernel.setArg(2,d_row_index);	
			kernel.setArg(3,d_b);	
			kernel.setArg(4,d_x);	
			kernel.setArg(5,rows_offset);	
			kernel.setArg(6,group_rows);	
			kernel.setArg(7,max_nz);	
			kernel.setArg(8,rows);	
 
			queue.enqueueBarrier();
			queue.enqueueNDRangeKernel(kernel, cl::NullRange, cl::NDRange(global_size),cl::NDRange(local_size));
		} catch ( cl::Error& err ) {
			std::cerr << "Caught exception: " << err.what() << '(' << err.err() << ')' << std::endl;
		}	
 
	}
 
	queue.enqueueBarrier();
 
}

In the second implemention, I'm getting CL_OUT_OF_RESOURCES when calling ::queueReadBuffer();

Btw, I'm using enqueueBarrier() in an effort to solve this with no success.

I dont understand where the difference is since it's seems to be a matter of how I chose to structure my code and nothing to do with OpenCL, but something is definetly happening.

Any pointers will be appreciated.

Best regards
Mat