I have a question related to the maximum number of work-items that can be set using enqueueNDRangeKernel. According to the getInfo-functions my graphics card, the meager AMD Radeon HD 6450, has the following specifications:
CL_DEVICE_MAX_WORK_GROUP_SIZE: 256
CL_DEVICE_MAX_WORK_ITEM_SIZES()[0]: 256

But when I set the cl::NDRange global(dim), with dim > 256, it stil executes and calculates values.

Here is the sample code, and OpenCLObject is my own class which takes care of storing kernels, creating a program/platforms etc.
Code :
float matrixMultOpenCL(int dim,OpenCLObject &CLObj)
{
	if(dim % 4)
	{
		cout << "dims must be mult of 4";
		exit(1);
	}
 
 
	float* A = new float[dim*dim];
	float* B = new float[dim*dim];
	float* result = new float[dim*dim];
 
 
	for (int i = 0; i < dim*dim; i++)
	{
		A[i] = (float)(rand() % 10);
		B[i] = (float)(rand() % 10);
	}
	for (int i = 0; i < dim*dim; i++)
			result[i] = 0.0f;
 
	cl_ulong start, finish;
	cl::NDRange global(dim);
	cl:: NDRange local(1);
	cl::NDRange offset(NULL);
	//std::cout << CLObj.get_queues()[1].getInfo<CL_QUEUE_DEVICE>().getInfo<CL_DEVICE_PROFILING_TIMER_RESOLUTION>();
	cl::Event profiling;
	int matMultIndx = CLObj.findKernelIndex("matrix_mult");
 
	void* modelOutputMappedMem;
	cl::Buffer bufferR = cl::Buffer(CLObj.get_context(), CL_MEM_WRITE_ONLY, sizeof(float)*dim*dim);
	cl::Buffer bufferA = cl::Buffer(CLObj.get_context(), CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, sizeof(float)*dim*dim,A);
	cl::Buffer bufferB = cl::Buffer(CLObj.get_context(), CL_MEM_READ_ONLY| CL_MEM_COPY_HOST_PTR, sizeof(float)*dim*dim,B);
 
	CLObj.get_kernels()[matMultIndx].setArg(0, bufferA);
	CLObj.get_kernels()[matMultIndx].setArg(1, bufferB);
	CLObj.get_kernels()[matMultIndx].setArg(2, bufferR);
 
	CLObj.get_queues()[0].enqueueNDRangeKernel(CLObj.get_kernels()[matMultIndx], cl::NullRange, global,local,NULL,&profiling);
 
	modelOutputMappedMem = CLObj.get_queues()[0].enqueueMapBuffer(bufferR,
	CL_TRUE,CL_MAP_READ, 0,sizeof(float)*dim*dim);//,&waitList);
	memcpy(result,modelOutputMappedMem,sizeof(float)*dim*dim);
	CLObj.get_queues()[0].enqueueUnmapMemObject(bufferR,modelOutputMappedMem);
 
 
	start = profiling.getProfilingInfo<CL_PROFILING_COMMAND_START>();
	finish = profiling.getProfilingInfo<CL_PROFILING_COMMAND_END>();
	for(int i = 0; i <dim; i++)
	{
		cout <<"last el: " <<result[dim*dim-1-i]<<endl;
	}
	delete A;
	delete B;
	delete result;
 
	//cout<<"time executing kernel: " << float(finish-start)/1000.0f <<"uS";
	return float(finish-start)/1000.0f;
}

Shouldn't this produce an error, or at least return erroneous values for the matrix-multiplication?

Kernel-code:
Code :
kernel void matrix_mult(__global float4 *a_mat,
__global float4 *b_mat, __global float *c_mat) 
{
	float sum;
	int num_rows = get_global_size(0);
	int vectors_per_row = num_rows/4;
	int start = get_global_id(0) * vectors_per_row;
	a_mat += start;
	c_mat += start*4;
	for(int i=0; i<num_rows; i++) 
	{
		sum = 0.0f;
		for(int j=0; j<vectors_per_row; j++) 
		{
			sum += dot(a_mat[j],
			b_mat[i*vectors_per_row + j]);
		}
		c_mat[i] = sum;
	}
}