I have MacOS 10.6.2 (Snow L) and ATI Radeon HD4870. For this card, CL_DEVICE_MAX_WORK_GROUP_SIZE=1024. However, for some reason I cannot use any work group sizes larger than 256, for example if I try to use 512x1x1, clEnqueueNDRangeKernel reports CL_INVALID_WORK_GROUP_SIZE. Any ideas why this can be happening? Can it be something inherent to the ATI Stream?

Here's my code (error handling stripped, since no errors are generated in the middle):

Code :
clGetPlatformIDs(max_num_platforms, platforms, &num_platforms);
clGetDeviceIDs( platforms[0], CL_DEVICE_TYPE_GPU, num_entries, devices, &num_devices);
device = devices[0];
cmd_queue = clCreateCommandQueue(context, device, 0, NULL);
program = clCreateProgramWithSource( context, 1, &kernel_str, NULL, NULL);
clBuildProgram(program, 1, &device, NULL, NULL, NULL);
kernel = clCreateKernel(program, "inc", NULL);
cl_mem memobj = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, sizeof(float)*n, srcA, NULL);
clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *)&memobj );
 
size_t gws = 512, lws = 512;
err = clEnqueueNDRangeKernel(cmd_queue, kernel, 1, NULL, &gws, &lws, 0, NULL, NULL);
assert(err==CL_SUCCESS);

A kernel is very simple:
Code :
__kernel void inc (__global const double *a) {
      int k = get_global_id(0);
      ++a[k];
}