Get the following error when trying to run with size of 2048: ERROR: clEnqueueReadBuffer(-5). It works fine with 1024. Is there some buffer limits I should be aware of? A am running on a crappy NVidia Quadro FX 570.

Relevant host code:
Code :
unsigned int matrixSize = 2048;
const unsigned int matrixSizeSq = matrixSize * matrixSize;
try {
 
// (...)
 
// Create kernel data
float* a = new float[matrixSizeSq];
float* b = new float[matrixSizeSq];
float* c = new float[matrixSizeSq];
 
for (unsigned int i = 0; i < matrixSizeSq; ++i) {
  float val = static_cast<float>(i);
  a[i] = val;
  b[i] = val + 1.0f;
  c[i] = 0.0f;
 }
 
// Set kernel arguments
cl::Buffer aBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
                   matrixSizeSq * sizeof(float), a, &err);
cl::Buffer bBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
                   matrixSizeSq * sizeof(float), b, &err);
cl::Buffer cBuffer(context, CL_MEM_WRITE_ONLY,
                   matrixSizeSq * sizeof(float), 0, &err);
 
// Create kernel
cl::Kernel kernel(program, "matrixMult", &err);
 
kernel.setArg(0, aBuffer);
kernel.setArg(1, bBuffer);
kernel.setArg(2, cBuffer);
 
cl::CommandQueue queue(context, devices[0], 0, &err);
 
std::vector<cl::Event> events;
cl::Event matMultEvent;
queue.enqueueNDRangeKernel(kernel, 
                           cl::NullRange, 
                           cl::NDRange(matrixSize, matrixSize),
                           cl::NDRange(16, 16),
                           NULL,
                           &matMultEvent);
 
events.push_back(matMultEvent);
 
queue.enqueueReadBuffer(cBuffer, CL_FALSE, 0, matrixSizeSq*sizeof(float), c,
                        &events);
queue.finish();
 
// (...)
 
}
  catch (cl::Error err) {
    std::cerr 
      << "ERROR: "
      << err.what()
      << "("
      << err.err()
      << ")"
      << std::endl;
}


Kernel code
Code :
__kernel void matrixMult(__global const float* a,
                         __global const float* b,
                         __global float* c)
{
  // Vector element index
  int globalIdx = get_global_id(0);
  int globalIdy = get_global_id(1);
  int n = get_global_size(0);
 
  float sum = 0;
  for (unsigned int k = 0; k < n ; ++k)
    sum = sum + a[globalIdx + (k * n)] * b[k + (globalIdy * n)];
  c[globalIdx + (globalIdy * n)] = sum;
}