PDA

View Full Version : Unable to run kernel with matrix of size > 1024



atlemann
01-26-2010, 07:35 AM
Get the following error when trying to run with size of 2048: ERROR: clEnqueueReadBuffer(-5). It works fine with 1024. Is there some buffer limits I should be aware of? A am running on a crappy NVidia Quadro FX 570.

Relevant host code:


unsigned int matrixSize = 2048;
const unsigned int matrixSizeSq = matrixSize * matrixSize;
try {

// (...)

// Create kernel data
float* a = new float[matrixSizeSq];
float* b = new float[matrixSizeSq];
float* c = new float[matrixSizeSq];

for (unsigned int i = 0; i < matrixSizeSq; ++i) {
float val = static_cast<float>(i);
a[i] = val;
b[i] = val + 1.0f;
c[i] = 0.0f;
}

// Set kernel arguments
cl::Buffer aBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
matrixSizeSq * sizeof(float), a, &err);
cl::Buffer bBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
matrixSizeSq * sizeof(float), b, &err);
cl::Buffer cBuffer(context, CL_MEM_WRITE_ONLY,
matrixSizeSq * sizeof(float), 0, &err);

// Create kernel
cl::Kernel kernel(program, "matrixMult", &err);

kernel.setArg(0, aBuffer);
kernel.setArg(1, bBuffer);
kernel.setArg(2, cBuffer);

cl::CommandQueue queue(context, devices[0], 0, &err);

std::vector<cl::Event> events;
cl::Event matMultEvent;
queue.enqueueNDRangeKernel(kernel,
cl::NullRange,
cl::NDRange(matrixSize, matrixSize),
cl::NDRange(16, 16),
NULL,
&matMultEvent);

events.push_back(matMultEvent);

queue.enqueueReadBuffer(cBuffer, CL_FALSE, 0, matrixSizeSq*sizeof(float), c,
&events);
queue.finish();

// (...)

}
catch (cl::Error err) {
std::cerr
<< "ERROR: "
<< err.what()
<< "("
<< err.err()
<< ")"
<< std::endl;
}




Kernel code


__kernel void matrixMult(__global const float* a,
__global const float* b,
__global float* c)
{
// Vector element index
int globalIdx = get_global_id(0);
int globalIdy = get_global_id(1);
int n = get_global_size(0);

float sum = 0;
for (unsigned int k = 0; k < n ; ++k)
sum = sum + a[globalIdx + (k * n)] * b[k + (globalIdy * n)];
c[globalIdx + (globalIdy * n)] = sum;
}

dbs2
01-26-2010, 12:33 PM
I believe there is a bug in Nvidia's beta drivers that limits the maximum execution size to 65k or something. Try running your code on a Mac (which has no limits on max size) and see if it works. If that's the case you can file a bug against Nvidia.

matrem
01-28-2010, 04:18 AM
It's more a limitation than a bug, because nVidia explicitly specify this limitation in their OpenCL guide. :cry: