PDA

View Full Version : OpenCL: clEnqueueNDRangeKernel Failed: -54



masterlowi
05-10-2010, 08:30 AM
Hi,

i wrote my first OpenCL Program with VS 2008 CLR Forms.
I always get a: "clEnqueueNDRangeKernel Failed: -54" Error and i dont know why.
Can someone take a look at my code?

Its a Visual Studio 2008 Project:

http://free.doublebackslash.net/Studium ... OpenCL.rar (http://free.doublebackslash.net/Studium/openCL/OpenCL.rar)

Click first on the button "Devices suchen + Infos lesen" and then on the button "Context + CommandQueues erstellen"


For all others without Visual Studio, here is the Code: (OpenCL.lib is linked!)



#include <utility>
#define __NO_STD_VECTOR
#define __NO_STD_STRING

#include <CL/cl.h>

#include <cstdio>
#include <cstdlib>
#include <fstream>
#include <iostream>
#include <string>
#include <iterator>
#include <math.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <string.h>

static char vendor_cpu[65536];
static char vendor_gpu[65536];
static std::size_t size;
static cl_uint num_devices_returned;
static cl_device_id devices[2];
static cl_ulong platform;

static cl_ulong frequ, max_par_units, addr_bits;
static cl_int err = CL_SUCCESS;
static cl_uint num_platforms;
static cl_platform_id * platforms;
static bool cpu = false;
static bool gpu = false;
static cl_kernel kernel;
static cl_context context;
static size_t local;
const unsigned int cnBlockSize= 512;
const unsigned int cnBlocks =3;
static size_t cnDimension = cnBlocks * cnBlockSize;


const char * kernelcode = "__kernel void vectorAdd( \n"
"__global const float * a, \n"
"__global const float * b, \n"
"__global float * c) \n"
"{ \n"
" // Vector element index \n"
" int nIndex = get_global_id(0); \n"
" c[nIndex] = a[nIndex] + b[nIndex]; \n"
"} \n";

private: System::Void bt_getdeviceinfos_Click(System::Object^ sender, System::EventArgs^ e)
{
this->rtb_log->AppendText("##Get Platform ID##\n");
err = clGetPlatformIDs(0, NULL, &num_platforms);
platforms = (cl_platform_id*)malloc(sizeof(cl_platform_id) * num_platforms);
clGetPlatformIDs(num_platforms, platforms, NULL); // Get device IDs
cl_platform_id platform_id = platforms[0];
platform = (cl_ulong)platform_id;
this->l_num_platform->Text = "Platform ID: " + platform;


this->rtb_log->AppendText("##Get CPU Devices##\n");
err = clGetDeviceIDs(platforms[0], CL_DEVICE_TYPE_CPU, 1, &devices[0], &num_devices_returned);
this->l_num_cpu->Text = "Anzahl der CPU-Devices: " + num_devices_returned.ToString();
this->rtb_cpu->AppendText(num_devices_returned.ToString() + " CPU Device gefunden\n");
//cout << "Fehlercode: " << err << endl;
if(err == 0) cpu = true;

this->rtb_log->AppendText("##Get GPU Devices##\n");
err = clGetDeviceIDs(platforms[0], CL_DEVICE_TYPE_GPU, 1, &devices[1], &num_devices_returned);
this->l_num_gpu->Text = "Anzahl der GPU-Devices: " + num_devices_returned.ToString();
this->rtb_gpu->AppendText(num_devices_returned.ToString() + " GPU Device gefunden\n");
if(err == 0) gpu = true;

if(cpu == true)
{
this->rtb_log->AppendText("##Get CPU Device Infos##\n");
clGetDeviceInfo(devices[0], CL_DEVICE_MAX_CLOCK_FREQUENCY, sizeof(frequ), &frequ, &size);
this->rtb_cpu->AppendText("Maximum clock frequency of the device in MHz: " + frequ.ToString() + "\n");
clGetDeviceInfo(devices[0], CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(max_par_units), &max_par_units, &size);
this->rtb_cpu->AppendText("The number of parallel compute cores on the OpenCL device: " + max_par_units.ToString() + "\n");
clGetDeviceInfo(devices[0], CL_DEVICE_NAME, sizeof(vendor_cpu), vendor_cpu, &size);
this->rtb_cpu->AppendText("Vendor name: " + gcnew String(vendor_cpu) + "\n");
clGetDeviceInfo(devices[0], CL_DEVICE_ADDRESS_BITS, sizeof(addr_bits), &addr_bits, &size);
this->rtb_cpu->AppendText("CPU ADDRESS BITS: " + addr_bits.ToString() + "\n");
}

if(gpu == true)
{
this->rtb_gpu->AppendText("##Get GPU Device Infos##\n");
clGetDeviceInfo(devices[1], CL_DEVICE_MAX_CLOCK_FREQUENCY, sizeof(frequ), &frequ, &size);
this->rtb_gpu->AppendText("Maximum clock frequency of the device in MHz: " + frequ.ToString() + "\n");
clGetDeviceInfo(devices[1], CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(max_par_units), &max_par_units, &size);
this->rtb_gpu->AppendText("The number of parallel compute cores on the OpenCL device: " + max_par_units.ToString() + "\n");
clGetDeviceInfo(devices[1], CL_DEVICE_NAME, sizeof(vendor_gpu), vendor_gpu, &size);
this->rtb_gpu->AppendText("Vendor name: " + gcnew String(vendor_gpu) + "\n");
clGetDeviceInfo(devices[1], CL_DEVICE_ADDRESS_BITS, sizeof(addr_bits), &addr_bits, &size);
this->rtb_gpu->AppendText("GPU ADDRESS BITS: " + addr_bits.ToString() + "\n");
}
}
private: System::Void bt_contextcq_Click(System::Object^ sender, System::EventArgs^ e)
{
this->rtb_log->AppendText("##Create Context##\n");

if(cpu == true && gpu == true)
context = clCreateContext(0, 2, devices, NULL, NULL, &err);
if(cpu == true && gpu == false)
context = clCreateContext(0, 1, devices, NULL, NULL, &err);
if(gpu == true && cpu == false)
context = clCreateContext(0, 1, devices, NULL, NULL, &err);

this->rtb_log->AppendText("##Create CommandQueue's##\n");
cl_command_queue queue_cpu, queue_gpu;
if(cpu == true)
queue_cpu = clCreateCommandQueue(context, devices[0], 0, &err);
if(gpu == true)
queue_gpu = clCreateCommandQueue(context, devices[1], 0, &err);

this->rtb_log->AppendText("##Create Program Codes for OpenCL##\n");
cl_program program;
size_t kernelsize = strlen(kernelcode);
program = clCreateProgramWithSource(context, 1, (const char**)&kernelcode, &kernelsize, &err);
err = clBuildProgram(program, num_devices_returned, NULL, NULL, NULL, NULL);
if(err != 0)
{
char log[1024] = "";
err = clGetProgramBuildInfo(program, devices[0], CL_PROGRAM_BUILD_LOG, sizeof(log), log, NULL);
MessageBox::Show(gcnew String(log));
}

this->rtb_log->AppendText("##Create Kernel Codes for OpenCL##\n");
kernel = clCreateKernel(program, "vectorAdd", &err);
if(err != 0)
MessageBox::Show("clCreateKernel: " + err.ToString());

float * pA = new float[cnDimension];
float * pB = new float[cnDimension];
float * pC = new float[cnDimension];
float * pC1 = new float[cnDimension];

memset(pC, 0, cnDimension * sizeof(float));
memset(pC1, 0, cnDimension * sizeof(float));

// initialize host memory
int i;
for(i=0; i < cnDimension; i++)
{
pA[i] = pC[i] = pC1[i] = 0;
pB[i] = i;
}

// allocate device memory
cl_mem hDeviceMemA, hDeviceMemB, hDeviceMemC;
hDeviceMemA = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, cnDimension * sizeof(cl_float), pA, 0);
hDeviceMemB = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, cnDimension * sizeof(cl_float), pB, 0);
hDeviceMemC = clCreateBuffer(context, CL_MEM_WRITE_ONLY, cnDimension * sizeof(cl_float), 0, 0);

// setup parameter values
err = 0;
err = clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *)&hDeviceMemA);
err |= clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *)&hDeviceMemB);
err |= clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *)&hDeviceMemC);
if (err != CL_SUCCESS)
{
MessageBox::Show("Error: Failed to set kernel args: " + err.ToString() + "\n");
}

// Get the maximum work-group size for executing the kernel on the device
err = clGetKernelWorkGroupInfo(kernel, devices[0], CL_KERNEL_WORK_GROUP_SIZE, sizeof(local), &local, NULL);
if (err != CL_SUCCESS)
{
MessageBox::Show("Error: clGetKernelWorkGroupInfo Failed: " + err.ToString() + "\n");
}

// execute kernel
err = clEnqueueNDRangeKernel(queue_cpu, kernel, 1, NULL, (size_t*)(&cnDimension), &local, 0, NULL, NULL);
if (err != CL_SUCCESS)
{
MessageBox::Show("Error: clEnqueueNDRangeKernel Failed: " + err.ToString() + "\n");
}

// copy results from device back to host
clEnqueueReadBuffer(queue_cpu, hDeviceMemC, CL_TRUE, 0, cnDimension * sizeof(cl_float),
pC, 0, NULL, NULL);

// wait for command queue
clFinish(queue_cpu);
for(i=0; i < cnDimension; i++)
{
this->rtb_log->AppendText(pC[i].ToString() + "\n");
}

}
private: System::Void Form1_Load(System::Object^ sender, System::EventArgs^ e)
{

}
};
}

masterlowi
05-10-2010, 08:49 AM
ps: i try to run it on the cpu, ATI stream drivers are installed.

i dont have a opencl ready graphics card, so i have to use the cpu.
is it possible that this problem only accours on the cpu?

masterlowi
05-10-2010, 09:31 AM
i found the error:

err = clEnqueueNDRangeKernel(queue_cpu, kernel, 1, NULL, (size_t*)(&cnDimension), &local, 0, NULL, NULL);

i has to be:

err = clEnqueueNDRangeKernel(queue_cpu, kernel, 1, NULL, (size_t*)(&cnDimension), NULL, 0, NULL, NULL);

dont know why, but i works :D