Hi,

I am trying out OpenCL on my nVidia GTX660. It looks great, but I am having an issue with CL_MEM_USE_HOST_PTR buffers. I use this flag on all my buffers : it works on input buffers but my kernel cannot write the output data in RAM.

I tried to use clEnqueueReadBuffer() after kernel execution and it works so what's going on ? Does this mean that the buffer is in the GPU RAM ? Why ?
Here are my relevant functions.

Code :
void OCLInterface::LoadKernel(string file, size_t size)
{
        char* source;
        size_t fileSize;
 
        source = LoadSource(file.c_str(), &fileSize);
        program = clCreateProgramWithSource(context, 1, (const char **)&source, &fileSize, &err);
        free(source);
        CHK();
 
        err = clBuildProgram(program, 0, NULL, NULL, NULL, NULL);
        CHK();
        kernel = clCreateKernel(program, file.c_str(), &err);
        CHK();
 
        currentSize = size;
        currentArgCount = 0;
}
 
void OCLInterface::AddParam(float* data, bool bInput)
{
        cl_mem buffer = clCreateBuffer(
                context,
                (bInput ? CL_MEM_READ_ONLY:CL_MEM_WRITE_ONLY) | CL_MEM_USE_HOST_PTR,
                VSIZE * sizeof(float),
                data,
                &err
        );
        CHK();
        err = clSetKernelArg(kernel, currentArgCount, sizeof(cl_mem), (void*)&buffer);
        CHK();
        buffers.push_back(buffer);
        currentArgCount++;
}
 
void OCLInterface::Exec(int size)
{
        size_t localsize = size;
        size_t globalsize = ceil(VSIZE / (float)localsize) * localsize;
        err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &globalsize, &localsize, 0, NULL, NULL);
        CHK();
        err = clFinish(queue);
        CHK();
}
 
// My code
 
float* A = (float*)malloc(VSIZE * sizeof(float));
float* B = (float*)malloc(VSIZE * sizeof(float));
float* C = (float*)malloc(VSIZE * sizeof(float));
// snip : array init (a loop)
 
openCL.LoadKernel("add", VSIZE);
openCL.AddParam(A, true);
openCL.AddParam(B, true);
openCL.AddParam(C, false);
openCL.Exec(128);
At this point C[] is still at its initialization value (0 in my case but depending on the array init code).
And now my kernel, which should do a simple vector addition.

Code :
__kernel void add(
	__global const float* input1,
	__global const float* input2,
	__global float* output)
{
    unsigned int index = get_global_id(0);
    output[index] = input1[index] + input2[index];
}

Thanks !