Hi,

I think I found a memory leak in nvidia driver (tested on 310.90 windows X64) with OpenCL 1.1.

The memory leak appears when you call the function clEnqueueReadBuffer with asynchronous parameters.

Here is a usage:

Code :
err = clEnqueueReadBuffer(cq, d_c, CL_FALSE, 0, NB_DATA*sizeof(float), c, 0, NULL, &evt);

and here is a code sample to illustrate the problem:

Code :
#include "CL/cl.h"
 
#include <sstream>
#include <iostream>
#include <vector>
 
#define TEST_SUCCESS(err) do{ if(err != CL_SUCCESS){ std::cout << "Line : " << __LINE__ << " - Error : " << err << std::endl; system("PAUSE"); return -1; } } while(0)
 
#define DEVICE_INDEX 1
#define NB_DATA 1024
 
int main(void)
{
  cl_int err;
 
  // Getting devices
  cl_uint nbPlatforms;
 
  err = clGetPlatformIDs(0, NULL, &nbPlatforms);
  TEST_SUCCESS(err);
 
  if (nbPlatforms == 0)
  {
    std::cout << "No platforms" << std::endl;
    return 0;
  }
 
  cl_platform_id* platformId = new cl_platform_id[nbPlatforms];
  err = clGetPlatformIDs(nbPlatforms, platformId, NULL);
  TEST_SUCCESS(err);
 
  typedef std::pair<cl_platform_id, cl_device_id> DevicePairType;
  std::vector<DevicePairType> deviceId;
  for (cl_uint i=0; i<nbPlatforms; i++)
  {
    cl_uint nbDevices;
    err = clGetDeviceIDs(platformId[i], CL_DEVICE_TYPE_ALL, 0, NULL, &nbDevices);
    TEST_SUCCESS(err);
 
    if(nbDevices == 0)
    {
      std::cout << "No Device in  platform : " << i << std::endl;
    }
    else
    {
      cl_device_id* did = new cl_device_id[nbDevices];
      err = clGetDeviceIDs(platformId[i], CL_DEVICE_TYPE_ALL, nbDevices, did, NULL);
      TEST_SUCCESS(err);
 
      for (cl_uint j = 0; j<nbDevices; j++)
      {
        cl_platform_id pl = platformId[i];
        cl_device_id de = did[j];
 
        DevicePairType tmp(pl, de);
        deviceId.push_back(tmp);
      }
 
      delete[] did;
    }
  }
 
  // Selected device 
  cl_platform_id platform = deviceId[DEVICE_INDEX].first;
  cl_device_id device = deviceId[DEVICE_INDEX].second;
 
  size_t nameSize;
  err = clGetDeviceInfo(device, CL_DEVICE_NAME, 0, NULL, &nameSize);
  TEST_SUCCESS(err);
 
  char* name = new char[nameSize];
  err = clGetDeviceInfo(device, CL_DEVICE_NAME, sizeof(char)*nameSize, (void*)name, NULL);
  TEST_SUCCESS(err);
 
  std::cout << "Device name : " << name << std::endl;
 
  // Creating context etc...
  cl_context_properties props[] = {CL_CONTEXT_PLATFORM, (cl_context_properties)platform, 0};
  cl_context context = clCreateContext(props, 1, &device, NULL, NULL, &err);
  TEST_SUCCESS(err);
 
  cl_command_queue cq = clCreateCommandQueue(context, device, CL_QUEUE_PROFILING_ENABLE, &err);
  TEST_SUCCESS(err);
 
  // Creating kernel
  std::ostringstream source;
  source << " __kernel void sum( __global const float* a, __global const float* b, __global float* c)" << std::endl;
  source << "{" << std::endl;
  source << "  unsigned int tid = get_global_id(0);" << std::endl;
  source << "  c[tid] = a[tid] + b[tid];" << std::endl;
  source << "}" << std::endl;
 
  std::string str = source.str();
 
  const char* src = str.c_str();
 
  size_t srcSize = str.size();
 
  std::cout << src << std::endl;
  cl_program program = clCreateProgramWithSource(context, 1, &(src), &srcSize, &err);
  TEST_SUCCESS(err);
 
  err = clBuildProgram(program, 1, &device, NULL, NULL, NULL);
  if( err != CL_SUCCESS)
  {
    std::string blog;
 
    // Get size of data to get
    size_t logSize;
    err = clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG, 0, NULL, &logSize);
    TEST_SUCCESS(err);
 
    char* lblog = new char[logSize];
 
    // Get build status
    err = clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG, sizeof(char)*logSize, (void*)lblog, NULL);
    TEST_SUCCESS(err);
 
    blog = std::string(lblog);
 
    std::cout << "Log : " << std::endl << blog << std::endl;
 
    delete[] lblog;
 
    system("PAUSE");
 
    return -1;
  }
 
 
  cl_kernel zeKernel = clCreateKernel(program, "sum", &err);
  TEST_SUCCESS(err);
 
  // Memory
  float* a = new float[NB_DATA];
  float* b = new float[NB_DATA];
  float* c = new float[NB_DATA];
 
  cl_mem d_a = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, NB_DATA*sizeof(float), (void*)a, &err);
  TEST_SUCCESS(err);
 
  cl_mem d_b = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, NB_DATA*sizeof(float), (void*)b, &err);
  TEST_SUCCESS(err);
 
  cl_mem d_c = clCreateBuffer(context, CL_MEM_WRITE_ONLY, NB_DATA*sizeof(float), NULL, &err);
  TEST_SUCCESS(err);
 
  // Running kernel
  size_t lws[1] = {128};
  size_t gws[1] = {NB_DATA};
 
  cl_event evt;
 
  for (unsigned int i=0; i<10000; i++)
  {
    err = clSetKernelArg(zeKernel, 0, sizeof(cl_mem), &d_a);
    TEST_SUCCESS(err);
    err = clSetKernelArg(zeKernel, 1, sizeof(cl_mem), &d_b);
    TEST_SUCCESS(err);
    err = clSetKernelArg(zeKernel, 2, sizeof(cl_mem), &d_c);
    TEST_SUCCESS(err);
 
    err = clEnqueueNDRangeKernel(cq, zeKernel, 1, NULL, gws, lws, 0, NULL, NULL);
    TEST_SUCCESS(err);
 
    err = clEnqueueReadBuffer(cq, d_c, CL_FALSE, 0, NB_DATA*sizeof(float), c, 0, NULL, &evt);
    TEST_SUCCESS(err);
 
    clWaitForEvents(1, &evt);
  }
 
  // Clean
  clReleaseMemObject(d_a);
  clReleaseMemObject(d_b);
  clReleaseMemObject(d_c);
 
  delete[] a;
  delete[] b;
  delete[] c;
 
  clReleaseKernel(zeKernel);
  clReleaseProgram(program);
  clFinish(cq);
  clReleaseCommandQueue(cq);
  clReleaseContext(context);
 
  delete[] platformId;
  delete[] name;
}

When you run this program, the process allocate more and more memory. The problem doesn't appear with Intel and AMD (GPU) drivers.

Am I right? How can I tell it to NVidia, because their forum doesn't work!

Thanks.