Results 1 to 9 of 9

Thread: Possible Memory leak in nVidia driver

  1. #1
    Junior Member
    Join Date
    Aug 2011
    Posts
    14

    Possible Memory leak in nVidia driver

    Hi,

    I think I found a memory leak in nvidia driver (tested on 310.90 windows X64) with OpenCL 1.1.

    The memory leak appears when you call the function clEnqueueReadBuffer with asynchronous parameters.

    Here is a usage:

    Code :
    err = clEnqueueReadBuffer(cq, d_c, CL_FALSE, 0, NB_DATA*sizeof(float), c, 0, NULL, &evt);

    and here is a code sample to illustrate the problem:

    Code :
    #include "CL/cl.h"
     
    #include <sstream>
    #include <iostream>
    #include <vector>
     
    #define TEST_SUCCESS(err) do{ if(err != CL_SUCCESS){ std::cout << "Line : " << __LINE__ << " - Error : " << err << std::endl; system("PAUSE"); return -1; } } while(0)
     
    #define DEVICE_INDEX 1
    #define NB_DATA 1024
     
    int main(void)
    {
      cl_int err;
     
      // Getting devices
      cl_uint nbPlatforms;
     
      err = clGetPlatformIDs(0, NULL, &nbPlatforms);
      TEST_SUCCESS(err);
     
      if (nbPlatforms == 0)
      {
        std::cout << "No platforms" << std::endl;
        return 0;
      }
     
      cl_platform_id* platformId = new cl_platform_id[nbPlatforms];
      err = clGetPlatformIDs(nbPlatforms, platformId, NULL);
      TEST_SUCCESS(err);
     
      typedef std::pair<cl_platform_id, cl_device_id> DevicePairType;
      std::vector<DevicePairType> deviceId;
      for (cl_uint i=0; i<nbPlatforms; i++)
      {
        cl_uint nbDevices;
        err = clGetDeviceIDs(platformId[i], CL_DEVICE_TYPE_ALL, 0, NULL, &nbDevices);
        TEST_SUCCESS(err);
     
        if(nbDevices == 0)
        {
          std::cout << "No Device in  platform : " << i << std::endl;
        }
        else
        {
          cl_device_id* did = new cl_device_id[nbDevices];
          err = clGetDeviceIDs(platformId[i], CL_DEVICE_TYPE_ALL, nbDevices, did, NULL);
          TEST_SUCCESS(err);
     
          for (cl_uint j = 0; j<nbDevices; j++)
          {
            cl_platform_id pl = platformId[i];
            cl_device_id de = did[j];
     
            DevicePairType tmp(pl, de);
            deviceId.push_back(tmp);
          }
     
          delete[] did;
        }
      }
     
      // Selected device 
      cl_platform_id platform = deviceId[DEVICE_INDEX].first;
      cl_device_id device = deviceId[DEVICE_INDEX].second;
     
      size_t nameSize;
      err = clGetDeviceInfo(device, CL_DEVICE_NAME, 0, NULL, &nameSize);
      TEST_SUCCESS(err);
     
      char* name = new char[nameSize];
      err = clGetDeviceInfo(device, CL_DEVICE_NAME, sizeof(char)*nameSize, (void*)name, NULL);
      TEST_SUCCESS(err);
     
      std::cout << "Device name : " << name << std::endl;
     
      // Creating context etc...
      cl_context_properties props[] = {CL_CONTEXT_PLATFORM, (cl_context_properties)platform, 0};
      cl_context context = clCreateContext(props, 1, &device, NULL, NULL, &err);
      TEST_SUCCESS(err);
     
      cl_command_queue cq = clCreateCommandQueue(context, device, CL_QUEUE_PROFILING_ENABLE, &err);
      TEST_SUCCESS(err);
     
      // Creating kernel
      std::ostringstream source;
      source << " __kernel void sum( __global const float* a, __global const float* b, __global float* c)" << std::endl;
      source << "{" << std::endl;
      source << "  unsigned int tid = get_global_id(0);" << std::endl;
      source << "  c[tid] = a[tid] + b[tid];" << std::endl;
      source << "}" << std::endl;
     
      std::string str = source.str();
     
      const char* src = str.c_str();
     
      size_t srcSize = str.size();
     
      std::cout << src << std::endl;
      cl_program program = clCreateProgramWithSource(context, 1, &(src), &srcSize, &err);
      TEST_SUCCESS(err);
     
      err = clBuildProgram(program, 1, &device, NULL, NULL, NULL);
      if( err != CL_SUCCESS)
      {
        std::string blog;
     
        // Get size of data to get
        size_t logSize;
        err = clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG, 0, NULL, &logSize);
        TEST_SUCCESS(err);
     
        char* lblog = new char[logSize];
     
        // Get build status
        err = clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG, sizeof(char)*logSize, (void*)lblog, NULL);
        TEST_SUCCESS(err);
     
        blog = std::string(lblog);
     
        std::cout << "Log : " << std::endl << blog << std::endl;
     
        delete[] lblog;
     
        system("PAUSE");
     
        return -1;
      }
     
     
      cl_kernel zeKernel = clCreateKernel(program, "sum", &err);
      TEST_SUCCESS(err);
     
      // Memory
      float* a = new float[NB_DATA];
      float* b = new float[NB_DATA];
      float* c = new float[NB_DATA];
     
      cl_mem d_a = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, NB_DATA*sizeof(float), (void*)a, &err);
      TEST_SUCCESS(err);
     
      cl_mem d_b = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, NB_DATA*sizeof(float), (void*)b, &err);
      TEST_SUCCESS(err);
     
      cl_mem d_c = clCreateBuffer(context, CL_MEM_WRITE_ONLY, NB_DATA*sizeof(float), NULL, &err);
      TEST_SUCCESS(err);
     
      // Running kernel
      size_t lws[1] = {128};
      size_t gws[1] = {NB_DATA};
     
      cl_event evt;
     
      for (unsigned int i=0; i<10000; i++)
      {
        err = clSetKernelArg(zeKernel, 0, sizeof(cl_mem), &d_a);
        TEST_SUCCESS(err);
        err = clSetKernelArg(zeKernel, 1, sizeof(cl_mem), &d_b);
        TEST_SUCCESS(err);
        err = clSetKernelArg(zeKernel, 2, sizeof(cl_mem), &d_c);
        TEST_SUCCESS(err);
     
        err = clEnqueueNDRangeKernel(cq, zeKernel, 1, NULL, gws, lws, 0, NULL, NULL);
        TEST_SUCCESS(err);
     
        err = clEnqueueReadBuffer(cq, d_c, CL_FALSE, 0, NB_DATA*sizeof(float), c, 0, NULL, &evt);
        TEST_SUCCESS(err);
     
        clWaitForEvents(1, &evt);
      }
     
      // Clean
      clReleaseMemObject(d_a);
      clReleaseMemObject(d_b);
      clReleaseMemObject(d_c);
     
      delete[] a;
      delete[] b;
      delete[] c;
     
      clReleaseKernel(zeKernel);
      clReleaseProgram(program);
      clFinish(cq);
      clReleaseCommandQueue(cq);
      clReleaseContext(context);
     
      delete[] platformId;
      delete[] name;
    }

    When you run this program, the process allocate more and more memory. The problem doesn't appear with Intel and AMD (GPU) drivers.

    Am I right? How can I tell it to NVidia, because their forum doesn't work!

    Thanks.

  2. #2
    Junior Member
    Join Date
    Aug 2011
    Posts
    14

    Re: Possible Memory leak in nVidia driver

    I forgot to tell that if you replace the line:

    Code :
    err = clEnqueueReadBuffer(cq, d_c, CL_FALSE, 0, NB_DATA*sizeof(float), c, 0, NULL, &evt);

    by the line

    Code :
    err = clEnqueueReadBuffer(cq, d_c, CL_TRUE, 0, NB_DATA*sizeof(float), c, 0, NULL, NULL);

    The problem no longer appears even on with nvidia driver.

    Be careful with the device choice, please select the right "DEVICE_INDEX value".

  3. #3
    Junior Member
    Join Date
    Aug 2011
    Posts
    14

    Re: Possible Memory leak in nVidia driver

    Ok, I found a new thing. If I add a clReleaseEvent just after the clWaitForEvents. The memory leak no longer appears.

    Code :
        err = clEnqueueNDRangeKernel(cq, zeKernel, 1, NULL, gws, lws, 0, NULL, NULL);
        TEST_SUCCESS(err);
     
        err = clEnqueueReadBuffer(cq, d_c, CL_FALSE, 0, NB_DATA*sizeof(float), c, 0, NULL, &evt);
        TEST_SUCCESS(err);
     
        err = clWaitForEvents(1, &evt);
        TEST_SUCCESS(err);
     
        err = clReleaseEvent(evt);
        TEST_SUCCESS(err);

    But I don't understand. Why can't we use the same cl_event without releasing it?

  4. #4
    Junior Member
    Join Date
    Aug 2011
    Posts
    14

    Re: Possible Memory leak in nVidia driver

    Ok I think the problem is solved.

    Thanks.

  5. #5
    Senior Member
    Join Date
    Oct 2012
    Posts
    165

    Re: Possible Memory leak in nVidia driver

    Would be great if you can tell us where the problem was.

  6. #6
    Junior Member
    Join Date
    Aug 2011
    Posts
    14

    Re: Possible Memory leak in nVidia driver

    I think nVidia has choosen to implement each event usage without controlling if the event is allready allocated.

    So when you call clEnqueueReadBuffer( [..], &evt);

    I believe the driver allocates the event systematically even if it has already been allocated.

    While Intel and AMD control that the event is not NULL before its allocation and use the same pointer if it already exists.

    So maybe it is a problem or maybe it's a choice. I don't know if OpenCL specifications constrain this point.

    I submited a bug report to nvidia, I'm waiting for the answer.

  7. #7

    Re: Possible Memory leak in nVidia driver

    Coincidentally, I stumbled upon similar "pseudo leak" problems. However, I don't think you can blame NVIDIA on that in this case.

    What you pass on to clEnqueueNDRangeKernel() is not an event object but merely a location. For example
    Code :
    cl_event event = NULL;
     
    clEnqueueNDRangeKernel (queue, kernel, 1, NULL, &size, NULL, 0, NULL, &event);
    printf ("event = %p\n", event);
    clEnqueueNDRangeKernel (queue, kernel, 1, NULL, &size, NULL, 0, NULL, &event);
    printf ("event = %p\n", event);
    will return two distinct pointers. And the standard is also very clear on that: "event Returns an event object that identifies this particular kernel execution instance.". It would be wrong for clEnqueueNDRangeKernel() to release an existing event that is passed for two reasons: 1) semantically, the function is used to enqueue kernels not release events and 2) it would be disastrous if I released an event a second time that has been released already.

  8. #8
    Junior Member
    Join Date
    Aug 2011
    Posts
    14

    Re: Possible Memory leak in nVidia driver

    I agree with you, and this is what I was told NVidia:

    Our developer identifies the observed behavior is valid as per OpenCL 1.1 spec, revision 44, section 5.9:
    "The function cl_int clRetainEvent (cl_event event) increments the event reference count. The OpenCL commands that return an event perform an implicit retain."
    Developers are responsible for releasing event objects returned by asynchronous enqueue calls when the events are no longer needed. So adding a call to clReleaseEvent in the loop is the correct solution.
    So does it mean only nvidia correctly implements the standard?

    Thanks.

  9. #9

    Re: Possible Memory leak in nVidia driver

    Quote Originally Posted by Letinono
    So does it mean only nvidia correctly implements the standard?
    I just did a test on an AMD machine and I get different event pointers each time. However, they also re-use the memory (seeing the same memory location) if I release the first event.

Similar Threads

  1. Memory leak problem in nVidia driver
    By Aki in forum OpenCL
    Replies: 4
    Last Post: 01-18-2013, 08:18 PM
  2. Replies: 2
    Last Post: 11-09-2009, 07:00 AM

Posting Permissions

  • You may not post new threads
  • You may not post replies
  • You may not post attachments
  • You may not edit your posts
  •