Results 1 to 2 of 2

Thread: [OS X, AMD Radeon] Kernel execution freezes system

  1. #1
    Junior Member
    Join Date
    Jan 2012
    Posts
    2

    [OS X, AMD Radeon] Kernel execution freezes system

    Hello,

    I'm implementing a fairly simple N^2 potential integration algorithm with OpenCL. I'm running OS X 10.7 on a 2011 iMac with an AMD Radeon HD 6770M.

    Launching the GPU threads occasionally freezes the entire OS GUI. When the system is locked up, I can SSH into it from a remote machine, however attempting to "kill -9" the process has no effect and a hard reboot is the only way to restore it. Other times, it does not freeze the entire system, but gets to clEnqueueReadBuffer and hangs. Again, kill has no effect. And sometimes it finishes with no problem.

    Running the kernel on the CPU by switching to CL_DEVICE_TYPE_CPU in clGetDeviceIDs works fine. Just wondering if maybe I'm doing something obviously wrong (I know my kernel code is extremely inefficient and I'm calling clFinish way too much on the host). Here's the host code:


    Code :
    //
    //  main.cpp
    //
     
    #include <bfstream.h>
    #include <cassert>
    #include <iostream>
    #include <OpenCL/opencl.h>
    #include <limits>
     
    cl_context g_context;
    cl_kernel g_integrand_gpu_kernel;
    cl_command_queue g_queue;
     
    void pfn_notify(const char *errinfo, const void *private_info, size_t cb, void *user_data);
    void initialize();
    void run();
    void destroy();
     
    // ---------------------------------------------------------
     
    void pfn_notify(const char *errinfo, const void *private_info, size_t cb, void *user_data)
    {
       fprintf(stderr, "%s\n", errinfo);
    }
     
    // ---------------------------------------------------------
     
    void initialize()
    {
       cl_int error = 0;
     
       // Platform
       cl_platform_id platform;
       error = clGetPlatformIDs(1, &platform, NULL );
     
       if (error != CL_SUCCESS) 
       {
          std::cout << "Error getting platform id: " << error << std::endl;
          exit(error);
       }
     
       // Device
       cl_device_id device;
       error = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &device, NULL);
     
       if (error != CL_SUCCESS) 
       {
          std::cout << "Error getting device ids: " << error << std::endl;
          exit(error);
       }
     
       // Context
       g_context = clCreateContext(0, 1, &device, pfn_notify, NULL, &error);
       if (error != CL_SUCCESS) 
       {
          std::cout << "Error creating context: " << error << std::endl;
          exit(error);
       }
     
       // Command-queue
       g_queue = clCreateCommandQueue(g_context, device, 0, &error);
       if (error != CL_SUCCESS) 
       {
          std::cout << "Error creating command queue: " << error << std::endl;
          exit(error);
       }
     
       std::cout << "Creating program" << std::endl;
     
       // Creates the program
    #define MAX_SOURCE_SIZE 10000
     
       FILE *fp;
       char *source_str = new char[MAX_SOURCE_SIZE];
       size_t source_size;
     
       fp = fopen("vortexkernel.cl", "r");
       if (!fp) 
       {
          fprintf(stderr, "Failed to load kernel.\n");
          exit(1);
       }
       source_size = fread( source_str, 1, MAX_SOURCE_SIZE, fp);
       fclose( fp );
     
       cl_program program = clCreateProgramWithSource(g_context, 1, (const char **)&source_str, (const size_t *)&source_size, &error);
       assert(error == CL_SUCCESS);
     
       // Builds the program
       error = clBuildProgram(program, 1, &device, NULL, NULL, NULL);
     
       std::cout << "finished building" << std::endl;
     
       delete[] source_str;
     
       if ( error != CL_SUCCESS )
       {
          // Shows the log
          char* build_log;
          size_t log_size;
          // First call to know the proper size
          clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size);
          build_log = new char[log_size+1];
          // Second call to get the log
          clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG, log_size, build_log, NULL);
          build_log[log_size] = '\0';
          std::cout << "build log: \n" << build_log << std::endl;
          delete[] build_log;
       }
     
       // Assert build success
       assert(error == CL_SUCCESS);
     
       // Extracting the kernel
       g_integrand_gpu_kernel = clCreateKernel(program, "compute_integrand_gpu", &error);
       assert(error == CL_SUCCESS);   
     
       std::cout << "finished initializing" << std::endl;
     
    }
     
    // ---------------------------------------------------------
     
    void run()
    {
     
       //
       // Read saved data
       // 
     
       bifstream save_data( "saved.bin" );
       save_data.read_endianity();
     
       int num_eval_points;
       int num_kernel_centres;
     
       save_data >> num_eval_points;
       save_data >> num_kernel_centres;   
     
       float* xs = new float[3*num_eval_points];
       float* centres = new float[3*num_kernel_centres];
       float* triangle_vorticities_f = new float[3*num_kernel_centres];
     
       std::cout << "num_eval_points: " << num_eval_points << std::endl;
       std::cout << "num_kernel_centres: " << num_kernel_centres << std::endl;
     
       for ( int i = 0; i < 3*num_eval_points; ++i ) 
       { 
          save_data >> xs[i]; 
          assert( xs[i] == xs[i] );
          assert( xs[i] != std::numeric_limits<float>::infinity() );
       }
     
       for ( int i = 0; i < 3*num_kernel_centres; ++i ) 
       { 
          save_data >> centres[i]; 
          assert( centres[i] == centres[i] );
          assert( centres[i] != std::numeric_limits<float>::infinity() );
       }
     
       for ( int i = 0; i < 3*num_kernel_centres; ++i ) 
       { 
          save_data >> triangle_vorticities_f[i]; 
          assert( triangle_vorticities_f[i] == triangle_vorticities_f[i] );
          assert( triangle_vorticities_f[i] != std::numeric_limits<float>::infinity() );
       }
     
       save_data.close();
       assert(save_data.good());
     
       std::cout << "data loaded" << std::endl;
     
       float *buffer = new float[ 3*num_eval_points ];
     
       size_t buffer_mem_size = num_eval_points * 3 * sizeof(float);
       size_t eval_mem_size = num_eval_points * 3 * sizeof(float);
       size_t kernel_mem_size = num_kernel_centres * 3 * sizeof(float);
     
       // Create buffers on the GPU
     
       cl_int error;
       cl_mem src_xs_d = clCreateBuffer( g_context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, eval_mem_size, static_cast<void*>(xs), &error);
       assert(error == CL_SUCCESS);
       cl_mem src_ks_d = clCreateBuffer( g_context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, kernel_mem_size, static_cast<void*>(centres), &error);
       assert(error == CL_SUCCESS);
       cl_mem src_vs_d = clCreateBuffer( g_context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, kernel_mem_size, static_cast<void*>(triangle_vorticities_f), &error);
       assert(error == CL_SUCCESS);
       cl_mem src_buffer_d = clCreateBuffer( g_context, CL_MEM_READ_WRITE, buffer_mem_size, NULL, &error);
       assert(error == CL_SUCCESS);
     
       // Enqueue parameters
     
       error = clSetKernelArg(g_integrand_gpu_kernel, 0, sizeof(cl_mem), &src_xs_d );
       assert(error == CL_SUCCESS);
       error |= clSetKernelArg(g_integrand_gpu_kernel, 1, sizeof(int), &num_eval_points );
       assert(error == CL_SUCCESS);
       error |= clSetKernelArg(g_integrand_gpu_kernel, 2, sizeof(cl_mem), &src_ks_d );
       assert(error == CL_SUCCESS);
       error |= clSetKernelArg(g_integrand_gpu_kernel, 3, sizeof(int), &num_kernel_centres );
       assert(error == CL_SUCCESS);
       error |= clSetKernelArg(g_integrand_gpu_kernel, 4, sizeof(cl_mem), &src_vs_d );
       assert(error == CL_SUCCESS);
       error |= clSetKernelArg(g_integrand_gpu_kernel, 5, sizeof(cl_mem), &src_buffer_d );
       assert(error == CL_SUCCESS);
     
       std::cout << "paramaters queued" << std::endl;
     
       std::cout.flush();
     
       // Launching kernel
     
       const size_t global_ws = num_eval_points;
     
       error = clFlush(g_queue);
       assert(error == CL_SUCCESS);
       error = clFinish(g_queue);
       assert(error == CL_SUCCESS);
     
       error = clEnqueueNDRangeKernel(g_queue, g_integrand_gpu_kernel, 1, NULL, &global_ws, NULL, 0, NULL, NULL);
       assert(error == CL_SUCCESS);
     
       std::cout << "kernel enqueued" << std::endl;
     
       error = clFlush(g_queue);
       assert(error == CL_SUCCESS);
     
       error = clFinish(g_queue);
       assert(error == CL_SUCCESS);
     
       std::cout << "queue finished" << std::endl;
     
       // Wait for threads to finish, then read back buffer
       error = clEnqueueReadBuffer(g_queue, src_buffer_d, CL_TRUE, 0, buffer_mem_size, (void*)buffer, 0, NULL, NULL );
       assert(error == CL_SUCCESS);
     
       std::cout << "read buffer enqueued" << std::endl;
     
       error = clFinish(g_queue);
       assert(error == CL_SUCCESS);
     
       std::cout << "read buffer done" << std::endl;
     
       for ( int i = 0; i < 3*num_eval_points; ++i ) 
       { 
          assert( buffer[i] == buffer[i] );
          assert( buffer[i] != std::numeric_limits<float>::infinity() );
       }
     
       error = clReleaseMemObject(src_xs_d);
       error |= clReleaseMemObject(src_ks_d);
       error |= clReleaseMemObject(src_vs_d);
       error |= clReleaseMemObject(src_buffer_d);
       assert(error == CL_SUCCESS);
     
       delete[] buffer;
       delete[] triangle_vorticities_f;
       delete[] centres;
       delete[] xs;
     
    }
     
    // ---------------------------------------------------------
     
    void destroy()
    {
       clReleaseContext( g_context );
       clReleaseKernel( g_integrand_gpu_kernel );
       clReleaseCommandQueue(g_queue);
     
    }
     
    // ---------------------------------------------------------
     
    int main (int argc, const char * argv[])
    {
       std::cout << "initializing" << std::endl;
       initialize();
       std::cout << "running" << std::endl;
       run();
       std::cout << "destroying" << std::endl;
       destroy();
       std::cout << "done" << std::endl;
    }

    and the kernel:

    Code :
    //
    //  vortexkernel.cl
    //
     
    #define MY_PI  3.14159265358979323846264338327950288f   /* pi */
     
     
    __kernel void compute_integrand_gpu (__global const float* eval_points, 
                                         int num_eval_points,
                                         __global const float* kernel_centres, 
                                         int num_kernel_centres,
                                         __global const float* triangle_vorticities,
                                         __global float* out_buffer )
    {
     
       const int i = get_global_id(0);
     
       float d[3];
       float grad_kernel_eval[3];   
       float result[3];
     
       if ( i < num_eval_points )
       {
          out_buffer[3*i+0] = 0.0f;
          out_buffer[3*i+1] = 0.0f;
          out_buffer[3*i+2] = 0.0f;
     
          for ( int j = 0; j < num_kernel_centres; ++j )
          {
             d[0] = eval_points[3*i+0] - kernel_centres[3*j+0];
             d[1] = eval_points[3*i+1] - kernel_centres[3*j+1];
             d[2] = eval_points[3*i+2] - kernel_centres[3*j+2];
     
             float rsqr = d[0]*d[0] + d[1]*d[1] + d[2]*d[2];
             float r = sqrt(rsqr);
             float coeff = -1.0/(4*MY_PI*r*r*r);
     
             grad_kernel_eval[0] = -coeff * d[0] / r;
             grad_kernel_eval[1] = -coeff * d[1] / r;
             grad_kernel_eval[2] = -coeff * d[2] / r;
     
             result[0] = grad_kernel_eval[1]*triangle_vorticities[3*j+2]-grad_kernel_eval[2]*triangle_vorticities[3*j+1];
             result[1] = grad_kernel_eval[2]*triangle_vorticities[3*j+0]-grad_kernel_eval[0]*triangle_vorticities[3*j+2];
             result[2] = grad_kernel_eval[0]*triangle_vorticities[3*j+1]-grad_kernel_eval[1]*triangle_vorticities[3*j+0];
     
             out_buffer[3*i+0] += result[0];
             out_buffer[3*i+1] += result[1];
             out_buffer[3*i+2] += result[2];
          }      
       }
     
    }


    I can provide the raw data and additional supporting source code as well if anyone feels like trying to reproduce the problem.


    T.

  2. #2
    Junior Member
    Join Date
    Jan 2012
    Posts
    2

    Re: [OS X, AMD Radeon] Kernel execution freezes system

    Okay, I noticed that this was failing only when chewing on a fairly large number of inputs (~30k points). So I'm now breaking up the input buffers into chunks and looping over them, launching a smaller number of threads each time. It seems to work if I can keep the time between clEnqueueNDRangeKernel and clEnqueueReadBuffer down below 5 seconds each time.

    I wonder if the GUI was timing out while waiting for my threads to finish or something. Or maybe there's some kind of watchdog process on OS X, similar to the one in Windows, and it's having trouble killing my long-running tasks? Maybe this is one for the Apple engineers.


    T.

Similar Threads

  1. System freeze on kernel execution
    By joshuafolken in forum OpenCL
    Replies: 3
    Last Post: 02-04-2013, 08:25 AM
  2. Screen Freezes and black screen during game execution on Win
    By Leena in forum OpenGL ES 2X - for programmable 3D graphics pipelines
    Replies: 0
    Last Post: 07-29-2009, 11:54 AM

Posting Permissions

  • You may not post new threads
  • You may not post replies
  • You may not post attachments
  • You may not edit your posts
  •