Results 1 to 2 of 2

Thread: CPU Usage 200%

  1. #1
    Junior Member
    Join Date
    Aug 2011
    Posts
    5

    CPU Usage 200%

    I am using an AMD 3 core system, with 2x GTX260 cards (but I'm only processing on the second one right now). I am using Archlinux with the nvidia driver 280.13-1 (http://www.archlinux.org/packages/extra/x86_64/nvidia/).

    The process I am working on is pretty complex, but I'm just wondering if it is reasonable for two full CPUs to be occupied while event.wait() is occurring? I'm also a little concerned because on a single thread with the CPU the process took about 44 hours. I had hoped that having 128 threads (because I have 128 stream processors) running simultaneously would give me an 100x speed up, but that doesn't seem to be the case.

    I'll throw my kernel up in case anyone is interested. I'm using a #define OPENCL_KERNEL in a header file to use the exact same code for CPU and GPU processing. I've tested the kernel in small cases and it definitely gives the correct (same as CPU) output in small tests.
    http://paste.pocoo.org/show/455434/

  2. #2
    Junior Member
    Join Date
    Aug 2011
    Posts
    5

    Re: CPU Usage 200%

    Sorry for not including my enqueue code. I think the problem is that I was using cl::NDRange(1) for the workgroup size. That might fix the problem. It was I guess running only running 1 at a time?

    Code :
    /* parsData
     * out   - multimap connecting pairs of labels (of which there will be 1 repeat for each pair of
                voxels matching */
    int parseData(StatsT& ostats, TimeseriesT& timeseries, bool mutual, string srcfile = "")
    {
        //zero out the output statistics
        for(int ii = 0 ; ii < ostats.regions*ostats.regions ; ii++) {
            ostats.stats[ii].A                       = 0;
            ostats.stats[ii].B                       = 0;
            ostats.stats[ii].sem                     = 0;
            ostats.stats[ii].count                   = 0;
            ostats.stats[ii].peak_corr               = 0;
            ostats.stats[ii].delay_of_peak_corr      = 0;
            ostats.stats[ii].avg_peak_corr           = 0;
            ostats.stats[ii].avg_delay_of_peak_corr  = 0;
        }
     
        //calculate conversion from index in correlation table to label
        vector<int> index_to_label(ostats.regions);
        {
        int labelnum = 0;
        int curr_label = -1;
        for(int ii = 0 ; ii < timeseries.points ; ii++) {
            if(curr_label != timeseries.data[ii*timeseries.timepoints]) {
                curr_label = timeseries.data[ii*timeseries.timepoints];
                index_to_label[labelnum] = curr_label;
                labelnum++;
            }
        }
        }
     
        int DEVICE = 1;
        //for tracking time
        time_t start,end;
    //    char szInput [256];
        double dif;
        time (&start);
     
        //there are several indexes here:
        //ii,jj = location in list of voxels
        //indexA, indexB = location in list of correlations
        //labelA, labelB = voxel labels
        if(srcfile.size()) {
            std::fstream fin(srcfile.c_str(), fstream::in);
            std::string src((std::istreambuf_iterator<char>(fin)), std::istreambuf_iterator<char>());
            fin.close();
            cout << "Source Size:" << src.size() << endl;
     
            size_t ostatSize = sizeof(StatsT)+sizeof(CorrelationT)*
                        timeseries.regions*timeseries.regions;
            cout << setw(10) << sizeof(TimeseriesT) << setw(10) << sizeof(float) << setw(10) << timeseries.points
                        << setw(10) << timeseries.timepoints+1 << endl;
            size_t tsSize = sizeof(TimeseriesT)+sizeof(float)*timeseries.points*
                        (timeseries.timepoints+1);
     
            cl_int error;
            // Get list of platforms (things that can execute OpenCL on this host), get a "context" on the first executor.
            std::vector<cl::Platform> platformList;
            cl::Platform::get(&platformList);
            cl_context_properties cprops[3] = {CL_CONTEXT_PLATFORM, (cl_context_properties)(platformList[0])(), 0};
            cl::Context context( CL_DEVICE_TYPE_GPU, cprops, NULL, NULL, &error);
            if(error) {
                cerr << "Error getting context: " << endl;
                printError(error);
                return -1;
            }
     
            // Give the OpenCL program embedded in the string above to OpenCL.
            cl::Program::Sources source(1, std::make_pair(src.c_str(), src.length()+1));
     
            // Get devices used in this "context"
            std::vector<cl::Device> devices = context.getInfo<CL_CONTEXT_DEVICES>();
     
            // Compile program against device
            cout << "Available Devices: " << endl;
            string name, vendor, profile, version, driver;
            for(unsigned int i = 0 ; i < devices.size() ; i++) {
                devices[i].getInfo(CL_DEVICE_NAME, &name);
                devices[i].getInfo(CL_DEVICE_VENDOR, &vendor);
                devices[i].getInfo(CL_DEVICE_PROFILE, &profile);
                devices[i].getInfo(CL_DEVICE_VERSION, &version);
                devices[i].getInfo(CL_DRIVER_VERSION, &driver);
                cout << name << ", " << vendor << ", " << profile << ", " 
                            << version << ", " << driver << endl;
            }
            cl::Program program(context, source);
            error = program.build(devices,"-D OPENCL_KERNEL");
            string buildlog;
            program.getBuildInfo(devices[DEVICE], CL_PROGRAM_BUILD_LOG, &buildlog);
            cerr << buildlog<< endl;
            if(error) {
                cerr << "Error building program" << endl;
                printError(error);
                return -2;
            }
     
            // create a kernel object, tell it we are using the kernel 
            //called "hello", give it an argument which is the memory we alloc'd above.
            string kernelname;
            if(mutual)
                kernelname = "parseDataMIHelp";
            else
                kernelname = "parseDataCorrHelp";
     
            cl::Kernel kernel(program, kernelname.c_str(), &error);
            if(error) {
                cerr << "Error creating kernel" << endl;
                printError(error);
                return -3;
            }
     
            cout << tsSize << endl;
            // Allocate Input Buffer
            cl::Buffer timeseriesCL(context, CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR, 
                        tsSize, &timeseries, &error);
            if(error) {
                cerr << "Error creating ts buffer: " << endl;
                printError(error);
                return -4;
            }
     
            cout << endl;
            //Allocate output Buffer
            cl::Buffer index_to_labelCL( context, CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR, 
                        index_to_label.size()*sizeof(int), index_to_label.data(), &error);
            if(error) {
                cerr << "Error creating label buffer: " << endl;
                printError(error);
                return -5;
            }
     
            cout << ostatSize << endl;
            //Allocate output Buffer
            cl::Buffer ostatsCL(context, CL_MEM_READ_WRITE | CL_MEM_USE_HOST_PTR, 
                        ostatSize, &ostats, &error);
            if(error) {
                cerr << "Error creating output buffer: " << endl;
                printError(error);
                return -6;
            }
     
            if((error = kernel.setArg(0, ostatsCL))) {
                cerr << "Error Setting Arg 0: " << endl;
                printError(error);
                return -7;
            }
     
            if((error = kernel.setArg(1, index_to_labelCL))) {
                cerr << "Error Setting Arg 1: " << endl;
                printError(error);
                return -8;
            }
     
            if((error = kernel.setArg(2, timeseriesCL))) {
                cerr << "Error Setting Arg 2: " << endl;
                printError(error);
                return -9;
            }
     
            // Queue the kernel up to run
            cl::CommandQueue queue(context, devices[DEVICE], 0, &error);
            if(error) {
                cerr << "Error queuing kernel: " << endl;
                printError(error);
                return -0;
            }
     
            cl::Event event;
            ///HERE TODO
            cout << (timeseries.points*timeseries.points+timeseries.points)/2 << endl;
            cout << timeseries.regions*timeseries.regions << endl;
     
            devices[DEVICE].getInfo(CL_DEVICE_NAME, &name);
            devices[DEVICE].getInfo(CL_DEVICE_VENDOR, &vendor);
            devices[DEVICE].getInfo(CL_DEVICE_PROFILE, &profile);
            devices[DEVICE].getInfo(CL_DEVICE_VERSION, &version);
            devices[DEVICE].getInfo(CL_DRIVER_VERSION, &driver);
     
            cout << "Enqueue Kernel: " << (timeseries.points*timeseries.points+timeseries.points)/2 
                        << " On Device" << name << ", " << vendor << ", " << version << endl;
            error = queue.enqueueNDRangeKernel(kernel, cl::NullRange, 
                        cl::NDRange((timeseries.points*timeseries.points+timeseries.points)/2),
                        cl::NullRange, NULL, &event);
            if(error) {
                cerr << "Error Queuing NDRange kernel: " << endl;
                printError(error);
                return -9;
            }
     
            // Use the event object above to block until processing has completed
            event.wait();
     
            // Read the results out of the shared memory area.
            error = queue.enqueueReadBuffer(ostatsCL, CL_TRUE, 0, ostatSize, &ostats);
            if(error) {
                cerr << "Error Queuing Read Buffer: " << endl;
                printError(error);
                return -8;
            }
            ostats.regions = timeseries.regions;
        } else {
            for(int ii = 0 ; ii < timeseries.points ; ii++) {
                time (&end);
                dif = difftime (end,start);
                //time_elapse = distance_traveled
                //-----------   -----------------
                //total_time  = total_distance
     
                //total_time  = total_distance
                //-----------   -----------------
                //time_elapse = distance_traveled
                std::cout << "Remaining: " << dif*timeseries.points/ii << std::endl;
     
                for(int jj = ii ; jj < timeseries.points ; jj++) {
                    if(mutual) {
                        parseDataMIHelp(&ostats, index_to_label.data(), &timeseries, ii, jj);
                    } else {
                        parseDataCorrHelp(&ostats, index_to_label.data(), &timeseries, ii, jj);
                    }
                }
            }
        }
        return 0;
    }

Similar Threads

  1. when run on cpu or graphics card of cpu
    By prince in forum OpenCL
    Replies: 2
    Last Post: 01-20-2013, 07:35 AM
  2. CPU Usage
    By johndoe in forum OpenVG and VGU
    Replies: 2
    Last Post: 01-19-2010, 08:20 AM

Posting Permissions

  • You may not post new threads
  • You may not post replies
  • You may not post attachments
  • You may not edit your posts
  •