Results 1 to 10 of 10

Thread: Problem with using local memory and global memory

  1. #1

    Problem with using local memory and global memory

    Hi all,

    I am working on using OpenCL for convolution of images.
    I cant able to know how to initialize the local memories and global memories.
    Can u help me??????

    //This is the kernel I am using for convolution
    __kernel void Convolution(const __global float * pInput,
    __constant float * pFilter,
    __global float * pOutput,
    const int nInWidth,
    const int nFilterWidth)
    {
    const int nWidth = get_global_size(0);

    const int xOut = get_global_id(0);
    const int yOut = get_global_id(1);

    const int xInTopLeft = xOut;
    const int yInTopLeft = yOut;

    float sum = 0;
    for (int r = 0; r < nFilterWidth; r++)
    {
    const int idxFtmp = r * nFilterWidth;

    const int yIn = yInTopLeft + r;
    const int idxIntmp = yIn * nInWidth + xInTopLeft;

    for (int c = 0; c < nFilterWidth; c++)
    {
    const int idxF = idxFtmp + c;
    const int idxIn = idxIntmp + c;
    sum += pFilter[idxF]*pInput[idxIn];
    }
    }
    const int idxOut = yOut * nWidth + xOut;
    pOutput[idxOut] = sum;
    }



    //This is my main program

    // I am getting stuck in the red marked place of the code...

    int main(int argc, char **argv)
    {
    printf("The program execution has started and entered in to MAIN");
    cvWaitKey(0);

    IplImage * img1 = NULL;
    img1 = cvLoadImage("road.jpg", CV_LOAD_IMAGE_COLOR );
    cvNamedWindow( "RGB image", CV_WINDOW_AUTOSIZE );
    cvShowImage( "RGB image", img1 ); // display image

    IplImage * img = NULL;

    img = cvCreateImage( cvGetSize(img1),img1->depth,1 );

    //Converting it to gray scale
    cvCvtColor( img1, img, CV_RGB2GRAY );

    cvNamedWindow( "Gray image", CV_WINDOW_AUTOSIZE );
    cvShowImage( "Gray image", img ); // display image

    cvWaitKey(0);
    //IplImage *img =(IplImage *)img2;
    iWidth = img->width;
    iHeight = img->height;
    iNumElements = iWidth * iHeight;
    //cvDestroyWindow( "image" );
    //cvReleaseImage( &img ); // release memory

    float filter[] = {0,1,0,1,-4,1,0,1,0};

    //float b[] = {1,2,3,4,5,6,7,8,9,10};
    //float c[10],d[10];

    // get command line arg for quick test, if provided
    bNoPrompt = shrCheckCmdLineFlag(argc, (const char**)argv, "noprompt");

    // start logs
    shrSetLogFileName ("oclConvolution.txt");
    shrLog("%s Starting...\n\n# of float elements per Array \t= %u\n", argv[0], iNumElements);

    // set and log Global and Local work size dimensions
    //szLocalWorkSize = (sizeof(cl_float4)* 1);
    //szGlobalWorkSize = shrRoundUp((cl_float)szLocalWorkSize, iNumElements); // rounded up to the nearest multiple of the LocalWorkSize
    //shrLog("Global Work Size \t\t= %u\nLocal Work Size \t\t= %u\n# of Work Groups \t\t= %u\n\n",
    // szGlobalWorkSize, szLocalWorkSize, (szGlobalWorkSize % szLocalWorkSize + szGlobalWorkSize/szLocalWorkSize));

    szLocalWorkSize[0] = iHeight;
    szLocalWorkSize[1] = iWidth;
    szGlobalWorkSize[0] = shrRoundUp((int)szLocalWorkSize[0], iFilterWidth);
    szGlobalWorkSize[1] = shrRoundUp((int)szLocalWorkSize[1], iWidth * iHeight);


    // Allocate and initialize host arrays
    shrLog( "Allocate and Init Host Mem...\n");
    //srcA = malloc(sizeof(cl_float4) * szGlobalWorkSize);
    //srcB = malloc(sizeof(cl_float4) * szGlobalWorkSize);
    GPUoutput = (float *)malloc(iWidth * iHeight * sizeof(cl_float4));
    CPUoutput = (float *)malloc(iWidth * iHeight * sizeof(cl_float4));

    srcA = (cl_float *)img;
    srcB = filter;

    // Get the NVIDIA platform
    shrLog("Get the NVIDIA platform...\n");
    ciErrNum = oclGetPlatformID(&cpPlatform);
    oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);

    // Get a GPU device
    shrLog("Getting the GPU device...\n");
    ciErrNum = clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_GPU, 1, &cdDevice, NULL);
    oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);

    // Create the context
    shrLog("clCreateContext...\n");
    cxGPUContext = clCreateContext(0, 1, &cdDevice, NULL, NULL, &ciErrNum);
    oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);

    // Create a command-queue
    shrLog("clCreateCommandQueue...\n");
    cqCommandQueue = clCreateCommandQueue(cxGPUContext, cdDevice, 0, &ciErrNum);
    oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);

    // Allocate the OpenCL buffer memory objects for source and result on the device GMEM
    shrLog("clCreateBuffer (SrcA, SrcB and GPUoutput in Device GMEM)...\n");
    cmDevSrcA = clCreateBuffer(cxGPUContext, CL_MEM_READ_ONLY, sizeof(cl_float) * iWidth * iHeight, NULL, &ciErrNum);
    oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
    cmDevSrcB = clCreateBuffer(cxGPUContext, CL_MEM_READ_ONLY, sizeof(cl_float) * iFilterWidth * iFilterWidth, NULL, &ciErrNum);
    oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
    cmDevDst = clCreateBuffer(cxGPUContext, CL_MEM_WRITE_ONLY, sizeof(cl_float) * iWidth * iHeight, NULL, &ciErrNum);
    oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);

    // Read the OpenCL kernel in from source file
    shrLog("oclLoadProgSource (%s)...\n", cSourceFile);
    cPathAndName = shrFindFilePath(cSourceFile, argv[0]);
    oclCheckErrorEX(cPathAndName != NULL, shrTRUE, pCleanup);
    cSourceCL = oclLoadProgSource(cPathAndName, "", &szKernelLength);
    oclCheckErrorEX(cSourceCL != NULL, shrTRUE, pCleanup);

    // Create the program
    shrLog("clCreateProgramWithSource...\n");
    cpProgram = clCreateProgramWithSource(cxGPUContext, 1, (const char **)&cSourceCL, &szKernelLength, &ciErrNum);

    // Build the program with 'mad' Optimization option
    #ifdef MAC
    char* flags = "-cl-fast-relaxed-math -DMAC";
    #else
    char* flags = "-cl-fast-relaxed-math";
    #endif
    shrLog("clBuildProgram...\n");
    ciErrNum = clBuildProgram(cpProgram, 0, NULL, NULL, NULL, NULL);
    if (ciErrNum != CL_SUCCESS)
    {
    // write out standard error, Build Log and PTX, then cleanup and exit
    shrLogEx(LOGBOTH | ERRORMSG, ciErrNum, STDERROR);
    oclLogBuildInfo(cpProgram, oclGetFirstDev(cxGPUContext));
    oclLogPtx(cpProgram, oclGetFirstDev(cxGPUContext), "oclConvolution.ptx");
    Cleanup(EXIT_FAILURE);
    }

    // Create the kernel
    shrLog("clCreateKernel (Convolution)...\n");
    ckKernel = clCreateKernel(cpProgram, "Convolution", &ciErrNum);

    // Set the Argument values
    shrLog("clSetKernelArg 0 - 4...\n\n");
    ciErrNum = clSetKernelArg(ckKernel, 0, sizeof(cl_mem), (void*)&cmDevSrcA);
    ciErrNum |= clSetKernelArg(ckKernel, 1, sizeof(cl_mem), (void*)&cmDevSrcB);
    ciErrNum |= clSetKernelArg(ckKernel, 2, sizeof(cl_mem), (void*)&cmDevDst);
    ciErrNum |= clSetKernelArg(ckKernel, 3, sizeof(cl_int), (void*)&iWidth);
    ciErrNum |= clSetKernelArg(ckKernel, 4, sizeof(cl_int), (void*)&iFilterWidth);
    oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup); // ERROR Check

    // --------------------------------------------------------
    // Core sequence... copy input data to GPU, compute, copy results back

    // Asynchronous write of data to GPU device
    shrLog("clEnqueueWriteBuffer (SrcA and SrcB)...\n");
    ciErrNum = clEnqueueWriteBuffer(cqCommandQueue, cmDevSrcA, CL_FALSE, 0, sizeof(cl_float) * iWidth * iHeight, srcA, 0, NULL, NULL);
    ciErrNum |= clEnqueueWriteBuffer(cqCommandQueue, cmDevSrcB, CL_FALSE, 0, sizeof(cl_float) * iFilterWidth * iFilterWidth, srcB, 0, NULL, NULL);
    oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);

    // Launch kernel
    shrLog("clEnqueueNDRangeKernel (Convolution)...\n");
    ciErrNum = clEnqueueNDRangeKernel(cqCommandQueue, ckKernel, 2, NULL, szGlobalWorkSize, szLocalWorkSize, 0, NULL, NULL);
    oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup); // ERROR Check

    // Read back results and check accumulated errors
    shrLog("clEnqueueReadBuffer (GPU Output)...\n\n");
    ciErrNum = clEnqueueReadBuffer(cqCommandQueue, cmDevDst, CL_TRUE, 0, sizeof(cl_float) * iWidth * iHeight, GPUoutput, 0, NULL, NULL);
    oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);

    // Compute and compare results for CPU-host and report errors and pass/fail
    shrLog("Comparing against Host/C++ computation...\n\n");
    ConvolutionHost ((const float*)srcA, (float*)srcB, (float*)CPUoutput);
    shrBOOL bMatch = shrComparefet((const float*)CPUoutput, (const float*)GPUoutput, (unsigned int)iNumElements, 0.0f, 0);
    shrLog("%s\n\n", (bMatch == shrTRUE) ? "PASSED" : "FAILED");


    IplImage *GPUresultImage = (IplImage *)GPUoutput;
    IplImage *CPUresultImage = (IplImage *)CPUoutput;
    cvNamedWindow( "image", CV_WINDOW_AUTOSIZE );


    // Diaplaying the result images from GPU and CPU
    cvShowImage( "GPU Output", GPUresultImage );
    cvShowImage( "CPU Output", CPUresultImage );

    cvWaitKey(0);
    cvDestroyWindow( "image" );
    cvDestroyWindow( "GPU Output" );
    cvDestroyWindow( "CPU Output" );

    //Releasing the image memories
    cvReleaseImage( &img );
    cvReleaseImage( &GPUresultImage);
    cvReleaseImage( &CPUresultImage);
    // Diaplaying the result images from GPU and CPU


    } // End of Main


    Can any one give me some suggestions????

    Thanks in advance

  2. #2
    Senior Member
    Join Date
    May 2010
    Location
    Toronto, Canada
    Posts
    845

    Re: Problem with using local memory and global memory

    There are two very different things. On one hand, there's local and global memory. On the other hand there's the arguments passed to clEnqueueNDRangeKernel. Your problem has to do with clEnqueueNDRangeKernel, not with local or global memory.

    This is what you are doing today:

    Code :
    szLocalWorkSize[0] = iHeight;
    szLocalWorkSize[1] = iWidth;
    szGlobalWorkSize[0] = shrRoundUp((int)szLocalWorkSize[0], iFilterWidth);
    szGlobalWorkSize[1] = shrRoundUp((int)szLocalWorkSize[1], iWidth * iHeight);
    ciErrNum = clEnqueueNDRangeKernel(cqCommandQueue, ckKernel, 2, NULL, szGlobalWorkSize, szLocalWorkSize, 0, NULL, NULL);

    This is probably what you are trying to do:

    Code :
    szGlobalWorkSize[0] = iWidth;
    szGlobalWorkSize[1] = iHeight;
    ciErrNum = clEnqueueNDRangeKernel(cqCommandQueue, ckKernel, 2, NULL, szGlobalWorkSize, NULL, 0, NULL, NULL);

    In other words, you only need to select an appropriate global size, which controls how many work-items (samples) will be run. Notice that I'm passing NULL as the local-size argument. This tells OpenCL to select the best local size automatically.
    Disclaimer: Employee of Qualcomm Canada. Any opinions expressed here are personal and do not necessarily reflect the views of my employer. LinkedIn profile.

  3. #3
    Senior Member
    Join Date
    May 2010
    Location
    Toronto, Canada
    Posts
    845

    Re: Problem with using local memory and global memory

    There some other things that don't look right. For example, you malloc iWidth * iHeight * sizeof(cl_float4) bytes but then when you create the OpenCL buffers you only allocate sizeof(cl_float) * iWidth * iHeight.
    Disclaimer: Employee of Qualcomm Canada. Any opinions expressed here are personal and do not necessarily reflect the views of my employer. LinkedIn profile.

  4. #4

    Re: Problem with using local memory and global memory

    Thanks for your reply....
    I will try the changes

  5. #5

    Re: Problem with using local memory and global memory

    I am still facing the problem with the code......

    I think I have problem in clEnqueueNDRangeKernel which is used here to launch the kernel

    Can any one help in giving the suggestions.

    Thanks in advance.

  6. #6
    Member
    Join Date
    Mar 2010
    Location
    Raleigh, NC
    Posts
    55

    Re: Problem with using local memory and global memory

    Is your kernel launch dependent upon the entire set of memory being passed to the device? I would add a clEnqueueBarrier(cqCommandQueue) to ensure that the data is fully copied to the device before enqueuing your kernel for launch, especially since you are using a non-blocking write.

    What is the error code you are getting when you enqueue the kernel? That would be extremely helpful in narrowing down your problem.

  7. #7
    Senior Member
    Join Date
    May 2010
    Location
    Toronto, Canada
    Posts
    845

    Re: Problem with using local memory and global memory

    I would add a clEnqueueBarrier(cqCommandQueue) to ensure that the data is fully copied to the device before enqueuing your kernel for launch, especially since you are using a non-blocking write.
    That shouldn't be necessary because the queue is in-order.

    GPUWorker, can you describe what kind of problem do you have now? Are you sure no OpenCL calls are returning an error code?
    Disclaimer: Employee of Qualcomm Canada. Any opinions expressed here are personal and do not necessarily reflect the views of my employer. LinkedIn profile.

  8. #8

    Re: Problem with using local memory and global memory

    The error I am getting is :

    Debug Assertion Failed!

    Expression: _CrtIsValidHeapPointer(pUserData)

    Its showing the path of .exe file which I am runinng and also
    File:\dd\vctools\crt_bld\self_x86\crt\src\dbgheap. c
    Line:1317

    It is also saying
    For more information on how your program can cause an assertion failure, see the Visual C++ documentation on asserts.

  9. #9
    Senior Member
    Join Date
    May 2010
    Location
    Toronto, Canada
    Posts
    845

    Re: Problem with using local memory and global memory

    _CrtIsValidHeapPointer(pUserData)
    That's MSVC telling you that the pointer you've passed does not belong to the heap. The most likely cause is that you are freeing a pointer that was never initialized, or maybe you are freeing the same pointer twice. I don't remember whether a corrupted heap can produce the same error.

    Can you look at the call stack and find out what line of code in your application is generating that error? It's most likely a call to free() or delete.
    Disclaimer: Employee of Qualcomm Canada. Any opinions expressed here are personal and do not necessarily reflect the views of my employer. LinkedIn profile.

  10. #10

    Re: Problem with using local memory and global memory

    But it not showing the error on the line where we are freeing the pointers, it is showing at line where we are setting the kernel arguments with commandsetKernelArg

    There I used an error check and it is showing the error at that point for me. But while debugging step by step I am getting error at launching the kernel.

Similar Threads

  1. Replies: 6
    Last Post: 02-28-2013, 04:59 PM
  2. copy from global memory to local memory..problem
    By phoebe0105 in forum OpenCL
    Replies: 3
    Last Post: 06-03-2010, 02:14 PM

Posting Permissions

  • You may not post new threads
  • You may not post replies
  • You may not post attachments
  • You may not edit your posts
  •