Results 1 to 7 of 7

Thread: How to get correct access to all values in the global memory

  1. #1
    Junior Member
    Join Date
    Apr 2010
    Posts
    4

    How to get correct access to all values in the global memory

    Hello everybody, I've written a little program based on the "Hello world" and the "oclVectorAdd" programs. The final objective is to make image processing.
    My programs loads an image as an 1D array, submit it to the kernel to be solved first by the CPU, then by the GPU. Both results are saved as 2 images (1 for CPU and one for GPU).
    My problem is, when I see other programs they only need to write "get_global_id(0)" to solve all the values in the array. With my kernel only 1 valu on 4 is solved with the GPU, the others stay at 0.
    With the CPU it works.

    My kernel is only doing now as a test for a greyscale image:
    ImageOutput (i) = ImageInput (i)

    All the examples propose:
    i = get_global_id(0);
    ImageOutput (i) = ImageInput (i);

    I use a technic, which doesn't really work is really heavy. When I do it, I can attribute all the values except the the second (for i=1), which stays at 0. (see kernel code)

    Thanks a lot in advance for your help.

    Here you'll find the kernel code:
    Code :
    //////////////////////////////OpenCL Calcul Code////////////////////////////////
     
    __kernel  void Image_Processing( __global const unsigned char* ImageInput,
    								 __global unsigned char* ImageOutput)
    							//__global const int nbr_val_image)
    {
    	int gti = get_global_id(0);
    	int ti  = get_local_id(0);
     
    	int n  = get_global_size(0);
    	int nt = get_local_size(0);
    	int nb = n/nt;
    	int i;
     
    	for(int j=0; j<=nt; j++)
    		{
    		i  = gti+j*ti;
    		ImageOutput[i] = ImageInput[i];
    		}
    //	barrier(CLK_GLOBAL_MEM_FENCE);
    	return;
    }

    Here you'll find the C code: (I use 2 functions to load and save the pictures taken from the SOIL library that you can find at: http://www.lonesock.net/soil.html)
    Code :
    #include <fcntl.h>
    #include <stdio.h>
    #include <stdlib.h>
    #include <string.h>
    #include <math.h>
    #include <unistd.h>
    #include <sys/types.h>
    #include <OpenCL/opencl.h>
    #include <time.h>
     
     
    #include "SOIL.h"
     
     
     
    ////////////////////////////////////////////////////////////////////////////////
    /////////////////////////////////Main Code//////////////////////////////////////
    ////////////////////////////////////////////////////////////////////////////////
     
    int main (int argc, const char * argv[])
    {
    	//Declar Functions
     
    	char * LoadFile2txt(const char *File);
     
    	//Declar Variables
     
    	int err;                            // error code returned from api calls
    	int gpu;
     
    	int width; 
    	int height;
    	int channels;
     
    	int TimeTotGPU;
    	int TimeKernGPU;
    	int TimeTotCPU;
    	int TimeKernCPU;
     
    	//	int RunLevel;
    	//GLuint *monImage;
     
    	const char* cSourceFile = "Image_Process.cl";
    	char filename[]= "Test3.bmp";
    	char *KernelSource;
     
        size_t local;                       // local domain size for our calculation
     
        cl_device_id device_id;             // compute device id 
        cl_context context;                 // compute context
        cl_command_queue commands;          // compute command queue
        cl_program program;                 // compute program
        cl_kernel kernel;                   // compute kernel
     
        cl_mem ImageInput;                  // device memory used for the input array
        cl_mem ImageOutput;                 // device memory used for the output array
    	//cl_mem nbrPixel;
     
    	unsigned char *monImage = SOIL_load_image(filename,&width, &height, &channels, SOIL_LOAD_L);
    	unsigned char *imageTraitee;
     
    	channels=1;
     
    	int nbr_val_image = width * height * channels;
     
    	printf("Image width: %d \n", width);
    	printf("Image height: %d \n", height);
    	printf("Image channels: %d \n", channels);
    	printf("nbr_val_image de: %d \n", nbr_val_image);
    	printf("Vals pix monImage:\n%d  %d  %d\n%d  %d  %d\n%d  %d  %d\n%d  %d  %d\n\n",
    		   monImage[0],  monImage[1], monImage[2], monImage[3], monImage[4], monImage[5],
    		   monImage[6],  monImage[7], monImage[8], monImage[9], monImage[10], monImage[11]);
     
     
    	// Ajuste le nombre de valeurs de l'image au multiple de 256 au-dessus pour la création de la mémoire tampon
        //
        size_t LocalWorkSize = 256;
    	size_t GlobalWorkzise = ceil((double)nbr_val_image/(double)LocalWorkSize)*LocalWorkSize;
     
    	monImage     = (void *)realloc(monImage,sizeof(cl_uchar)*GlobalWorkzise);
    	imageTraitee = (void *)malloc(sizeof(cl_uchar)*GlobalWorkzise);
     
     
    	for(gpu=0;gpu<2;gpu++)
    	{
    	    // Prise de temps début de résolution GPU
    		clock_t TimeStartSolve = clock ();
     
    		// Connect to a compute device
    		//
    		err = clGetDeviceIDs(NULL, gpu ? CL_DEVICE_TYPE_GPU : CL_DEVICE_TYPE_CPU, 1, &device_id, NULL);	// if gpu=0 : solving on CPU, if gpu=1 : solving on GPU
    		if (err != CL_SUCCESS)
    		{
    			printf("Error: Failed to create a device group!\n");
    			return EXIT_FAILURE;
    		}
     
     
    		// Create a compute context 
    		//
    		context = clCreateContext(0, 1, &device_id, NULL, NULL, &err);
    		if (!context)
    		{
    			printf("Error: Failed to create a compute context!\n");
    			return EXIT_FAILURE;
    		}
     
     
    		// Create a command commands
    		//
    		commands = clCreateCommandQueue(context, device_id, 0, &err);
    		if (!commands)
    		{
    			printf("Error: Failed to create a command commands!\n");
    			return EXIT_FAILURE;
    		}
     
     
    		// Create the input and output arrays in device memory for our calculation
    		//
    		ImageInput  = clCreateBuffer(context,  CL_MEM_READ_ONLY, sizeof(cl_uchar) * GlobalWorkzise, NULL, NULL);
    		//nbrPixel    = clCreateBuffer(context,  CL_MEM_READ_ONLY, sizeof(cl_int)  * GlobalWorkzise, NULL, NULL);
    		ImageOutput = clCreateBuffer(context, CL_MEM_WRITE_ONLY, sizeof(cl_uchar) * GlobalWorkzise, NULL, NULL);
    		if (!ImageInput || !ImageOutput)
    		{
    			printf("Error: Failed to allocate device memory!\n");
    			exit(1);
    		}   
     
     
    		// Create the compute program from the source buffer
    		//
    		KernelSource = LoadFile2txt (cSourceFile);
     
    		program = clCreateProgramWithSource(context, 1, (const char **) &KernelSource, NULL, &err);
    		if (!program)
    		{
    			printf("Error: Failed to create compute program!\n");
    			return EXIT_FAILURE;
    		}
     
     
    		// Build the program executable
    		//
    		err = clBuildProgram(program, 0, NULL, NULL, NULL, NULL);
    		if (err != CL_SUCCESS)
    		{
    			size_t len;
    			char buffer[2048];
     
    			printf("Error: Failed to build program executable!\n");
    			clGetProgramBuildInfo(program, device_id, CL_PROGRAM_BUILD_LOG, sizeof(buffer), buffer, &len);
    			printf("%s\n", buffer);
    			exit(1);
    		}
     
     
    		// Create the compute kernel in the program we wish to run
    		//
    		kernel = clCreateKernel(program, "Image_Processing", &err);
    		if (!kernel || err != CL_SUCCESS)
    		{
    			printf("Error: Failed to create compute kernel!\n");
    			exit(1);
    		}
     
     
    		// Set the arguments to our compute kernel
    		//
    		err = 0;
    		err  = clSetKernelArg(kernel, 0, sizeof(cl_mem), &ImageInput);
    		err |= clSetKernelArg(kernel, 1, sizeof(cl_mem), &ImageOutput);
    		//err |= clSetKernelArg(kernel, 2, sizeof(cl_mem), &nbrPixel);
    		if (err != CL_SUCCESS)
    		{
    			printf("Error: Failed to set kernel arguments! %d\n", err);
    			exit(1);
    		}
     
     
    		// Write our data set into the input array in device memory 
    		//
    		err  = clEnqueueWriteBuffer(commands, ImageInput, CL_TRUE, 0, sizeof(cl_uchar) * GlobalWorkzise, monImage, 0, NULL, NULL);
    		//err |= clEnqueueWriteBuffer(commands, nbrPixel  , CL_TRUE, 0, sizeof(int)  * GlobalWorkzise, nbr_val_image, 0, NULL, NULL);
    		if (err != CL_SUCCESS)
    		{
    			printf("Error: Failed to write to source array!\n");
    			exit(1);
    		}
     
     
    		// Get the maximum work group size for executing the kernel on the device
    		//
    		err = clGetKernelWorkGroupInfo(kernel, device_id, CL_KERNEL_WORK_GROUP_SIZE, sizeof(local), &local, NULL);
    		if (err != CL_SUCCESS)
    		{
    			printf("Error: Failed to retrieve kernel work group info! %d\n", err);
    			exit(1);
    		}
     
    		//printf("local = %d\n", (int)local);
     
     
    		// Prise de temps début de résolution du kernel
    		clock_t TimeStartKernel = clock ();
     
     
    		// Execute the kernel over the entire range of our 1d input data set
    		// using the maximum number of work group items for this device
    		//
     
    		err = clEnqueueNDRangeKernel(commands, kernel, 1, NULL, &GlobalWorkzise, &local, 0, NULL, NULL);
    		if (err)
    		{
    			printf("Error: Failed to execute kernel!\n");
    			return EXIT_FAILURE;
    		}
     
    		// Wait for the command commands to get serviced before reading back results
    		//
    		clFinish(commands);
     
    		clock_t TimeFinishKernel = clock ();
     
    		// Read back the results from the device to verify the output
    		//
    		err = clEnqueueReadBuffer(commands, ImageOutput, CL_TRUE, 0, sizeof(cl_uchar) * GlobalWorkzise, imageTraitee, 0, NULL, NULL );  
    		if (err != CL_SUCCESS)
    		{
    			printf("Error: Failed to read output array! %d\n", err);
    			exit(1);
    		}
     
    		// Prise de temps fin résolution du kernel
    		clock_t TimeFinishSolve = clock ();
     
    		int TimeGPU    = (((TimeFinishSolve  - TimeStartSolve) *1e6) / CLOCKS_PER_SEC);
    		int TimeKernel = (((TimeFinishKernel - TimeStartKernel)*1e6) / CLOCKS_PER_SEC);
     
    		printf("Vals pix imageTraitee:\n%d  %d  %d\n%d  %d  %d\n%d  %d  %d\n%d  %d  %d\n",
    			   imageTraitee[0],  imageTraitee[1], imageTraitee[2], imageTraitee[3], imageTraitee[4], imageTraitee[5],
    			   imageTraitee[6], imageTraitee[7], imageTraitee[8], imageTraitee[9], imageTraitee[10], imageTraitee[11]);
     
     
    		// Enregistrement de l'image traitée en BMP
     
    		if(gpu==1)
    		{
    			err = SOIL_save_image("GPUProcessedImage.bmp", SOIL_SAVE_TYPE_BMP, width, height, 1, imageTraitee);
    			TimeTotGPU  = TimeGPU;
    			TimeKernGPU = TimeKernel;
    		}
    		else
    		{
    			err = SOIL_save_image("CPUProcessedImage.bmp", SOIL_SAVE_TYPE_BMP, width, height, 1, imageTraitee);
    			TimeTotCPU  = TimeGPU;
    			TimeKernCPU = TimeKernel;
    		}
     
     
    		// Shutdown and cleanup
     
    		clReleaseMemObject(ImageInput);
    		clReleaseMemObject(ImageOutput);
    		clReleaseProgram(program);
    		clReleaseKernel(kernel);
    		clReleaseCommandQueue(commands);
    		clReleaseContext(context);
    	}
     
     
    	printf("Temps de réolution du programme sur GPU: %d [usec]\n", TimeTotGPU);
    	printf("Temps de réolution du programme sur CPU: %d [usec]\n\n", TimeTotCPU);
     
    	printf("La résulotion du programme sur GPU est environ %d fois plus rapide que sur CPU\n\n", TimeTotCPU / TimeTotGPU);
     
    	printf("Temps de réolution du noyau sur GPU: %d [usec]\n", TimeKernGPU);
    	printf("Temps de réolution du noyau sur CPU: %d [usec]\n\n", TimeKernCPU);
     
    	printf("La résulotion du noyau sur GPU est environ %d fois plus rapide que sur CPU\n\n", TimeKernCPU / TimeKernGPU);
     
    	free(monImage);
    	free(imageTraitee);
     
        return 0;
    }
     
     
    ////////////////////////////////////////////////////////////////////////////////
    //////////////////////////////Annexe functions//////////////////////////////////
    ////////////////////////////////////////////////////////////////////////////////
     
    char * LoadFile2txt (const char *File)
    {
    	FILE * pFile;
    	long lSize;
    	size_t result;
    	char * TXTBuffer;
     
    	pFile = fopen (File, "r");
    	if (pFile==NULL)
    	{
    		printf("Fct LoadFile2txt: File error");
    	}
     
    	// obtain file size:
    	fseek (pFile , 0 , SEEK_END);
    	lSize = ftell (pFile);
    	rewind (pFile);
     
    	// allocate memory to contain the whole file:
    	TXTBuffer = (char*) malloc (sizeof(char)*lSize);
    	if (TXTBuffer == NULL)
    	{
    		printf("Fct LoadFile2txt: Memory error");
    	}
     
    	// copy the file into the buffer:
    	result = fread (TXTBuffer,1,lSize,pFile);
    	if (result != lSize)
    	{
    		printf("Fct LoadFile2txt: Reading error");
    	}
     
    	// terminate
    	fclose (pFile);
     
    	return TXTBuffer;	
    }

  2. #2
    Member
    Join Date
    Nov 2009
    Location
    Scotland
    Posts
    72

    Re: How to get correct access to all values in the global memory

    It seems like your global work size is as big as your image, i.e. you have one workitem per pixel, right? In this case you don't have to loop over the image in your kernel, because each workitem only processes one pixel:
    Code :
    ImageOutput[gti] = ImageInput[gti]

  3. #3
    Junior Member
    Join Date
    Apr 2010
    Posts
    4

    Re: How to get correct access to all values in the global memory

    Exactly, I'm supposed to have one workitem per pixel. But if I do:
    Code :
    ImageOutput[gti] = ImageInput[gti]
    When I solve it with GPU, I only get 1 value on 4:
    ImageOutput = 0 0 0 X 0 0 0 X 0 0 0 X ... (X are the same values as ImageInput in this situation.)

    But if I solve it on CPU, it works, I get all the correct values.

  4. #4
    Junior Member
    Join Date
    Apr 2010
    Posts
    4

    Re: How to get correct access to all values in the global memory

    here you can see an example of the image input and its result.
    Code :
    InputValues:
     
       92   99    1    8   15   67   74   51   58   40
       98   80    7   14   16   73   55   57   64   41
        4   81   88   20   22   54   56   63   70   47
       85   87   19   21    3   60   62   69   71   28
       86   93   25    2    9   61   68   75   52   34
       17   24   76   83   90   42   49   26   33   65
       23    5   82   89   91   48   30   32   39   66
       79    6   13   95   97   29   31   38   45   72
       10   12   94   96   78   35   37   44   46   53
       11   18  100   77   84   36   43   50   27   59
     
    OutputValues:
     
        0    0    0    8    0    0    0   51    0   40
        0    0    0   14    0    0    0    0    0   41
        0    0    0   20    0   54    0    0    0    0
        0   87    0   21    0    0    0    0    0   28
        0    0    0    2    0   61    0    0    0    0
        0   24    0    0    0   42    0    0    0   65
        0    0    0   89    0   48    0    0    0    0
        0    6    0    0    0   29    0    0    0   72
        0    0    0   96    0    0    0   44    0   53
        0    0    0   77    0    0    0    0    0   59

  5. #5
    Senior Member
    Join Date
    Nov 2009
    Posts
    118

    Re: How to get correct access to all values in the global memory

    that should work :

    Code :
    int gti = get_global_id(0);
    ImageOutput[gti] = ImageInput[gti];

  6. #6
    Junior Member
    Join Date
    Apr 2010
    Posts
    4

    Re: How to get correct access to all values in the global memory

    Problem solved: The current GPUs don't support arrays of char or unsigned char (next generations should apparently support). Values have to be minimum int.
    That's why I only got 1 values on 4: 4 * 8 (char) = 32 (int)
    I hope it will be helpful for others.

  7. #7
    Senior Member
    Join Date
    Jul 2009
    Location
    Northern Europe
    Posts
    311

    Re: How to get correct access to all values in the global memory

    In OpenCL 1.0 you need to check if the byte writes are supported. I know Nvidia GPUs do support this, but the 4xxx AMD ones do not, for example.

Similar Threads

  1. concurrent access to global memory
    By xMate23 in forum OpenCL
    Replies: 2
    Last Post: 10-24-2012, 05:48 AM
  2. Global memory access
    By Rui in forum OpenCL
    Replies: 1
    Last Post: 03-23-2010, 12:18 PM

Posting Permissions

  • You may not post new threads
  • You may not post replies
  • You may not post attachments
  • You may not edit your posts
  •