Results 1 to 6 of 6

Thread: Poor bandwith on matrix multiplication with local memory?

  1. #1
    Junior Member
    Join Date
    Mar 2010
    Posts
    22

    Poor bandwith on matrix multiplication with local memory?

    Well, i'm trying to do the best i can to increase BW on matrix multiplication but my efforts aren't coming to where i want... in a Tesla C1060 the operation of multiplying two matrix of 2048x2048 is done in approx 0.136...seg. Today i was looking the OpenCLProfiler and in the box of Occupancy figures a 0... The code (shortest) is under, i was playing with clEuqneueMap and clEnqueueUnmap last time i saw de code. Any idea to improve performace? Thanks!

    Code :
    #include "CL/cl.h"
    #include <stdlib.h>
    #include <stdio.h>
    #include <math.h>
     
    #define m 2048
    #define n m
     
    void chkError(cl_int errCode,const char* file,cl_int line){
    	if(errCode != CL_SUCCESS){
    		printf("Error %i in file %s near line %i.\n",errCode,file,line-1);
    		exit(0);
    	}
    }
     
    int main(int argc,char*argv[]){
    	unsigned int szMem = m*n*sizeof(float);
     
    	cl_platform_id clPlatform;
    	cl_uint numPlatforms;
    	cl_int errCode;
    	errCode = clGetPlatformIDs(0,NULL,&numPlatforms);
    	chkError(errCode,__FILE__,__LINE__);
    	errCode = clGetPlatformIDs(numPlatforms,&clPlatform,NULL);
    	chkError(errCode,__FILE__,__LINE__);
     
    	size_t szParam;
    	errCode = clGetPlatformInfo(clPlatform,CL_PLATFORM_PROFILE,0,NULL,&szParam);
    	chkError(errCode,__FILE__,__LINE__);
    	char* param = (char*) malloc (szParam);
    	errCode = clGetPlatformInfo(clPlatform,CL_PLATFORM_PROFILE,szParam,param,NULL);
    	chkError(errCode,__FILE__,__LINE__);
     
    	cl_device_id clDevices;
    	cl_uint numDevices;
    	errCode = clGetDeviceIDs(clPlatform,CL_DEVICE_TYPE_GPU,0,NULL,&numDevices);
    	chkError(errCode,__FILE__,__LINE__);
    	errCode = clGetDeviceIDs(clPlatform,CL_DEVICE_TYPE_GPU,numDevices,&clDevices,NULL);
    	chkError(errCode,__FILE__,__LINE__);
     
    	//---------------------------------------------------------------------------------
    	cl_uint maxComputeUnits;
    	errCode = clGetDeviceInfo(clDevices,CL_DEVICE_MAX_COMPUTE_UNITS,sizeof(cl_uint),&maxComputeUnits,NULL);
    	chkError(errCode,__FILE__,__LINE__);
    	//---------------------------------------------------------------------------------
     
    	cl_context clContext;
    	clContext = clCreateContext(NULL,numDevices,&clDevices,NULL,NULL,&errCode);
    	chkError(errCode,__FILE__,__LINE__);
     
    	cl_command_queue clCommandQueue;
    	clCommandQueue = clCreateCommandQueue(clContext,clDevices,CL_QUEUE_PROFILING_ENABLE,&errCode);
    	chkError(errCode,__FILE__,__LINE__);
     
      FILE *fp;
      fp = fopen("clNew.cl", "r");
      if (!fp) {
        fprintf(stderr, "Failed to load kernel.\n");
        exit(1);
      }
      fseek(fp,0,SEEK_END);
      const size_t kernelLength = ftell(fp);
      rewind(fp);
      char *clNew = (char *) malloc (kernelLength);
      fread(clNew,1,kernelLength,fp);
      fclose(fp);
     
    	cl_program clProgram;
    	clProgram = clCreateProgramWithSource(clContext,1,(const char**)&clNew,&kernelLength,&errCode);
    	chkError(errCode,__FILE__,__LINE__);
    	errCode = clBuildProgram(clProgram,0,NULL,NULL,NULL,NULL);
    	chkError(errCode,__FILE__,__LINE__);
     
    	cl_kernel clKernel; 
    	const char* kernelName = "matMult";
    	clKernel = clCreateKernel(clProgram,kernelName,&errCode);
    	chkError(errCode,__FILE__,__LINE__);
     
    	cl_mem clDevA,clDevB,clDevC;
    	clDevA = clCreateBuffer(clContext,CL_MEM_READ_ONLY | CL_MEM_ALLOC_HOST_PTR,szMem,NULL,&errCode);
    	chkError(errCode,__FILE__,__LINE__);
    	clDevB = clCreateBuffer(clContext,CL_MEM_READ_ONLY | CL_MEM_ALLOC_HOST_PTR,szMem,NULL,&errCode);
    	chkError(errCode,__FILE__,__LINE__);
    	clDevC = clCreateBuffer(clContext,CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR,szMem,NULL,&errCode);
    	chkError(errCode,__FILE__,__LINE__);
     
    	float *A = (float*) malloc (szMem);
    	float *B = (float*) malloc (szMem);
    	float *C = (float*) malloc (szMem);
     
    	A = (float *)clEnqueueMapBuffer(clCommandQueue,clDevA,CL_TRUE,CL_MAP_WRITE,0,sizeof(clDevA),0,NULL,NULL,&errCode);
    	chkError(errCode,__FILE__,__LINE__);
    	B = (float *)clEnqueueMapBuffer(clCommandQueue,clDevB,CL_TRUE,CL_MAP_WRITE,0,sizeof(clDevB),0,NULL,NULL,&errCode);
    	chkError(errCode,__FILE__,__LINE__);
    	C = (float *)clEnqueueMapBuffer(clCommandQueue,clDevC,CL_TRUE,CL_MAP_READ,0,sizeof(clDevC),0,NULL,NULL,&errCode);
    	chkError(errCode,__FILE__,__LINE__);
     
    	errCode = clEnqueueUnmapMemObject(clCommandQueue,clDevA,A,0,NULL,NULL);
    	chkError(errCode,__FILE__,__LINE__);
    	errCode = clEnqueueUnmapMemObject(clCommandQueue,clDevB,B,0,NULL,NULL);
    	chkError(errCode,__FILE__,__LINE__);
     
    	int row = n, col = m;
    	errCode = clSetKernelArg(clKernel,0,sizeof(clDevC),(const void*)&clDevC);
    	chkError(errCode,__FILE__,__LINE__);
    	errCode = clSetKernelArg(clKernel,1,sizeof(clDevA),(const void*)&clDevA);
    	chkError(errCode,__FILE__,__LINE__);
    	errCode = clSetKernelArg(clKernel,2,sizeof(clDevB),(const void*)&clDevB);
    	chkError(errCode,__FILE__,__LINE__);
    	errCode = clSetKernelArg(clKernel,3,sizeof(cl_int),(const void*)&col);
    	chkError(errCode,__FILE__,__LINE__);
    	errCode = clSetKernelArg(clKernel,4,256*sizeof(cl_float),NULL);
    	chkError(errCode,__FILE__,__LINE__);
    	errCode = clSetKernelArg(clKernel,5,256*sizeof(cl_float),NULL);
    	chkError(errCode,__FILE__,__LINE__);
     
    	const size_t clLocalWorkSize[2] = {16,16}, clGlobalWorkSize[2] = {m,n};
    	cl_event clEvent;
    	errCode = clEnqueueNDRangeKernel(clCommandQueue,clKernel,2,0,clGlobalWorkSize,clLocalWorkSize,0,NULL,&clEvent);
    	chkError(errCode,__FILE__,__LINE__);
     
    	long long  start,end;
    	cl_int status;
    	status = clWaitForEvents(1,&clEvent);
    	chkError(errCode,__FILE__,__LINE__);
      status = clGetEventProfilingInfo(clEvent,CL_PROFILING_COMMAND_START,
                            sizeof(long long),&start,NULL);
    	chkError(errCode,__FILE__,__LINE__);
    	status = clGetEventProfilingInfo(clEvent,CL_PROFILING_COMMAND_END,
    													sizeof(long long),&end,NULL);
    	chkError(errCode,__FILE__,__LINE__);
    	cl_double total = (cl_double)(end - start) / 1e9;
      printf("Profiling: Total kernel time was %f secs.\n", total);
     
    	C = (float*)clEnqueueMapBuffer(clCommandQueue,clDevC,CL_FALSE,CL_MAP_READ,0,sizeof(clDevC),0,NULL,NULL,&errCode);
    	chkError(errCode,__FILE__,__LINE__);
     
    	errCode = clEnqueueUnmapMemObject(clCommandQueue,clDevC,C,0,NULL,NULL);
    	chkError(errCode,__FILE__,__LINE__);
     
    	clReleaseMemObject(clDevA);
    	clReleaseMemObject(clDevB);
    	clReleaseMemObject(clDevC);
    	clReleaseEvent(clEvent);
    	clReleaseKernel(clKernel);
    	clReleaseProgram(clProgram);
    	clReleaseCommandQueue(clCommandQueue);
    	clReleaseContext(clContext);
    	return 0;
    }

    Code :
    #define bSize 16
     
    __kernel void
    matMult(__global float* C, 
    				__global float* A, 
    				__global float* B,
    				int N, 
    				__local float Asub [bSize][bSize],
    				__local float Bsub [bSize][bSize])
    {
    	int gidx = get_group_id(0);
    	int gidy = get_group_id(1);
    	float Csub = 0;
    	int lidx = get_local_id(0);
    	int lidy = get_local_id(1);
    	int aBegin = gidy*N*bSize;
    	int aStep = bSize;
    	int aEnd = aBegin+N-1;
    	int bBegin = gidx*bSize;
    	int bStep = N*bSize;
    	int base = lidy*N+lidx;;
    	for (int i = aBegin,j = bBegin; i < aEnd; i+=2*aStep,j+=2*bStep) {
    		/*
    		Asub[lidy][lidx] = A[i+base];
    		Bsub[lidy][lidx] = B[j+base];
    		barrier(CLK_LOCAL_MEM_FENCE);
    		//for(int k=0;k<bSize;k++) Csub += Asub[lidy][k]*Bsub[k][lidx];
    		Csub += Asub[lidy][0]*Bsub[0][lidx];
    		Csub += Asub[lidy][1]*Bsub[1][lidx];
    		Csub += Asub[lidy][2]*Bsub[2][lidx];
    		Csub += Asub[lidy][3]*Bsub[3][lidx];
    		Csub += Asub[lidy][4]*Bsub[4][lidx];
    		Csub += Asub[lidy][5]*Bsub[5][lidx];
    		Csub += Asub[lidy][6]*Bsub[6][lidx];
    		Csub += Asub[lidy][7]*Bsub[7][lidx];
    		Csub += Asub[lidy][8]*Bsub[8][lidx];
    		Csub += Asub[lidy][9]*Bsub[9][lidx];
    		Csub += Asub[lidy][10]*Bsub[10][lidx];
    		Csub += Asub[lidy][11]*Bsub[11][lidx];
    		Csub += Asub[lidy][12]*Bsub[12][lidx];
    		Csub += Asub[lidy][13]*Bsub[13][lidx];
    		Csub += Asub[lidy][14]*Bsub[14][lidx];
    		Csub += Asub[lidy][15]*Bsub[15][lidx];
    		barrier(CLK_LOCAL_MEM_FENCE);*/
    		Asub[lidy][lidx] = A[i+base];
    		Bsub[lidy][lidx] = B[j+base];
    		barrier(CLK_LOCAL_MEM_FENCE);
    		//for(int k=0;k<bSize;k++) Csub += Asub[lidy][k]*Bsub[k][lidx];
    		Csub += Asub[lidy][0]*Bsub[0][lidx];
    		Csub += Asub[lidy][1]*Bsub[1][lidx];
    		Csub += Asub[lidy][2]*Bsub[2][lidx];
    		Csub += Asub[lidy][3]*Bsub[3][lidx];
    		Csub += Asub[lidy][4]*Bsub[4][lidx];
    		Csub += Asub[lidy][5]*Bsub[5][lidx];
    		Csub += Asub[lidy][6]*Bsub[6][lidx];
    		Csub += Asub[lidy][7]*Bsub[7][lidx];
    		Csub += Asub[lidy][8]*Bsub[8][lidx];
    		Csub += Asub[lidy][9]*Bsub[9][lidx];
    		Csub += Asub[lidy][10]*Bsub[10][lidx];
    		Csub += Asub[lidy][11]*Bsub[11][lidx];
    		Csub += Asub[lidy][12]*Bsub[12][lidx];
    		Csub += Asub[lidy][13]*Bsub[13][lidx];
    		Csub += Asub[lidy][14]*Bsub[14][lidx];
    		Csub += Asub[lidy][15]*Bsub[15][lidx];
    		barrier(CLK_LOCAL_MEM_FENCE);
    		Asub[lidy][lidx] = A[i+aStep+base];
    		Bsub[lidy][lidx] = B[j+bStep+base];
    		barrier(CLK_LOCAL_MEM_FENCE);
    		//for(int k=0;k<bSize;k++) Csub += Asub[lidy][k]*Bsub[k][lidx];
    		Csub += Asub[lidy][0]*Bsub[0][lidx];
    		Csub += Asub[lidy][1]*Bsub[1][lidx];
    		Csub += Asub[lidy][2]*Bsub[2][lidx];
    		Csub += Asub[lidy][3]*Bsub[3][lidx];
    		Csub += Asub[lidy][4]*Bsub[4][lidx];
    		Csub += Asub[lidy][5]*Bsub[5][lidx];
    		Csub += Asub[lidy][6]*Bsub[6][lidx];
    		Csub += Asub[lidy][7]*Bsub[7][lidx];
    		Csub += Asub[lidy][8]*Bsub[8][lidx];
    		Csub += Asub[lidy][9]*Bsub[9][lidx];
    		Csub += Asub[lidy][10]*Bsub[10][lidx];
    		Csub += Asub[lidy][11]*Bsub[11][lidx];
    		Csub += Asub[lidy][12]*Bsub[12][lidx];
    		Csub += Asub[lidy][13]*Bsub[13][lidx];
    		Csub += Asub[lidy][14]*Bsub[14][lidx];
    		Csub += Asub[lidy][15]*Bsub[15][lidx];
    		barrier(CLK_LOCAL_MEM_FENCE);
    	}
    	C[aBegin+bBegin+base] = Csub;
    }

  2. #2
    Senior Member
    Join Date
    Jul 2009
    Location
    Northern Europe
    Posts
    311

    Re: Poor bandwith on matrix multiplication with local memory?

    Have you taken a look at some of the papers published on getting maximum matrix multiplication performance on Nvidia hardware? They use very specific tricks to get the best results. I've heard that Nvidia's OpenCL currently has some serious performance bugs (like 2x worse than cuda) so you may have a hard time getting their level of performance, though.

  3. #3
    Junior Member
    Join Date
    Mar 2010
    Posts
    22

    Re: Poor bandwith on matrix multiplication with local memory?

    im reading the papers, but still no idea for the poor performance... im trying to implement another method, but at the time im confused...

  4. #4
    Senior Member
    Join Date
    Jul 2009
    Location
    Northern Europe
    Posts
    311

    Re: Poor bandwith on matrix multiplication with local memory?

    Two comments after looking at your code:

    1) I'm not sure why you're using map instead of just writing to the buffer with your data or creating the buffer with COPY_HOST_PTR. (It looks like you're mapping and unmapping without actually changing the data.)
    2) The first time you execute a kernel you may incur the delayed cost of compiling the kernel, plus the cost of transferring the data. You should execute the kernel once and then average your time over at least 10-100 runs of the kernel to avoid skewed results due to system/device allocation and transfer overhead.

  5. #5
    Junior Member
    Join Date
    Mar 2010
    Posts
    22

    Re: Poor bandwith on matrix multiplication with local memory?

    Thanks for your reply, the maps were only for play and learn how is it works. In my original code i use MEM_COPY_HOST_PTR with a valid pointer where is the info for passing the data to the device. Respecting your second answer, you suggest that i must run multiple times my kernel and then promediate the results no? It's seems to be logical, i run the code manually multiple times and them i conclude the final time of execution. Respect to the code, the writes on Global Memory are expensive... and the "internal for" it has been rolled and this enhance performance... but i'm far that i consider a GOOD PERFORMANCE... when i run the OpenCL Profiles in Occupancy there ir a 0, and no idea why! there arent uncoaleced access, or anything rare...

  6. #6
    Senior Member
    Join Date
    Jul 2009
    Location
    Northern Europe
    Posts
    311

    Re: Poor bandwith on matrix multiplication with local memory?

    Running the code manually many times is not going to have the same effect since the data will still have to be transferred and all the setup and initialization will have to happen.

    If you want to time performance with a kernel you should to do something like:
    Code :
    buildProgram
    writeData
    enqueueKernel
    finish
     
    start = get_time
    for (i=0; i<20; i++
      enqueueKernel
    finish
    stop = get_time
     
    total_time = (stop-start)/20

    If you don't do this you'll be measuring a huge amount of overhead to the kernel execution which can screw up the analysis.

    If you want to include the data transfer time then you should explicitly include it in the loop as well.

Similar Threads

  1. Matrix multiplication question
    By BKB in forum OpenGL ES 2X - for programmable 3D graphics pipelines
    Replies: 1
    Last Post: 08-23-2011, 02:32 AM
  2. Matrix Multiplication
    By wrx in forum OpenCL
    Replies: 18
    Last Post: 02-17-2011, 01:24 AM

Posting Permissions

  • You may not post new threads
  • You may not post replies
  • You may not post attachments
  • You may not edit your posts
  •