Results 1 to 3 of 3

Thread: Segmentation fault while creating command queue

  1. #1
    Junior Member
    Join Date
    Nov 2011
    Posts
    11

    Segmentation fault while creating command queue

    I have two ATI Radeon 5970 graphics card with Intel Core i7 processor running 64-bit CentOS 5.3 on my system. I have installed ATI Catalyst 12.2(64-bit) drivers and AMD APP SDK 2.4 as well. Output given by CLInfo sample program of SDK is

    Code :
    Number of platforms:				 1
      Platform Profile:				 FULL_PROFILE
      Platform Version:				 OpenCL 1.1 AMD-APP-SDK-v2.4 (595.10)
      Platform Name:					 AMD Accelerated Parallel Processing
      Platform Vendor:				 Advanced Micro Devices, Inc.
      Platform Extensions:			 cl_khr_icd cl_amd_event_callback cl_amd_offline_devices
     
     
      Platform Name:					 AMD Accelerated Parallel Processing
    Number of devices:				 3
      Device Type:					 CL_DEVICE_TYPE_GPU
      Device ID:					 4098
      Max compute units:				 20
      Max work items dimensions:			 3
        Max work items[0]:				 256
        Max work items[1]:				 256
        Max work items[2]:				 256
      Max work group size:				 256
      Preferred vector width char:			 16
      Preferred vector width short:			 8
      Preferred vector width int:			 4
      Preferred vector width long:			 2
      Preferred vector width float:			 4
      Preferred vector width double:		 0
      Max clock frequency:				 725Mhz
      Address bits:					 32
      Max memory allocation:			 134217728
      Image support:				 Yes
      Max number of images read arguments:	 128
      Max number of images write arguments:	 8
      Max image 2D width:			 8192
      Max image 2D height:			 8192
      Max image 3D width:			 2048
      Max image 3D height:	 2048
      Max image 3D depth:			 2048
      Max samplers within kernel:		 16
      Max size of kernel argument:			 1024
      Alignment (bits) of base address:		 32768
      Minimum alignment (bytes) for any datatype:	 128
      Single precision floating point capability
        Denorms:					 No
        Quiet NaNs:					 Yes
        Round to nearest even:			 Yes
        Round to zero:				 Yes
        Round to +ve and infinity:			 Yes
        IEEE754-2008 fused multiply-add:		 Yes
      Cache type:					 None
      Cache line size:				 0
      Cache size:					 0
      Global memory size:				 536870912
      Constant buffer size:				 65536
      Max number of constant args:			 8
      Local memory type:				 Scratchpad
      Local memory size:				 32768
      Profiling timer resolution:			 1
      Device endianess:				 Little
      Available:					 Yes
      Compiler available:				 Yes
      Execution capabilities:				 
        Execute OpenCL kernels:			 Yes
        Execute native function:			 No
      Queue properties:				 
        Out-of-Order:				 No
        Profiling :					 Yes
      Platform ID:					 0x2b1e1f8c3800
      Name:						 Cypress
      Vendor:					 Advanced Micro Devices, Inc.
      Driver version:				 CAL 1.4.1703
      Profile:					 FULL_PROFILE
      Version:					 OpenCL 1.1 AMD-APP-SDK-v2.4 (595.10)
      Extensions:					 cl_amd_fp64 cl_khr_global_int32_base_atomics cl_khr_global_int32_extended_atomics cl_khr_local_int32_base_atomics cl_khr_local_int32_extended_atomics cl_khr_3d_image_writes cl_khr_byte_addressable_store cl_khr_gl_sharing cl_amd_device_attribute_query cl_amd_printf cl_amd_media_ops cl_amd_popcnt 
      Device Type:					 CL_DEVICE_TYPE_GPU
      Device ID:					 4098
      Max compute units:				 20
      Max work items dimensions:			 3
        Max work items[0]:				 256
        Max work items[1]:				 256
        Max work items[2]:				 256
      Max work group size:				 256
      Preferred vector width char:			 16
      Preferred vector width short:			 8
      Preferred vector width int:			 4
      Preferred vector width long:			 2
      Preferred vector width float:			 4
      Preferred vector width double:		 0
      Max clock frequency:				 725Mhz
      Address bits:					 32
      Max memory allocation:			 134217728
      Image support:				 Yes
      Max number of images read arguments:	 128
      Max number of images write arguments:	 8
      Max image 2D width:			 8192
      Max image 2D height:			 8192
      Max image 3D width:			 2048
      Max image 3D height:	 2048
      Max image 3D depth:			 2048
      Max samplers within kernel:		 16
      Max size of kernel argument:			 1024
      Alignment (bits) of base address:		 32768
      Minimum alignment (bytes) for any datatype:	 128
      Single precision floating point capability
        Denorms:					 No
        Quiet NaNs:					 Yes
        Round to nearest even:			 Yes
        Round to zero:				 Yes
        Round to +ve and infinity:			 Yes
        IEEE754-2008 fused multiply-add:		 Yes
      Cache type:					 None
      Cache line size:				 0
      Cache size:					 0
      Global memory size:				 536870912
      Constant buffer size:				 65536
      Max number of constant args:			 8
      Local memory type:				 Scratchpad
      Local memory size:				 32768
      Profiling timer resolution:			 1
      Device endianess:				 Little
      Available:					 Yes
      Compiler available:				 Yes
      Execution capabilities:				 
        Execute OpenCL kernels:			 Yes
        Execute native function:			 No
      Queue properties:				 
        Out-of-Order:				 No
        Profiling :					 Yes
      Platform ID:					 0x2b1e1f8c3800
      Name:						 Cypress
      Vendor:					 Advanced Micro Devices, Inc.
      Driver version:				 CAL 1.4.1703
      Profile:					 FULL_PROFILE
      Version:					 OpenCL 1.1 AMD-APP-SDK-v2.4 (595.10)
      Extensions:					 cl_amd_fp64 cl_khr_global_int32_base_atomics cl_khr_global_int32_extended_atomics cl_khr_local_int32_base_atomics cl_khr_local_int32_extended_atomics cl_khr_3d_image_writes cl_khr_byte_addressable_store cl_khr_gl_sharing cl_amd_device_attribute_query cl_amd_printf cl_amd_media_ops cl_amd_popcnt 
      Device Type:					 CL_DEVICE_TYPE_CPU
      Device ID:					 4098
      Max compute units:				 12
      Max work items dimensions:			 3
        Max work items[0]:				 1024
        Max work items[1]:				 1024
        Max work items[2]:				 1024
      Max work group size:				 1024
      Preferred vector width char:			 16
      Preferred vector width short:			 8
      Preferred vector width int:			 4
      Preferred vector width long:			 2
      Preferred vector width float:			 4
      Preferred vector width double:		 0
      Max clock frequency:				 1596Mhz
      Address bits:					 64
      Max memory allocation:			 3145740288
      Image support:				 Yes
      Max number of images read arguments:	 128
      Max number of images write arguments:	 8
      Max image 2D width:			 8192
      Max image 2D height:			 8192
      Max image 3D width:			 2048
      Max image 3D height:	 2048
      Max image 3D depth:			 2048
      Max samplers within kernel:		 16
      Max size of kernel argument:			 4096
      Alignment (bits) of base address:		 1024
      Minimum alignment (bytes) for any datatype:	 128
      Single precision floating point capability
        Denorms:					 Yes
        Quiet NaNs:					 Yes
        Round to nearest even:			 Yes
        Round to zero:				 Yes
        Round to +ve and infinity:			 Yes
        IEEE754-2008 fused multiply-add:		 No
      Cache type:					 Read/Write
      Cache line size:				 0
      Cache size:					 0
      Global memory size:				 12582961152
      Constant buffer size:				 65536
      Max number of constant args:			 8
      Local memory type:				 Global
      Local memory size:				 32768
      Profiling timer resolution:			 999848
      Device endianess:				 Little
      Available:					 Yes
      Compiler available:				 Yes
      Execution capabilities:				 
        Execute OpenCL kernels:			 Yes
        Execute native function:			 Yes
      Queue properties:				 
        Out-of-Order:				 No
        Profiling :					 Yes
      Platform ID:					 0x2b1e1f8c3800
      Name:						 Intel(R) Core(TM) i7 CPU       X 980  @ 3.33GHz
      Vendor:					 GenuineIntel
      Driver version:				 2.0
      Profile:					 FULL_PROFILE
      Version:					 OpenCL 1.1 AMD-APP-SDK-v2.4 (595.10)
      Extensions:					 cl_khr_fp64 cl_amd_fp64 cl_khr_global_int32_base_atomics cl_khr_global_int32_extended_atomics cl_khr_local_int32_base_atomics cl_khr_local_int32_extended_atomics cl_khr_int64_base_atomics cl_khr_int64_extended_atomics cl_khr_byte_addressable_store cl_khr_gl_sharing cl_ext_device_fission cl_amd_device_attribute_query cl_amd_vec3 cl_amd_media_ops cl_amd_popcnt cl_amd_printf 
     
     
    Passed!

    My system gives segmentation fault when I create a command queue on a GPU device but It works fine on a CPU device for all programs(including sample programs of SDK). For example, the following program(chk_mod.c) works fine with CPU but gives Segmentation fault with GPU

    Code :
    #include <stdio.h>
    #include <stdlib.h>
    #include <CL/cl.h>
    #include <time.h>
     
    #define MAX_SOURCE_SIZE (0x10000000)
    #define RNGE (100000)
     
    int main()
    {
    	cl_platform_id platform_id = NULL;
    	cl_device_id device_id = NULL;
    	cl_context context = NULL;
    	cl_command_queue command_queue = NULL;
    	cl_mem Amobj = NULL;
    	cl_mem Bmobj = NULL;
    	cl_mem Cmobj = NULL;
    	cl_program program = NULL;
    	cl_kernel kernel = NULL;
    	cl_uint ret_num_devices;
    	cl_uint ret_num_platforms;
    	cl_int ret;
     
    	clock_t time_i, time_f;
     
    	time_i = clock();
     
    	int i;
    	int j;
    	int *A;
    	int *B;
     
    	A = (int *) malloc( RNGE * sizeof(int) );
    	B = (int *) malloc( RNGE * sizeof(int) );
     
    	FILE *fp;
    	const char fileName[] = "chk_mod.cl";
    	size_t source_size;
    	char *source_str;
     
    	fp = fopen(fileName, "r");
    	if ( ! fp ) {
    		fprintf(stderr, "Failed to load kernel.\n");
    		exit(1);
    	}
    	source_str = (char *) malloc(MAX_SOURCE_SIZE);
    	source_size = fread(source_str, 1, MAX_SOURCE_SIZE, fp);
    	fclose(fp);
     
    	for ( i=0; i < RNGE; i++ ) {
    			A[ i ] = i;
    			B[ i ] = A[i];
    	}
     
    	ret = clGetPlatformIDs(1, &platform_id, &ret_num_platforms);
    	ret = clGetDeviceIDs(platform_id, CL_DEVICE_TYPE_GPU, 1, &device_id, &ret_num_devices);
     
    	context = clCreateContext(NULL, 1, &device_id, NULL, NULL, &ret);
     
    	command_queue = clCreateCommandQueue(context, device_id, 0, &ret);
     
    	Amobj = clCreateBuffer(context, CL_MEM_READ_ONLY, RNGE*sizeof(int), NULL, &ret);
    	Bmobj = clCreateBuffer(context, CL_MEM_READ_WRITE, RNGE*sizeof(int), NULL, &ret);
     
    	ret = clEnqueueWriteBuffer(command_queue, Amobj, CL_TRUE, 0, RNGE*sizeof(int), A, 0, NULL, NULL);
    	ret = clEnqueueWriteBuffer(command_queue, Bmobj, CL_TRUE, 0, RNGE*sizeof(int), B, 0, NULL, NULL);
     
    	program = clCreateProgramWithSource(context, 1, (const char **)&source_str, (const size_t *)&source_size, &ret);
    	ret = clBuildProgram(program, 1, &device_id, NULL, NULL, NULL);
     
    	kernel = clCreateKernel(program, "data_parallel", &ret);
     
    	ret = clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &Amobj);
    	ret = clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *) &Bmobj);
     
    	size_t global_item_size = RNGE;
    	size_t local_item_size = 1;
     
    	ret = clEnqueueNDRangeKernel(command_queue, kernel, 1, 0, &global_item_size, &local_item_size, 0, 0, 0 );
     
    	ret = clEnqueueReadBuffer(command_queue, Bmobj, CL_TRUE, 0, RNGE*sizeof(int), B, 0, NULL, NULL);
     
    	printf("Result: ");
    	for ( i=2; i < RNGE; i++ ) {
    		if ( B[i] ) {
    			printf( "%d ", B[i] );
    		}
    	}
    	printf("\n");
     
    	ret = clFlush(command_queue);
    	ret = clFinish(command_queue);
    	ret = clReleaseKernel(kernel);
    	ret = clReleaseProgram(program);
    	ret = clReleaseMemObject(Amobj);
    	ret = clReleaseMemObject(Bmobj);
    	ret = clReleaseCommandQueue(command_queue);
    	ret = clReleaseContext(context);
     
    	free(source_str);
     
    	free(A);
    	free(B);
     
    	time_f = clock();
    	printf("Time elapsed = %7.3fs\n", (float) (time_f - time_i)/CLOCKS_PER_SEC);
     
    	return 0;
    }

    Kernel file(chk_mod.cl) is:
    Code :
    __kernel void data_parallel( __global int *A, __global int *B )
    {
    	int t;
    	int i = get_global_id(0);
     
    	for ( t = 2; t < i; t++ ) {
    		if ( i % t == 0 ) {
    			B[ i ] = 0;
    		}
    	}
    }

    Can someone please help me with the issue as soon as possible?

  2. #2
    Senior Member
    Join Date
    Aug 2011
    Posts
    271

    Re: Segmentation fault while creating command queue

    Works here, although it hard-locked my entire system for 60 seconds while it ran.
    (HD 5770)

    My system gives segmentation fault when I create a command queue on a GPU device but It works fine on a CPU device for all programs(including sample programs of SDK). For example, the following program(chk_mod.c) works fine with CPU but gives Segmentation fault with GPU
    Sigh, not code based on that poor example again; add checking to every call to help identify what is going wrong.

  3. #3
    Junior Member
    Join Date
    Nov 2011
    Posts
    11

    Re: Segmentation fault while creating command queue

    Though I have added checks yet the problem is not with the code. As soon as I run the code it aborts returning segfault. Even after installing compatible drivers the system gives segfault for any OpenCL program running on GPU(even the samples programs of amd sdk).

    Code :
    #include <stdio.h>
    #include <stdlib.h>
    #include <CL/cl.h>
    #include <time.h>
     
    #define MAX_SOURCE_SIZE (0x10000000)
    #define RNGE (100000)
     
    int main()
    {
            cl_platform_id platform_id = NULL;
            cl_device_id device_id = NULL;
            cl_context context = NULL;
            cl_command_queue command_queue = NULL;
            cl_mem Amobj = NULL;
            cl_mem Bmobj = NULL;
            cl_mem Cmobj = NULL;
            cl_program program = NULL;
            cl_kernel kernel = NULL;
            cl_uint ret_num_devices;
            cl_uint ret_num_platforms;
            cl_int ret;
     
            clock_t time_i, time_f;
     
            time_i = clock();
     
            int i;
            int j;
            int *A;
            int *B;
     
            A = (int *) malloc( RNGE * sizeof(int) );
            B = (int *) malloc( RNGE * sizeof(int) );
     
            FILE *fp;
            const char fileName[] = "chk_mod.cl";
            size_t source_size;
            char *source_str;
     
            fp = fopen(fileName, "r");
            if ( ! fp ) {
                    fprintf(stderr, "Failed to load kernel.\n");
                    exit(1);
            }
            source_str = (char *) malloc(MAX_SOURCE_SIZE);
            source_size = fread(source_str, 1, MAX_SOURCE_SIZE, fp);
            fclose(fp);
     
            for ( i=0; i < RNGE; i++ ) {
                            A[ i ] = i;
                            B[ i ] = A[i];
            }
     
            ret = clGetPlatformIDs(1, &platform_id, &ret_num_platforms);
            if( ret != CL_SUCCESS ) {
                    printf( "Error : Cannot get platform id.\n" );
                    return 1;
            }
     
     
            ret = clGetDeviceIDs(platform_id, CL_DEVICE_TYPE_GPU, 1, &device_id, &ret_num_devices);
            if( ret != CL_SUCCESS ) {
                    printf( "Error : Cannot get device id.\n" );
                    return 1;
            }
     
     
            context = clCreateContext(NULL, 1, &device_id, NULL, NULL, &ret);
            if( ret != CL_SUCCESS ) {
                    printf( "Error : Cannot create context\n" );
                    return 1;
            }
     
            command_queue = clCreateCommandQueue(context, device_id, 0, &ret);
            if( ret != CL_SUCCESS ) {
                    printf( "Error : Cannot create command queue.\n" );
                    return 1;
            }
     
            Amobj = clCreateBuffer(context, CL_MEM_READ_ONLY, RNGE*sizeof(int), NULL, &ret);
            if( ret != CL_SUCCESS ) {
                    printf( "Error : Cannot create buffer A\n" );
                    return 1;
            }
     
            Bmobj = clCreateBuffer(context, CL_MEM_READ_WRITE, RNGE*sizeof(int), NULL, &ret);
            if( ret != CL_SUCCESS ) {
                    printf( "Error : Cannot create buffer B\n" );
                    return 1;
            }
     
     
            ret = clEnqueueWriteBuffer(command_queue, Amobj, CL_TRUE, 0, RNGE*sizeof(int), A, 0, NULL, NULL);
            if( ret != CL_SUCCESS ) {
                    printf( "Error : Cannot enqueue in write buffer.\n" );
                    return 1;
            }
     
            ret = clEnqueueWriteBuffer(command_queue, Bmobj, CL_TRUE, 0, RNGE*sizeof(int), B, 0, NULL, NULL);
            if( ret != CL_SUCCESS ) {
                    printf( "Error : Cannot enqueue in write buffer.\n" );
                    return 1;
            }
     
     
            program = clCreateProgramWithSource(context, 1, (const char **)&source_str, (const size_t *)&source_size, &ret);
            if( ret != CL_SUCCESS ) {
                    printf( "Error : Cannot create program with source\n" );
                    return 1;
            }
     
            ret = clBuildProgram(program, 1, &device_id, NULL, NULL, NULL);
     
            kernel = clCreateKernel(program, "data_parallel", &ret);
            if( ret != CL_SUCCESS ) {
                    printf( "Error : Cannot create kernel\n" );
                    return 1;
            }
     
     ret = clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &Amobj);
            ret = clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *) &Bmobj);
     
            size_t global_item_size = RNGE;
            size_t local_item_size = 1;
     
            ret = clEnqueueNDRangeKernel(command_queue, kernel, 1, 0, &global_item_size, &local_item_size, 0, 0, 0 );
     
            ret = clEnqueueReadBuffer(command_queue, Bmobj, CL_TRUE, 0, RNGE*sizeof(int), B, 0, NULL, NULL);
     
            printf("Result: ");
            for ( i=2; i < RNGE; i++ ) {
                    if ( B[i] ) {
                            printf( "%d ", B[i] );
                    }
            }
            printf("\n");
     
            ret = clFlush(command_queue);
            ret = clFinish(command_queue);
            ret = clReleaseKernel(kernel);
            ret = clReleaseProgram(program);
            ret = clReleaseMemObject(Amobj);
            ret = clReleaseMemObject(Bmobj);
            ret = clReleaseCommandQueue(command_queue);
            ret = clReleaseContext(context);
     
            free(source_str);
     
            free(A);
            free(B);
     
            time_f = clock();
            printf("Time elapsed = %7.3fs\n", (float) (time_f - time_i)/CLOCKS_PER_SEC);
     
            return 0;
    }

    I am not able to find where the real problem lies. CLInfo returns the complete information about all 3(2 GPU + 1 CPU) devices yet running any code returns segmentation fault.

    Somebody please review the problem.

Similar Threads

  1. Replies: 3
    Last Post: 04-05-2013, 02:17 AM
  2. Segmentation fault (core dumped )
    By jainx224 in forum OpenCL
    Replies: 1
    Last Post: 03-18-2013, 08:32 AM

Posting Permissions

  • You may not post new threads
  • You may not post replies
  • You may not post attachments
  • You may not edit your posts
  •