I have two ATI Radeon 5970 graphics card with Intel Core i7 processor running 64-bit CentOS 5.3 on my system. I have installed ATI Catalyst 12.2(64-bit) drivers and AMD APP SDK 2.4 as well. Output given by CLInfo sample program of SDK is

Code :
Number of platforms:				 1
  Platform Profile:				 FULL_PROFILE
  Platform Version:				 OpenCL 1.1 AMD-APP-SDK-v2.4 (595.10)
  Platform Name:					 AMD Accelerated Parallel Processing
  Platform Vendor:				 Advanced Micro Devices, Inc.
  Platform Extensions:			 cl_khr_icd cl_amd_event_callback cl_amd_offline_devices
 
 
  Platform Name:					 AMD Accelerated Parallel Processing
Number of devices:				 3
  Device Type:					 CL_DEVICE_TYPE_GPU
  Device ID:					 4098
  Max compute units:				 20
  Max work items dimensions:			 3
    Max work items[0]:				 256
    Max work items[1]:				 256
    Max work items[2]:				 256
  Max work group size:				 256
  Preferred vector width char:			 16
  Preferred vector width short:			 8
  Preferred vector width int:			 4
  Preferred vector width long:			 2
  Preferred vector width float:			 4
  Preferred vector width double:		 0
  Max clock frequency:				 725Mhz
  Address bits:					 32
  Max memory allocation:			 134217728
  Image support:				 Yes
  Max number of images read arguments:	 128
  Max number of images write arguments:	 8
  Max image 2D width:			 8192
  Max image 2D height:			 8192
  Max image 3D width:			 2048
  Max image 3D height:	 2048
  Max image 3D depth:			 2048
  Max samplers within kernel:		 16
  Max size of kernel argument:			 1024
  Alignment (bits) of base address:		 32768
  Minimum alignment (bytes) for any datatype:	 128
  Single precision floating point capability
    Denorms:					 No
    Quiet NaNs:					 Yes
    Round to nearest even:			 Yes
    Round to zero:				 Yes
    Round to +ve and infinity:			 Yes
    IEEE754-2008 fused multiply-add:		 Yes
  Cache type:					 None
  Cache line size:				 0
  Cache size:					 0
  Global memory size:				 536870912
  Constant buffer size:				 65536
  Max number of constant args:			 8
  Local memory type:				 Scratchpad
  Local memory size:				 32768
  Profiling timer resolution:			 1
  Device endianess:				 Little
  Available:					 Yes
  Compiler available:				 Yes
  Execution capabilities:				 
    Execute OpenCL kernels:			 Yes
    Execute native function:			 No
  Queue properties:				 
    Out-of-Order:				 No
    Profiling :					 Yes
  Platform ID:					 0x2b1e1f8c3800
  Name:						 Cypress
  Vendor:					 Advanced Micro Devices, Inc.
  Driver version:				 CAL 1.4.1703
  Profile:					 FULL_PROFILE
  Version:					 OpenCL 1.1 AMD-APP-SDK-v2.4 (595.10)
  Extensions:					 cl_amd_fp64 cl_khr_global_int32_base_atomics cl_khr_global_int32_extended_atomics cl_khr_local_int32_base_atomics cl_khr_local_int32_extended_atomics cl_khr_3d_image_writes cl_khr_byte_addressable_store cl_khr_gl_sharing cl_amd_device_attribute_query cl_amd_printf cl_amd_media_ops cl_amd_popcnt 
  Device Type:					 CL_DEVICE_TYPE_GPU
  Device ID:					 4098
  Max compute units:				 20
  Max work items dimensions:			 3
    Max work items[0]:				 256
    Max work items[1]:				 256
    Max work items[2]:				 256
  Max work group size:				 256
  Preferred vector width char:			 16
  Preferred vector width short:			 8
  Preferred vector width int:			 4
  Preferred vector width long:			 2
  Preferred vector width float:			 4
  Preferred vector width double:		 0
  Max clock frequency:				 725Mhz
  Address bits:					 32
  Max memory allocation:			 134217728
  Image support:				 Yes
  Max number of images read arguments:	 128
  Max number of images write arguments:	 8
  Max image 2D width:			 8192
  Max image 2D height:			 8192
  Max image 3D width:			 2048
  Max image 3D height:	 2048
  Max image 3D depth:			 2048
  Max samplers within kernel:		 16
  Max size of kernel argument:			 1024
  Alignment (bits) of base address:		 32768
  Minimum alignment (bytes) for any datatype:	 128
  Single precision floating point capability
    Denorms:					 No
    Quiet NaNs:					 Yes
    Round to nearest even:			 Yes
    Round to zero:				 Yes
    Round to +ve and infinity:			 Yes
    IEEE754-2008 fused multiply-add:		 Yes
  Cache type:					 None
  Cache line size:				 0
  Cache size:					 0
  Global memory size:				 536870912
  Constant buffer size:				 65536
  Max number of constant args:			 8
  Local memory type:				 Scratchpad
  Local memory size:				 32768
  Profiling timer resolution:			 1
  Device endianess:				 Little
  Available:					 Yes
  Compiler available:				 Yes
  Execution capabilities:				 
    Execute OpenCL kernels:			 Yes
    Execute native function:			 No
  Queue properties:				 
    Out-of-Order:				 No
    Profiling :					 Yes
  Platform ID:					 0x2b1e1f8c3800
  Name:						 Cypress
  Vendor:					 Advanced Micro Devices, Inc.
  Driver version:				 CAL 1.4.1703
  Profile:					 FULL_PROFILE
  Version:					 OpenCL 1.1 AMD-APP-SDK-v2.4 (595.10)
  Extensions:					 cl_amd_fp64 cl_khr_global_int32_base_atomics cl_khr_global_int32_extended_atomics cl_khr_local_int32_base_atomics cl_khr_local_int32_extended_atomics cl_khr_3d_image_writes cl_khr_byte_addressable_store cl_khr_gl_sharing cl_amd_device_attribute_query cl_amd_printf cl_amd_media_ops cl_amd_popcnt 
  Device Type:					 CL_DEVICE_TYPE_CPU
  Device ID:					 4098
  Max compute units:				 12
  Max work items dimensions:			 3
    Max work items[0]:				 1024
    Max work items[1]:				 1024
    Max work items[2]:				 1024
  Max work group size:				 1024
  Preferred vector width char:			 16
  Preferred vector width short:			 8
  Preferred vector width int:			 4
  Preferred vector width long:			 2
  Preferred vector width float:			 4
  Preferred vector width double:		 0
  Max clock frequency:				 1596Mhz
  Address bits:					 64
  Max memory allocation:			 3145740288
  Image support:				 Yes
  Max number of images read arguments:	 128
  Max number of images write arguments:	 8
  Max image 2D width:			 8192
  Max image 2D height:			 8192
  Max image 3D width:			 2048
  Max image 3D height:	 2048
  Max image 3D depth:			 2048
  Max samplers within kernel:		 16
  Max size of kernel argument:			 4096
  Alignment (bits) of base address:		 1024
  Minimum alignment (bytes) for any datatype:	 128
  Single precision floating point capability
    Denorms:					 Yes
    Quiet NaNs:					 Yes
    Round to nearest even:			 Yes
    Round to zero:				 Yes
    Round to +ve and infinity:			 Yes
    IEEE754-2008 fused multiply-add:		 No
  Cache type:					 Read/Write
  Cache line size:				 0
  Cache size:					 0
  Global memory size:				 12582961152
  Constant buffer size:				 65536
  Max number of constant args:			 8
  Local memory type:				 Global
  Local memory size:				 32768
  Profiling timer resolution:			 999848
  Device endianess:				 Little
  Available:					 Yes
  Compiler available:				 Yes
  Execution capabilities:				 
    Execute OpenCL kernels:			 Yes
    Execute native function:			 Yes
  Queue properties:				 
    Out-of-Order:				 No
    Profiling :					 Yes
  Platform ID:					 0x2b1e1f8c3800
  Name:						 Intel(R) Core(TM) i7 CPU       X 980  @ 3.33GHz
  Vendor:					 GenuineIntel
  Driver version:				 2.0
  Profile:					 FULL_PROFILE
  Version:					 OpenCL 1.1 AMD-APP-SDK-v2.4 (595.10)
  Extensions:					 cl_khr_fp64 cl_amd_fp64 cl_khr_global_int32_base_atomics cl_khr_global_int32_extended_atomics cl_khr_local_int32_base_atomics cl_khr_local_int32_extended_atomics cl_khr_int64_base_atomics cl_khr_int64_extended_atomics cl_khr_byte_addressable_store cl_khr_gl_sharing cl_ext_device_fission cl_amd_device_attribute_query cl_amd_vec3 cl_amd_media_ops cl_amd_popcnt cl_amd_printf 
 
 
Passed!

My system gives segmentation fault when I create a command queue on a GPU device but It works fine on a CPU device for all programs(including sample programs of SDK). For example, the following program(chk_mod.c) works fine with CPU but gives Segmentation fault with GPU

Code :
#include <stdio.h>
#include <stdlib.h>
#include <CL/cl.h>
#include <time.h>
 
#define MAX_SOURCE_SIZE (0x10000000)
#define RNGE (100000)
 
int main()
{
	cl_platform_id platform_id = NULL;
	cl_device_id device_id = NULL;
	cl_context context = NULL;
	cl_command_queue command_queue = NULL;
	cl_mem Amobj = NULL;
	cl_mem Bmobj = NULL;
	cl_mem Cmobj = NULL;
	cl_program program = NULL;
	cl_kernel kernel = NULL;
	cl_uint ret_num_devices;
	cl_uint ret_num_platforms;
	cl_int ret;
 
	clock_t time_i, time_f;
 
	time_i = clock();
 
	int i;
	int j;
	int *A;
	int *B;
 
	A = (int *) malloc( RNGE * sizeof(int) );
	B = (int *) malloc( RNGE * sizeof(int) );
 
	FILE *fp;
	const char fileName[] = "chk_mod.cl";
	size_t source_size;
	char *source_str;
 
	fp = fopen(fileName, "r");
	if ( ! fp ) {
		fprintf(stderr, "Failed to load kernel.\n");
		exit(1);
	}
	source_str = (char *) malloc(MAX_SOURCE_SIZE);
	source_size = fread(source_str, 1, MAX_SOURCE_SIZE, fp);
	fclose(fp);
 
	for ( i=0; i < RNGE; i++ ) {
			A[ i ] = i;
			B[ i ] = A[i];
	}
 
	ret = clGetPlatformIDs(1, &platform_id, &ret_num_platforms);
	ret = clGetDeviceIDs(platform_id, CL_DEVICE_TYPE_GPU, 1, &device_id, &ret_num_devices);
 
	context = clCreateContext(NULL, 1, &device_id, NULL, NULL, &ret);
 
	command_queue = clCreateCommandQueue(context, device_id, 0, &ret);
 
	Amobj = clCreateBuffer(context, CL_MEM_READ_ONLY, RNGE*sizeof(int), NULL, &ret);
	Bmobj = clCreateBuffer(context, CL_MEM_READ_WRITE, RNGE*sizeof(int), NULL, &ret);
 
	ret = clEnqueueWriteBuffer(command_queue, Amobj, CL_TRUE, 0, RNGE*sizeof(int), A, 0, NULL, NULL);
	ret = clEnqueueWriteBuffer(command_queue, Bmobj, CL_TRUE, 0, RNGE*sizeof(int), B, 0, NULL, NULL);
 
	program = clCreateProgramWithSource(context, 1, (const char **)&source_str, (const size_t *)&source_size, &ret);
	ret = clBuildProgram(program, 1, &device_id, NULL, NULL, NULL);
 
	kernel = clCreateKernel(program, "data_parallel", &ret);
 
	ret = clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &Amobj);
	ret = clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *) &Bmobj);
 
	size_t global_item_size = RNGE;
	size_t local_item_size = 1;
 
	ret = clEnqueueNDRangeKernel(command_queue, kernel, 1, 0, &global_item_size, &local_item_size, 0, 0, 0 );
 
	ret = clEnqueueReadBuffer(command_queue, Bmobj, CL_TRUE, 0, RNGE*sizeof(int), B, 0, NULL, NULL);
 
	printf("Result: ");
	for ( i=2; i < RNGE; i++ ) {
		if ( B[i] ) {
			printf( "%d ", B[i] );
		}
	}
	printf("\n");
 
	ret = clFlush(command_queue);
	ret = clFinish(command_queue);
	ret = clReleaseKernel(kernel);
	ret = clReleaseProgram(program);
	ret = clReleaseMemObject(Amobj);
	ret = clReleaseMemObject(Bmobj);
	ret = clReleaseCommandQueue(command_queue);
	ret = clReleaseContext(context);
 
	free(source_str);
 
	free(A);
	free(B);
 
	time_f = clock();
	printf("Time elapsed = %7.3fs\n", (float) (time_f - time_i)/CLOCKS_PER_SEC);
 
	return 0;
}

Kernel file(chk_mod.cl) is:
Code :
__kernel void data_parallel( __global int *A, __global int *B )
{
	int t;
	int i = get_global_id(0);
 
	for ( t = 2; t < i; t++ ) {
		if ( i % t == 0 ) {
			B[ i ] = 0;
		}
	}
}

Can someone please help me with the issue as soon as possible?