Hi!

I'm having some trouble with this code:

Code :
#include <oclUtils.h>
#include <stdio.h>
#include <stdlib.h>
#include <tchar.h>
#include <CL/cl.h>
#include <conio.h>
 
#define N 10
 
void write(int A[N][N])
{
	int s = 0;
	for (int i = 0; i < N; i++)
	{
		for (int j = 0; j < N; j++)
		{
			if (s > N-1)
			{
				printf("\n");
				s = 0;
			}
			printf(" %d", A[i][j]);
			s++;
		}
	}
	printf("\n \n");
}
 
const char* OpenCLSource1[] = {
"__kernel void matrixAdd(__global int* c, __global int* a, __global int* b)\n",
"{\n",
"       unsigned int x = get_global_id(0);\n",
"       unsigned int y = get_global_id(1);\n",
"       \n",
"       c[x] = a[x] + b[x];\n",
"}\n"
};
 
int main(int argc, const char** argv) 
{
	int host_vector1[N][N], host_vector2[N][N];
	int host_vector[N][N];
 
	for(int i = 0; i < N; i++)
    {
		for(int j = 0; j < N; j++)
		{
			host_vector1[i][j] = j;
			host_vector2[i][j] = j;
		}
    }
 
	write(host_vector1);
	write(host_vector2);
 
    cl_int error = 0;
 
    cl_uint numPlatforms;
    cl_platform_id* clSelectedPlatformID = NULL;
    clGetPlatformIDs(0, NULL, &numPlatforms);
    clSelectedPlatformID = (cl_platform_id*)malloc(sizeof(cl_platform_id)*numPlatforms);
    error = clGetPlatformIDs(numPlatforms, clSelectedPlatformID, NULL);
    if(error != CL_SUCCESS) 
        return 0;
 
    cl_uint ciDeviceCount;
    cl_device_id* clDevices =  NULL;
    error = clGetDeviceIDs(clSelectedPlatformID[0], CL_DEVICE_TYPE_GPU, 0, NULL, &ciDeviceCount);
 
    clDevices = (cl_device_id*) malloc(sizeof(cl_device_id) * ciDeviceCount);
    error = clGetDeviceIDs(clSelectedPlatformID[0], CL_DEVICE_TYPE_GPU, ciDeviceCount, clDevices, &ciDeviceCount);
 
	cl_context GPU_context = clCreateContext(0, 1, clDevices, NULL, NULL, &error);
 
    cl_command_queue GPUCommandQueue = clCreateCommandQueue(GPU_context, clDevices[0], CL_QUEUE_PROFILING_ENABLE, NULL);
	cl_event event1;
 
	cl_mem GPU_vector1 = clCreateBuffer(GPU_context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, sizeof(int) * N*N, host_vector1, NULL);
    cl_mem GPU_vector2 = clCreateBuffer(GPU_context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, sizeof(int) * N*N, host_vector2, NULL);
    cl_mem GPU_result_vector = clCreateBuffer(GPU_context, CL_MEM_WRITE_ONLY, sizeof(int) * N*N, NULL, NULL);
 
	cl_program OpenCLProgram = clCreateProgramWithSource(GPU_context, 7, OpenCLSource1, NULL, &error);
 
    error = clBuildProgram(OpenCLProgram, 0, NULL, NULL, NULL, NULL);
 
	cl_kernel OpenCLVectorAdd = clCreateKernel(OpenCLProgram, "matrixAdd", NULL); 
    clSetKernelArg(OpenCLVectorAdd, 0, sizeof(cl_mem),(void*)&GPU_result_vector);
    clSetKernelArg(OpenCLVectorAdd, 1, sizeof(cl_mem), (void*)&GPU_vector1);
    clSetKernelArg(OpenCLVectorAdd, 2, sizeof(cl_mem), (void*)&GPU_vector2);
 
	size_t WorkSize[2] = {N, N};
    cl_int temp = clEnqueueNDRangeKernel(GPUCommandQueue, OpenCLVectorAdd, 2, NULL, WorkSize, NULL, 0, NULL, &event1);
 
    clEnqueueReadBuffer(GPUCommandQueue, GPU_result_vector, CL_TRUE, 0, N*N * sizeof(int), host_vector, 0, NULL, NULL);
 
	write(host_vector);
 
	_getch();
 
    return 0;
}

It runs now, but if I remove the
Code :
"       \n"
line form the kernel, the program crashes. Also my original goal is a simple matrix addition, but if I add this (instead of the current c[x] = a[x] + b[x]; line) to the kernel
Code :
c[x][y] = a[x][y] + b[x][y];
it just won't work. I get a matrix filled with zeros.

Any thoughts?