Hello again, I'm using OpenCL to find the nearest neighbour between two set of 3D points.

This time I'm using kd-tree for the model.

First I build the kd-tree and then I pass it to the GPU.

I'm representing the tree as an implicit data structure (array) so I don't need to use pointer (left and right child) during the search on the kd-tree.

I'm using this properties to represent the tree as an array:

root: 0
left: index*2+1
right: index*2+2


The main problem is that kd-tree nearest neighbour is a recursive algorithm, so I've converted it to a non-recursive form based on a stack.

Here my non-optimized kernel:
Code :
float distanza(float4 a, float4 b)
{
	float dx, dy, dz;
 
	dx = a.x-b.x;
	dx *= dx;
 
	dy = a.y-b.y;
	dy *= dy;
 
	dz = a.z-b.z;
	dz *= dz;
 
	return dx+dy+dz;
}
 
bool isNull(float4 p)
{
	if(p.w==-1)
		return true;
 
	return false;
}
 
int findLeaf(__global float4 *model, float4 qPoint, int model_size, int cur)
{
	int best_id = -1;
	int asse;
 
	while(cur <= model_size)
	{		
		asse = model[cur].w;
 
		if(qPoint[asse] < model[cur][asse])
		{			
			if(cur*2+1>=model_size || isNull(model[cur*2+1]))
			{
				if(cur*2+2>=model_size || isNull(model[cur*2+2]))
				{
					best_id = cur;
					break;
				}
				else
				{
					cur = cur*2+2;
				}
			}
			else
			{
				cur = cur*2+1;
			}
		}
		else
		{
			if(cur*2+2>=model_size || isNull(model[cur*2+2]))
			{
				if(cur*2+1>=model_size || isNull(model[cur*2+1]))
				{
					best_id = cur;
					break;
				}
				else
				{
					cur = cur*2+1;
				}
			}
			else
			{
				cur = cur*2+2;
			}
		}
	}
 
	return best_id;
}
 
__kernel void 
nearest_neighbour(__global float4 *model,
	__global float4 *dataset,
	__global int *nearest,
	const int model_size)
{
		int g_dataset_id = get_global_id(0);
 
		float4 qPoint = dataset[g_dataset_id];
 
		int stack[7]; // 7 is enough for the number of points in my model
		int top = 0;
 
		stack[top] = -1;
 
		int node = findLeaf(model, qPoint, model_size, 0);
 
		int nn = node;
 
		int lastChild, asse, otherSide;
 
		float bestDist = distanza(qPoint, model[node]);
 
		while(node != 0)
		{
			lastChild = node;
 
			node = (node - 1) / 2;
 
			if(stack[top] == node)
			{
				--top;
			}
			else
			{
				float parentDist = distanza(qPoint, model[node]);
 
				if(parentDist < bestDist)
				{
					bestDist = parentDist;
					nn = node;
				}
 
				asse = model[node].w;
 
				float testDist = model[node][asse] - qPoint[asse];
				testDist = testDist * testDist;
 
				if(testDist < bestDist)
				{
					if (node*2+1 == lastChild)
					{
						otherSide = node*2+2;
					}
					else
					{
						otherSide = node*2+1;
					}
 
					if(otherSide < model_size && !isNull(model[otherSide]))
					{
						int testNode = findLeaf(model, qPoint, model_size, otherSide);
 
						testDist = distanza(qPoint, model[testNode]);
 
						if(testDist < bestDist)
						{
							bestDist = testDist;
							nn = testNode;
						}
 
						++top;
						stack[top] = node;
 
						node = testNode;
					}
				}
			}
		}
 
		nearest[g_dataset_id] = nn;
}

The code return the correct nearest neighbour (except for few points but this is not a problem right now). The real problem is that is slower on the GPU then on the CPU.

Results (model 100.000 points and 100.000 query points):


Also it crash on ATI 5850.

What can I do to make it faster? I was thinking about storing the model in local memory, but I don't know if I can store 100.000 float4 on it, also I'm not sure if it will improve performance.

Any suggestion?

Thanks

Stefano