Hello again, I'm using OpenCL to find the nearest neighbour between two set of 3D points.

This time I'm using kd-tree for the model.

First I build the kd-tree and then I pass it to the GPU.

I'm representing the tree as an implicit data structure (array) so I don't need to use pointer (left and right child) during the search on the kd-tree.

I'm using this properties to represent the tree as an array:

root: 0

left: index*2+1

right: index*2+2

http://i.imgur.com/lQQ1Q.png

The main problem is that kd-tree nearest neighbour is a recursive algorithm, so I've converted it to a non-recursive form based on a stack.

Here my non-optimized kernel:

Code :

float distanza(float4 a, float4 b) { float dx, dy, dz; dx = a.x-b.x; dx *= dx; dy = a.y-b.y; dy *= dy; dz = a.z-b.z; dz *= dz; return dx+dy+dz; } bool isNull(float4 p) { if(p.w==-1) return true; return false; } int findLeaf(__global float4 *model, float4 qPoint, int model_size, int cur) { int best_id = -1; int asse; while(cur <= model_size) { asse = model[cur].w; if(qPoint[asse] < model[cur][asse]) { if(cur*2+1>=model_size || isNull(model[cur*2+1])) { if(cur*2+2>=model_size || isNull(model[cur*2+2])) { best_id = cur; break; } else { cur = cur*2+2; } } else { cur = cur*2+1; } } else { if(cur*2+2>=model_size || isNull(model[cur*2+2])) { if(cur*2+1>=model_size || isNull(model[cur*2+1])) { best_id = cur; break; } else { cur = cur*2+1; } } else { cur = cur*2+2; } } } return best_id; } __kernel void nearest_neighbour(__global float4 *model, __global float4 *dataset, __global int *nearest, const int model_size) { int g_dataset_id = get_global_id(0); float4 qPoint = dataset[g_dataset_id]; int stack[7]; // 7 is enough for the number of points in my model int top = 0; stack[top] = -1; int node = findLeaf(model, qPoint, model_size, 0); int nn = node; int lastChild, asse, otherSide; float bestDist = distanza(qPoint, model[node]); while(node != 0) { lastChild = node; node = (node - 1) / 2; if(stack[top] == node) { --top; } else { float parentDist = distanza(qPoint, model[node]); if(parentDist < bestDist) { bestDist = parentDist; nn = node; } asse = model[node].w; float testDist = model[node][asse] - qPoint[asse]; testDist = testDist * testDist; if(testDist < bestDist) { if (node*2+1 == lastChild) { otherSide = node*2+2; } else { otherSide = node*2+1; } if(otherSide < model_size && !isNull(model[otherSide])) { int testNode = findLeaf(model, qPoint, model_size, otherSide); testDist = distanza(qPoint, model[testNode]); if(testDist < bestDist) { bestDist = testDist; nn = testNode; } ++top; stack[top] = node; node = testNode; } } } } nearest[g_dataset_id] = nn; }

The code return the correct nearest neighbour (except for few points but this is not a problem right now). The real problem is that is slower on the GPU then on the CPU.

Results (model 100.000 points and 100.000 query points):

http://i.imgur.com/dEUcA.png

Also it crash on ATI 5850.

What can I do to make it faster? I was thinking about storing the model in local memory, but I don't know if I can store 100.000 float4 on it, also I'm not sure if it will improve performance.

Any suggestion?

Thanks

Stefano