Hi all,

I've written a basic SHA1 hash brute-forcer for OpenCL. Unfortunately the performance is way below what I was anticipating. Tens of millions of hashes per second should be typical for a half-decent GPU, yet this is taking 14.7 seconds just to burn through 2.6 million.

Any advice would be much appreciated..

Code :
#include <fcntl.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <math.h>
#include <unistd.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <OpenCL/opencl.h>
 
void nextkey();
 
#define NUM_BLOCKS 10240
#define BLOCK_SIZE 256
#define CHARSET "abcdefghijklmnopqrstuvwxyz1234567890"
 
const char *KernelSource = "\n" \
"#define K0	0x5A827999\n" \
"#define K1	0x6ED9EBA1\n" \
"#define K2	0x8F1BBCDC\n" \
"#define K3	0xCA62C1D6\n" \
"\n" \
"#define H1 0x67452301\n" \
"#define H2 0xEFCDAB89\n" \
"#define H3 0x98BADCFE\n" \
"#define H4 0x10325476\n" \
"#define H5 0xC3D2E1F0\n" \
"\n" \
"#define uchar unsigned char\n" \
"\n" \
"uint rotateLeft(uint x, int n)\n" \
"{\n" \
"		return	(x << n) | (x >> (32-n));\n" \
"}\n" \
"\n" \
"__kernel void sha1(__global char *msg, __global const unsigned int *len, __global char *digest)\n" \
"{\n" \
"		int t, i, j, gid, x;\n" \
"		uint W[80], A[5], temp, number;\n" \
"		char hexChars[16] = {'0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 'a', 'b', 'c', 'd', 'e', 'f'};\n" \
"		gid = get_global_id(0);\n" \
"		int item_pad = gid * 64;\n" \
"		uint ulen = (len[gid]*8) & 0xFFFFFFFF;\n" \
"\n" \
"		for (i=0;i<64-len[gid];i++) {\n" \
"			msg[item_pad+len[gid]+i] = 0;\n" \
"		}\n" \
"\n" \
"		msg[item_pad + len[gid]] = (char) 0x80;\n" \
"\n" \
"	 msg[item_pad + 60] = ulen >> 24;\n" \
"	 msg[item_pad + 61] = ulen >> 16;\n" \
"	 msg[item_pad + 62] = ulen >> 8;\n" \
"	 msg[item_pad + 63] = ulen;\n" \
"\n" \
"		A[0] = H1;\n" \
"		A[1] = H2;\n" \
"		A[2] = H3;\n" \
"		A[3] = H4;\n" \
"		A[4] = H5;\n" \
"\n" \
"		for (t = 0; t < 16; t++)\n" \
"		{\n" \
"				W[t] = ((uchar) msg[item_pad + (t * 4)]);\n" \
"				W[t] = W[t] << 24;\n" \
"				temp = ((uchar) msg[item_pad + (t * 4 + 1)]);\n" \
"				temp = temp << 16;\n" \
"				W[t] |= temp;\n" \
"				temp = ((uchar) msg[item_pad + (t * 4 + 2)]);\n" \
"				temp = temp << 8;\n" \
"				W[t] |= temp;\n" \
"				W[t] |= (uchar) msg[item_pad + (t * 4 + 3)];\n" \
"		}\n" \
"\n" \
"		for(i = 16; i < 80; i++)\n" \
"		{\n" \
"				W[i] = rotateLeft(W[i-3] ^ W[i-8] ^ W[i-14] ^ W[i-16], 1);\n" \
"		}\n" \
"\n" \
"		for(i = 0; i < 20; i++)\n" \
"		{\n" \
"				temp = rotateLeft(A[0],5) + ((A[1] & A[2]) | ((~ A[1]) & A[3])) + A[4] + W[i] + K0;\n" \
"				A[4] = A[3];\n" \
"				A[3] = A[2];\n" \
"				A[2] = rotateLeft(A[1], 30);\n" \
"				A[1] = A[0];\n" \
"				A[0] = temp;\n" \
"		}\n" \
"\n" \
"		for(i = 20; i < 40; i++)\n" \
"		{\n" \
"				temp = rotateLeft(A[0], 5) + (A[1] ^ A[2] ^ A[3]) + A[4] + W[i] + K1;\n" \
"				A[4] = A[3];\n" \
"				A[3] = A[2];\n" \
"				A[2] = rotateLeft(A[1], 30);\n" \
"				A[1] = A[0];\n" \
"				A[0] = temp;\n" \
"		}\n" \
"\n" \
"		for(i = 40; i < 60; i++)\n" \
"		{\n" \
"				temp = rotateLeft(A[0], 5) + ((A[1] & A[2]) | (A[1] & A[3]) | (A[2] & A[3])) + A[4] + W[i] + K2;\n" \
"				A[4] = A[3];\n" \
"				A[3] = A[2];\n" \
"				A[2] = rotateLeft(A[1], 30);\n" \
"				A[1] = A[0];\n" \
"				A[0] = temp;\n" \
"		}\n" \
"\n" \
"		for(i = 60; i < 80; i++)\n" \
"		{\n" \
"				temp = rotateLeft(A[0], 5) + (A[1] ^ A[2] ^ A[3])  + A[4] + W[i] + K3;\n" \
"				A[4] = A[3];\n" \
"				A[3] = A[2];\n" \
"				A[2] = rotateLeft(A[1], 30);\n" \
"				A[1] = A[0];\n" \
"				A[0] = temp;\n" \
"		}\n" \
"		A[0] += H1;\n" \
"		A[1] += H2;\n" \
"		A[2] += H3;\n" \
"		A[3] += H4;\n" \
"		A[4] += H5;\n" \
"\n" \
"		for(j = 0; j < 5; j++)\n" \
"		{\n" \
"				number = A[j];\n" \
"				for(i = 0; i < 8; i++)\n" \
"				{\n" \
"						digest[item_pad + (j*8 + 7-i)] = hexChars[number%16];\n" \
"						number /= 16;\n" \
"				}\n" \
"		}\n" \
"\n" \
"		digest[item_pad + 40] = '\\0';\n" \
"}\n" \
"\n";
 
char keybuf[64+1];
 
int main(int argc, char **argv) {
	char c;
	unsigned int i=0, j, x;
	char *textstring = (char *)malloc(64*BLOCK_SIZE);
	char *result = (char *)malloc(64*BLOCK_SIZE);
	time_t tt,tt2,tt3;
 
	memset(&keybuf,0,sizeof(keybuf));
	strncpy(keybuf,CHARSET,1);
 
	if (textstring == NULL || result == NULL) {
		printf("Couldn't allocate memory.\n");
		return 0;
	}
 
	int err;
	unsigned int length[BLOCK_SIZE];
 
	size_t global = BLOCK_SIZE;
	size_t local = BLOCK_SIZE;
 
	cl_device_id device_id;
	cl_context context;
	cl_command_queue commands;
	cl_program program;
	cl_kernel kernel;
 
	cl_mem input;
	cl_mem devlen;
	cl_mem output;
 
		int gpu = 1;
		err = clGetDeviceIDs(NULL, gpu ? CL_DEVICE_TYPE_GPU : CL_DEVICE_TYPE_CPU, 1, &device_id, NULL);
		if (err != CL_SUCCESS)
		{
				printf("Error: Failed to create a device group!\n");
				return EXIT_FAILURE;
		}
 
		context = clCreateContext(0, 1, &device_id, NULL, NULL, &err);
		if (!context)
		{
				printf("Error: Failed to create a compute context!\n");
				return EXIT_FAILURE;
		}
 
		commands = clCreateCommandQueue(context, device_id, 0, &err);
		if (!commands)
		{
				printf("Error: Failed to create a command commands!\n");
				return EXIT_FAILURE;
		}
 
		program = clCreateProgramWithSource(context, 1, (const char **) & KernelSource, NULL, &err);
		if (!program)
		{
				printf("Error: Failed to create compute program! (error %d)\n",err);
				switch (err) {
					case CL_INVALID_CONTEXT: printf("context is not a valid context.\n"); break;
					case CL_INVALID_VALUE: printf("count is zero or if strings or any entry in strings is NULL\n"); break;
					case CL_OUT_OF_HOST_MEMORY: printf("there is a failure to allocate resources required by the OpenCL implementation on the host\n"); break;
				}
				return EXIT_FAILURE;
		}
 
		err = clBuildProgram(program, 0, NULL, NULL, NULL, NULL);
		if (err != CL_SUCCESS)
		{
				size_t len;
				char buffer[2048];
 
				printf("Error: Failed to build program executable!\n");
				clGetProgramBuildInfo(program, device_id, CL_PROGRAM_BUILD_LOG, sizeof(buffer), buffer, &len);
				printf("%s\n", buffer);
				exit(1);
		}
 
		kernel = clCreateKernel(program, "sha1", &err);
		if (!kernel || err != CL_SUCCESS)
		{
				printf("Error: Failed to create compute kernel!\n");
				switch (err) {
					case CL_INVALID_PROGRAM: printf("if program is not a valid program object\n"); break;
					case CL_INVALID_PROGRAM_EXECUTABLE: printf("if there is no successfully built executable for program\n"); break;
					case CL_INVALID_KERNEL_NAME: printf("if kernel_name is not found in program\n"); break;
					case CL_INVALID_KERNEL_DEFINITION: printf("if the function definition for __kernel function given by kernel_name such as the number of arguments, the argument types are not the same for all devices for which the program executable has been built\n"); break;
					case CL_INVALID_VALUE: printf("is kernel_name is NULL\n"); break;
					case CL_OUT_OF_HOST_MEMORY: printf("if there is a failure to allocate resources required by the OpenCL implementation on the host\n"); break;
				}
				exit(1);
		}
 
		input = clCreateBuffer(context,  CL_MEM_READ_ONLY,	64*BLOCK_SIZE, NULL, NULL);
		output = clCreateBuffer(context, CL_MEM_WRITE_ONLY, 64*BLOCK_SIZE, NULL, NULL);
		devlen = clCreateBuffer(context, CL_MEM_READ_ONLY, sizeof(unsigned int)*BLOCK_SIZE, NULL, NULL);
 
		if (!input || !output)
		{
				printf("Error: Failed to allocate device memory!\n");
				exit(1);
		}
 
		err = 0;
		err  = clSetKernelArg(kernel, 0, sizeof(cl_mem), &input);
		err |= clSetKernelArg(kernel, 1, sizeof(cl_mem), &devlen);
		err |= clSetKernelArg(kernel, 2, sizeof(cl_mem), &output);
		if (err != CL_SUCCESS)
		{
				printf("Error: Failed to set kernel arguments! %d\n", err);
				switch (err) {
					case CL_INVALID_KERNEL: printf("kernel is not a valid kernel object\n"); break;
					case CL_INVALID_ARG_INDEX: printf("arg_index is not a valid argument index\n"); break;
					case CL_INVALID_ARG_VALUE: printf("arg_value specified is NULL for an argument that is not declared with the __local qualifier or vice-versa\n"); break;
					case CL_INVALID_MEM_OBJECT: printf("an argument declared to be a memory object when the specified arg_value is not a valid memory object\n"); break;
					case CL_INVALID_SAMPLER: printf("an argument declared to be of type sampler_t when the specified arg_value is not a valid sampler object\n"); break;
					case CL_INVALID_ARG_SIZE: printf("arg_size does not match the size of the data type for an argument that is not a memory object or if the argument is a memory object and arg_size != sizeof(cl_mem) or if arg_size is zero and the argument is declared with the __local qualifier or if the argument is a sampler and arg_size != sizeof(cl_sampler)\n"); break;
				}
				exit(1);
		}
 
		for (x = 0; x<NUM_BLOCKS; x++) {
			for (i=0; i<BLOCK_SIZE; i++) {
				strcpy(&textstring[i*64],keybuf);
				length[i] = strlen(&textstring[i*64]);
				nextkey();
			}
 
			err = clEnqueueWriteBuffer(commands, input, CL_TRUE, 0, 64*BLOCK_SIZE, textstring, 0, NULL, NULL);
			if (err != CL_SUCCESS)
			{
					printf("Error: Failed to write to source array!\n");
					exit(1);
			}
 
			err = clEnqueueWriteBuffer(commands, devlen, CL_TRUE, 0, sizeof(unsigned int)*BLOCK_SIZE, length, 0, NULL, NULL);
			if (err != CL_SUCCESS)
			{
					printf("Error: Failed to write to source array!\n");
					exit(1);
			}
 
			err = clEnqueueNDRangeKernel(commands, kernel, 1, NULL, &global, &local, 0, NULL, NULL);
			if (err)
			{
					printf("Error: Failed to execute kernel!\n");
					return EXIT_FAILURE;
			}
 
			clFinish(commands);
 
			err = clEnqueueReadBuffer( commands, output, CL_TRUE, 0, 64*BLOCK_SIZE, result, 0, NULL, NULL );
			if (err != CL_SUCCESS)
			{
					printf("Error: Failed to read output array! %d\n", err);
					exit(1);
			}
		}
 
		printf("Done, last result was: %s - %s\n",&textstring[(i-1)*64],&result[(i-1)*64]);
		printf("Computed %d hashes.\n",NUM_BLOCKS*BLOCK_SIZE);
 
		clReleaseMemObject(input);
		clReleaseMemObject(output);
		clReleaseProgram(program);
		clReleaseKernel(kernel);
		clReleaseCommandQueue(commands);
		clReleaseContext(context);
 
		return 0;
	}
 
	void nextkey() {
		int i,j,k,found;
 
		if (keybuf[strlen(keybuf)-1] == CHARSET[strlen(CHARSET)-1])
		{
			found=0;
			for (i=strlen(keybuf)-1;i>=0;i--)
			{
				if (keybuf[i] != CHARSET[strlen(CHARSET)-1])
				{
					for (j=0;j<strlen(CHARSET);j++)
					{
						if (keybuf[i] == CHARSET[j])
						{
							keybuf[i] = CHARSET[j+1];
							for (k=i+1;k<strlen(keybuf);k++)
							{
								keybuf[k] = CHARSET[0];
							}
							found=1;
							break;
						}
					}
					i=-1;
				}
			}
			if (!found)
			{
				for (i=0;i<strlen(keybuf);i++)
				{
					keybuf[i] = CHARSET[0];
				}
				strncat(keybuf,CHARSET,1);
			}
		}
		else
		{
			for (i=0;i<strlen(CHARSET);i++)
			{
				if (keybuf[strlen(keybuf)-1] == CHARSET[i]) break;
			}
			keybuf[strlen(keybuf)-1] = CHARSET[i+1];
		}
	}