Commit 4d730577728b88351da6d05916c629d351a04d74

Con Kolivas 2011-06-30T10:36:19

Build binaries with unique filenames from the kernel generated and save them. Try to load this cached binary if it matches on next kernel instantiation. This speeds up start-up dramatically, and has a unique kernel binary for different kernel configurations.

diff --git a/ocl.c b/ocl.c
index 1c496cc..574af6e 100644
--- a/ocl.c
+++ b/ocl.c
@@ -10,6 +10,8 @@
 #include <time.h>
 #include <sys/time.h>
 #include <pthread.h>
+#include <sys/stat.h>
+#include <unistd.h>
 
 #include "findnonce.h"
 #include "ocl.h"
@@ -307,40 +309,117 @@ _clState *initCl(unsigned int gpu, char *name, size_t nameSize)
 	if (clState->max_work_size > 512)
 		clState->max_work_size = 512;
 
-	/////////////////////////////////////////////////////////////////
-	// Load CL file, build CL program object, create CL kernel object
-	/////////////////////////////////////////////////////////////////
-
-	/* Load a different kernel depending on whether it supports
-	 * cl_amd_media_ops or not */
-	char filename[10];
-
-	if (clState->hasBitAlign)
-		strcpy(filename, "phatk.cl");
+	/* For some reason 2 vectors is still better even if the card says
+	 * otherwise */
+	if (clState->preferred_vwidth > 1)
+		clState->preferred_vwidth = 2;
+	if (opt_vectors)
+		clState->preferred_vwidth = opt_vectors;
+	if (opt_worksize && opt_worksize <= clState->max_work_size)
+		clState->work_size = opt_worksize;
 	else
-		strcpy(filename, "poclbm.cl");
+		clState->work_size = clState->max_work_size / clState->preferred_vwidth;
 
+	/* Create binary filename based on parameters passed to opencl
+	 * compiler to ensure we only load a binary that matches what would
+	 * have otherwise created. The filename is:
+	 * kernelname +/i bitalign + v + vectors + w + work_size + sizeof(long) + .bin
+	 */
+	char binaryfilename[255];
+	char numbuf[10];
+	char filename[10];
+	FILE *binaryfile;
+	size_t *binary_sizes;
+	char **binaries;
+	size_t nDevices = 1;
 	int pl;
-	char *source, *rawsource = file_contents(filename, &pl);
+	char *source, *rawsource;
 	size_t sourceSize[] = {(size_t)pl};
+
 	source = malloc(pl);
-retry:
 	if (!source) {
 		applog(LOG_ERR, "Unable to malloc source");
 		return NULL;
 	}
-	memcpy(source, rawsource, pl);
 
-	/* For some reason 2 vectors is still better even if the card says
-	 * otherwise */
-	if (clState->preferred_vwidth > 1)
-		clState->preferred_vwidth = 2;
-	if (opt_vectors)
-		clState->preferred_vwidth = opt_vectors;
-	if (opt_worksize && opt_worksize <= clState->max_work_size)
-		clState->work_size = opt_worksize;
+	if (clState->hasBitAlign)
+		strcpy(filename, "phatk.cl");
 	else
-		clState->work_size = clState->max_work_size / clState->preferred_vwidth;
+		strcpy(filename, "poclbm.cl");
+	rawsource = file_contents(filename, &pl);
+
+	binary_sizes = (size_t *)malloc(sizeof(size_t)*nDevices);
+	if (unlikely(!binary_sizes)) {
+		applog(LOG_ERR, "Unable to malloc binary_sizes");
+		return NULL;
+	}
+	binaries = (char **)malloc(sizeof(char *)*nDevices);
+	if (unlikely(!binaries)) {
+		applog(LOG_ERR, "Unable to malloc binaries");
+		return NULL;
+	}
+
+	if (clState->hasBitAlign) {
+		strcpy(binaryfilename, "phatk");
+		strcat(binaryfilename, "bitalign");
+	} else
+		strcpy(binaryfilename, "poclbm");
+	strcat(binaryfilename, "v");
+	sprintf(numbuf, "%d", clState->preferred_vwidth);
+	strcat(binaryfilename, numbuf);
+	strcat(binaryfilename, "w");
+	sprintf(numbuf, "%d", (int)clState->work_size);
+	strcat(binaryfilename, numbuf);
+	strcat(binaryfilename, "long");
+	sprintf(numbuf, "%d", (int)sizeof(long));
+	strcat(binaryfilename, numbuf);
+	strcat(binaryfilename, ".bin");
+
+	binaryfile = fopen(binaryfilename, "r");
+	if (!binaryfile) {
+		if (opt_debug)
+			applog(LOG_DEBUG, "No binary found, generating from source");
+	} else {
+		struct stat binary_stat;
+
+		if (unlikely(stat(binaryfilename, &binary_stat))) {
+			if (opt_debug)
+				applog(LOG_DEBUG, "Unable to stat binary, generating from source");
+			fclose(binaryfile);
+			goto build;
+		}
+		binary_sizes[gpu] = binary_stat.st_size;
+		binaries[gpu] = (char *)malloc(binary_sizes[gpu]);
+		if (unlikely(!binaries[gpu])) {
+			applog(LOG_ERR, "Unable to malloc binaries");
+			fclose(binaryfile);
+			return NULL;
+		}
+
+		if (fread(binaries[gpu], 1, binary_sizes[gpu], binaryfile) != binary_sizes[gpu]) {
+			applog(LOG_ERR, "Unable to fread binaries[gpu]");
+			fclose(binaryfile);
+			return NULL;
+		}
+		fclose(binaryfile);
+
+		clState->program = clCreateProgramWithBinary(clState->context, 1, &devices[gpu], &binary_sizes[gpu], (const unsigned char **)&binaries[gpu], &status, NULL);
+		if (status != CL_SUCCESS)
+		{
+			applog(LOG_ERR, "Error: Loading Binary into cl_program (clCreateProgramWithBinary)");
+			return NULL;
+		}
+		if (opt_debug)
+			applog(LOG_DEBUG, "Loaded binary image %s", binaryfilename);
+		goto built;
+	}
+
+	/////////////////////////////////////////////////////////////////
+	// Load CL file, build CL program object, create CL kernel object
+	/////////////////////////////////////////////////////////////////
+
+build:
+	memcpy(source, rawsource, pl);
 
 	/* Patch the source file with the preferred_vwidth */
 	if (clState->preferred_vwidth > 1) {
@@ -411,22 +490,24 @@ retry:
 
 	/* Patch the kernel if the hardware supports BFI_INT */
 	if (patchbfi) {
-		size_t nDevices;
-		size_t * binary_sizes;
-		char ** binaries;
-		int err;
-
-		/* figure out number of devices and the sizes of the binary for each device. */
-		err = clGetProgramInfo( clState->program, CL_PROGRAM_NUM_DEVICES, sizeof(nDevices), &nDevices, NULL );
-		binary_sizes = (size_t *)malloc( sizeof(size_t)*nDevices );
-		err = clGetProgramInfo( clState->program, CL_PROGRAM_BINARY_SIZES, sizeof(size_t)*nDevices, binary_sizes, NULL );
+		/* figure out the size of the binary for each device. */
+		status = clGetProgramInfo( clState->program, CL_PROGRAM_BINARY_SIZES, sizeof(size_t)*nDevices, binary_sizes, NULL );
+		if (unlikely(status != CL_SUCCESS))
+		{
+			applog(LOG_ERR, "Error: Getting program info. (clGetPlatformInfo)");
+			return NULL;
+		}
 
 		/* copy over all of the generated binaries. */
-		binaries = (char **)malloc( sizeof(char *)*nDevices );
 		if (opt_debug)
 			applog(LOG_DEBUG, "binary size %d : %d", gpu, binary_sizes[gpu]);
 		binaries[gpu] = (char *)malloc( sizeof(char)*binary_sizes[gpu] );
-		err = clGetProgramInfo( clState->program, CL_PROGRAM_BINARIES, sizeof(char *)*nDevices, binaries, NULL );
+		status = clGetProgramInfo( clState->program, CL_PROGRAM_BINARIES, sizeof(char *)*nDevices, binaries, NULL );
+		if (unlikely(status != CL_SUCCESS))
+		{
+			applog(LOG_ERR, "Error: Getting program info. (clGetPlatformInfo)");
+			return NULL;
+		}
 
 		unsigned remaining = binary_sizes[gpu];
 		char *w = binaries[gpu];
@@ -437,7 +518,7 @@ retry:
 		* back and find the 2nd incidence of \x7ELF (rewind by one
 		* from ELF) and then patch the opcocdes */
 		if (!advance(&w, &remaining, ".text"))
-			{patchbfi = 0; goto retry;}
+			{patchbfi = 0; goto build;}
 		w++; remaining--;
 		if (!advance(&w, &remaining, ".text")) {
 			/* 32 bit builds only one ELF */
@@ -447,7 +528,7 @@ retry:
 		memcpy(&length, w + 289, 4);
 		w = binaries[gpu]; remaining = binary_sizes[gpu];
 		if (!advance(&w, &remaining, "ELF"))
-			{patchbfi = 0; goto retry;}
+			{patchbfi = 0; goto build;}
 		w++; remaining--;
 		if (!advance(&w, &remaining, "ELF")) {
 			/* 32 bit builds only one ELF */
@@ -478,6 +559,23 @@ retry:
 	free(source);
 	free(rawsource);
 
+	/* Save the binary to be loaded next time */
+	binaryfile = fopen(binaryfilename, "w");
+	if (!binaryfile) {
+		/* Not a fatal problem, just means we build it again next time */
+		if (opt_debug)
+			applog(LOG_DEBUG, "Unable to create file %s", binaryfilename);
+	} else {
+		if (unlikely(fwrite(binaries[gpu], 1, binary_sizes[gpu], binaryfile) != binary_sizes[gpu])) {
+			applog(LOG_ERR, "Unable to fwrite to binaryfile");
+			return NULL;
+		}
+		fclose(binaryfile);
+	}
+built:
+	free(binaries);
+	free(binary_sizes);
+
 	applog(LOG_INFO, "Initialising kernel %s with%s BFI_INT patching, %d vectors and worksize %d",
 	       filename, patchbfi ? "" : "out", clState->preferred_vwidth, clState->work_size);