Commit c548dea84850da67c2ce37292d68747162285ed2

Con Kolivas 2011-06-20T16:53:13

First BFI_INT patch changes.

diff --git a/ocl.c b/ocl.c
index 2a4a37e..bf2f659 100644
--- a/ocl.c
+++ b/ocl.c
@@ -1,3 +1,4 @@
+#define _GNU_SOURCE
 #include <signal.h>
 #include <stdlib.h>
 #include <string.h>
@@ -93,6 +94,63 @@ int clDevicesNum() {
 	return numDevices;
+void advance(char **area, unsigned *remaining, const char *marker)
+	char *find = memmem(*area, *remaining, marker, strlen(marker));
+	if (!find)
+		fprintf(stderr, "Marker \"%s\" not found\n", marker), exit(1);
+	*remaining -= find - *area;
+	*area = find;
+#define OP3_INST_BFE_UINT	4UL
+#define OP3_INST_BFE_INT	5UL
+#define OP3_INST_BFI_INT	6UL
+void patch_opcodes(char *w, unsigned remaining)
+	uint64_t *opcode = (uint64_t *)w;
+	int patched = 0;
+	int count_bfe_int = 0;
+	int count_bfe_uint = 0;
+	int count_byte_align = 0;
+	while (42)
+	{
+		int clamp = (*opcode >> (32 + 31)) & 0x1;
+		int dest_rel = (*opcode >> (32 + 28)) & 0x1;
+		int alu_inst = (*opcode >> (32 + 13)) & 0x1f;
+		int s2_neg = (*opcode >> (32 + 12)) & 0x1;
+		int s2_rel = (*opcode >> (32 + 9)) & 0x1;
+		int pred_sel = (*opcode >> 29) & 0x3;
+		if (!clamp && !dest_rel && !s2_neg && !s2_rel && !pred_sel) {
+		if (alu_inst == OP3_INST_BFE_INT) {
+			count_bfe_int++;
+		} else if (alu_inst == OP3_INST_BFE_UINT) {
+			count_bfe_uint++;
+		} else if (alu_inst == OP3_INST_BYTE_ALIGN_INT) {
+			count_byte_align++;
+			// patch this instruction to BFI_INT
+			*opcode &= 0xfffc1fffffffffffUL;
+			*opcode |= OP3_INST_BFI_INT << (32 + 13);
+			patched++;
+		}
+		}
+		if (remaining <= 8) {
+		break;
+		}
+		opcode++;
+		remaining -= 8;
+	}
+	if (opt_debug) {
+		printf("Potential OP3 instructions identified: "
+			"%i BFE_INT, %i BFE_UINT, %i BYTE_ALIGN\n",
+			count_bfe_int, count_bfe_uint, count_byte_align);
+		printf("Patched a total of %i BFI_INT instructions\n", patched);
+	}
 _clState *initCl(int gpu, char *name, size_t nameSize) {
 	cl_int status = 0;
@@ -165,7 +223,7 @@ _clState *initCl(int gpu, char *name, size_t nameSize) {
 		printf("List of devices:\n");
-		int i;
+		unsigned int i;
 		for(i=0; i<numDevices; i++) {
 			char pbuff[100];
 			status = clGetDeviceInfo(devices[i], CL_DEVICE_NAME, sizeof(pbuff), pbuff, NULL);
@@ -236,6 +294,82 @@ _clState *initCl(int gpu, char *name, size_t nameSize) {
 		return NULL; 
+	size_t nDevices;
+	size_t * binary_sizes;
+	char ** binaries;
+	unsigned int i;
+	int err;
+	/* figure out number of devices and the sizes of the binary for each device. */
+	err = clGetProgramInfo( clState->program, CL_PROGRAM_NUM_DEVICES, sizeof(nDevices), &nDevices, NULL );
+	binary_sizes = (size_t *)malloc( sizeof(size_t)*nDevices );
+	err = clGetProgramInfo( clState->program, CL_PROGRAM_BINARY_SIZES, sizeof(size_t)*nDevices, binary_sizes, NULL );
+	/* copy over all of the generated binaries. */
+	binaries = (char **)malloc( sizeof(char *)*nDevices );
+	for( i = 0; i < nDevices; i++ ) {
+		printf("binary size %d : %d\n", i, binary_sizes[i]);
+		if( binary_sizes[i] != 0 )
+			binaries[i] = (char *)malloc( sizeof(char)*binary_sizes[i] );
+		else
+			binaries[i] = NULL;
+	}
+	err = clGetProgramInfo( clState->program, CL_PROGRAM_BINARIES, sizeof(char *)*nDevices, binaries, NULL );
+	// all the code should be within the first 83000 bytes or so, but scan
+	// a bit more for headroom
+	unsigned bytes_to_scan = 93000;
+	for (i = 0; i < nDevices; i++) {
+		if (!binaries[i])
+			continue;
+		unsigned remaining = bytes_to_scan;
+		char *w = binaries[i];
+		int j;
+		if (opt_debug)
+			printf("At %p (%u rem. bytes), searching outer elf marker\n", w, remaining);
+		advance(&w, &remaining, "ELF");
+		if (opt_debug)
+			printf("At %p (%u rem. bytes), searching inner elf marker\n", w, remaining);
+		advance(&w, &remaining, "ELF");
+		if (opt_debug)
+			printf("At %p (%u rem. bytes), searching first .text marker\n", w, remaining);
+		advance(&w, &remaining, ".text");
+		if (opt_debug)
+			printf("At %p (%u rem. bytes), searching second .text marker\n", w, remaining);
+		advance(&w, &remaining, ".text");
+		// now we are pointing to the first opcode
+		patch_opcodes(w, remaining);
+	}
+	status = clReleaseProgram(clState->program);
+	if(status != CL_SUCCESS)
+	{
+		printf("Error: Releasing program. (clReleaseProgram)\n");
+		return NULL;
+	}
+	clState->program = clCreateProgramWithBinary(clState->context, numDevices, &devices[gpu], binary_sizes, binaries, &status, NULL);
+	if(status != CL_SUCCESS) 
+	{   
+		printf("Error: Loading Binary into cl_program (clCreateProgramWithBinary)\n");
+		return NULL;
+	}
+	/* create a cl program executable for all the devices specified */
+	status = clBuildProgram(clState->program, 1, &devices[gpu], NULL, NULL, NULL);
+	if(status != CL_SUCCESS) 
+	{   
+		printf("Error: Building Program (clBuildProgram)\n");
+		size_t logSize;
+		status = clGetProgramBuildInfo(clState->program, devices[gpu], CL_PROGRAM_BUILD_LOG, 0, NULL, &logSize);
+		char *log = malloc(logSize);
+		status = clGetProgramBuildInfo(clState->program, devices[gpu], CL_PROGRAM_BUILD_LOG, logSize, log, NULL);
+		printf("%s\n", log);
+		return NULL; 
+	}
 	/* get a kernel object handle for a kernel with the given name */
 	clState->kernel = clCreateKernel(clState->program, "oclminer", &status);
 	if(status != CL_SUCCESS)
diff --git a/ b/
index b706c92..c6e3766 100644
--- a/
+++ b/
@@ -1,4 +1,16 @@
-#define rotr(x, n) rotate(x, (uint)(32 - n))
+typedef uint z;
+#define BITALIGN
+#ifdef BITALIGN
+#pragma OPENCL EXTENSION cl_amd_media_ops : enable
+#define rotr(a, b) amd_bitalign((z)a, (z)a, (z)b)
+#define Ch(a, b, c) amd_bytealign(a, b, c)
+#define Ma(a, b, c) amd_bytealign((b), (a | c), (c & a))
+#define rotr(a, b) rotate((z)a, (z)(32 - b))
+#define Ch(a, b, c) (c ^ (a & (b ^ c)))
+#define Ma(a, b, c) ((b & c) | (a & (b | c)))
 #define WGS __attribute__((reqd_work_group_size(128, 1, 1)))