First BFI_INT patch changes.
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187
diff --git a/ocl.c b/ocl.c
index 2a4a37e..bf2f659 100644
--- a/ocl.c
+++ b/ocl.c
@@ -1,3 +1,4 @@
+#define _GNU_SOURCE
#include <signal.h>
#include <stdlib.h>
#include <string.h>
@@ -93,6 +94,63 @@ int clDevicesNum() {
return numDevices;
}
+void advance(char **area, unsigned *remaining, const char *marker)
+{
+ char *find = memmem(*area, *remaining, marker, strlen(marker));
+ if (!find)
+ fprintf(stderr, "Marker \"%s\" not found\n", marker), exit(1);
+ *remaining -= find - *area;
+ *area = find;
+}
+
+#define OP3_INST_BFE_UINT 4UL
+#define OP3_INST_BFE_INT 5UL
+#define OP3_INST_BFI_INT 6UL
+#define OP3_INST_BIT_ALIGN_INT 12UL
+#define OP3_INST_BYTE_ALIGN_INT 13UL
+
+void patch_opcodes(char *w, unsigned remaining)
+{
+ uint64_t *opcode = (uint64_t *)w;
+ int patched = 0;
+ int count_bfe_int = 0;
+ int count_bfe_uint = 0;
+ int count_byte_align = 0;
+ while (42)
+ {
+ int clamp = (*opcode >> (32 + 31)) & 0x1;
+ int dest_rel = (*opcode >> (32 + 28)) & 0x1;
+ int alu_inst = (*opcode >> (32 + 13)) & 0x1f;
+ int s2_neg = (*opcode >> (32 + 12)) & 0x1;
+ int s2_rel = (*opcode >> (32 + 9)) & 0x1;
+ int pred_sel = (*opcode >> 29) & 0x3;
+ if (!clamp && !dest_rel && !s2_neg && !s2_rel && !pred_sel) {
+ if (alu_inst == OP3_INST_BFE_INT) {
+ count_bfe_int++;
+ } else if (alu_inst == OP3_INST_BFE_UINT) {
+ count_bfe_uint++;
+ } else if (alu_inst == OP3_INST_BYTE_ALIGN_INT) {
+ count_byte_align++;
+ // patch this instruction to BFI_INT
+ *opcode &= 0xfffc1fffffffffffUL;
+ *opcode |= OP3_INST_BFI_INT << (32 + 13);
+ patched++;
+ }
+ }
+ if (remaining <= 8) {
+ break;
+ }
+ opcode++;
+ remaining -= 8;
+ }
+ if (opt_debug) {
+ printf("Potential OP3 instructions identified: "
+ "%i BFE_INT, %i BFE_UINT, %i BYTE_ALIGN\n",
+ count_bfe_int, count_bfe_uint, count_byte_align);
+ printf("Patched a total of %i BFI_INT instructions\n", patched);
+ }
+}
+
_clState *initCl(int gpu, char *name, size_t nameSize) {
cl_int status = 0;
@@ -165,7 +223,7 @@ _clState *initCl(int gpu, char *name, size_t nameSize) {
printf("List of devices:\n");
- int i;
+ unsigned int i;
for(i=0; i<numDevices; i++) {
char pbuff[100];
status = clGetDeviceInfo(devices[i], CL_DEVICE_NAME, sizeof(pbuff), pbuff, NULL);
@@ -236,6 +294,82 @@ _clState *initCl(int gpu, char *name, size_t nameSize) {
return NULL;
}
+ size_t nDevices;
+ size_t * binary_sizes;
+ char ** binaries;
+ unsigned int i;
+ int err;
+
+ /* figure out number of devices and the sizes of the binary for each device. */
+ err = clGetProgramInfo( clState->program, CL_PROGRAM_NUM_DEVICES, sizeof(nDevices), &nDevices, NULL );
+ binary_sizes = (size_t *)malloc( sizeof(size_t)*nDevices );
+ err = clGetProgramInfo( clState->program, CL_PROGRAM_BINARY_SIZES, sizeof(size_t)*nDevices, binary_sizes, NULL );
+
+ /* copy over all of the generated binaries. */
+ binaries = (char **)malloc( sizeof(char *)*nDevices );
+ for( i = 0; i < nDevices; i++ ) {
+ printf("binary size %d : %d\n", i, binary_sizes[i]);
+ if( binary_sizes[i] != 0 )
+ binaries[i] = (char *)malloc( sizeof(char)*binary_sizes[i] );
+ else
+ binaries[i] = NULL;
+ }
+ err = clGetProgramInfo( clState->program, CL_PROGRAM_BINARIES, sizeof(char *)*nDevices, binaries, NULL );
+ // all the code should be within the first 83000 bytes or so, but scan
+ // a bit more for headroom
+ unsigned bytes_to_scan = 93000;
+ for (i = 0; i < nDevices; i++) {
+ if (!binaries[i])
+ continue;
+
+ unsigned remaining = bytes_to_scan;
+ char *w = binaries[i];
+ int j;
+
+ if (opt_debug)
+ printf("At %p (%u rem. bytes), searching outer elf marker\n", w, remaining);
+ advance(&w, &remaining, "ELF");
+ if (opt_debug)
+ printf("At %p (%u rem. bytes), searching inner elf marker\n", w, remaining);
+ advance(&w, &remaining, "ELF");
+ if (opt_debug)
+ printf("At %p (%u rem. bytes), searching first .text marker\n", w, remaining);
+ advance(&w, &remaining, ".text");
+ if (opt_debug)
+ printf("At %p (%u rem. bytes), searching second .text marker\n", w, remaining);
+ advance(&w, &remaining, ".text");
+ // now we are pointing to the first opcode
+ patch_opcodes(w, remaining);
+ }
+
+ status = clReleaseProgram(clState->program);
+ if(status != CL_SUCCESS)
+ {
+ printf("Error: Releasing program. (clReleaseProgram)\n");
+ return NULL;
+ }
+
+ clState->program = clCreateProgramWithBinary(clState->context, numDevices, &devices[gpu], binary_sizes, binaries, &status, NULL);
+ if(status != CL_SUCCESS)
+ {
+ printf("Error: Loading Binary into cl_program (clCreateProgramWithBinary)\n");
+ return NULL;
+ }
+
+ /* create a cl program executable for all the devices specified */
+ status = clBuildProgram(clState->program, 1, &devices[gpu], NULL, NULL, NULL);
+ if(status != CL_SUCCESS)
+ {
+ printf("Error: Building Program (clBuildProgram)\n");
+ size_t logSize;
+ status = clGetProgramBuildInfo(clState->program, devices[gpu], CL_PROGRAM_BUILD_LOG, 0, NULL, &logSize);
+
+ char *log = malloc(logSize);
+ status = clGetProgramBuildInfo(clState->program, devices[gpu], CL_PROGRAM_BUILD_LOG, logSize, log, NULL);
+ printf("%s\n", log);
+ return NULL;
+ }
+
/* get a kernel object handle for a kernel with the given name */
clState->kernel = clCreateKernel(clState->program, "oclminer", &status);
if(status != CL_SUCCESS)
diff --git a/oclminer.cl b/oclminer.cl
index b706c92..c6e3766 100644
--- a/oclminer.cl
+++ b/oclminer.cl
@@ -1,4 +1,16 @@
-#define rotr(x, n) rotate(x, (uint)(32 - n))
+typedef uint z;
+#define BITALIGN
+
+#ifdef BITALIGN
+#pragma OPENCL EXTENSION cl_amd_media_ops : enable
+#define rotr(a, b) amd_bitalign((z)a, (z)a, (z)b)
+#define Ch(a, b, c) amd_bytealign(a, b, c)
+#define Ma(a, b, c) amd_bytealign((b), (a | c), (c & a))
+#else
+#define rotr(a, b) rotate((z)a, (z)(32 - b))
+#define Ch(a, b, c) (c ^ (a & (b ^ c)))
+#define Ma(a, b, c) ((b & c) | (a & (b | c)))
+#endif
#define WGS __attribute__((reqd_work_group_size(128, 1, 1)))