Actually get first BFI_INT patch working.
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126
diff --git a/cpu-miner.c b/cpu-miner.c
index 4e8c8cd..16c6a9e 100644
--- a/cpu-miner.c
+++ b/cpu-miner.c
@@ -814,7 +814,6 @@ static void *gpuminer_thread(void *userdata)
BUFFERSIZE, res, 0, NULL, NULL);
if (unlikely(status != CL_SUCCESS))
{ applog(LOG_ERR, "Error: clEnqueueReadBuffer failed. (clEnqueueReadBuffer)"); goto out;}
-
for (i = 0; i < 128; i++) {
int found = false;
diff --git a/ocl.c b/ocl.c
index 264adc4..02e0d60 100644
--- a/ocl.c
+++ b/ocl.c
@@ -125,29 +125,29 @@ void patch_opcodes(char *w, unsigned remaining)
int s2_rel = (*opcode >> (32 + 9)) & 0x1;
int pred_sel = (*opcode >> 29) & 0x3;
if (!clamp && !dest_rel && !s2_neg && !s2_rel && !pred_sel) {
- if (alu_inst == OP3_INST_BFE_INT) {
- count_bfe_int++;
- } else if (alu_inst == OP3_INST_BFE_UINT) {
- count_bfe_uint++;
- } else if (alu_inst == OP3_INST_BYTE_ALIGN_INT) {
- count_byte_align++;
- // patch this instruction to BFI_INT
- *opcode &= 0xfffc1fffffffffffUL;
- *opcode |= OP3_INST_BFI_INT << (32 + 13);
- patched++;
- }
+ if (alu_inst == OP3_INST_BFE_INT) {
+ count_bfe_int++;
+ } else if (alu_inst == OP3_INST_BFE_UINT) {
+ count_bfe_uint++;
+ } else if (alu_inst == OP3_INST_BYTE_ALIGN_INT) {
+ count_byte_align++;
+ // patch this instruction to BFI_INT
+ *opcode &= 0xfffc1fffffffffffUL;
+ *opcode |= OP3_INST_BFI_INT << (32 + 13);
+ patched++;
+ }
}
if (remaining <= 8) {
- break;
+ break;
}
opcode++;
remaining -= 8;
}
if (opt_debug) {
- printf("Potential OP3 instructions identified: "
- "%i BFE_INT, %i BFE_UINT, %i BYTE_ALIGN\n",
+ applog(LOG_DEBUG, "Potential OP3 instructions identified: "
+ "%i BFE_INT, %i BFE_UINT, %i BYTE_ALIGN",
count_bfe_int, count_bfe_uint, count_byte_align);
- printf("Patched a total of %i BFI_INT instructions\n", patched);
+ applog(LOG_DEBUG, "Patched a total of %i BFI_INT instructions", patched);
}
}
@@ -316,31 +316,34 @@ _clState *initCl(int gpu, char *name, size_t nameSize) {
}
err = clGetProgramInfo( clState->program, CL_PROGRAM_BINARIES, sizeof(char *)*nDevices, binaries, NULL );
-#if 0
for (i = 0; i < nDevices; i++) {
if (!binaries[i])
continue;
unsigned remaining = binary_sizes[i];
char *w = binaries[i];
- const int ati_cal_markers = 17;
- int j;
- for (j = 0; j < ati_cal_markers; j++) {
- if (opt_debug)
- printf("At %p (%u rem. bytes), searching ATI CAL marker %i\n",
- w, remaining, j);
- advance(&w, &remaining, "ATI CAL");
- if (remaining < 1)
- fprintf(stderr, "Only %u rem. bytes\n", remaining), exit(1);
- w++; remaining--;
- }
- if (remaining < 11)
- fprintf(stderr, "Only %u rem. bytes\n", remaining), exit(1);
- w += 11; remaining -= 11;
- patch_opcodes(w, remaining);
- exit (0);
+ unsigned int start, length;
+
+ /* Find 2nd incidence of .text, and copy the program's
+ * position and length at a fixed offset from that. Then go
+ * back and find the 2nd incidence of \x7ELF (rewind by one
+ * from ELF) and then patch the opcocdes */
+ advance(&w, &remaining, ".text");
+ w++; remaining--;
+ advance(&w, &remaining, ".text");
+ memcpy(&start, w + 285, 4);
+ memcpy(&length, w + 289, 4);
+ w = binaries[i]; remaining = binary_sizes[i];
+ advance(&w, &remaining, "ELF");
+ w++; remaining--;
+ advance(&w, &remaining, "ELF");
+ w--; remaining++;
+ w += start; remaining -= start;
+ if (opt_debug)
+ printf("At %p (%u rem. bytes), to begin patching\n",
+ w, remaining);
+ patch_opcodes(w, length);
}
-#endif
status = clReleaseProgram(clState->program);
if(status != CL_SUCCESS)
{
diff --git a/oclminer.cl b/oclminer.cl
index d0650ac..40550ca 100644
--- a/oclminer.cl
+++ b/oclminer.cl
@@ -1,5 +1,7 @@
typedef uint z;
+#define BITALIGN
+
#ifdef BITALIGN
#pragma OPENCL EXTENSION cl_amd_media_ops : enable
#define rotr(a, b) amd_bitalign((z)a, (z)a, (z)b)