KnC: Recover automatically if core state and cgminer state gets out of sync handles communication errors, reset cores, and other situations where cgminer core state and actual core state gets out of sync
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148
diff --git a/driver-knc.c b/driver-knc.c
index bab800d..22c9efd 100644
--- a/driver-knc.c
+++ b/driver-knc.c
@@ -62,6 +62,7 @@ struct knc_core_state {
int slot;
struct work *work;
} workslot[WORKS_PER_CORE]; /* active, next */
+ int transfer_stamp;
struct knc_report report;
struct {
int slot;
@@ -140,6 +141,8 @@ struct knc_state {
} spi_buffer[KNC_SPI_BUFFERS];
int send_buffer;
int read_buffer;
+ int send_buffer_count;
+ int read_buffer_count;
/* end SPI thread */
/* Do not add anything below here!! core[] must be last */
@@ -191,6 +194,7 @@ static void knc_flush(struct thr_info *thr)
buffer->state = KNC_SPI_PENDING;
pthread_cond_signal(&knc->spi_qcond);
knc->send_buffer += 1;
+ knc->send_buffer_count += 1;
if (knc->send_buffer >= KNC_SPI_BUFFERS)
knc->send_buffer = 0;
buffer = &knc->spi_buffer[knc->send_buffer];
@@ -224,6 +228,17 @@ static void knc_transfer(struct thr_info *thr, struct knc_core_state *core, int
buffer->size = knc_prepare_transfer(buffer->txbuf, buffer->size, MAX_SPI_SIZE, core->die->channel, request_length, request, response_length);
}
+static int knc_transfer_stamp(struct knc_state *knc)
+{
+ return knc->send_buffer_count;
+}
+
+static int knc_transfer_completed(struct knc_state *knc, int stamp)
+{
+ /* signed delta math, counter wrap OK */
+ return (knc->read_buffer_count - stamp) >= 0;
+}
+
static bool knc_detect_one(void *ctx)
{
/* Scan device for ASICs */
@@ -404,7 +419,7 @@ static int knc_core_process_report(struct thr_info *thr, struct knc_core_state *
knc_core_handle_nonce(thr, core, report->nonce[n].slot, report->nonce[n].nonce);
}
- if (report->active_slot && core->workslot[1].slot == report->active_slot) {
+ if (report->active_slot && core->workslot[0].slot != report->active_slot) {
/* Core switched to next work */
if (core->workslot[0].work) {
core->die->knc->completed++;
@@ -414,7 +429,30 @@ static int knc_core_process_report(struct thr_info *thr, struct knc_core_state *
}
core->workslot[0] = core->workslot[1];
core->workslot[1].work = NULL;
- core->workslot[1].slot = 0;
+ core->workslot[1].slot = -1;
+
+ /* or did it switch directly to pending work? */
+ if (report->active_slot == core->workslot[2].slot) {
+ if (core->workslot[0].work)
+ free_work(core->workslot[0].work);
+ core->workslot[0] = core->workslot[2];
+ core->workslot[2].work = NULL;
+ core->workslot[2].slot = -1;
+ }
+ }
+
+ if (report->next_state && core->workslot[2].slot > 0 && (core->workslot[2].slot == report->next_slot || report->next_slot == -1)) {
+ /* core accepted next work */
+ if (core->workslot[1].work)
+ free_work(core->workslot[1].work);
+ core->workslot[1] = core->workslot[2];
+ core->workslot[2].work = NULL;
+ }
+
+ if (core->workslot[2].work && knc_transfer_completed(core->die->knc, core->transfer_stamp)) {
+ applog(LOG_INFO, "KnC: Setwork failed on core %d.%d.%d?", core->die->channel, core->die->die, core->core);
+ free_work(core->workslot[2].work);
+ core->workslot[2].slot = -1;
}
return 0;
@@ -430,18 +468,19 @@ static void knc_process_responses(struct thr_info *thr)
for (i = 0; i < buffer->responses; i++) {
struct knc_spi_response *response_info = &buffer->response_info[i];
uint8_t *rxbuf = &buffer->rxbuf[response_info->offset];
+ struct knc_core_state *core = response_info->core;
int status = knc_decode_response(rxbuf, response_info->request_length, &rxbuf, response_info->response_length);
if (response_info->type == KNC_SETWORK)
status ^= KNC_ACCEPTED;
- if (response_info->core->die->version != KNC_VERSION_JUPITER && status != 0) {
- applog(LOG_ERR, "KnC: Communication error (%x)", status);
- knc_core_failure(response_info->core);
+ if (core->die->version != KNC_VERSION_JUPITER && status != 0) {
+ applog(LOG_ERR, "KnC %d.%d.%d: Communication error (%x / %d)", core->die->channel, core->die->die, core->core, status, i);
+ knc_core_failure(core);
}
switch(response_info->type) {
case KNC_REPORT:
case KNC_SETWORK:
/* Should we care about failed SETWORK explicit? Or simply handle it by next state not loaded indication in reports? */
- knc_core_process_report(thr, response_info->core, rxbuf);
+ knc_core_process_report(thr, core, rxbuf);
break;
}
}
@@ -450,6 +489,7 @@ static void knc_process_responses(struct thr_info *thr)
buffer->responses = 0;
buffer->size = 0;
knc->read_buffer += 1;
+ knc->read_buffer_count += 1;
if (knc->read_buffer >= KNC_SPI_BUFFERS)
knc->read_buffer = 0;
buffer = &knc->spi_buffer[knc->read_buffer];
@@ -498,6 +538,7 @@ static int knc_core_send_work(struct thr_info *thr, struct knc_core_state *core,
core->generation = knc->generation;
core->works++;
core->die->knc->works++;
+ core->transfer_stamp = knc_transfer_stamp(knc);
timeradd(&now, &core_submit_interval, &core->hold_work_until);
timeradd(&now, &core_timeout_interval, &core->timeout);
@@ -583,16 +624,14 @@ static int64_t knc_scanwork(struct thr_info *thr)
for (slot = 0; slot < WORKS_PER_CORE; slot ++) {
if (core->workslot[slot].work)
free_work(core->workslot[slot].work);
+ core->workslot[slot].slot = -1;
}
core->hold_work_until = now;
}
if (knc_core_disabled(core))
continue;
- if (i == knc->scan_adjust_core) {
- /* TODO: Do a forced submit to even out work generation over time.
- * but don't forget scheduled works until the new one gets active
- */
- }
+ if (i == knc->scan_adjust_core)
+ clean = true;
if (knc_core_need_work(core)) {
struct work *work = get_work(thr, thr->id);
knc_core_send_work(thr, core, work, clean);