Enable cores back aftre 15 minutes in disabled state
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139
diff --git a/driver-knc-spi-fpga.c b/driver-knc-spi-fpga.c
index 2a291bc..332c723 100644
--- a/driver-knc-spi-fpga.c
+++ b/driver-knc-spi-fpga.c
@@ -37,6 +37,9 @@
#define WORK_STALE_US 60000000
+/* Keep core disabled for no longer than 15 minutes */
+#define CORE_DISA_PERIOD_US (15 * 60 * 1000000)
+
struct spidev_context {
int fd;
uint32_t speed;
@@ -124,6 +127,12 @@ struct active_work {
struct timeval begin;
};
+struct core_disa_data {
+ struct timeval disa_begin;
+ uint8_t asic;
+ uint8_t core;
+};
+
struct knc_state {
struct spidev_context *ctx;
int devices;
@@ -152,6 +161,9 @@ struct knc_state {
struct timeval jupiter_work_start[KNC_ACTIVE_BUFFER_SIZE];
#endif
uint8_t hwerrs[MAX_ASICS * 256];
+ int read_d, write_d;
+#define KNC_DISA_CORES_SIZE (MAX_ASICS * 256)
+ struct core_disa_data disa_cores_fifo[KNC_DISA_CORES_SIZE];
};
static inline bool knc_queued_fifo_full(struct knc_state *knc)
@@ -180,6 +192,14 @@ static inline void knc_active_fifo_inc_idx(int *idx)
++(*idx);
}
+static inline void knc_disa_cores_fifo_inc_idx(int *idx)
+{
+ if (unlikely(*idx >= (KNC_DISA_CORES_SIZE - 1)))
+ *idx = 0;
+ else
+ ++(*idx);
+}
+
/* Find SPI device with index idx, init it */
static struct spidev_context * spi_new(int idx)
{
@@ -288,6 +308,14 @@ static void disable_core(uint8_t asic, uint8_t core)
applog(LOG_ERR, "KnC: system call failed");
}
+static void enable_core(uint8_t asic, uint8_t core)
+{
+ char str[256];
+ snprintf(str, sizeof(str), "i2cset -y 2 0x2%hhu %hhu 1", asic, core);
+ if (0 != WEXITSTATUS(system(str)))
+ applog(LOG_ERR, "KnC: system call failed");
+}
+
static int64_t timediff(const struct timeval *a, const struct timeval *b)
{
struct timeval diff;
@@ -295,6 +323,32 @@ static int64_t timediff(const struct timeval *a, const struct timeval *b)
return diff.tv_sec * 1000000 + diff.tv_usec;
}
+static void knc_check_disabled_cores(struct knc_state *knc)
+{
+ int next_read_d;
+ struct timeval now;
+ int64_t us;
+ struct core_disa_data *core;
+ int cidx;
+
+ next_read_d = knc->read_d;
+ knc_disa_cores_fifo_inc_idx(&next_read_d);
+ if (next_read_d == knc->write_d)
+ return; /* queue empty */
+ core = &knc->disa_cores_fifo[next_read_d];
+ gettimeofday(&now, NULL);
+ us = timediff(&now, &core->disa_begin);
+ if ((us >= 0) && (us < CORE_DISA_PERIOD_US))
+ return; /* latest disabled core still not expired */
+ cidx = core->asic * 256 + core->core;
+ enable_core(core->asic, core->core);
+ knc->hwerrs[cidx] = 0;
+ applog(LOG_NOTICE,
+ "KnC: core %u-%u was enabled back from disabled state",
+ core->asic, core->core);
+ knc->read_d = next_read_d;
+}
+
static void knc_work_from_queue_to_spi(struct knc_state *knc,
struct active_work *q_work,
struct spi_request *spi_req)
@@ -482,10 +536,16 @@ static int64_t knc_process_response(struct thr_info *thr, struct cgpu_info *cgpu
} else {
if (cidx < sizeof(knc->hwerrs)) {
if (++(knc->hwerrs[cidx]) >= HW_ERR_LIMIT) {
- disable_core(rxbuf->responses[i].asic, rxbuf->responses[i].core);
- applog(LOG_WARNING,
- "KnC: core %u-%u was disabled due to %u HW errors in a row",
- rxbuf->responses[i].asic,rxbuf->responses[i].core,HW_ERR_LIMIT);
+ struct core_disa_data *core;
+ core = &knc->disa_cores_fifo[knc->write_d];
+ core->disa_begin = now;
+ core->asic = rxbuf->responses[i].asic;
+ core->core = rxbuf->responses[i].core;
+ disable_core(core->asic, core->core);
+ applog(LOG_WARNING,
+ "KnC: core %u-%u was disabled due to %u HW errors in a row",
+ core->asic, core->core, HW_ERR_LIMIT);
+ knc_disa_cores_fifo_inc_idx(&knc->write_d);
}
}
};
@@ -585,6 +645,8 @@ static bool knc_detect_one(struct spidev_context *ctx)
knc->write_q = 1;
knc->read_a = 0;
knc->write_a = 1;
+ knc->read_d = 0;
+ knc->write_d = 1;
knc->salt = rand();
#ifdef ENABLE_BENCHMARK
gettimeofday(&knc->lastscan, NULL);
@@ -648,6 +710,8 @@ static int64_t knc_scanwork(struct thr_info *thr)
applog(LOG_DEBUG, "KnC running scanwork");
+ knc_check_disabled_cores(knc);
+
/* Prepare tx buffer */
memset(spi_txbuf, 0, sizeof(spi_txbuf));
num = 0;