Merge pull request #369 from kanoi/mmq MMQ overheat: remove clockdown (doesn't help) + ensure no lost shares + allow partial work replies and count them + count work check timeout failures
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237
diff --git a/driver-modminer.c b/driver-modminer.c
index ae182c7..832b854 100644
--- a/driver-modminer.c
+++ b/driver-modminer.c
@@ -45,9 +45,11 @@
#define MODMINER_DEF_CLOCK 200
#define MODMINER_MIN_CLOCK 160
-#define MODMINER_CLOCK_DOWN -2
-#define MODMINER_CLOCK_SET 0
#define MODMINER_CLOCK_UP 2
+#define MODMINER_CLOCK_SET 0
+#define MODMINER_CLOCK_DOWN -2
+// = 0 means OVERHEAT doesn't affect the clock
+#define MODMINER_CLOCK_OVERHEAT 0
#define MODMINER_CLOCK_DEAD -6
#define MODMINER_CLOCK_CUTOFF -10
@@ -578,12 +580,14 @@ static bool modminer_fpga_prepare(struct thr_info *thr)
* If device exceeds cutoff or overheat temp - stop sending work until it cools
* decrease the clock by MODMINER_CLOCK_CUTOFF/MODMINER_CLOCK_OVERHEAT
* for when it restarts
+ * with MODMINER_CLOCK_OVERHEAT=0 basically says that temp shouldn't
+ * affect the clock unless we reach CUTOFF
*
- * When to clock down:
* If device overheats
- * also halve shares_to_good
- * (so multiple temp drops can recover faster)
- * or
+ * set shares_to_good back to MODMINER_MIN_BACK
+ * to speed up clock recovery if temp drop doesnt help
+ *
+ * When to clock down:
* If device gets MODMINER_HW_ERROR_PERCENT errors since last clock up or down
* if clock is <= default it requires 2 HW to do this test
* if clock is > default it only requires 1 HW to do this test
@@ -603,7 +607,6 @@ static bool modminer_delta_clock(struct thr_info *thr, int delta, bool temp)
int err, amount;
// Only do once if multiple shares per work or multiple reasons
- // Since the temperature down clock test is first in the code this is OK
if (!state->new_work)
return false;
@@ -613,18 +616,17 @@ static bool modminer_delta_clock(struct thr_info *thr, int delta, bool temp)
state->shares_last_hw = 0;
state->hw_errors = 0;
- // If drop requested due to temperature, clock drop is always allowed
- if (!temp && delta < 0 && modminer->clock <= MODMINER_MIN_CLOCK)
+ // FYI clock drop has little effect on temp
+ if (delta < 0 && modminer->clock <= MODMINER_MIN_CLOCK)
return false;
if (delta > 0 && modminer->clock >= MODMINER_MAX_CLOCK)
return false;
if (delta < 0) {
- if (temp) {
- if (state->shares_to_good > MODMINER_MIN_BACK)
- state->shares_to_good /= 2;
- } else {
+ if (temp)
+ state->shares_to_good = MODMINER_MIN_BACK;
+ else {
if ((state->shares_to_good * 2) < MODMINER_TRY_UP)
state->shares_to_good *= 2;
else
@@ -759,7 +761,8 @@ static bool modminer_start_work(struct thr_info *thr)
mutex_lock(modminer->modminer_mutex);
if ((err = usb_write(modminer, (char *)(state->next_work_cmd), 46, &amount, C_SENDWORK)) < 0 || amount != 46) {
-// TODO: err = -4 means the MMQ disappeared - need to delete it and rescan for it? (after a delay?)
+// TODO: err = LIBUSB_ERROR_NO_DEVICE means the MMQ disappeared
+// - need to delete it and rescan for it? (after a delay?)
// but check all (4) disappeared
mutex_unlock(modminer->modminer_mutex);
@@ -802,8 +805,7 @@ static void check_temperature(struct thr_info *thr)
mutex_lock(modminer->modminer_mutex);
if (usb_write(modminer, (char *)cmd, 2, &amount, C_REQUESTTEMPERATURE) == 0 && amount == 2 &&
- usb_read(modminer, (char *)(&temperature), tbytes, &tamount, C_GETTEMPERATURE) == 0 && tamount == tbytes)
- {
+ usb_read(modminer, (char *)(&temperature), tbytes, &tamount, C_GETTEMPERATURE) == 0 && tamount == tbytes) {
mutex_unlock(modminer->modminer_mutex);
if (state->one_byte_temp)
modminer->temp = temperature[0];
@@ -837,7 +839,9 @@ static void check_temperature(struct thr_info *thr)
modminer->api->name, modminer->device_id,
MODMINER_OVERHEAT_TEMP, modminer->temp);
- modminer_delta_clock(thr, MODMINER_CLOCK_DOWN, true);
+ // If it's defined to be 0 then don't call modminer_delta_clock()
+ if (MODMINER_CLOCK_OVERHEAT != 0)
+ modminer_delta_clock(thr, MODMINER_CLOCK_OVERHEAT, true);
state->overheated = true;
dev_error(modminer, REASON_DEV_OVER_HEAT);
}
@@ -854,6 +858,11 @@ static void check_temperature(struct thr_info *thr)
#define work_restart(thr) thr->work_restart
+// 250Mhz is 17.17s - ensure we don't go idle
+static const double processtime = 17.0;
+// 160Mhz is 26.84 - when overheated ensure we don't throw away shares
+static const double overheattime = 26.9;
+
static uint64_t modminer_process_results(struct thr_info *thr)
{
struct cgpu_info *modminer = thr->cgpu;
@@ -863,9 +872,9 @@ static uint64_t modminer_process_results(struct thr_info *thr)
char cmd[2];
uint32_t nonce;
uint32_t curr_hw_errors;
- int err, amount;
+ int err, amount, amount2;
int timeoutloop;
- double processtime;
+ double timeout;
int temploop;
// If we are overheated it will just keep checking for results
@@ -876,21 +885,22 @@ static uint64_t modminer_process_results(struct thr_info *thr)
cmd[0] = MODMINER_CHECK_WORK;
cmd[1] = modminer->fpgaid;
- // 250Mhz is 17.17s
- processtime = 17.0;
timeoutloop = 0;
temploop = 0;
while (1) {
mutex_lock(modminer->modminer_mutex);
if ((err = usb_write(modminer, cmd, 2, &amount, C_REQUESTWORKSTATUS)) < 0 || amount != 2) {
-// TODO: err = -4 means the MMQ disappeared - need to delete it and rescan for it? (after a delay?)
+// TODO: err = LIBUSB_ERROR_NO_DEVICE means the MMQ disappeared
+// - need to delete it and rescan for it? (after a delay?)
// but check all (4) disappeared
mutex_unlock(modminer->modminer_mutex);
// timeoutloop never resets so the timeouts can't
// accumulate much during a single item of work
- if (err == -7 && ++timeoutloop < 10)
+ if (err == LIBUSB_ERROR_TIMEOUT && ++timeoutloop < 10) {
+ state->timeout_fail++;
goto tryagain;
+ }
applog(LOG_ERR, "%s%u: Error sending (get nonce) (%d:%d)",
modminer->api->name, modminer->device_id, amount, err);
@@ -899,17 +909,28 @@ static uint64_t modminer_process_results(struct thr_info *thr)
}
err = usb_read(modminer, (char *)(&nonce), 4, &amount, C_GETWORKSTATUS);
- mutex_unlock(modminer->modminer_mutex);
+ while (err == LIBUSB_SUCCESS && amount < 4) {
+ size_t remain = 4 - amount;
+ char *pos = ((char *)(&nonce)) + amount;
+
+ state->success_more++;
- if (err < 0 || amount != 4) {
+ err = usb_read(modminer, pos, remain, &amount2, C_GETWORKSTATUS);
+ amount += amount2;
+ }
+ mutex_unlock(modminer->modminer_mutex);
+
+ if (err < 0 || amount < 4) {
// timeoutloop never resets so the timeouts can't
// accumulate much during a single item of work
- if (err == -7 && ++timeoutloop < 10)
+ if (err == LIBUSB_ERROR_TIMEOUT && ++timeoutloop < 10) {
+ state->timeout_fail++;
goto tryagain;
+ }
applog(LOG_ERR, "%s%u: Error reading (get nonce) (%d:%d)",
- modminer->api->name, modminer->device_id, amount, err);
+ modminer->api->name, modminer->device_id, amount+amount2, err);
}
if (memcmp(&nonce, "\xff\xff\xff\xff", 4)) {
@@ -970,16 +991,24 @@ tryagain:
if (work_restart(thr))
break;
- gettimeofday(&now, NULL);
- if (tdiff(&now, &state->tv_workstart) > processtime)
- break;
+ if (state->overheated == true) {
+ // don't check every time
+ if (++temploop > 30) {
+ check_temperature(thr);
+ temploop = 0;
+ }
- // don't check every time
- if (state->overheated == true && ++temploop > 30) {
- check_temperature(thr);
- temploop = 0;
}
+ if (state->overheated == true)
+ timeout = overheattime;
+ else
+ timeout = processtime;
+
+ gettimeofday(&now, NULL);
+ if (tdiff(&now, &state->tv_workstart) > timeout)
+ break;
+
nmsleep(10);
if (work_restart(thr))
break;
@@ -991,6 +1020,7 @@ tryagain:
// Not exact since the clock may have changed ... but close enough I guess
uint64_t hashes = (uint64_t)modminer->clock * (((uint64_t)elapsed.tv_sec * 1000000) + elapsed.tv_usec);
+ // Overheat will complete the nonce range
if (hashes > 0xffffffff)
hashes = 0xffffffff;
else
diff --git a/miner.h b/miner.h
index 7f5b6c8..b0659d9 100644
--- a/miner.h
+++ b/miner.h
@@ -1033,6 +1033,8 @@ struct modminer_fpga_state {
uint32_t shares_last_hw;
uint32_t hw_errors;
uint32_t shares_to_good;
+ uint32_t timeout_fail;
+ uint32_t success_more;
struct timeval last_changed;
struct timeval last_nonce;
struct timeval first_work;