klondike - downgrade 'late update' but add an idle detect - and correct error levels
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131
diff --git a/driver-klondike.c b/driver-klondike.c
index deb87f1..bcd3899 100644
--- a/driver-klondike.c
+++ b/driver-klondike.c
@@ -208,6 +208,7 @@ typedef struct jobque {
int workqc;
struct timeval last_update;
bool overheat;
+ bool flushed;
int late_update_count;
int late_update_sequential;
} JOBQUE;
@@ -315,7 +316,7 @@ static KLIST *allocate_kitem(struct cgpu_info *klncgpu)
cg_wunlock(&klninfo->klist_lock);
if (ran_out > 0)
- applog(LOG_ERR, "%s", errbuf);
+ applog(LOG_WARNING, "%s", errbuf);
return kitem;
}
@@ -985,7 +986,9 @@ static void *klondike_get_replies(void *userdata)
}
if (!err && recd == REPLY_SIZE) {
cgtime(&(kitem->tv_when));
+ rd_lock(&(klninfo->stat_lock));
kitem->block_seq = klninfo->block_seq;
+ rd_unlock(&(klninfo->stat_lock));
if (opt_log_level <= READ_DEBUG) {
hexdata = bin2hex((unsigned char *)&(kitem->kline.hd.dev), recd-1);
applog(READ_DEBUG, "%s%i:%d reply [%c:%s]",
@@ -1022,8 +1025,27 @@ static void *klondike_get_replies(void *userdata)
klondike_check_nonce(klncgpu, kitem);
display_kline(klncgpu, &kitem->kline, msg_reply);
break;
- case KLN_CMD_STATUS:
case KLN_CMD_WORK:
+ // We can't do/check this until it's initialised
+ if (klninfo->initialised) {
+ if (kitem->kline.ws.workqc == 0) {
+ bool idle = false;
+ rd_lock(&(klninfo->stat_lock));
+ if (klninfo->jobque[dev].flushed == false)
+ idle = true;
+ slaves = klninfo->status[0].kline.ws.slavecount;
+ rd_lock(&(klninfo->stat_lock));
+ if (idle)
+ applog(LOG_WARNING, "%s%i:%d went idle before work was sent",
+ klncgpu->drv->name,
+ klncgpu->device_id,
+ dev);
+ }
+ wr_lock(&(klninfo->stat_lock));
+ klninfo->jobque[dev].flushed = false;
+ wr_lock(&(klninfo->stat_lock));
+ }
+ case KLN_CMD_STATUS:
case KLN_CMD_ABORT:
// We can't do/check this until it's initialised
if (klninfo->initialised) {
@@ -1067,9 +1089,10 @@ static void *klondike_get_replies(void *userdata)
klninfo->jobque[dev].overheat = true;
wr_unlock(&(klninfo->stat_lock));
- applog(LOG_ERR, "%s%i:%d Critical overheat (%.0fC)",
- klncgpu->drv->name, klncgpu->device_id,
- dev, temp);
+ applog(LOG_WARNING, "%s%i:%d Critical overheat (%.0fC)",
+ klncgpu->drv->name,
+ klncgpu->device_id,
+ dev, temp);
zero_kline(&kline);
kline.hd.cmd = KLN_CMD_ABORT;
@@ -1122,13 +1145,13 @@ static void klondike_flush_work(struct cgpu_info *klncgpu)
KLINE kline;
int slaves, dev;
+ wr_lock(&(klninfo->stat_lock));
klninfo->block_seq++;
+ slaves = klninfo->status[0].kline.ws.slavecount;
+ wr_unlock(&(klninfo->stat_lock));
applog(LOG_DEBUG, "%s%i: flushing work",
klncgpu->drv->name, klncgpu->device_id);
- rd_lock(&(klninfo->stat_lock));
- slaves = klninfo->status[0].kline.ws.slavecount;
- rd_unlock(&(klninfo->stat_lock));
zero_kline(&kline);
kline.hd.cmd = KLN_CMD_ABORT;
for (dev = 0; dev <= slaves; dev++) {
@@ -1139,6 +1162,7 @@ static void klondike_flush_work(struct cgpu_info *klncgpu)
memcpy((void *)&(klninfo->status[dev]),
kitem,
sizeof(klninfo->status[dev]));
+ klninfo->jobque[dev].flushed = true;
wr_unlock(&(klninfo->stat_lock));
kitem = release_kitem(klncgpu, kitem);
}
@@ -1286,14 +1310,14 @@ static bool klondike_queue_full(struct cgpu_info *klncgpu)
seq = ++klninfo->jobque[dev].late_update_sequential;
rd_unlock(&(klninfo->stat_lock));
if (seq < LATE_UPDATE_LIMIT) {
- applog(LOG_ERR, "%s%i:%d late update",
+ applog(LOG_DEBUG, "%s%i:%d late update",
klncgpu->drv->name, klncgpu->device_id, dev);
klondike_get_stats(klncgpu);
goto que;
} else {
- applog(LOG_ERR, "%s%i:%d late update (%d) reached - attempting reset",
- klncgpu->drv->name, klncgpu->device_id,
- dev, LATE_UPDATE_LIMIT);
+ applog(LOG_WARNING, "%s%i:%d late update (%d) reached - attempting reset",
+ klncgpu->drv->name, klncgpu->device_id,
+ dev, LATE_UPDATE_LIMIT);
control_init(klncgpu);
kln_enable(klncgpu);
klondike_get_stats(klncgpu);
@@ -1333,9 +1357,9 @@ tryagain:
if (temp <= KLN_COOLED_DOWN) {
klninfo->jobque[dev].overheat = false;
rd_unlock(&(klninfo->stat_lock));
- applog(LOG_ERR, "%s%i:%d Overheat recovered (%.0fC)",
- klncgpu->drv->name, klncgpu->device_id,
- dev, temp);
+ applog(LOG_WARNING, "%s%i:%d Overheat recovered (%.0fC)",
+ klncgpu->drv->name, klncgpu->device_id,
+ dev, temp);
kln_enable(klncgpu);
goto tryagain;
} else {