Commit d3f33888fd0e61956364092c044458e61a6b954a

Con Kolivas 2013-10-28T15:26:58

Merge branch 'master' into kncminer

diff --git a/driver-klondike.c b/driver-klondike.c
index 0d6d157..8648dde 100644
--- a/driver-klondike.c
+++ b/driver-klondike.c
@@ -63,9 +63,6 @@ static const char *msg_reply = "Reply";
 #define KLN_KILLWORK_TEMP	53.5
 #define KLN_COOLED_DOWN		45.5
 
-// If 5 late updates in a row, try to reset the device
-#define KLN_LATE_UPDATE_LIMIT	5
-
 /*
  *  Work older than 5s will already be completed
  *  FYI it must not be possible to complete 256 work
@@ -75,11 +72,28 @@ static const char *msg_reply = "Reply";
 #define OLD_WORK_MS ((int)(5 * 1000))
 
 /*
+ * How many incorrect slave counts to ignore in a row
+ * 2 means it allows random grabage returned twice
+ * Until slaves are implemented, this should never occur
+ * so allowing 2 in a row should ignore random errros
+ */
+#define KLN_ISS_IGNORE 2
+
+/*
  * If the queue status hasn't been updated for this long then do it now
  * 5GH/s = 859ms per full nonce range
  */
 #define LATE_UPDATE_MS ((int)(2.5 * 1000))
 
+// If 5 late updates in a row, try to reset the device
+#define LATE_UPDATE_LIMIT	5
+
+// If the reset fails sleep for 1s
+#define LATE_UPDATE_SLEEP_MS 1000
+
+// However give up after 8s
+#define LATE_UPDATE_NODEV_MS ((int)(8.0 * 1000))
+
 struct device_drv klondike_drv;
 
 typedef struct klondike_header {
@@ -199,7 +213,6 @@ typedef struct jobque {
 } JOBQUE;
 
 struct klondike_info {
-	bool shutdown;
 	pthread_rwlock_t stat_lock;
 	struct thr_info replies_thr;
 	cglock_t klist_lock;
@@ -216,6 +229,7 @@ struct klondike_info {
 	uint64_t hashcount;
 	uint64_t errorcount;
 	uint64_t noisecount;
+	int incorrect_slave_sequential;
 
 	// us Delay from USB reply to being processed
 	double delay_count;
@@ -540,7 +554,7 @@ static KLIST *GetReply(struct cgpu_info *klncgpu, uint8_t cmd, uint8_t dev)
 	KLIST *kitem;
 	int retries = CMD_REPLY_RETRIES;
 
-	while (retries-- > 0 && klninfo->shutdown == false) {
+	while (retries-- > 0 && klncgpu->shutdown == false) {
 		cgsleep_ms(REPLY_WAIT_TIME);
 		cg_rlock(&klninfo->klist_lock);
 		kitem = klninfo->used;
@@ -947,13 +961,13 @@ static void *klondike_get_replies(void *userdata)
 	struct klondike_info *klninfo = (struct klondike_info *)(klncgpu->device_data);
 	KLIST *kitem = NULL;
 	char *hexdata;
-	int err, recd, slaves, dev;
-	bool overheat;
+	int err, recd, slaves, dev, isc;
+	bool overheat, sent;
 
 	applog(LOG_DEBUG, "%s%i: listening for replies",
 			  klncgpu->drv->name, klncgpu->device_id);
 
-	while (klninfo->shutdown == false) {
+	while (klncgpu->shutdown == false) {
 		if (klncgpu->usbinfo.nodev)
 			return NULL;
 
@@ -1019,16 +1033,27 @@ static void *klondike_get_replies(void *userdata)
 						cgtime(&(klninfo->jobque[dev].last_update));
 						slaves = klninfo->status[0].kline.ws.slavecount;
 						overheat = klninfo->jobque[dev].overheat;
+						if (dev == 0) {
+							if (kitem->kline.ws.slavecount != slaves)
+								isc = ++klninfo->incorrect_slave_sequential;
+							else
+								isc = klninfo->incorrect_slave_sequential = 0;
+						}
 						wr_unlock(&(klninfo->stat_lock));
 
-						if (kitem->kline.ws.slavecount != slaves) {
-							applog(LOG_ERR, "%s%i:%d reply [%c] has a diff # of slaves=%d"
-									" (curr=%d) dropping device to hotplug",
-									klncgpu->drv->name, klncgpu->device_id,
-									dev, (char)(kitem->kline.ws.cmd),
+						if (isc) {
+							applog(LOG_ERR, "%s%i:%d reply [%c] has a diff"
+									" # of slaves=%d (curr=%d)%s",
+									klncgpu->drv->name,
+									klncgpu->device_id,
+									dev,
+									(char)(kitem->kline.ws.cmd),
 									(int)(kitem->kline.ws.slavecount),
-									slaves);
-							klninfo->shutdown = true;
+									slaves,
+									isc <= KLN_ISS_IGNORE ? "" :
+									 " disabling device");
+							if (isc > KLN_ISS_IGNORE)
+								usb_nodev(klncgpu);
 							break;
 						}
 
@@ -1048,15 +1073,16 @@ static void *klondike_get_replies(void *userdata)
 								zero_kline(&kline);
 								kline.hd.cmd = KLN_CMD_ABORT;
 								kline.hd.dev = dev;
-								if (!SendCmd(klncgpu, &kline, KSENDHD(0))) {
-									applog(LOG_ERR, "%s%i:%d failed to abort work"
-											" - dropping device to hotplug",
+								sent = SendCmd(klncgpu, &kline, KSENDHD(0));
+								kln_disable(klncgpu, dev, false);
+								if (!sent) {
+									applog(LOG_ERR, "%s%i:%d overheat failed to"
+											" abort work - disabling device",
 											klncgpu->drv->name,
 											klncgpu->device_id,
 											dev);
-									klninfo->shutdown = true;
+									usb_nodev(klncgpu);
 								}
-								kln_disable(klncgpu, dev, false);
 							}
 						}
 					}
@@ -1157,7 +1183,7 @@ static void klondike_shutdown(struct thr_info *thr)
 
 	kln_disable(klncgpu, klninfo->status[0].kline.ws.slavecount, true);
 
-	klncgpu->shutdown = klninfo->shutdown = true;
+	klncgpu->shutdown = true;
 }
 
 static void klondike_thread_enable(struct thr_info *thr)
@@ -1243,10 +1269,13 @@ static bool klondike_queue_full(struct cgpu_info *klncgpu)
 {
 	struct klondike_info *klninfo = (struct klondike_info *)(klncgpu->device_data);
 	struct work *work = NULL;
-	int dev, queued, slaves, seq;
+	int dev, queued, slaves, seq, howlong;
 	struct timeval now;
 	bool nowork;
 
+	if (klncgpu->shutdown == true)
+		return true;
+
 	cgtime(&now);
 	rd_lock(&(klninfo->stat_lock));
 	slaves = klninfo->status[0].kline.ws.slavecount;
@@ -1255,7 +1284,7 @@ static bool klondike_queue_full(struct cgpu_info *klncgpu)
 			klninfo->jobque[dev].late_update_count++;
 			seq = ++klninfo->jobque[dev].late_update_sequential;
 			rd_unlock(&(klninfo->stat_lock));
-			if (seq < KLN_LATE_UPDATE_LIMIT) {
+			if (seq < LATE_UPDATE_LIMIT) {
 				applog(LOG_ERR, "%s%i:%d late update",
 						klncgpu->drv->name, klncgpu->device_id, dev);
 				klondike_get_stats(klncgpu);
@@ -1263,17 +1292,22 @@ static bool klondike_queue_full(struct cgpu_info *klncgpu)
 			} else {
 				applog(LOG_ERR, "%s%i:%d late update (%d) reached - attempting reset",
 						klncgpu->drv->name, klncgpu->device_id,
-						dev, KLN_LATE_UPDATE_LIMIT);
+						dev, LATE_UPDATE_LIMIT);
 				control_init(klncgpu);
 				kln_enable(klncgpu);
 				klondike_get_stats(klncgpu);
 				rd_lock(&(klninfo->stat_lock));
-				if (ms_tdiff(&now, &(klninfo->jobque[dev].last_update)) > LATE_UPDATE_MS) {
+				howlong = ms_tdiff(&now, &(klninfo->jobque[dev].last_update));
+				if (howlong > LATE_UPDATE_MS) {
 					rd_unlock(&(klninfo->stat_lock));
-					applog(LOG_ERR, "%s%i:%d reset failed - dropping device",
-							klncgpu->drv->name, klncgpu->device_id, dev);
-					klninfo->shutdown = true;
-					return false;
+					if (howlong > LATE_UPDATE_NODEV_MS) {
+						applog(LOG_ERR, "%s%i:%d reset failed - dropping device",
+								klncgpu->drv->name, klncgpu->device_id, dev);
+						usb_nodev(klncgpu);
+					} else
+						cgsleep_ms(LATE_UPDATE_SLEEP_MS);
+
+					return true;
 				}
 				break;
 			}
@@ -1360,6 +1394,7 @@ static int64_t klondike_scanwork(struct thr_info *thr)
 		klninfo->noncecount = 0;
 		rd_unlock(&(klninfo->stat_lock));
 	}
+
 	return newhashcount;
 }
 
diff --git a/usbutils.c b/usbutils.c
index 4c21e36..82dc312 100644
--- a/usbutils.c
+++ b/usbutils.c
@@ -1356,6 +1356,20 @@ static void release_cgpu(struct cgpu_info *cgpu)
 }
 
 /*
+ * Force a NODEV on a device so it goes back to hotplug
+ */
+void usb_nodev(struct cgpu_info *cgpu)
+{
+	int pstate;
+
+	DEVWLOCK(cgpu, pstate);
+
+	release_cgpu(cgpu);
+
+	DEVWUNLOCK(cgpu, pstate);
+}
+
+/*
  * Use the same usbdev thus locking is across all related devices
  */
 struct cgpu_info *usb_copy_cgpu(struct cgpu_info *orig)
diff --git a/usbutils.h b/usbutils.h
index 2c9e5ad..4f0dfed 100644
--- a/usbutils.h
+++ b/usbutils.h
@@ -358,7 +358,8 @@ bool async_usb_transfers(void);
 void cancel_usb_transfers(void);
 void usb_all(int level);
 const char *usb_cmdname(enum usb_cmds cmd);
-void usb_applog(struct cgpu_info *bflsc, enum usb_cmds cmd, char *msg, int amount, int err);
+void usb_applog(struct cgpu_info *cgpu, enum usb_cmds cmd, char *msg, int amount, int err);
+void usb_nodev(struct cgpu_info *cgpu);
 struct cgpu_info *usb_copy_cgpu(struct cgpu_info *orig);
 struct cgpu_info *usb_alloc_cgpu(struct device_drv *drv, int threads);
 struct cgpu_info *usb_free_cgpu(struct cgpu_info *cgpu);