Commit aaaa8a52fdc6ee0b7b894cfc0a417502d60dedbc

Kano 2012-09-30T17:37:01

Icarus catch more USB errors and close/reopen the port

diff --git a/driver-icarus.c b/driver-icarus.c
index 4214c31..cc74df2 100644
--- a/driver-icarus.c
+++ b/driver-icarus.c
@@ -223,6 +223,11 @@ static void rev(unsigned char *s, size_t l)
 #define icarus_open2(devpath, baud, purge)  serial_open(devpath, baud, ICARUS_READ_FAULT_DECISECONDS, purge)
 #define icarus_open(devpath, baud)  icarus_open2(devpath, baud, false)
 
+#define ICA_GETS_ERROR -1
+#define ICA_GETS_OK 0
+#define ICA_GETS_RESTART 1
+#define ICA_GETS_TIMEOUT 2
+
 static int icarus_gets(unsigned char *buf, int fd, struct timeval *tv_finish, struct thr_info *thr, int read_count)
 {
 	ssize_t ret = 0;
@@ -233,12 +238,14 @@ static int icarus_gets(unsigned char *buf, int fd, struct timeval *tv_finish, st
 	// Read reply 1 byte at a time to get earliest tv_finish
 	while (true) {
 		ret = read(fd, buf, 1);
+		if (ret < 0)
+			return ICA_GETS_ERROR;
 
 		if (first)
 			gettimeofday(tv_finish, NULL);
 
 		if (ret >= read_amount)
-			return 0;
+			return ICA_GETS_OK;
 
 		if (ret > 0) {
 			buf += ret;
@@ -254,16 +261,16 @@ static int icarus_gets(unsigned char *buf, int fd, struct timeval *tv_finish, st
 					"Icarus Read: No data in %.2f seconds",
 					(float)rc/(float)TIME_FACTOR);
 			}
-			return 1;
+			return ICA_GETS_TIMEOUT;
 		}
 
-		if (thr->work_restart) {
+		if (thr && thr->work_restart) {
 			if (opt_debug) {
 				applog(LOG_DEBUG,
 					"Icarus Read: Work restart at %.2f seconds",
 					(float)(rc)/(float)TIME_FACTOR);
 			}
-			return 1;
+			return ICA_GETS_RESTART;
 		}
 	}
 }
@@ -281,6 +288,13 @@ static int icarus_write(int fd, const void *buf, size_t bufLen)
 
 #define icarus_close(fd) close(fd)
 
+static void do_icarus_close(struct thr_info *thr)
+{
+	struct cgpu_info *icarus = thr->cgpu;
+	icarus_close(icarus->device_fd);
+	icarus->device_fd = -1;
+}
+
 static const char *timing_mode_str(enum timing_mode timing_mode)
 {
 	switch(timing_mode) {
@@ -533,10 +547,7 @@ static bool icarus_detect_one(const char *devpath)
 	gettimeofday(&tv_start, NULL);
 
 	memset(nonce_bin, 0, sizeof(nonce_bin));
-	struct thr_info dummy = {
-		.work_restart = false,
-	};
-	icarus_gets(nonce_bin, fd, &tv_finish, &dummy, 1);
+	icarus_gets(nonce_bin, fd, &tv_finish, NULL, 1);
 
 	icarus_close(fd);
 
@@ -563,6 +574,7 @@ static bool icarus_detect_one(const char *devpath)
 	icarus = calloc(1, sizeof(struct cgpu_info));
 	icarus->api = &icarus_api;
 	icarus->device_path = strdup(devpath);
+	icarus->device_fd = -1;
 	icarus->threads = 1;
 	add_cgpu(icarus);
 	icarus_info = realloc(icarus_info, sizeof(struct ICARUS_INFO *) * (total_devices + 1));
@@ -607,6 +619,8 @@ static bool icarus_prepare(struct thr_info *thr)
 
 	struct timeval now;
 
+	icarus->device_fd = -1;
+
 	int fd = icarus_open(icarus->device_path, icarus_info[icarus->device_id]->baud);
 	if (unlikely(-1 == fd)) {
 		applog(LOG_ERR, "Failed to open Icarus on %s",
@@ -653,6 +667,17 @@ static int64_t icarus_scanhash(struct thr_info *thr, struct work *work,
 	elapsed.tv_sec = elapsed.tv_usec = 0;
 
 	icarus = thr->cgpu;
+	if (icarus->device_fd == -1)
+		if (!icarus_prepare(thr)) {
+			applog(LOG_ERR, "ICA%i: Comms error", icarus->device_id);
+			icarus->device_last_not_well = time(NULL);
+			icarus->device_not_well_reason = REASON_DEV_COMMS_ERROR;
+			icarus->dev_comms_error_count++;
+
+			// fail the device if the reopen attempt fails
+			return -1;
+		}
+
 	fd = icarus->device_fd;
 
 	memset(ob_bin, 0, sizeof(ob_bin));
@@ -664,8 +689,10 @@ static int64_t icarus_scanhash(struct thr_info *thr, struct work *work,
 	tcflush(fd, TCOFLUSH);
 #endif
 	ret = icarus_write(fd, ob_bin, sizeof(ob_bin));
-	if (ret)
-		return -1;	/* This should never happen */
+	if (ret) {
+		do_icarus_close(thr);
+		return 0;	/* This should never happen */
+	}
 
 	gettimeofday(&tv_start, NULL);
 
@@ -682,12 +709,19 @@ static int64_t icarus_scanhash(struct thr_info *thr, struct work *work,
 	memset(nonce_bin, 0, sizeof(nonce_bin));
 	info = icarus_info[icarus->device_id];
 	ret = icarus_gets(nonce_bin, fd, &tv_finish, thr, info->read_count);
+	if (ret == ICA_GETS_ERROR) {
+		do_icarus_close(thr);
+		applog(LOG_ERR, "ICA%i: Comms error", icarus->device_id);
+		icarus->device_last_not_well = time(NULL);
+		icarus->device_not_well_reason = REASON_DEV_COMMS_ERROR;
+		icarus->dev_comms_error_count++;
+		return 0;
+	}
 
 	work->blk.nonce = 0xffffffff;
-	memcpy((char *)&nonce, nonce_bin, sizeof(nonce_bin));
 
 	// aborted before becoming idle, get new work
-	if (nonce == 0 && ret) {
+	if (ret == ICA_GETS_TIMEOUT || ret == ICA_GETS_RESTART) {
 		timersub(&tv_finish, &tv_start, &elapsed);
 
 		// ONLY up to just when it aborted
@@ -709,6 +743,8 @@ static int64_t icarus_scanhash(struct thr_info *thr, struct work *work,
 		return estimate_hashes;
 	}
 
+	memcpy((char *)&nonce, nonce_bin, sizeof(nonce_bin));
+
 #if !defined (__BIG_ENDIAN__) && !defined(MIPSEB)
 	nonce = swab32(nonce);
 #endif
@@ -717,6 +753,10 @@ static int64_t icarus_scanhash(struct thr_info *thr, struct work *work,
 	submit_nonce(thr, work, nonce);
 	was_hw_error = (curr_hw_errors > icarus->hw_errors);
 
+	// Force a USB close/reopen on any hw error
+	if (was_hw_error)
+		do_icarus_close(thr);
+
 	hash_count = (nonce & info->nonce_mask);
 	hash_count++;
 	hash_count *= info->fpga_count;
@@ -862,8 +902,7 @@ static struct api_data *icarus_api_stats(struct cgpu_info *cgpu)
 
 static void icarus_shutdown(struct thr_info *thr)
 {
-	struct cgpu_info *icarus = thr->cgpu;
-	icarus_close(icarus->device_fd);
+	do_icarus_close(thr);
 }
 
 struct device_api icarus_api = {