Commit 209e82611519fbf1cdc800133a1cb9aefa00e58f

kanoi 2014-02-04T17:32:33

Merge pull request #545 from kanoi/master bab - more speed maangement and dead stats

diff --git a/driver-bab.c b/driver-bab.c
index 68c5d01..1a3a176 100644
--- a/driver-bab.c
+++ b/driver-bab.c
@@ -144,8 +144,9 @@ static const uint32_t bab_test_data[BAB_TEST_DATA] = {
  */
 #define BAB_MAXSPEED 57
 #define BAB_DEFMAXSPEED 55
-#define BAB_DEFSPEED 54
+#define BAB_DEFSPEED 53
 #define BAB_MINSPEED 52
+#define BAB_ABSMINSPEED 32
 
 /*
  * % of errors to tune the speed up or down
@@ -345,6 +346,8 @@ struct bab_info {
 	int chips;
 	int chips_per_bank[BAB_MAXBANKS+1];
 	int missing_chips_per_bank[BAB_MAXBANKS+1];
+	int bank_first_chip[BAB_MAXBANKS+1];
+	int bank_last_chip[BAB_MAXBANKS+1];
 	int boards;
 	int banks;
 	uint32_t chip_spis[BAB_MAXCHIPS+1];
@@ -667,7 +670,7 @@ static void _bab_reset(__maybe_unused struct cgpu_info *babcgpu, struct bab_info
 // TODO: handle a false return where this is called?
 static bool _bab_txrx(struct cgpu_info *babcgpu, struct bab_info *babinfo, K_ITEM *item, bool detect_ignore, const char *file, const char *func, const int line)
 {
-	int bank, i, count;
+	int bank, i, count, chip1, chip2;
 	uint32_t siz, pos;
 	struct spi_ioc_transfer tran;
 	uintptr_t rbuf, wbuf;
@@ -751,9 +754,31 @@ static bool _bab_txrx(struct cgpu_info *babcgpu, struct bab_info *babinfo, K_ITE
 		count++;
 		if (ioctl(babinfo->spifd, SPI_IOC_MESSAGE(1), (void *)&tran) < 0) {
 			if (!detect_ignore || errno != 110) {
-				applog(LOG_ERR, "%s%d: ioctl (%d) failed err=%d" BAB_FFL,
-						babcgpu->drv->name, babcgpu->device_id,
-						count, errno, BAB_FFL_PASS);
+				for (bank = BAB_MAXBANKS; bank >= 0; bank--) {
+					if (DATAS(item)->bank_off[bank] &&
+					    pos >= DATAS(item)->bank_off[bank]) {
+						break;
+					}
+				}
+				for (chip1 = babinfo->chips-1; chip1 >= 0; chip1--) {
+					if (DATAS(item)->chip_off[chip1] &&
+					    pos >= DATAS(item)->chip_off[chip1]) {
+						break;
+					}
+				}
+				for (chip2 = babinfo->chips-1; chip2 >= 0; chip2--) {
+					if (DATAS(item)->chip_off[chip2] &&
+					    (pos + tran.len) >= DATAS(item)->chip_off[chip2]) {
+						break;
+					}
+				}
+				applog(LOG_ERR, "%s%d: ioctl (%d) siz=%d bank=%d chip=%d-%d"
+						" failed err=%d" BAB_FFL,
+						babcgpu->drv->name,
+						babcgpu->device_id,
+						count, (int)(tran.len),
+						bank, chip1, chip2,
+						errno, BAB_FFL_PASS);
 			}
 			return false;
 		}
@@ -1202,6 +1227,8 @@ static void bab_init_chips(struct cgpu_info *babcgpu, struct bab_info *babinfo)
 			bab_detect_chips(babcgpu, babinfo, 0, BAB_V1_CHIP_TEST, BAB_MAXCHIPS);
 		}
 		babinfo->chips_per_bank[BAB_V1_BANK] = babinfo->chips;
+		babinfo->bank_first_chip[BAB_V1_BANK] = 0;
+		babinfo->bank_last_chip[BAB_V1_BANK] = babinfo->chips - 1;
 		babinfo->boards = (int)((float)(babinfo->chips - 1) / BAB_BOARDCHIPS) + 1;
 		babinfo->reply_wait = BAB_REPLY_WAIT_mS * 2;
 
@@ -1229,6 +1256,10 @@ static void bab_init_chips(struct cgpu_info *babcgpu, struct bab_info *babinfo)
 			bab_detect_chips(babcgpu, babinfo, bank, babinfo->chips, babinfo->chips + BAB_BANKCHIPS);
 			new_chips = babinfo->chips - chips;
 			babinfo->chips_per_bank[bank] = new_chips;
+			if (new_chips > 0) {
+				babinfo->bank_first_chip[bank] = babinfo->chips - new_chips;
+				babinfo->bank_last_chip[bank] = babinfo->chips - 1;
+			}
 			chips = babinfo->chips;
 			if (new_chips == 0)
 				boards = 0;
@@ -1291,39 +1322,47 @@ static void bab_get_options(struct cgpu_info *babcgpu, struct bab_info *babinfo)
 			case 0:
 				if (*ptr && tolower(*ptr) != 'd') {
 					val = atoi(ptr);
-					if (!isdigit(*ptr) || val < BAB_MINSPEED || val > BAB_MAXSPEED) {
+					if (!isdigit(*ptr) || val < BAB_ABSMINSPEED || val > BAB_MAXSPEED) {
 						quit(1, "%s"INVOP"%s '%s' must be %d <= %s <= %d",
 							babcgpu->drv->dname,
 							bab_options[which],
-							ptr, BAB_MINSPEED,
+							ptr, BAB_ABSMINSPEED,
 							bab_options[which],
 							BAB_MAXSPEED);
 					}
 					babinfo->max_speed = (uint8_t)val;
+					// Adjust def,min down if they are above max specified
+					if (babinfo->def_speed > babinfo->max_speed)
+						babinfo->def_speed = babinfo->max_speed;
+					if (babinfo->min_speed > babinfo->max_speed)
+						babinfo->min_speed = babinfo->max_speed;
 				}
 				break;
 			case 1:
 				if (*ptr && tolower(*ptr) != 'd') {
 					val = atoi(ptr);
-					if (!isdigit(*ptr) || val < BAB_MINSPEED || val > babinfo->max_speed) {
+					if (!isdigit(*ptr) || val < BAB_ABSMINSPEED || val > babinfo->max_speed) {
 						quit(1, "%s"INVOP"%s '%s' must be %d <= %s <= %d",
 							babcgpu->drv->dname,
 							bab_options[which],
-							ptr, BAB_MINSPEED,
+							ptr, BAB_ABSMINSPEED,
 							bab_options[which],
 							babinfo->max_speed);
 					}
 					babinfo->def_speed = (uint8_t)val;
+					// Adjust min down if is is above def specified
+					if (babinfo->min_speed > babinfo->def_speed)
+						babinfo->min_speed = babinfo->def_speed;
 				}
 				break;
 			case 2:
 				if (*ptr && tolower(*ptr) != 'd') {
 					val = atoi(ptr);
-					if (!isdigit(*ptr) || val < BAB_MINSPEED || val > babinfo->def_speed) {
+					if (!isdigit(*ptr) || val < BAB_ABSMINSPEED || val > babinfo->def_speed) {
 						quit(1, "%s"INVOP"%s '%s' must be %d <= %s <= %d",
 							babcgpu->drv->dname,
 							bab_options[which],
-							ptr, BAB_MINSPEED,
+							ptr, BAB_ABSMINSPEED,
 							bab_options[which],
 							babinfo->def_speed);
 					}
@@ -1423,7 +1462,7 @@ static void bab_detect(bool hotplug)
 
 	babinfo->max_speed = BAB_DEFMAXSPEED;
 	babinfo->def_speed = BAB_DEFSPEED;
-	babinfo->min_speed = BAB_MINSPEED;
+	babinfo->min_speed = BAB_ABSMINSPEED;
 
 	babinfo->tune_up = BAB_TUNEUP;
 	babinfo->tune_down = BAB_TUNEDOWN;
@@ -1450,6 +1489,11 @@ static void bab_detect(bool hotplug)
 	babinfo->spi_list = k_new_store(babinfo->sfree_list);
 	babinfo->spi_sent = k_new_store(babinfo->sfree_list);
 
+	for (i = 0; i <= BAB_MAXBANKS; i++) {
+		babinfo->bank_first_chip[i] = -1;
+		babinfo->bank_last_chip[i] = -1;
+	}
+
 	bab_init_chips(babcgpu, babinfo);
 
 	if (babinfo->boards) {
@@ -1697,12 +1741,12 @@ static void process_history(struct cgpu_info *babcgpu, int chip, struct timeval 
 			item = babinfo->bad_nonces[i]->tail;
 		}
 	}
-	K_WUNLOCK(babinfo->nfree_list);
-
-	// Tuning ...
 	good_nonces = babinfo->good_nonces[chip]->count;
 	bad_nonces = babinfo->bad_nonces[chip]->count;
 
+	K_WUNLOCK(babinfo->nfree_list);
+
+	// Tuning ...
 	if (tdiff(now, &(babinfo->first_work[chip])) >= HISTORY_TIME_S &&
 	    tdiff(now, &(babinfo->last_tune[chip])) >= HISTORY_TIME_S &&
 	    (good_nonces + bad_nonces) > 0) {
@@ -1759,8 +1803,16 @@ static void process_history(struct cgpu_info *babcgpu, int chip, struct timeval 
 
 		tune = (double)bad_nonces / (double)(good_nonces + bad_nonces) * 100.0;
 
-		// Tune it down if error rate is too high
-		if (tune >= babinfo->tune_down) {
+		/*
+		 * TODO: it appears some chips just get a % bad at low speeds
+		 * so we should handle them by weighting the speed reduction vs
+		 * the HW% gained from the reduction (i.e. GH/s)
+		 * Maybe handle that when they hit min_speed, then do a gradual speed
+		 * up verifying if it is really making GH/s worse or better
+		 */
+
+		// Tune it down if error rate is too high (and it's above min)
+		if (tune >= babinfo->tune_down && chip_fast > babinfo->min_speed) {
 			babinfo->chip_fast[chip]--;
 
 			applog(LOG_WARNING, "%s%d: Chip %d High errors %.2f%% - speed down %d to %d",
@@ -2271,7 +2323,9 @@ static struct api_data *bab_api_stats(struct cgpu_info *babcgpu)
 	struct api_data *root = NULL;
 	char data[2048];
 	char buf[32];
-	int spi_work, chip_work, i, to, j, sp, bank, chip_off;
+	int spi_work, chip_work, sp, chip, bank, chip_off, board, last_board;
+	int i, to, j, k;
+	bool dead;
 	struct timeval now;
 	double elapsed, ghs;
 	float ghs_sum, his_ghs_tot;
@@ -2641,6 +2695,106 @@ static struct api_data *bab_api_stats(struct cgpu_info *babcgpu)
 	free(tmp);
 	tmp = NULL;
 
+	switch (babinfo->version) {
+		case 1:
+			i = j = BAB_V1_BANK;
+			break;
+		case 2:
+			i = 1;
+			j = BAB_MAXBANKS;
+			break;
+	}
+	data[0] = '\0';
+	for (bank = i; bank <= j; bank++) {
+		if (babinfo->bank_first_chip[bank] >= 0) {
+			chip = babinfo->bank_first_chip[bank];
+			to = babinfo->bank_last_chip[bank];
+			for (; chip <= to; chip += BAB_BOARDCHIPS) {
+				dead = true;
+				for (k = chip; (k <= to) && (k < (chip+BAB_BOARDCHIPS)); k++) {
+					if (history_elapsed[k] > 0) {
+						double num = history_good[k];
+						// exclude the first nonce?
+						if (elapsed_is_good[k])
+							num--;
+						ghs = num * 0xffffffffull /
+							history_elapsed[k] / 1000000000.0;
+					} else
+						ghs = 0;
+
+					if (ghs > 0.0) {
+						dead = false;
+						break;
+					}
+				}
+				if (dead) {
+					board = (int)((float)(chip - babinfo->bank_first_chip[bank]) /
+							BAB_BOARDCHIPS) + 1;
+					snprintf(buf, sizeof(buf),
+							"%s%d/%d%s",
+							data[0] ? " " : "",
+							bank, board,
+							babinfo->missing_chips_per_bank[bank] ?
+							"?" : "");
+					strcat(data, buf);
+				}
+			}
+		}
+	}
+	root = api_add_string(root, "History Dead Boards", data[0] ? data : "None", true);
+
+	data[0] = '\0';
+	for (bank = i; bank <= j; bank++) {
+		if (babinfo->bank_first_chip[bank] >= 0) {
+			to = babinfo->bank_first_chip[bank];
+			chip = babinfo->bank_last_chip[bank];
+			for (; chip >= to; chip--) {
+				dead = true;
+				if (history_elapsed[chip] > 0) {
+					double num = history_good[chip];
+					// exclude the first nonce?
+					if (elapsed_is_good[chip])
+						num--;
+					ghs = num * 0xffffffffull /
+						history_elapsed[chip] / 1000000000.0;
+				} else
+					ghs = 0;
+
+				if (ghs > 0.0)
+					break;
+			}
+			/*
+			 * The output here is: a/b+c/d
+			 * a/b is the SPI/board that starts the Dead Chain
+			 * c is the number of boards after a
+			 * d is the total number of chips in the Dead Chain
+			 * A Dead Chain is a continous set of dead chips that
+			 * finish at the end of an SPI chain of boards
+			 * This might be caused by the first board, or the cables attached
+			 * to the first board, in the Dead Chain i.e. a/b
+			 * If c is zero, it's just the last board, so it's the same as any
+			 * other board having dead chips
+			 */
+			if (chip < babinfo->bank_last_chip[bank]) {
+				board = (int)((float)(chip - babinfo->bank_first_chip[bank]) /
+						BAB_BOARDCHIPS) + 1;
+				last_board = (int)((float)(babinfo->bank_last_chip[bank] -
+						babinfo->bank_first_chip[bank]) /
+						BAB_BOARDCHIPS) + 1;
+				snprintf(buf, sizeof(buf),
+						"%s%d/%d%s+%d/%d",
+						data[0] ? " " : "",
+						bank, board,
+						babinfo->missing_chips_per_bank[bank] ?
+						"?" : "",
+						last_board - board,
+						babinfo->bank_last_chip[bank] - chip);
+				strcat(data, buf);
+			}
+		}
+	}
+	root = api_add_string(root, "History Dead Chains", data[0] ? data : "None", true);
+
 	for (i = 0; i < BAB_NONCE_OFFSETS; i++) {
 		snprintf(buf, sizeof(buf), "Nonce Offset 0x%08x", bab_nonce_offsets[i]);
 		root = api_add_uint64(root, buf, &(babinfo->nonce_offset_count[i]), true);