Commit 4e05c30bfa12d658253ea670995156e069fdb5f0

kanoi 2014-03-22T17:24:49

Merge pull request #569 from kanoi/master Ant S1 - overheat handling + API-README versions fix

diff --git a/API-README b/API-README
index 303a62b..a10017c 100644
--- a/API-README
+++ b/API-README
@@ -503,7 +503,7 @@ miner.php - an example web page to access the API
 Feature Changelog for external applications using the API:
 
 
-API V3.3 (cgminer v4.1.1)
+API V3.3 (cgminer v4.2.0)
 
 Added API commands:
  'edevs' - Only enabled devices, for 'devs'
@@ -1674,7 +1674,8 @@ With cgminer 2.10.2 and later, miner.php includes an extension to
 the custom pages that allows you to apply SQL style commands to
 the data: where, group, and having
 cgminer 3.4.2 and later also includes another option 'gen'
-cgminer 4.1.1 and later also includes 2 another options 'fmt' and 'bgen'
+cgminer 4.2.0 and later also includes another option 'fmt'
+cgminer 4.2.1 and later also includes another option 'bgen'
 
 An example of an 'ext' section in a more complex custom summary page:
 
diff --git a/driver-bitmain.c b/driver-bitmain.c
index 9dc88b7..6ea9930 100644
--- a/driver-bitmain.c
+++ b/driver-bitmain.c
@@ -748,6 +748,7 @@ static inline void record_temp_fan(struct bitmain_info *info, struct bitmain_rxs
 		info->fan[i] = bm->fan[i] * BITMAIN_FAN_FACTOR;
 	}
 	info->temp_num = bm->temp_num;
+	info->temp_hi = 0;
 	for (i = 0; i < bm->temp_num; i++) {
 		info->temp[i] = bm->temp[i];
 		/*
@@ -757,9 +758,10 @@ static inline void record_temp_fan(struct bitmain_info *info, struct bitmain_rxs
 		}*/
 		*temp_avg += info->temp[i];
 
-		if (info->temp[i] > info->temp_max) {
+		if (info->temp[i] > info->temp_max)
 			info->temp_max = info->temp[i];
-		}
+		if (info->temp[i] > info->temp_hi)
+			info->temp_hi = info->temp[i];
 	}
 
 	if (bm->temp_num > 0) {
@@ -804,14 +806,21 @@ static void bitmain_update_temps(struct cgpu_info *bitmain, struct bitmain_info 
 		info->temp_history_index = 0;
 		info->temp_sum = 0;
 	}
-	if (unlikely(info->temp_max >= opt_bitmain_overheat)) {
-		applog(LOG_WARNING, "%s%d: overheat! Idling",
-				    bitmain->drv->name, bitmain->device_id);
-		info->overheat = true;
-	} else if (info->overheat && info->temp_max <= opt_bitmain_temp) {
+	if (unlikely(info->temp_hi >= opt_bitmain_overheat)) {
+		if (!info->overheat) {
+			applog(LOG_WARNING, "%s%d: overheat! hi %dC limit %dC idling",
+					    bitmain->drv->name, bitmain->device_id,
+					    info->temp_hi, opt_bitmain_overheat);
+			info->overheat = true;
+			info->overheat_temp = info->temp_hi;
+			info->overheat_count++;
+			info->overheat_slept = 0;
+		}
+	} else if (info->overheat && info->temp_hi <= opt_bitmain_temp) {
 		applog(LOG_WARNING, "%s%d: cooled, restarting",
 				    bitmain->drv->name, bitmain->device_id);
 		info->overheat = false;
+		info->overheat_recovers++;
 	}
 }
 
@@ -1100,13 +1109,9 @@ static void *bitmain_get_results(void *userdata)
 			offset = 0;
 		}
 
-		/* As the usb read returns after just 1ms, sleep long enough
-		 * to leave the interface idle for writes to occur, but do not
-		 * sleep if we have been receiving data as more may be coming. */
-		//if (offset == 0)
-		//	cgsleep_ms_r(&ts_start, BITMAIN_READ_TIMEOUT);
+		// 2ms shouldn't be too much
+		cgsleep_ms(2);
 
-		//cgsleep_prepare_r(&ts_start);
 		applog(LOG_DEBUG, "%s%d: %s() read",
 				  bitmain->drv->name, bitmain->device_id, __func__);
 		ret = bitmain_read(bitmain, buf, rsize, BITMAIN_READ_TIMEOUT, C_BITMAIN_READ);
@@ -1496,6 +1501,48 @@ static bool bitmain_fill(struct cgpu_info *bitmain)
 	int timediff = 0;
 	K_ITEM *witem;
 
+	/*
+	 * Overheat just means delay the next work
+	 * since the temperature reply is only found with a work reply,
+	 * we can only sleep and hope it will cool down
+	 * TODO: of course it may be possible to read the temperature
+	 * without sending work ...
+	 */
+	if (info->overheat == true) {
+		if (info->overheat_sleep_ms == 0)
+			info->overheat_sleep_ms = BITMAIN_OVERHEAT_SLEEP_MS_DEF;
+
+		/*
+		 * If we slept and we are still here, and the temp didn't drop,
+		 * increment the sleep time to find a sleep time that causes a
+		 * temperature drop
+		 */
+		if (info->overheat_slept) {
+			if (info->overheat_temp > info->temp_hi)
+				info->overheat_temp = info->temp_hi;
+			else {
+				if (info->overheat_sleep_ms < BITMAIN_OVERHEAT_SLEEP_MS_MAX)
+					info->overheat_sleep_ms += BITMAIN_OVERHEAT_SLEEP_MS_STEP;
+			}
+		}
+
+		applog(LOG_DEBUG, "%s%d: %s() sleeping %"PRIu32" - overheated",
+				  bitmain->drv->name, bitmain->device_id,
+				  __func__, info->overheat_sleep_ms);
+		cgsleep_ms(info->overheat_sleep_ms);
+		info->overheat_sleeps++;
+		info->overheat_slept = info->overheat_sleep_ms;
+		info->overheat_total_sleep += info->overheat_sleep_ms;
+	} else {
+		// If we slept and it cooled then try less next time
+		if (info->overheat_slept) {
+			if (info->overheat_sleep_ms > BITMAIN_OVERHEAT_SLEEP_MS_MIN)
+				info->overheat_sleep_ms -= BITMAIN_OVERHEAT_SLEEP_MS_STEP;
+			info->overheat_slept = 0;
+		}
+
+	}
+
 	applog(LOG_DEBUG, "%s%d: %s() start",
 			  bitmain->drv->name, bitmain->device_id,
 			  __func__);
@@ -1787,6 +1834,24 @@ static struct api_data *bitmain_api_stats(struct cgpu_info *cgpu)
 					(float)(info->failed_search) : 0;
 	root = api_add_avg(root, "avg_failed", &avg, true);
 
+	root = api_add_int(root, "temp_hi", &(info->temp_hi), false);
+	root = api_add_bool(root, "overheat", &(info->overheat), true);
+	root = api_add_int(root, "overheat_temp", &(info->overheat_temp), true);
+	root = api_add_uint32(root, "overheat_count", &(info->overheat_count), true);
+	root = api_add_uint32(root, "overheat_sleep_ms", &(info->overheat_sleep_ms), true);
+	root = api_add_uint32(root, "overheat_sleeps", &(info->overheat_sleeps), true);
+	root = api_add_uint32(root, "overheat_slept", &(info->overheat_slept), true);
+	root = api_add_uint64(root, "overheat_total_sleep", &(info->overheat_total_sleep), true);
+	root = api_add_uint32(root, "overheat_recovers", &(info->overheat_recovers), true);
+
+	root = api_add_int(root, "opt_bitmain_temp", &opt_bitmain_temp, false);
+	root = api_add_int(root, "opt_bitmain_overheat", &opt_bitmain_overheat, false);
+	root = api_add_int(root, "opt_bitmain_fan_min", &opt_bitmain_fan_min, false);
+	root = api_add_int(root, "opt_bitmain_fan_max", &opt_bitmain_fan_max, false);
+	root = api_add_int(root, "opt_bitmain_freq_min", &opt_bitmain_freq_min, false);
+	root = api_add_int(root, "opt_bitmain_freq_max", &opt_bitmain_freq_max, false);
+	root = api_add_bool(root, "opt_bitmain_auto", &opt_bitmain_auto, false);
+
 	return root;
 }
 
diff --git a/driver-bitmain.h b/driver-bitmain.h
index 605fdd9..be066c1 100644
--- a/driver-bitmain.h
+++ b/driver-bitmain.h
@@ -73,6 +73,11 @@
 #define BITMAIN_SEND_STATUS_TIME   10 //s
 #define BITMAIN_SEND_FULL_SPACE    128
 
+#define BITMAIN_OVERHEAT_SLEEP_MS_MAX 10000
+#define BITMAIN_OVERHEAT_SLEEP_MS_MIN 200
+#define BITMAIN_OVERHEAT_SLEEP_MS_DEF 600
+#define BITMAIN_OVERHEAT_SLEEP_MS_STEP 200
+
 struct bitmain_txconfig_token {
 	uint8_t token_type;
 	uint8_t length;
@@ -187,6 +192,7 @@ struct bitmain_info {
 	int temp[BITMAIN_MAX_TEMP_NUM];
 
 	int temp_max;
+	int temp_hi;
 	int temp_avg;
 	int temp_history_count;
 	int temp_history_index;
@@ -219,6 +225,13 @@ struct bitmain_info {
 	bool reset;
 	bool overheat;
 	bool optimal;
+	int overheat_temp;
+	uint32_t overheat_count;
+	uint32_t overheat_sleep_ms;
+	uint32_t overheat_sleeps;
+	uint32_t overheat_slept;
+	uint64_t overheat_total_sleep;
+	uint32_t overheat_recovers;
 
 	// Work
 	K_LIST *work_list;