ants1 - slow down mining if overheat occurs
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172
diff --git a/driver-bitmain.c b/driver-bitmain.c
index 9dc88b7..20d3598 100644
--- a/driver-bitmain.c
+++ b/driver-bitmain.c
@@ -748,6 +748,7 @@ static inline void record_temp_fan(struct bitmain_info *info, struct bitmain_rxs
info->fan[i] = bm->fan[i] * BITMAIN_FAN_FACTOR;
}
info->temp_num = bm->temp_num;
+ info->temp_hi = 0;
for (i = 0; i < bm->temp_num; i++) {
info->temp[i] = bm->temp[i];
/*
@@ -757,9 +758,10 @@ static inline void record_temp_fan(struct bitmain_info *info, struct bitmain_rxs
}*/
*temp_avg += info->temp[i];
- if (info->temp[i] > info->temp_max) {
+ if (info->temp[i] > info->temp_max)
info->temp_max = info->temp[i];
- }
+ if (info->temp[i] > info->temp_hi)
+ info->temp_hi = info->temp[i];
}
if (bm->temp_num > 0) {
@@ -804,14 +806,21 @@ static void bitmain_update_temps(struct cgpu_info *bitmain, struct bitmain_info
info->temp_history_index = 0;
info->temp_sum = 0;
}
- if (unlikely(info->temp_max >= opt_bitmain_overheat)) {
- applog(LOG_WARNING, "%s%d: overheat! Idling",
- bitmain->drv->name, bitmain->device_id);
- info->overheat = true;
- } else if (info->overheat && info->temp_max <= opt_bitmain_temp) {
+ if (unlikely(info->temp_hi >= opt_bitmain_overheat)) {
+ if (!info->overheat) {
+ applog(LOG_WARNING, "%s%d: overheat! hi %dC limit %dC idling",
+ bitmain->drv->name, bitmain->device_id,
+ info->temp_hi, opt_bitmain_overheat);
+ info->overheat = true;
+ info->overheat_temp = info->temp_hi;
+ info->overheat_count++;
+ info->overheat_slept = 0;
+ }
+ } else if (info->overheat && info->temp_hi <= opt_bitmain_temp) {
applog(LOG_WARNING, "%s%d: cooled, restarting",
bitmain->drv->name, bitmain->device_id);
info->overheat = false;
+ info->overheat_recovers++;
}
}
@@ -1107,6 +1116,7 @@ static void *bitmain_get_results(void *userdata)
// cgsleep_ms_r(&ts_start, BITMAIN_READ_TIMEOUT);
//cgsleep_prepare_r(&ts_start);
+
applog(LOG_DEBUG, "%s%d: %s() read",
bitmain->drv->name, bitmain->device_id, __func__);
ret = bitmain_read(bitmain, buf, rsize, BITMAIN_READ_TIMEOUT, C_BITMAIN_READ);
@@ -1496,6 +1506,48 @@ static bool bitmain_fill(struct cgpu_info *bitmain)
int timediff = 0;
K_ITEM *witem;
+ /*
+ * Overheat just means delay the next work
+ * since the temperature reply is only found with a work reply,
+ * we can only sleep and hope it will cool down
+ * TODO: of course it may be possible to read the temperature
+ * without sending work ...
+ */
+ if (info->overheat == true) {
+ if (info->overheat_sleep_ms == 0)
+ info->overheat_sleep_ms = BITMAIN_OVERHEAT_SLEEP_MS_DEF;
+
+ /*
+ * If we slept and we are still here, and the temp didn't drop,
+ * increment the sleep time to find a sleep time that causes a
+ * temperature drop
+ */
+ if (info->overheat_slept) {
+ if (info->overheat_temp > info->temp_hi)
+ info->overheat_temp = info->temp_hi;
+ else {
+ if (info->overheat_sleep_ms < BITMAIN_OVERHEAT_SLEEP_MS_MAX)
+ info->overheat_sleep_ms += BITMAIN_OVERHEAT_SLEEP_MS_STEP;
+ }
+ }
+
+ applog(LOG_DEBUG, "%s%d: %s() sleeping %"PRIu32" - overheated",
+ bitmain->drv->name, bitmain->device_id,
+ __func__, info->overheat_sleep_ms);
+ cgsleep_ms(info->overheat_sleep_ms);
+ info->overheat_sleeps++;
+ info->overheat_slept = info->overheat_sleep_ms;
+ info->overheat_total_sleep += info->overheat_sleep_ms;
+ } else {
+ // If we slept and it cooled then try less next time
+ if (info->overheat_slept) {
+ if (info->overheat_sleep_ms > BITMAIN_OVERHEAT_SLEEP_MS_MIN)
+ info->overheat_sleep_ms -= BITMAIN_OVERHEAT_SLEEP_MS_STEP;
+ info->overheat_slept = 0;
+ }
+
+ }
+
applog(LOG_DEBUG, "%s%d: %s() start",
bitmain->drv->name, bitmain->device_id,
__func__);
@@ -1787,6 +1839,24 @@ static struct api_data *bitmain_api_stats(struct cgpu_info *cgpu)
(float)(info->failed_search) : 0;
root = api_add_avg(root, "avg_failed", &avg, true);
+ root = api_add_int(root, "temp_hi", &(info->temp_hi), false);
+ root = api_add_bool(root, "overheat", &(info->overheat), true);
+ root = api_add_int(root, "overheat_temp", &(info->overheat_temp), true);
+ root = api_add_uint32(root, "overheat_count", &(info->overheat_count), true);
+ root = api_add_uint32(root, "overheat_sleep_ms", &(info->overheat_sleep_ms), true);
+ root = api_add_uint32(root, "overheat_sleeps", &(info->overheat_sleeps), true);
+ root = api_add_uint32(root, "overheat_slept", &(info->overheat_slept), true);
+ root = api_add_uint64(root, "overheat_total_sleep", &(info->overheat_total_sleep), true);
+ root = api_add_uint32(root, "overheat_recovers", &(info->overheat_recovers), true);
+
+ root = api_add_int(root, "opt_bitmain_temp", &opt_bitmain_temp, false);
+ root = api_add_int(root, "opt_bitmain_overheat", &opt_bitmain_overheat, false);
+ root = api_add_int(root, "opt_bitmain_fan_min", &opt_bitmain_fan_min, false);
+ root = api_add_int(root, "opt_bitmain_fan_max", &opt_bitmain_fan_max, false);
+ root = api_add_int(root, "opt_bitmain_freq_min", &opt_bitmain_freq_min, false);
+ root = api_add_int(root, "opt_bitmain_freq_max", &opt_bitmain_freq_max, false);
+ root = api_add_bool(root, "opt_bitmain_auto", &opt_bitmain_auto, false);
+
return root;
}
diff --git a/driver-bitmain.h b/driver-bitmain.h
index 605fdd9..be066c1 100644
--- a/driver-bitmain.h
+++ b/driver-bitmain.h
@@ -73,6 +73,11 @@
#define BITMAIN_SEND_STATUS_TIME 10 //s
#define BITMAIN_SEND_FULL_SPACE 128
+#define BITMAIN_OVERHEAT_SLEEP_MS_MAX 10000
+#define BITMAIN_OVERHEAT_SLEEP_MS_MIN 200
+#define BITMAIN_OVERHEAT_SLEEP_MS_DEF 600
+#define BITMAIN_OVERHEAT_SLEEP_MS_STEP 200
+
struct bitmain_txconfig_token {
uint8_t token_type;
uint8_t length;
@@ -187,6 +192,7 @@ struct bitmain_info {
int temp[BITMAIN_MAX_TEMP_NUM];
int temp_max;
+ int temp_hi;
int temp_avg;
int temp_history_count;
int temp_history_index;
@@ -219,6 +225,13 @@ struct bitmain_info {
bool reset;
bool overheat;
bool optimal;
+ int overheat_temp;
+ uint32_t overheat_count;
+ uint32_t overheat_sleep_ms;
+ uint32_t overheat_sleeps;
+ uint32_t overheat_slept;
+ uint64_t overheat_total_sleep;
+ uint32_t overheat_recovers;
// Work
K_LIST *work_list;