Add per-die throttling control for hfa driver based on each die's temperature, issuing a suitable reset to maintain the temperature below a configurable target temperature.
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311
diff --git a/cgminer.c b/cgminer.c
index 7781b0c..663977f 100644
--- a/cgminer.c
+++ b/cgminer.c
@@ -1236,6 +1236,9 @@ static struct opt_table opt_config_table[] = {
OPT_WITH_ARG("--hfa-temp-overheat",
set_int_0_to_200, opt_show_intval, &opt_hfa_overheat,
"Set the hashfast overheat throttling temperature"),
+ OPT_WITH_ARG("--hfa-temp-target",
+ set_int_0_to_200, opt_show_intval, &opt_hfa_target,
+ "Set the hashfast target temperature"),
#endif
#ifdef USE_KLONDIKE
OPT_WITH_ARG("--klondike-options",
diff --git a/driver-hashfast.c b/driver-hashfast.c
index 0ff2f41..3eb69c6 100644
--- a/driver-hashfast.c
+++ b/driver-hashfast.c
@@ -20,7 +20,8 @@
int opt_hfa_ntime_roll = 1;
int opt_hfa_hash_clock = HFA_CLOCK_DEFAULT;
-int opt_hfa_overheat = HFA_OVERHEAT_DEFAULT;
+int opt_hfa_overheat = HFA_TEMP_OVERHEAT;
+int opt_hfa_target = HFA_TEMP_TARGET;
bool opt_hfa_pll_bypass;
bool opt_hfa_dfu_boot;
@@ -433,6 +434,7 @@ static bool hfa_detect_common(struct cgpu_info *hashfast)
{
struct hashfast_info *info;
bool ret;
+ int i;
info = calloc(sizeof(struct hashfast_info), 1);
if (!info)
@@ -456,6 +458,12 @@ static bool hfa_detect_common(struct cgpu_info *hashfast)
if (unlikely(!(info->die_status)))
quit(1, "Failed to calloc die_status");
+ info->die_data = calloc(info->asic_count, sizeof(struct hf_die_data));
+ if (unlikely(!(info->die_data)))
+ quit(1, "Failed to calloc die_data");
+ for (i = 0; i < info->asic_count; i++)
+ info->die_data[i].hash_clock = info->hash_clock_rate;
+
// The per-die statistics array
info->die_statistics = calloc(info->asic_count, sizeof(struct hf_long_statistics));
if (unlikely(!(info->die_statistics)))
@@ -659,24 +667,35 @@ static void hfa_update_die_status(struct cgpu_info *hashfast, struct hashfast_in
float die_temperature;
float core_voltage[6];
- if (info->device_type == HFD_G1) {
- // Copy in the data. They're numbered sequentially from the starting point
- ds = info->die_status + h->chip_address;
- for (i = 0; i < num_included; i++)
- memcpy(ds++, d++, sizeof(struct hf_g1_die_data));
-
- for (i = 0, d = &info->die_status[h->chip_address]; i < num_included; i++, d++) {
- die_temperature = GN_DIE_TEMPERATURE(d->die.die_temperature);
- for (j = 0; j < 6; j++)
- core_voltage[j] = GN_CORE_VOLTAGE(d->die.core_voltage[j]);
-
- applog(LOG_DEBUG, "%s %d: die %2d: OP_DIE_STATUS Temps die %.1fC board %.1fC vdd's %.2f %.2f %.2f %.2f %.2f %.2f",
- hashfast->drv->name, hashfast->device_id, h->chip_address + i, die_temperature, board_temperature(d->temperature),
- core_voltage[0], core_voltage[1], core_voltage[2],
- core_voltage[3], core_voltage[4], core_voltage[5]);
- // XXX Convert board phase currents, voltage, temperature
- }
+ // Copy in the data. They're numbered sequentially from the starting point
+ ds = info->die_status + h->chip_address;
+ for (i = 0; i < num_included; i++)
+ memcpy(ds++, d++, sizeof(struct hf_g1_die_data));
+
+ info->max_temp = 0;
+ for (i = 0, d = &info->die_status[h->chip_address]; i < num_included; i++, d++) {
+ int die = h->chip_address + i;
+
+ die_temperature = GN_DIE_TEMPERATURE(d->die.die_temperature);
+ info->die_data[die].temp = die_temperature;
+ if (die_temperature > info->max_temp)
+ info->max_temp = die_temperature;
+ for (j = 0; j < 6; j++)
+ core_voltage[j] = GN_CORE_VOLTAGE(d->die.core_voltage[j]);
+
+ applog(LOG_DEBUG, "%s %d: die %2d: OP_DIE_STATUS Temps die %.1fC board %.1fC vdd's %.2f %.2f %.2f %.2f %.2f %.2f",
+ hashfast->drv->name, hashfast->device_id, die, die_temperature, board_temperature(d->temperature),
+ core_voltage[0], core_voltage[1], core_voltage[2],
+ core_voltage[3], core_voltage[4], core_voltage[5]);
+ // XXX Convert board phase currents, voltage, temperature
}
+
+ if (unlikely(info->max_temp >= opt_hfa_overheat)) {
+ /* -1 means new overheat condition */
+ if (!info->overheat)
+ info->overheat = -1;
+ } else if (unlikely(info->overheat && info->max_temp < opt_hfa_overheat - HFA_TEMP_HYSTERESIS))
+ info->overheat = 0;
}
static void hfa_parse_nonce(struct thr_info *thr, struct cgpu_info *hashfast,
@@ -899,8 +918,8 @@ static int hfa_jobs(struct cgpu_info *hashfast, struct hashfast_info *info)
if (unlikely(info->overheat)) {
/* Acknowledge and notify of new condition.*/
if (info->overheat < 0) {
- applog(LOG_WARNING, "%s %d: Hit overheat temp, throttling!",
- hashfast->drv->name, hashfast->device_id);
+ applog(LOG_WARNING, "%s %d: Hit overheat temp %.1f, throttling!",
+ hashfast->drv->name, hashfast->device_id, info->max_temp);
/* Value of 1 means acknowledged overheat */
info->overheat = 1;
}
@@ -919,6 +938,79 @@ out:
return ret;
}
+static void hfa_increase_clock(struct cgpu_info *hashfast, struct hashfast_info *info,
+ int die)
+{
+ struct hf_die_data *hdd = &info->die_data[die];
+ uint32_t diebit = 0x00000001ul << die;
+ uint16_t hdata, increase = 5;
+
+ if (hdd->hash_clock + increase > info->hash_clock_rate)
+ increase = info->hash_clock_rate - hdd->hash_clock;
+ hdd->hash_clock += increase;
+ applog(LOG_INFO, "%s %d: Die temp below range %.1f, increasing die %d clock to %d",
+ hashfast->drv->name, hashfast->device_id, info->die_data[die].temp, die, hdd->hash_clock);
+ hdata = (WR_MHZ_INCREASE << 12) | increase;
+ hfa_send_frame(hashfast, HF_USB_CMD(OP_WORK_RESTART), hdata, (uint8_t *)&diebit, 4);
+}
+
+static void hfa_decrease_clock(struct cgpu_info *hashfast, struct hashfast_info *info,
+ int die)
+{
+ struct hf_die_data *hdd = &info->die_data[die];
+ uint32_t diebit = 0x00000001ul << die;
+ uint16_t hdata, decrease = 10;
+
+ if (hdd->hash_clock - decrease < HFA_CLOCK_MIN)
+ decrease = hdd->hash_clock - HFA_CLOCK_MIN;
+ hdd->hash_clock -= decrease;
+ applog(LOG_INFO, "%s %d: Die temp above range %.1f, decreasing die %d clock to %d",
+ hashfast->drv->name, hashfast->device_id, info->die_data[die].temp, die, hdd->hash_clock);
+ hdata = (WR_MHZ_DECREASE << 12) | decrease;
+ hfa_send_frame(hashfast, HF_USB_CMD(OP_WORK_RESTART), hdata, (uint8_t *)&diebit, 4);
+}
+
+/* Adjust clock according to temperature if need be by changing the clock
+ * setting and issuing a work restart with the new clock speed. */
+static void hfa_temp_clock(struct cgpu_info *hashfast, struct hashfast_info *info)
+{
+ time_t now_t = time(NULL);
+ int i;
+
+ for (i = 0; i < info->asic_count ; i++) {
+ struct hf_die_data *hdd = &info->die_data[i];
+
+ /* Only send a restart no more than every 30 seconds. */
+ if (now_t - hdd->last_restart < 30)
+ continue;
+
+ /* Sanity check */
+ if (unlikely(hdd->temp == 0.0 || hdd->temp > 255))
+ continue;
+
+ /* In target temperature */
+ if (hdd->temp >= opt_hfa_target - HFA_TEMP_HYSTERESIS && hdd->temp <= opt_hfa_target)
+ continue;
+
+ if (hdd->temp > opt_hfa_target) {
+ /* Temp above target range */
+
+ /* Already at min speed */
+ if (hdd->hash_clock == HFA_CLOCK_MIN)
+ continue;
+ hfa_decrease_clock(hashfast, info, i);
+ } else {
+ /* Temp below target range.*/
+
+ /* Already at max speed */
+ if (hdd->hash_clock == info->hash_clock_rate)
+ continue;
+ hfa_increase_clock(hashfast, info, i);
+ }
+ hdd->last_restart = now_t;
+ }
+}
+
static int64_t hfa_scanwork(struct thr_info *thr)
{
struct cgpu_info *hashfast = thr->cgpu;
@@ -952,6 +1044,8 @@ static int64_t hfa_scanwork(struct thr_info *thr)
hashfast->device_id);
}
+ hfa_temp_clock(hashfast, info);
+
if (unlikely(thr->work_restart)) {
restart:
thr->work_restart = false;
@@ -1091,6 +1185,7 @@ static struct api_data *hfa_api_stats(struct cgpu_info *cgpu)
int j;
root = api_add_int(root, "Core", &i, true);
+ root = api_add_int(root, "hash clockrate", &(info->die_data[i].hash_clock), false);
val = GN_DIE_TEMPERATURE(d->die.die_temperature);
root = api_add_double(root, "die temperature", &val, true);
val = board_temperature(d->temperature);
@@ -1119,22 +1214,16 @@ static struct api_data *hfa_api_stats(struct cgpu_info *cgpu)
static void hfa_statline_before(char *buf, size_t bufsiz, struct cgpu_info *hashfast)
{
struct hashfast_info *info = hashfast->device_data;
- double max_temp, max_volt;
struct hf_g1_die_data *d;
+ double max_volt;
int i;
- max_temp = max_volt = 0.0;
+ max_volt = 0.0;
for (i = 0; i < info->asic_count; i++) {
- double temp;
int j;
d = &info->die_status[i];
- temp = GN_DIE_TEMPERATURE(d->die.die_temperature);
- /* Sanity check on temp since we change it lockless it can
- * rarely read a massive value */
- if (temp > max_temp && temp < 200)
- max_temp = temp;
for (j = 0; j < 6; j++) {
double volt = GN_CORE_VOLTAGE(d->die.core_voltage[j]);
@@ -1143,14 +1232,7 @@ static void hfa_statline_before(char *buf, size_t bufsiz, struct cgpu_info *hash
}
}
- tailsprintf(buf, bufsiz, " max%3.0fC %3.2fV | ", max_temp, max_volt);
-
- if (unlikely(max_temp >= opt_hfa_overheat)) {
- /* -1 means new overheat condition */
- if (!info->overheat)
- info->overheat = -1;
- } else if (unlikely(info->overheat))
- info->overheat = 0;
+ tailsprintf(buf, bufsiz, " max%3.0fC %3.2fV | ", info->max_temp, max_volt);
}
static void hfa_init(struct cgpu_info __maybe_unused *hashfast)
@@ -1183,6 +1265,7 @@ static void hfa_shutdown(struct thr_info *thr)
free(info->works);
free(info->die_statistics);
free(info->die_status);
+ free(info->die_data);
/* Don't free info here since it will be accessed by statline before
* if a device is removed. */
}
diff --git a/driver-hashfast.h b/driver-hashfast.h
index ea09100..65b6825 100644
--- a/driver-hashfast.h
+++ b/driver-hashfast.h
@@ -19,12 +19,16 @@
int opt_hfa_ntime_roll;
int opt_hfa_hash_clock;
int opt_hfa_overheat;
+int opt_hfa_target;
bool opt_hfa_pll_bypass;
bool opt_hfa_dfu_boot;
#define HASHFAST_MINER_THREADS 1
#define HFA_CLOCK_DEFAULT 550
-#define HFA_OVERHEAT_DEFAULT 90
+#define HFA_CLOCK_MIN 125
+#define HFA_TEMP_OVERHEAT 90
+#define HFA_TEMP_TARGET 85
+#define HFA_TEMP_HYSTERESIS 3
// Matching fields for hf_statistics, but large #s for local accumulation, per-die
struct hf_long_statistics {
@@ -73,6 +77,13 @@ struct hf_long_usb_stats1 {
uint8_t max_rx_buffers;
};
+/* Private per die data for dynamic clocking */
+struct hf_die_data {
+ int hash_clock;
+ double temp;
+ time_t last_restart;
+};
+
struct hashfast_info {
int asic_count; // # of chips in the chain
int core_count; // # of cores per chip
@@ -82,6 +93,7 @@ struct hashfast_info {
struct hf_g1_die_data *die_status; // Array of per-die voltage, current, temperature sensor data
struct hf_long_statistics *die_statistics; // Array of per-die error counters
struct hf_long_usb_stats1 stats1;
+ struct hf_die_data *die_data;
int hash_clock_rate; // Hash clock rate to use, in Mhz
struct hf_usb_init_base usb_init_base; // USB Base information from USB_INIT
struct hf_config_data config_data; // Configuration data used from USB_INIT
@@ -103,6 +115,7 @@ struct hashfast_info {
int no_matching_work;
int resets;
int overheat;
+ double max_temp;
pthread_t read_thr;
};