Commit 02295c69c89d5c5899f3e11d475672244215e0a6

Con Kolivas 2012-02-19T20:19:47

Allow devices that are disabled due to overheating to be flagged as recovering instead of disabling them and re-enable them if they're below ideal temperatures and --no-restart has not been set.

diff --git a/adl.c b/adl.c
index cb93814..fe2bb9b 100644
--- a/adl.c
+++ b/adl.c
@@ -330,7 +330,7 @@ void init_adl(int nDevs)
 			continue;
 		}
 
-		if (!gpus[gpu].enabled) {
+		if (gpus[gpu].deven == DEV_DISABLED) {
 			gpus[i].gpu_engine =
 			gpus[i].gpu_memclock =
 			gpus[i].gpu_vddc =
@@ -1025,7 +1025,7 @@ static void fan_autotune(int gpu, int temp, int fanpercent, bool __maybe_unused 
 	}
 }
 
-void gpu_autotune(int gpu, bool *enable)
+void gpu_autotune(int gpu, enum dev_enable *denable)
 {
 	int temp, fanpercent, engine, newengine, twintemp = 0;
 	bool fan_optimal = true;
@@ -1068,7 +1068,7 @@ void gpu_autotune(int gpu, bool *enable)
 	if (engine && ga->autoengine) {
 		if (temp > cgpu->cutofftemp) {
 			applog(LOG_WARNING, "Hit thermal cutoff limit on GPU %d, disabling!", gpu);
-			*enable = false;
+			*denable = DEV_RECOVER;
 			newengine = ga->minspeed;
 		} else if (temp > ga->overtemp && engine > ga->minspeed) {
 			applog(LOG_WARNING, "Overheat detected, decreasing GPU %d clock speed", gpu);
@@ -1077,9 +1077,12 @@ void gpu_autotune(int gpu, bool *enable)
 			applog(LOG_DEBUG, "Temperature %d degrees over target, decreasing clock speed", opt_hysteresis);
 			newengine = engine - ga->lpOdParameters.sEngineClock.iStep;
 			/* Only try to tune engine speed up if this GPU is not disabled */
-		} else if (temp < ga->targettemp && engine < ga->maxspeed && *enable) {
+		} else if (temp < ga->targettemp && engine < ga->maxspeed && *denable == DEV_ENABLED) {
 			applog(LOG_DEBUG, "Temperature below target, increasing clock speed");
 			newengine = engine + ga->lpOdParameters.sEngineClock.iStep;
+		} else if (temp < ga->targettemp && *denable == DEV_RECOVER && opt_restart) {
+			applog(LOG_NOTICE, "Device recovered to temperature below target, re-enabling");
+			*denable = DEV_ENABLED;
 		}
 
 		if (newengine > ga->maxspeed)
diff --git a/adl.h b/adl.h
index 5b98f7e..f48c420 100644
--- a/adl.h
+++ b/adl.h
@@ -17,7 +17,7 @@ int gpu_fanpercent(int gpu);
 bool gpu_stats(int gpu, float *temp, int *engineclock, int *memclock, float *vddc,
 	       int *activity, int *fanspeed, int *fanpercent, int *powertune);
 void change_gpusettings(int gpu);
-void gpu_autotune(int gpu, bool *enable);
+void gpu_autotune(int gpu, enum dev_enable *denable);
 void clear_adl(int nDevs);
 #else /* HAVE_ADL */
 #define adl_active (0)
diff --git a/api.c b/api.c
index 296132d..1373425 100644
--- a/api.c
+++ b/api.c
@@ -522,7 +522,7 @@ static void gpustatus(int gpu, bool isjson)
 #endif
 			gt = gv = gm = gc = ga = gf = gp = pt = 0;
 
-		if (cgpu->enabled)
+		if (cgpu->deven != DEV_DISABLED)
 			enabled = (char *)YES;
 		else
 			enabled = (char *)NO;
@@ -830,7 +830,7 @@ static void gpuenable(__maybe_unused SOCKETTYPE c, char *param, bool isjson)
 		return;
 	}
 
-	if (gpus[id].enabled) {
+	if (gpus[id].deven != DEV_DISABLED) {
 		strcpy(io_buffer, message(MSG_ALRENA, id, NULL, isjson));
 		return;
 	}
@@ -844,7 +844,7 @@ static void gpuenable(__maybe_unused SOCKETTYPE c, char *param, bool isjson)
 				return;
 			}
 
-			gpus[id].enabled = true;
+			gpus[id].deven = DEV_ENABLED;
 			tq_push(thr->q, &ping);
 
 		}
@@ -873,12 +873,12 @@ static void gpudisable(__maybe_unused SOCKETTYPE c, char *param, bool isjson)
 		return;
 	}
 
-	if (!gpus[id].enabled) {
+	if (gpus[id].deven == DEV_DISABLED) {
 		strcpy(io_buffer, message(MSG_ALRDIS, id, NULL, isjson));
 		return;
 	}
 
-	gpus[id].enabled = false;
+	gpus[id].deven = DEV_DISABLED;
 
 	strcpy(io_buffer, message(MSG_GPUDIS, id, NULL, isjson));
 }
diff --git a/bitforce.c b/bitforce.c
index 53d6336..377169f 100644
--- a/bitforce.c
+++ b/bitforce.c
@@ -119,7 +119,7 @@ static bool bitforce_detect_one(const char *devpath)
 	bitforce->api = &bitforce_api;
 	bitforce->device_id = i++;
 	bitforce->device_path = strdup(devpath);
-	bitforce->enabled = true;
+	bitforce->deven = DEV_ENABLED;
 	bitforce->threads = 1;
 
 	return true;
@@ -254,7 +254,7 @@ static uint64_t bitforce_scanhash(struct thr_info *thr, struct work *work, uint6
 			bitforce->temp = temp;
 			if (temp > bitforce->cutofftemp) {
 				applog(LOG_WARNING, "Hit thermal cutoff limit on %s %d, disabling!", bitforce->api->name, bitforce->device_id);
-				bitforce->enabled = false;
+				bitforce->deven = DEV_RECOVER;
 			}
 		}
 	}
diff --git a/cgminer.c b/cgminer.c
index 008bd86..83be99c 100644
--- a/cgminer.c
+++ b/cgminer.c
@@ -100,7 +100,7 @@ static const bool opt_time = true;
 
 #ifdef HAVE_OPENCL
 int opt_dynamic_interval = 7;
-static bool opt_restart = true;
+bool opt_restart = true;
 static bool opt_nogpu;
 #endif
 
@@ -1223,10 +1223,12 @@ static void curses_print_devstatus(int thr_id)
 			wprintw(statuswin, "DEAD ");
 		else if (cgpu->status == LIFE_SICK)
 			wprintw(statuswin, "SICK ");
-	else if (!cgpu->enabled)
-			wprintw(statuswin, "OFF  ");
-		else
-			wprintw(statuswin, "%5.1f", cgpu->rolling);
+	else if (cgpu->deven == DEV_DISABLED)
+		wprintw(statuswin, "OFF  ");
+	else if (cgpu->deven == DEV_RECOVER)
+		wprintw(statuswin, "REST  ");
+	else
+		wprintw(statuswin, "%5.1f", cgpu->rolling);
 		adj_width(cgpu->accepted, &awidth);
 		adj_width(cgpu->rejected, &rwidth);
 		adj_width(cgpu->hw_errors, &hwwidth);
@@ -2380,11 +2382,11 @@ void write_config(FILE *fcfg)
 	if (opt_socks_proxy && *opt_socks_proxy)
 		fprintf(fcfg, ",\n\"socks-proxy\" : \"%s\"", opt_socks_proxy);
 	for(i = 0; i < nDevs; i++)
-		if (!gpus[i].enabled)
+		if (gpus[i].deven == DEV_DISABLED)
 			break;
 	if (i < nDevs)
 		for (i = 0; i < nDevs; i++)
-			if (gpus[i].enabled)
+			if (gpus[i].deven != DEV_DISABLED)
 				fprintf(fcfg, ",\n\"device\" : \"%d\"", i);
 	if (opt_api_allow != NULL)
 		fprintf(fcfg, ",\n\"api-allow\" : \"%s\"", opt_api_allow);
@@ -3401,7 +3403,7 @@ void *miner_thread(void *userdata)
 				tv_lastupdate = tv_end;
 			}
 
-			if (unlikely(mythr->pause || !cgpu->enabled)) {
+			if (unlikely(mythr->pause || cgpu->deven == DEV_DISABLED)) {
 				applog(LOG_WARNING, "Thread %d being disabled", thr_id);
 				mythr->rolling = mythr->cgpu->rolling = 0;
 				applog(LOG_DEBUG, "Popping wakeup ping in miner thread");
@@ -3728,7 +3730,7 @@ static void *watchdog_thread(void __maybe_unused *userdata)
 				thr = &thr_info[i];
 
 				/* Don't touch disabled devices */
-				if (!thr->cgpu->enabled)
+				if (thr->cgpu->deven == DEV_DISABLED)
 					continue;
 				thr->pause = false;
 				tq_push(thr->q, &ping);
@@ -3739,7 +3741,7 @@ static void *watchdog_thread(void __maybe_unused *userdata)
 		for (i = 0; i < total_devices; ++i) {
 			struct cgpu_info *cgpu = devices[i];
 			struct thr_info *thr = cgpu->thread;
-			bool *enable;
+			enum dev_enable *denable;
 			int gpu;
 
 			if (cgpu->api != &opencl_api)
@@ -3748,10 +3750,10 @@ static void *watchdog_thread(void __maybe_unused *userdata)
 			if (i >= nDevs)
 				break;
 			gpu = thr->cgpu->device_id;
-			enable = &cgpu->enabled;
+			denable = &cgpu->deven;
 #ifdef HAVE_ADL
 			if (adl_active && gpus[gpu].has_adl)
-				gpu_autotune(gpu, enable);
+				gpu_autotune(gpu, denable);
 			if (opt_debug && gpus[gpu].has_adl) {
 				int engineclock = 0, memclock = 0, activity = 0, fanspeed = 0, fanpercent = 0, powertune = 0;
 				float temp = 0, vddc = 0;
@@ -3762,7 +3764,7 @@ static void *watchdog_thread(void __maybe_unused *userdata)
 			}
 #endif
 			/* Thread is waiting on getwork or disabled */
-			if (thr->getwork || !*enable)
+			if (thr->getwork || *denable == DEV_DISABLED)
 				continue;
 
 			if (gpus[gpu].status != LIFE_WELL && now.tv_sec - thr->last.tv_sec < 60) {
@@ -3880,7 +3882,7 @@ static void print_summary(void)
 
 	applog(LOG_WARNING, "Summary of per device statistics:\n");
 	for (i = 0; i < total_devices; ++i) {
-		if (devices[i]->enabled)
+		if (devices[i]->deven == DEV_ENABLED)
 			log_print_status(devices[i]);
 	}
 
@@ -4130,7 +4132,7 @@ static int cgminer_id_count = 0;
 
 void enable_device(struct cgpu_info *cgpu)
 {
-	cgpu->enabled = true;
+	cgpu->deven = DEV_ENABLED;
 	devices[cgpu->cgminer_id = cgminer_id_count++] = cgpu;
 	mining_threads += cgpu->threads;
 #ifdef HAVE_OPENCL
@@ -4306,7 +4308,7 @@ int main (int argc, char *argv[])
 				} else {
 					enable_device(devices[i]);
 				}
-				devices[i]->enabled = false;
+				devices[i]->deven = DEV_DISABLED;
 			}
 		}
 		total_devices = cgminer_id_count;
@@ -4488,7 +4490,7 @@ int main (int argc, char *argv[])
 
 			/* Enable threads for devices set not to mine but disable
 			 * their queue in case we wish to enable them later */
-			if (cgpu->enabled) {
+			if (cgpu->deven != DEV_DISABLED) {
 				applog(LOG_DEBUG, "Pushing ping to thread %d", thr->id);
 
 				tq_push(thr->q, &ping);
diff --git a/device-cpu.c b/device-cpu.c
index 9faf236..a2fda5f 100644
--- a/device-cpu.c
+++ b/device-cpu.c
@@ -742,7 +742,7 @@ static void cpu_detect()
 
 		cgpu = devices[total_devices + i] = &cpus[i];
 		cgpu->api = &cpu_api;
-		cgpu->enabled = true;
+		cgpu->deven = DEV_ENABLED;
 		cgpu->device_id = i;
 		cgpu->threads = 1;
 	}
diff --git a/device-gpu.c b/device-gpu.c
index 0e1d076..44aa499 100644
--- a/device-gpu.c
+++ b/device-gpu.c
@@ -430,7 +430,7 @@ void pause_dynamic_threads(int gpu)
 		}
 
 		thr->pause = cgpu->dynamic;
-		if (!cgpu->dynamic && cgpu->enabled)
+		if (!cgpu->dynamic && cgpu->deven != DEV_DISABLED)
 			tq_push(thr->q, &ping);
 	}
 }
@@ -505,7 +505,7 @@ retry:
 			if (thr->cgpu != cgpu)
 				continue;
 			get_datestamp(checkin, &thr->last);
-			wlog("Thread %d: %.1f Mh/s %s ", i, thr->rolling, cgpu->enabled ? "Enabled" : "Disabled");
+			wlog("Thread %d: %.1f Mh/s %s ", i, thr->rolling, cgpu->deven != DEV_DISABLED ? "Enabled" : "Disabled");
 			switch (cgpu->status) {
 				default:
 				case LIFE_WELL:
@@ -546,11 +546,11 @@ retry:
 			wlogprint("Invalid selection\n");
 			goto retry;
 		}
-		if (gpus[selected].enabled) {
+		if (gpus[selected].deven != DEV_DISABLED) {
 			wlogprint("Device already enabled\n");
 			goto retry;
 		}
-		gpus[selected].enabled = true;
+		gpus[selected].deven = DEV_ENABLED;
 		for (i = 0; i < mining_threads; ++i) {
 			thr = &thr_info[i];
 			cgpu = thr->cgpu;
@@ -560,7 +560,7 @@ retry:
 				continue;
 			if (cgpu->status != LIFE_WELL) {
 				wlogprint("Must restart device before enabling it");
-				gpus[selected].enabled = false;
+				gpus[selected].deven = DEV_DISABLED;
 				goto retry;
 			}
 			applog(LOG_DEBUG, "Pushing ping to thread %d", thr->id);
@@ -575,11 +575,11 @@ retry:
 			wlogprint("Invalid selection\n");
 			goto retry;
 		}
-		if (!gpus[selected].enabled) {
+		if (gpus[selected].deven == DEV_DISABLED) {
 			wlogprint("Device already disabled\n");
 			goto retry;
 		}
-		gpus[selected].enabled = false;
+		gpus[selected].deven = DEV_DISABLED;
 		goto retry;
 	} else if (!strncasecmp(&input, "i", 1)) {
 		int intensity;
@@ -887,7 +887,7 @@ select_cgpu:
 	}
 
 	gpu = cgpu->device_id;
-	cgpu->enabled = false;
+	cgpu->deven = DEV_DISABLED;
 
 	for (thr_id = 0; thr_id < mining_threads; ++thr_id) {
 		thr = &thr_info[thr_id];
@@ -912,7 +912,7 @@ select_cgpu:
 			applog(LOG_WARNING, "Thread %d no longer exists", thr_id);
 	}
 
-	cgpu->enabled = true;
+	cgpu->deven = DEV_ENABLED;
 
 	for (thr_id = 0; thr_id < mining_threads; ++thr_id) {
 		int virtual_gpu;
@@ -1016,7 +1016,7 @@ static void opencl_detect()
 		struct cgpu_info *cgpu;
 
 		cgpu = devices[total_devices++] = &gpus[i];
-		cgpu->enabled = true;
+		cgpu->deven = DEV_ENABLED;
 		cgpu->api = &opencl_api;
 		cgpu->device_id = i;
 		cgpu->threads = opt_g_threads;
@@ -1105,7 +1105,7 @@ static bool opencl_thread_prepare(struct thr_info *thr)
 					free(buf);
 			}
 		}
-		cgpu->enabled = false;
+		cgpu->deven = DEV_DISABLED;
 		cgpu->status = LIFE_NOSTART;
 		return false;
 	}
diff --git a/miner.h b/miner.h
index 8e05696..acaf3b4 100644
--- a/miner.h
+++ b/miner.h
@@ -207,6 +207,12 @@ struct device_api {
 	void (*thread_shutdown)(struct thr_info*);
 };
 
+enum dev_enable {
+	DEV_ENABLED,
+	DEV_DISABLED,
+	DEV_RECOVER,
+};
+
 struct cgpu_info {
 	int cgminer_id;
 	struct device_api *api;
@@ -215,7 +221,7 @@ struct cgpu_info {
 	FILE *device_file;
 	int device_fd;
 
-	bool enabled;
+	enum dev_enable deven;
 	int accepted;
 	int rejected;
 	int hw_errors;
@@ -398,6 +404,7 @@ extern int opt_api_port;
 extern bool opt_api_listen;
 extern bool opt_api_network;
 extern bool opt_delaynet;
+extern bool opt_restart;
 
 extern pthread_rwlock_t netacc_lock;