Commit 27b05db4a5044005895019c6965a7df9691a234b

Con Kolivas 2011-10-09T12:59:45

Use ADL activity report to tell us if a sick GPU is still busy suggesting it is hard hung and do not attempt to restart it.

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
diff --git a/main.c b/main.c
index cee360b..64e2eaf 100644
--- a/main.c
+++ b/main.c
@@ -5017,6 +5017,12 @@ static void *watchdog_thread(void *userdata)
 				gpus[gpu].status = LIFE_SICK;
 				applog(LOG_ERR, "Thread %d idle for more than 60 seconds, GPU %d declared SICK!", i, gpu);
 				gettimeofday(&thr->sick, NULL);
+#ifdef HAVE_ADL
+				if (adl_active && gpus[gpu].has_adl && gpu_activity(gpu) > 50) {
+					applog(LOG_ERR, "GPU still showing activity suggesting a hard hang.");
+					applog(LOG_ERR, "Will not attempt to auto-restart it.");
+				} else
+#endif
 				if (opt_restart) {
 					applog(LOG_ERR, "Attempting to restart GPU");
 					reinit_device(thr->cgpu);