Differentiate thread failure from GPU failure by declaring a GPU sick first and trying to restart the thread without re-initialising the card. If that fails, then try once more at ten minutes and declare it dead. This should prevent an attempted re-initialising of the GPU from taking out other GPUs.
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213
diff --git a/main.c b/main.c
index 13a2db8..edbe579 100644
--- a/main.c
+++ b/main.c
@@ -835,11 +835,16 @@ static void curses_print_status(int thr_id)
wmove(statuswin, gpucursor + gpu, 0);
- if (!cgpu->alive)
+ if (cgpu->status == LIFE_DEAD)
wprintw(statuswin, " GPU %d: [DEAD / %.1f Mh/s] [Q:%d A:%d R:%d HW:%d E:%.0f%% U:%.2f/m]",
gpu, cgpu->total_mhashes / total_secs,
cgpu->getworks, cgpu->accepted, cgpu->rejected, cgpu->hw_errors,
cgpu->efficiency, cgpu->utility);
+ else if (cgpu->status == LIFE_SICK)
+ wprintw(statuswin, " GPU %d: [SICK / %.1f Mh/s] [Q:%d A:%d R:%d HW:%d E:%.0f%% U:%.2f/m]",
+ gpu, cgpu->total_mhashes / total_secs,
+ cgpu->getworks, cgpu->accepted, cgpu->rejected, cgpu->hw_errors,
+ cgpu->efficiency, cgpu->utility);
else if (!gpu_devices[gpu])
wprintw(statuswin, " GPU %d: [DISABLED / %.1f Mh/s] [Q:%d A:%d R:%d HW:%d E:%.0f%% U:%.2f/m]",
gpu, cgpu->total_mhashes / total_secs,
@@ -1949,9 +1954,22 @@ retry:
if (thr->cgpu != cgpu)
continue;
get_datestamp(checkin, &thr->last);
- wlog("Thread %d: %.1f Mh/s %s %s reported in %s\n", i,
- thr->rolling, gpu_devices[gpu] ? "Enabled" : "Disabled",
- cgpu->alive ? "Alive" : "Dead", checkin);
+ switch (cgpu->status) {
+ case LIFE_WELL:
+ wlog("Thread %d: %.1f Mh/s %s ALIVE\n", i,
+ thr->rolling, gpu_devices[gpu] ? "Enabled" : "Disabled");
+ break;
+ case LIFE_SICK:
+ wlog("Thread %d: %.1f Mh/s %s SICK reported in %s\n", i,
+ thr->rolling, gpu_devices[gpu] ? "Enabled" : "Disabled",
+ checkin);
+ break;
+ case LIFE_DEAD:
+ wlog("Thread %d: %.1f Mh/s %s DEAD reported in %s\n", i,
+ thr->rolling, gpu_devices[gpu] ? "Enabled" : "Disabled",
+ checkin);
+ break;
+ }
}
}
@@ -1965,10 +1983,6 @@ retry:
wlogprint("Invalid selection\n");
goto retry;
}
- if (!gpus[selected].alive) {
- wlogprint("Device dead, need to attempt to restart before enabling\n");
- goto retry;
- }
if (gpu_devices[selected]) {
wlogprint("Device already enabled\n");
goto retry;
@@ -2087,7 +2101,7 @@ static void *workio_thread(void *userdata)
static void thread_reportin(struct thr_info *thr)
{
gettimeofday(&thr->last, NULL);
- thr->cgpu->alive = true;
+ thr->cgpu->status = LIFE_WELL;
thr->getwork = false;
}
@@ -2993,7 +3007,7 @@ static void *gpuminer_thread(void *userdata)
if (unlikely(status != CL_SUCCESS))
{ applog(LOG_ERR, "Error: clEnqueueWriteBuffer failed."); goto out; }
- mythr->cgpu->alive = true;
+ mythr->cgpu->status = LIFE_WELL;
if (opt_debug)
applog(LOG_DEBUG, "Popping ping in gpuminer thread");
@@ -3338,6 +3352,37 @@ static void *reinit_cpu(void *userdata)
}
#ifdef HAVE_OPENCL
+static void *reinit_gputhread(void *userdata)
+{
+ struct thr_info *thr = (struct thr_info *)userdata;
+ int thr_id = thr->id;
+
+ thr->rolling = thr->cgpu->rolling = 0;
+ tq_freeze(thr->q);
+ if (!pthread_cancel(*thr->pth)) {
+ pthread_join(*thr->pth, NULL);
+ free(thr->q);
+ }
+
+ thr->q = tq_new();
+ if (!thr->q)
+ quit(1, "Failed to tq_new in reinit_thread");
+
+ if (unlikely(thr_info_create(thr, NULL, gpuminer_thread, thr))) {
+ applog(LOG_ERR, "thread %d create failed", thr_id);
+ return NULL;
+ }
+ /* Try to re-enable it */
+ gpu_devices[thr->cgpu->cpu_gpu] = true;
+ if (opt_debug)
+ applog(LOG_DEBUG, "Pushing ping to thread %d", thr_id);
+ tq_push(thr->q, &ping);
+
+ applog(LOG_WARNING, "Thread %d restarted", thr_id);
+
+ return NULL;
+}
+
static void *reinit_gpu(void *userdata)
{
struct cgpu_info *cgpu = (struct cgpu_info *)userdata;
@@ -3346,7 +3391,7 @@ static void *reinit_gpu(void *userdata)
char name[256];
int thr_id;
- gpus[gpu].alive = false;
+ gpus[gpu].status = LIFE_DEAD;
for (thr_id = 0; thr_id < gpu_threads; thr_id ++) {
if (dev_from_id(thr_id) != gpu)
@@ -3363,7 +3408,7 @@ static void *reinit_gpu(void *userdata)
thr->q = tq_new();
if (!thr->q)
- quit(1, "Failed to tq_new in reinit_gputhread");
+ quit(1, "Failed to tq_new in reinit_gpu");
applog(LOG_INFO, "Reinit GPU thread %d", thr_id);
clStates[thr_id] = initCl(gpu, name, sizeof(name));
@@ -3390,11 +3435,25 @@ static void *reinit_gpu(void *userdata)
return NULL;
}
#else
+static void *reinit_gputhread(void *userdata)
+{
+ return NULL;
+}
+
static void *reinit_gpu(void *userdata)
{
+ return NULL;
}
#endif
+static void reinit_thread(struct thr_info *thr)
+{
+ pthread_t resus_thread;
+
+ if (unlikely(pthread_create(&resus_thread, NULL, reinit_gputhread, (void *)thr)))
+ applog(LOG_ERR, "Failed to create reinit thread");
+}
+
static void reinit_device(struct cgpu_info *cgpu)
{
pthread_t resus_thread;
@@ -3493,13 +3552,23 @@ static void *watchdog_thread(void *userdata)
struct thr_info *thr = &thr_info[i];
/* Thread is waiting on getwork or disabled */
- if (thr->getwork || !gpu_devices[i] || !gpus[i].alive)
+ if (thr->getwork || !gpu_devices[i])
continue;
- if (now.tv_sec - thr->last.tv_sec > 60) {
+ if (gpus[i].status != LIFE_WELL && now.tv_sec - thr->last.tv_sec < 60) {
+ applog(LOG_ERR, "Thread %d recovered, GPU %d declared WELL!", i, gpus[i]);
+ gpus[i].status = LIFE_WELL;
+ } else if (now.tv_sec - thr->last.tv_sec > 60 && gpus[i].status == LIFE_WELL) {
thr->rolling = thr->cgpu->rolling = 0;
- gpus[i].alive = false;
- applog(LOG_ERR, "Thread %d idle for more than 60 seconds, GPU %d declared DEAD!", i, gpus[i]);
+ gpus[i].status = LIFE_SICK;
+ applog(LOG_ERR, "Thread %d idle for more than 60 seconds, GPU %d declared SICK!", i, gpus[i]);
+ applog(LOG_ERR, "Attempting to restart thread");
+ reinit_thread(thr);
+ } else if (now.tv_sec - thr->last.tv_sec > 600 && gpus[i].status == LIFE_SICK) {
+ gpus[i].status = LIFE_DEAD;
+ applog(LOG_ERR, "Thread %d idle for more than 10 minutes, GPU %d declared DEAD!", i, gpus[i]);
+ applog(LOG_ERR, "Attempting to restart thread one last time");
+ reinit_thread(thr);
}
}
}
diff --git a/miner.h b/miner.h
index 9a2f570..b5c9525 100644
--- a/miner.h
+++ b/miner.h
@@ -134,6 +134,12 @@ enum {
#define ARRAY_SIZE(arr) (sizeof(arr) / sizeof((arr)[0]))
#endif
+enum alive {
+ LIFE_WELL,
+ LIFE_SICK,
+ LIFE_DEAD,
+};
+
struct cgpu_info {
int is_gpu;
int cpu_gpu;
@@ -145,7 +151,7 @@ struct cgpu_info {
unsigned int getworks;
double efficiency;
double utility;
- bool alive;
+ enum alive status;
};
struct thr_info {