Partial restart of threads has proven to be unsuccessful so reinstate device re-initialisation to restart GPUs. Do this by having a reinit thread that is told via a queue which device to restart. If this thread fails to return from opencl code, it should not interrupt the workings of other devices.
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210
diff --git a/main.c b/main.c
index bc11af5..858623f 100644
--- a/main.c
+++ b/main.c
@@ -184,6 +184,8 @@ int longpoll_thr_id;
static int stage_thr_id;
static int watchdog_thr_id;
static int input_thr_id;
+static int gpur_thr_id;
+static int cpur_thr_id;
static int total_threads;
struct work_restart *work_restart = NULL;
@@ -3340,6 +3342,7 @@ static void restart_longpoll(void)
static void *reinit_cpu(void *userdata)
{
+ pthread_detach(pthread_self());
#if 0
struct cgpu_info *cgpu = (struct cgpu_info *)userdata;
int cpu = cgpu->cpu_gpu;
@@ -3371,35 +3374,52 @@ static void *reinit_cpu(void *userdata)
}
#ifdef HAVE_OPENCL
+/* We have only one thread that ever re-initialises GPUs, thus if any GPU
+ * init command fails due to a completely wedged GPU, the thread will never
+ * return, unable to harm other GPUs. If it does return, it means we only had
+ * a soft failure and then the reinit_gpu thread is ready to tackle another
+ * GPU */
static void *reinit_gpu(void *userdata)
{
- struct cgpu_info *cgpu = (struct cgpu_info *)userdata;
- int gpu = cgpu->cpu_gpu;
+ struct thr_info *mythr = userdata;
+ struct cgpu_info *cgpu;
struct thr_info *thr;
char name[256];
int thr_id;
- _clState *clState;
+ int gpu;
+
+ pthread_detach(pthread_self());
+
+select_cgpu:
+ cgpu = tq_pop(mythr->q, NULL);
+ if (!cgpu)
+ goto out;
+
+ if (clDevicesNum() != nDevs) {
+ applog(LOG_WARNING, "Hardware not reporting same number of active devices, will not attempt to restart GPU");
+ goto out;
+ }
- /* Send threads message to stop */
+ gpu = cgpu->cpu_gpu;
gpu_devices[gpu] = false;
- sleep(5);
for (thr_id = 0; thr_id < gpu_threads; thr_id ++) {
if (dev_from_id(thr_id) != gpu)
continue;
- clState = clStates[thr_id];
- /* Send it a command. If it responds we can restart */
- applog(LOG_WARNING, "Attempting to send GPU command");
- clFlush(clState->commandQueue);
- clFinish(clState->commandQueue);
-
thr = &thr_info[thr_id];
thr->rolling = thr->cgpu->rolling = 0;
if (!pthread_cancel(*thr->pth)) {
- applog(LOG_WARNING, "Thread still exists, killing it off");
+ applog(LOG_WARNING, "Thread %d still exists, killing it off", thr_id);
} else
- applog(LOG_WARNING, "Thread no longer exists");
+ applog(LOG_WARNING, "Thread %d no longer exists", thr_id);
+ }
+
+ for (thr_id = 0; thr_id < gpu_threads; thr_id ++) {
+ if (dev_from_id(thr_id) != gpu)
+ continue;
+
+ thr = &thr_info[thr_id];
/* Lose this ram cause we may get stuck here! */
//tq_freeze(thr->q);
@@ -3410,13 +3430,12 @@ static void *reinit_gpu(void *userdata)
/* Lose this ram cause we may dereference in the dying thread! */
//free(clState);
- applog(LOG_WARNING, "Command successful, attempting to reinit device");
applog(LOG_INFO, "Reinit GPU thread %d", thr_id);
- clState = initCl(gpu, name, sizeof(name));
- if (!clState) {
+ clStates[thr_id] = initCl(gpu, name, sizeof(name));
+ if (!clStates[thr_id]) {
applog(LOG_ERR, "Failed to reinit GPU thread %d", thr_id);
- return NULL;
+ goto out;
}
applog(LOG_INFO, "initCl() finished. Found %s", name);
@@ -3427,14 +3446,17 @@ static void *reinit_gpu(void *userdata)
applog(LOG_WARNING, "Thread %d restarted", thr_id);
}
- /* Try to re-enable it */
gpu_devices[gpu] = true;
for (thr_id = 0; thr_id < gpu_threads; thr_id ++) {
+ if (dev_from_id(thr_id) != gpu)
+ continue;
+
thr = &thr_info[thr_id];
- if (dev_from_id(thr_id) == gpu)
- tq_push(thr->q, &ping);
+ tq_push(thr->q, &ping);
}
+ goto select_cgpu;
+out:
return NULL;
}
#else
@@ -3445,16 +3467,10 @@ static void *reinit_gpu(void *userdata)
static void reinit_device(struct cgpu_info *cgpu)
{
- pthread_t resus_thread;
- void *reinit;
-
if (cgpu->is_gpu)
- reinit = reinit_gpu;
+ tq_push(thr_info[gpur_thr_id].q, cgpu);
else
- reinit = reinit_cpu;
-
- if (unlikely(pthread_create(&resus_thread, NULL, reinit, (void *)cgpu)))
- applog(LOG_ERR, "Failed to create reinit thread");
+ tq_push(thr_info[cpur_thr_id].q, cgpu);
}
/* Determine which are the first threads belonging to a device and if they're
@@ -3811,7 +3827,7 @@ static void fork_monitor()
int main (int argc, char *argv[])
{
- unsigned int i, j = 0, x, y, pools_active = 0;
+ unsigned int i, x, y, pools_active = 0;
struct sigaction handler;
struct thr_info *thr;
char name[256];
@@ -3969,7 +3985,7 @@ int main (int argc, char *argv[])
mining_threads = opt_n_threads + gpu_threads;
- total_threads = mining_threads + 5;
+ total_threads = mining_threads + 7;
work_restart = calloc(total_threads, sizeof(*work_restart));
if (!work_restart)
quit(1, "Failed to calloc work_restart");
@@ -4050,11 +4066,9 @@ int main (int argc, char *argv[])
quit(0, "No pools active! Exiting.");
#ifdef HAVE_OPENCL
- i = 0;
-
/* start GPU mining threads */
- for (j = 0; j < nDevs * opt_g_threads; j++) {
- int gpu = j % nDevs;
+ for (i = 0; i < nDevs * opt_g_threads; i++) {
+ int gpu = i % nDevs;
gpus[gpu].is_gpu = 1;
gpus[gpu].cpu_gpu = gpu;
@@ -4087,8 +4101,6 @@ int main (int argc, char *argv[])
if (unlikely(thr_info_create(thr, NULL, gpuminer_thread, thr)))
quit(1, "thread %d create failed", i);
-
- i++;
}
applog(LOG_INFO, "%d gpu miner threads started", gpu_threads);
@@ -4134,6 +4146,24 @@ int main (int argc, char *argv[])
quit(1, "input thread create failed");
pthread_detach(*thr->pth);
+ /* Create reinit cpu thread */
+ cpur_thr_id = mining_threads + 5;
+ thr = &thr_info[cpur_thr_id];
+ thr->q = tq_new();
+ if (!thr->q)
+ quit(1, "tq_new failed for cpur_thr_id");
+ if (thr_info_create(thr, NULL, reinit_cpu, thr))
+ quit(1, "reinit_cpu thread create failed");
+
+ /* Create reinit gpu thread */
+ gpur_thr_id = mining_threads + 6;
+ thr = &thr_info[gpur_thr_id];
+ thr->q = tq_new();
+ if (!thr->q)
+ quit(1, "tq_new failed for gpur_thr_id");
+ if (thr_info_create(thr, NULL, reinit_gpu, thr))
+ quit(1, "reinit_gpu thread create failed");
+
/* main loop - simply wait for workio thread to exit */
pthread_join(*thr_info[work_thr_id].pth, NULL);
applog(LOG_INFO, "workio thread dead, exiting.");