author | Olivier Brunel
<jjk@jjacky.com> 2018-02-22 19:50:38 UTC |
committer | Olivier Brunel
<jjk@jjacky.com> 2018-02-23 18:59:03 UTC |
parent | f6cd60067a876ffd8ba123ced98b5224827a7d7c |
doc/aa-start.pod | +20 | -3 |
doc/anopa.pod | +19 | -0 |
src/anopa/start-stop.c | +41 | -0 |
src/include/anopa/service.h | +1 | -0 |
src/libanopa/service.c | +28 | -0 |
diff --git a/doc/aa-start.pod b/doc/aa-start.pod index 257314c..8ff8a3b 100644 --- a/doc/aa-start.pod +++ b/doc/aa-start.pod @@ -88,6 +88,24 @@ there must be one name per line. Refer to B<anopa>(1) for descriptions of servicedirs and service dependencies. +=head1 RETRIES + +It is possible to define a number of retries via file I<retries>, which must +contains the number of retries for the service. + +- For oneshot, the service will be re-started up to I<retries> times and +then be processed as failed; + +- For longrun, if it supports readiness, up to I<retries> event 'd' will be +allowed while waiting for event 'U', before the service is processed failed. +This means if the service goes down before becoming ready, B<aa-start>(1) will +keep waiting as it is restarted. + +Note that when the service is processed failed, nothing is changed regarding +its supervised state, i.e. its B<s6-supervise> will continue to restart it +unless/until told otherwise; The failed state is only meaningful when comes to +the current B<aa-start>(1) transaction and related dependencies/ordering. + =head1 TIMEOUTS When starting a service, a timestamp is collected. If the service fails to be @@ -160,9 +178,8 @@ If there was no I<gets-ready>/valid I<notification-fd> file, it waits for event 'u' to be triggered, then (disconnects from the fifodir and) announce the service as started. If there was one, it will then wait for event 'U' to be triggered instead, announcing the service as ready instead. (A message will be -shown on event 'u' as information only. Note that should event 'd' occur, a -message will also be shown, but B<aa-start>(1) will still keep waiting for event -'U'.) +shown on event 'u' as information only. Note that should event 'd' occur, the +behavior depends on file I<retries> as described in L<RETRIES> above.) =head1 STARTING A ONE-SHOT SERVICE diff --git a/doc/anopa.pod b/doc/anopa.pod index 12a7d2d..05d1510 100644 --- a/doc/anopa.pod +++ b/doc/anopa.pod @@ -133,6 +133,19 @@ This can be used for services that support readiness outside of the I<notification-fd> file interface (e.g. via B<aa-setready>(1) triggered on a log event). +=item An optional regular file named I<retries> + +This is B<anopa>-specific, and only used for services that supports readiness +(else it's simply ignored). +The file must only contain an unsigned integer, which is the number of times the +service can go down (and, therefore, restarted) whilst B<aa-start> is waiting +for it to become ready. + +This is used by B<aa-start>(1) to allow up to I<retries> event 'd' while waiting +for 'U'. After that, the service is considered failed (though nothing is changed +regarding its supervised state, i.e. its B<s6-supervise> will continue to +restart it unless/until told otherwise.) + =item An optional regular file named I<timeout> If such a file exists, it should contain the number of seconds before the @@ -187,6 +200,12 @@ B<aa-stop> when stopping the service. It if exits with 0 the service will be considered stopped, else stopped-failed. If no such file exists, the service will be considered stopped instantly. +=item An optional regular file named I<retries> + +The file must only contain an unsigned integer, which is the number of times the +service can go fail, and then be restarted, until it is actually considered +failed. + =item An optional, empty, regular file named I<essential> If present and the service fails to be started, when it exits B<aa-start> will diff --git a/src/anopa/start-stop.c b/src/anopa/start-stop.c index 79f0fb2..9887ae8 100644 --- a/src/anopa/start-stop.c +++ b/src/anopa/start-stop.c @@ -769,6 +769,14 @@ handle_oneshot (int is_start) if (aa_service_status_write (svst, aa_service_name (aa_service (si))) < 0) aa_strerr_warnu2sys ("write service status file for ", aa_service_name (aa_service (si))); + if (aa_service (si)->retries > 0) + { + --aa_service (si)->retries; + aa_bs (AA_OUT, aa_service_name (aa_service (si))); + aa_bs_flush (AA_OUT, ": Failed; Retrying...\n"); + return 1; + } + if (WIFEXITED (wstat)) { byte_copy (buf, 9, "exitcode "); @@ -820,6 +828,39 @@ handle_longrun (aa_mode mode, uint16_t id, char event) si = list_get (&aa_main_list, i); if ((mode & AA_MODE_START) && aa_service (si)->gets_ready) { + if (event == 'd') + { + if (aa_service (si)->retries == 0) + { + aa_service *s = aa_service (si); + + aa_unsubscribe_for (id); + s->ft_id = 0; + + /* This is to be consistent, but in all likelihood in very + * little time s6 will update the service status, thus taking + * over ours, with up, then down again, and up, and so on... */ + s->st.event = AA_EVT_STARTING_FAILED; + s->st.code = ERR_FAILED; + tain_copynow (&s->st.stamp); + aa_service_status_set_msg (&s->st, "Got down before being ready"); + if (aa_service_status_write (&s->st, aa_service_name (s)) < 0) + aa_strerr_warnu2sys ("write service status file for ", aa_service_name (s)); + + put_err_service (aa_service_name (s), ERR_FAILED, 0); + add_err (": "); + add_err ("Got down before being ready"); + end_err (); + genalloc_append (int, &ga_failed, &si); + + --nb_wait_longrun; + remove_from_list (&aa_main_list, si); + return 1; + } + else + --aa_service (si)->retries; + } + if (event == 'u' || event == 'd') { clear_draw (); diff --git a/src/include/anopa/service.h b/src/include/anopa/service.h index f24679e..7584ecb 100644 --- a/src/include/anopa/service.h +++ b/src/include/anopa/service.h @@ -86,6 +86,7 @@ typedef struct aa_ls ls; aa_service_status st; tain_t ts_exec; + uint16_t retries; /* longrun */ uint16_t ft_id; int gets_ready; diff --git a/src/libanopa/service.c b/src/libanopa/service.c index e92289d..235c835 100644 --- a/src/libanopa/service.c +++ b/src/libanopa/service.c @@ -416,6 +416,34 @@ aa_ensure_service_loaded (int si, aa_mode mode, int no_wants, aa_autoload_cb al_ aa_service (si)->secs_timeout = aa_secs_timeout; } + { + char buf[UINT_FMT + 1]; + ssize_t rr; + + sa.len -= strlen ("timeout") + 1; + stralloc_catb (&sa, "retries", strlen ("retries") + 1); + + rr = openreadnclose_nb (sa.s, buf, UINT_FMT); + if (rr < 0 && errno != ENOENT) + aa_strerr_warnu3sys ("read retries for ", aa_service_name (aa_service (si)), "; using default"); + + if (rr >= 0) + { + unsigned int i = rr; + + buf[byte_chr (buf, i, '\n')] = '\0'; + if (!uint0_scan (buf, &i)) + { + aa_strerr_warn3x ("invalid retries for ", aa_service_name (aa_service (si)), "; using default"); + aa_service (si)->retries = 0; + } + else + aa_service (si)->retries = (uint16_t) i; + } + else + aa_service (si)->retries = 0; + } + stralloc_free (&sa); aa_service (si)->ls = AA_LOAD_DONE; tain_now_g ();