Thank you for your observations and interest!
So I made some modifications:
- changed restart variables to unsigned
- changed to 0 as default (not -1, also dropped the define)
- moved counter reset from unit.c to service.c (after unit_notify call)
- because the "OnFailure" unit is triggered in unit.c, I still left the
checking before that.
- updated coding style
- added restart counter (de)serialization
The patch is for the same branch
(3661ac04b4f2840d3345605aa35963bbde3c469d) as before:
Index: systemd-29/src/dbus-service.c
===================================================================
--- systemd-29.orig/src/dbus-service.c 2011-07-04 16:19:37.000000000
+0300
+++ systemd-29/src/dbus-service.c 2011-07-04 17:07:09.971651549 +0300
@@ -42,6 +42,8 @@
" <property name=\"PIDFile\" type=\"s\" access=\"read\"/>\n" \
" <property name=\"NotifyAccess\" type=\"s\"
access=\"read\"/>\n" \
" <property name=\"RestartUSec\" type=\"t\" access=\"read\"/>\n" \
+ " <property name=\"MaxRestartRetries\" type=\"u\"
access=\"read\"/>\n" \
+ " <property name=\"RestartRetry\" type=\"u\" access=\"read\"/>\n" \
" <property name=\"TimeoutUSec\" type=\"t\" access=\"read\"/>\n" \
BUS_EXEC_COMMAND_INTERFACE("ExecStartPre") \
BUS_EXEC_COMMAND_INTERFACE("ExecStart") \
@@ -103,6 +105,8 @@
{ "org.freedesktop.systemd1.Service",
"PIDFile", bus_property_append_string, "s",
u->service.pid_file },
{ "org.freedesktop.systemd1.Service",
"NotifyAccess", bus_service_append_notify_access, "s",
&u->service.notify_access },
{ "org.freedesktop.systemd1.Service",
"RestartUSec", bus_property_append_usec, "t",
&u->service.restart_usec },
+ { "org.freedesktop.systemd1.Service",
"MaxRestartRetries", bus_property_append_unsigned, "u",
&u->service.restart_retries },
+ { "org.freedesktop.systemd1.Service",
"RestartRetry", bus_property_append_unsigned, "u",
&u->service.restart_current_retry },
{ "org.freedesktop.systemd1.Service",
"TimeoutUSec", bus_property_append_usec, "t",
&u->service.timeout_usec },
BUS_EXEC_COMMAND_PROPERTY("org.freedesktop.systemd1.Service",
u->service.exec_command[SERVICE_EXEC_START_PRE], "ExecStartPre"),
BUS_EXEC_COMMAND_PROPERTY("org.freedesktop.systemd1.Service",
u->service.exec_command[SERVICE_EXEC_START], "ExecStart"),
Index: systemd-29/src/load-fragment.c
===================================================================
--- systemd-29.orig/src/load-fragment.c 2011-07-04 16:19:37.000000000
+0300
+++ systemd-29/src/load-fragment.c 2011-07-04 17:07:09.975651196 +0300
@@ -1951,6 +1951,7 @@
{ "TimeoutSec",
config_parse_usec, 0,
&u->service.timeout_usec, "Service" },
{ "Type",
config_parse_service_type, 0,
&u->service.type, "Service" },
{ "Restart",
config_parse_service_restart, 0,
&u->service.restart, "Service" },
+ { "MaxRestartRetries",
config_parse_unsigned, 0,
&u->service.restart_retries, "Service" },
{ "PermissionsStartOnly",
config_parse_bool, 0,
&u->service.permissions_start_only, "Service" },
{ "RootDirectoryStartOnly",
config_parse_bool, 0,
&u->service.root_directory_start_only, "Service" },
{ "RemainAfterExit",
config_parse_bool, 0,
&u->service.remain_after_exit, "Service" },
Index: systemd-29/src/service.c
===================================================================
--- systemd-29.orig/src/service.c 2011-07-04 16:19:37.000000000 +0300
+++ systemd-29/src/service.c 2011-07-04 17:14:32.823465057 +0300
@@ -1151,6 +1166,20 @@
if (s->meta.default_dependencies)
if ((r = service_add_default_dependencies(s)) < 0)
return r;
+
+ /* If the option "RestartRetries=" is set then the
service will be "revived" */
+ if (s->restart_retries > 0) {
+ /* If the option "Restart=" is not set, then
for the service to be "revived"
+ we must set "Restart=" to "on-failure" */
+ if (s->restart == SERVICE_RESTART_NO)
+ s->restart = SERVICE_RESTART_ON_FAILURE;
+
+ /* If the option "Restart=" is already set,
then for the service to be "revived"
+ and be restarted on failure we must set
"Restart=" to "always"
+ only if it's "on-success" */
+ else if (s->restart == SERVICE_RESTART_ON_SUCCESS)
+ s->restart = SERVICE_RESTART_ALWAYS;
+ }
}
return service_verify(s);
@@ -1488,6 +1517,16 @@
log_debug("%s changed %s -> %s", s->meta.id,
service_state_to_string(old_state), service_state_to_string(state));
unit_notify(UNIT(s), state_translation_table[old_state],
state_translation_table[state], !s->reload_failure);
+
+ /* If service finished, reset restart-retry counter, if
applicable */
+ if (old_state != state &&
+ (state == SERVICE_DEAD || state == SERVICE_FAILED) &&
+ s->restart_current_retry > s->restart_retries) {
+ log_debug("Service %s entered dead or failed,
restart retries at max, reset counter (%u -> 0)!",
+ s->meta.id, s->restart_current_retry);
+ s->restart_current_retry = 0;
+ }
+
s->reload_failure = false;
}
@@ -2291,6 +2330,7 @@
/* Make sure we don't enter a busy loop of some kind. */
if (!ratelimit_test(&s->ratelimit)) {
+ s->restart_current_retry = 0;
log_warning("%s start request repeated too quickly,
refusing to start.", u->meta.id);
return -ECANCELED;
}
@@ -2311,6 +2351,7 @@
/* This is a user request, so don't do restarts on this
* shutdown. */
+ s->restart_current_retry = 0;
s->forbid_restart = true;
/* Already on it */
@@ -2411,6 +2452,8 @@
}
}
+ unit_serialize_item_format(u, f, "restart-current-retry", "%u",
s->restart_current_retry);
+
return 0;
}
@@ -2510,6 +2553,13 @@
dual_timestamp_deserialize(value,
&s->main_exec_status.start_timestamp);
else if (streq(key, "main-exec-status-exit"))
dual_timestamp_deserialize(value,
&s->main_exec_status.exit_timestamp);
+ else if (streq(key, "restart-current-retry")) {
+ unsigned i;
+ if (safe_atou(value, &i) < 0)
+ log_debug("Failed to parse
restart-current-retry %s", value);
+ else
+ s->restart_current_retry = i;
+ }
else
log_debug("Unknown serialization key '%s'", key);
@@ -2604,6 +2654,11 @@
* gone. */
s->main_command = NULL;
+ if (success && s->restart_current_retry) {
+ log_debug("%s changed restart_crt_retry
from %u to 0", s->meta.id, s->restart_current_retry);
+ s->restart_current_retry = 0;
+ }
+
switch (s->state) {
case SERVICE_START_POST:
@@ -2832,7 +2887,15 @@
break;
case SERVICE_AUTO_RESTART:
- log_info("%s holdoff time over, scheduling restart.",
u->meta.id);
+ if (s->restart_retries > 0) {
+ s->restart_current_retry ++;
+ if (s->restart_current_retry >
s->restart_retries) {
+ log_warning("%s maximum number of
restarting retries reached. Stopping.", s->meta.id);
+ service_enter_dead(s, true, false);
+ break;
+ }
+ }
+ log_info("%s holdoff time over, scheduling restart
(retry %u of %u).", u->meta.id, s->restart_current_retry,
s->restart_retries);
service_enter_restart(s);
break;
Index: systemd-29/src/service.h
===================================================================
--- systemd-29.orig/src/service.h 2011-07-04 16:19:37.000000000 +0300
+++ systemd-29/src/service.h 2011-07-04 17:07:09.975651196 +0300
@@ -92,6 +92,9 @@
ServiceType type;
ServiceRestart restart;
+ unsigned restart_retries;
+ unsigned restart_current_retry;
+
/* If set we'll read the main daemon PID from this file */
char *pid_file;
Index: systemd-29/src/unit.c
===================================================================
--- systemd-29.orig/src/unit.c 2011-07-04 16:19:37.000000000 +0300
+++ systemd-29/src/unit.c 2011-07-04 17:07:09.979652060 +0300
@@ -1114,6 +1114,10 @@
if (set_size(u->meta.dependencies[UNIT_ON_FAILURE]) <= 0)
return;
+ if (u->meta.type == UNIT_SERVICE &&
+ u->service.restart_current_retry <=
u->service.restart_retries)
+ return;
+
log_info("Triggering OnFailure= dependencies of %s.", u->meta.id);
SET_FOREACH(other, u->meta.dependencies[UNIT_ON_FAILURE], i) {
On 07/02/2011 03:21 AM, Lennart Poettering wrote:
On Fri, 01.07.11 15:35, Cristian Patrascu ([email protected])
wrote:
Heya!
(BTW, got your first post of this patch too, just didn't find the time
to review the patch, sorry. It was still in my mail queue however.)
After "x" restarts have happened, the "OnFailure" unit will start
(if specified) but only after all unsuccessfull restarts (after "x"
restarts reached).
Patch looks pretty good. A few comments though.
This patch is made for systemd-29, on master branch with SHA1 ID:
3661ac04b4f2840d3345605aa35963bbde3c469d
________systemd-restart-on-fail.patch : _______________________________
Index: systemd-29/src/dbus-service.c
===================================================================
--- systemd-29.orig/src/dbus-service.c 2011-06-22 10:47:14.000000000 +0300
+++ systemd-29/src/dbus-service.c 2011-06-22 13:48:51.292321742 +0300
@@ -42,6 +42,8 @@
"<property name=\"PIDFile\" type=\"s\" access=\"read\"/>\n" \
"<property name=\"NotifyAccess\" type=\"s\" access=\"read\"/>\n" \
"<property name=\"RestartUSec\" type=\"t\" access=\"read\"/>\n" \
+ "<property name=\"MaxRestartRetries\" type=\"i\" access=\"read\"/>\n" \
+ "<property name=\"RestartRetry\" type=\"i\"
access=\"read\"/>\n" \
I Think it would make sense to make both of these unsigned, in order to
avoid confusion whether these actually ever can be negative.
If I read your patch correctly right now MaxRestartRetries=-1 means that
there is no limit on the number of retries. I'd use 0 for that instead
(0 is not needed to indicate disabling, since we can indicate that with
Restart=no already.)
+ s->restart_retries = DEFAULT_RESTART_RETRIES;
+ s->restart_crt_retry = 0;
We generally try not to abbrievate variables unnecessarily, so I'd suggest
to call this variable "restart_current_retry" or so.
#ifdef HAVE_SYSV_COMPAT
s->sysv_start_priority = -1;
s->sysv_start_priority_from_rcnd = -1;
@@ -1151,6 +1153,22 @@
if (s->meta.default_dependencies)
if ((r = service_add_default_dependencies(s))< 0)
return r;
+
+ /* If the option "RestartRetries=" is set then the service will be
"revived" */
+ if (s->restart_retries != DEFAULT_RESTART_RETRIES){
+ /* If the option "Restart=" is not set, then for the service to
be "revived"
+ we must set "Restart=" to "on-failure" */
+ if (s->restart == SERVICE_RESTART_NO){
+ s->restart = SERVICE_RESTART_ON_FAILURE;
+ }
Please follow coding style. We generally place no {} brackets around
single line blocks.
case SERVICE_AUTO_RESTART:
- log_info("%s holdoff time over, scheduling restart.",
u->meta.id);
+ if (0<= s->restart_retries){
Please follow coding style, compare variables with constants not
constants with variables. Also, please place spaces before the<= and
the {.
+ if (u->meta.type == UNIT_SERVICE&&
+ u->service.restart_crt_retry<= u->service.restart_retries)
+ return;
+
Huhm, I wonder if we could fine a better place for this, not sure where though.
log_info("Triggering OnFailure= dependencies of %s.", u->meta.id);
SET_FOREACH(other, u->meta.dependencies[UNIT_ON_FAILURE], i) {
@@ -1245,6 +1249,17 @@
log_notice("Unit %s entered failed state.",
u->meta.id);
unit_trigger_on_failure(u);
}
+
+ /* When a unit is of type service and has finished,
+ reset restart-retry counter, if applicable */
+ if (ns != os&&
+ u->meta.type == UNIT_SERVICE&&
+ UNIT_IS_INACTIVE_OR_FAILED(ns)&&
+ u->service.restart_crt_retry> u->service.restart_retries
){
+ log_debug("Unit %s entered inactive or failed, restart
retries at max, reset counter (%d -> 0)!",
+ u->meta.id, u->service.restart_crt_retry,
u->service.restart_retries);
+ u->service.restart_crt_retry = 0;
Hmm, I think it would be nicer to reset this counter in service.c only, not
in the generic code. I'd like to avoid that we access too many
service-private fields from generic unit code.
#include "execute.h"
#include "condition.h"
+/* default = infinite retries (= systemd behaviour not patched) */
+#define DEFAULT_RESTART_RETRIES -1
+
Hee, if I commit this, then it won't be patched anymore, so the comment
should not refer to it being unpatched ;-)
Patch looks quite OK in general.
Thanks for the work,
Lennart
_______________________________________________
systemd-devel mailing list
[email protected]
http://lists.freedesktop.org/mailman/listinfo/systemd-devel