From: John Crispin Date: Tue, 27 Aug 2013 09:41:11 +0000 (+0200) Subject: add respawn handling X-Git-Url: https://git.archive.openwrt.org/?p=project%2Fprocd.git;a=commitdiff_plain;h=f0b6ea93233ba5134311352595969797b00a98de add respawn handling Signed-off-by: John Crispin --- diff --git a/instance.c b/instance.c index d61bb33..e263b84 100644 --- a/instance.c +++ b/instance.c @@ -32,6 +32,7 @@ enum { INSTANCE_ATTR_NETDEV, INSTANCE_ATTR_FILE, INSTANCE_ATTR_TRIGGER, + INSTANCE_ATTR_RESPAWN, INSTANCE_ATTR_NICE, __INSTANCE_ATTR_MAX }; @@ -43,6 +44,7 @@ static const struct blobmsg_policy instance_attr[__INSTANCE_ATTR_MAX] = { [INSTANCE_ATTR_NETDEV] = { "netdev", BLOBMSG_TYPE_ARRAY }, [INSTANCE_ATTR_FILE] = { "file", BLOBMSG_TYPE_ARRAY }, [INSTANCE_ATTR_TRIGGER] = { "triggers", BLOBMSG_TYPE_ARRAY }, + [INSTANCE_ATTR_RESPAWN] = { "respawn", BLOBMSG_TYPE_ARRAY }, [INSTANCE_ATTR_NICE] = { "nice", BLOBMSG_TYPE_INT32 }, }; @@ -102,6 +104,8 @@ instance_start(struct service_instance *in) return; in->restart = false; + in->halt = !in->respawn; + if (!in->valid) return; @@ -117,6 +121,7 @@ instance_start(struct service_instance *in) DEBUG(1, "Started instance %s::%s\n", in->srv->name, in->name); in->proc.pid = pid; + clock_gettime(CLOCK_MONOTONIC, &in->start); uloop_process_add(&in->proc); } @@ -126,29 +131,58 @@ instance_timeout(struct uloop_timeout *t) struct service_instance *in; in = container_of(t, struct service_instance, timeout); - kill(in->proc.pid, SIGKILL); - uloop_process_delete(&in->proc); - in->proc.cb(&in->proc, -1); + + if (!in->halt && (in->restart || in->respawn)) + instance_start(in); } static void instance_exit(struct uloop_process *p, int ret) { struct service_instance *in; + struct timespec tp; + long runtime; in = container_of(p, struct service_instance, proc); - DEBUG(1, "Instance %s::%s exit with error code %d\n", in->srv->name, in->name, ret); + + clock_gettime(CLOCK_MONOTONIC, &tp); + runtime = tp.tv_sec - in->start.tv_sec; + + DEBUG(1, "Instance %s::%s exit with error code %d after %ld seconds\n", in->srv->name, in->name, ret, runtime); uloop_timeout_cancel(&in->timeout); - if (in->restart) + if (in->halt) { + /* no action */ + } else if (in->restart) { instance_start(in); + } else if (in->respawn) { + if (runtime < RESPAWN_ERROR) + in->respawn_count++; + else + in->respawn_count = 0; + if (in->respawn_count > 5) + DEBUG(1, "Instance %s::%s s in a crash loop %d crashes, %ld seconds since last crash\n", + in->srv->name, in->name, in->respawn_count, runtime); + uloop_timeout_set(&in->timeout, 5000); + } } void -instance_stop(struct service_instance *in, bool restart) +instance_stop(struct service_instance *in) { if (!in->proc.pending) return; + in->halt = true; + in->restart = in->respawn = false; + kill(in->proc.pid, SIGTERM); +} +static void +instance_restart(struct service_instance *in) +{ + if (!in->proc.pending) + return; + in->halt = false; + in->restart = true; kill(in->proc.pid, SIGTERM); } @@ -348,9 +382,9 @@ instance_update(struct service_instance *in, struct service_instance *in_new) instance_config_move(in, in_new); instance_start(in); } else { - in->restart = true; - instance_stop(in, true); + instance_restart(in); instance_config_move(in, in_new); + /* restart happens in the child callback handler */ } return true; } @@ -375,6 +409,8 @@ instance_init(struct service_instance *in, struct service *s, struct blob_attr * in->config = config; in->timeout.cb = instance_timeout; in->proc.cb = instance_exit; + in->respawn = true; + in->respawn_count = 0; blobmsg_list_init(&in->netdev, struct instance_netdev, node, instance_netdev_cmp); blobmsg_list_init(&in->file, struct instance_file, node, instance_file_cmp); diff --git a/instance.h b/instance.h index ceae834..1c8c0a0 100644 --- a/instance.h +++ b/instance.h @@ -19,6 +19,8 @@ #include #include "utils.h" +#define RESPAWN_ERROR (5 * 60) + struct service_instance { struct vlist_node node; struct service *srv; @@ -26,7 +28,16 @@ struct service_instance { int8_t nice; bool valid; + + bool halt; bool restart; + bool respawn; + int respawn_count; + struct timespec start; + + int respawn_timeout; + int respawn_threshold; + int respawn_retry; struct blob_attr *config; struct uloop_process proc; @@ -41,7 +52,7 @@ struct service_instance { }; void instance_start(struct service_instance *in); -void instance_stop(struct service_instance *in, bool restart); +void instance_stop(struct service_instance *in); bool instance_update(struct service_instance *in, struct service_instance *in_new); void instance_init(struct service_instance *in, struct service *s, struct blob_attr *config); void instance_free(struct service_instance *in); diff --git a/service.c b/service.c index 561c76c..c80402c 100644 --- a/service.c +++ b/service.c @@ -55,7 +55,7 @@ service_instance_update(struct vlist_tree *tree, struct vlist_node *node_new, instance_free(in_n); } else if (in_o) { DEBUG(1, "Free instance %s::%s\n", in_o->srv->name, in_o->name); - instance_stop(in_o, false); + instance_stop(in_o); instance_free(in_o); } else if (in_n) { DEBUG(1, "Create instance %s::%s\n", in_n->srv->name, in_n->name);