Fix: change health poll update to entry/exit calls
[lttng-tools.git] / src / bin / lttng-sessiond / main.c
index 6e548f3c10bf8185c13ac8156f1b5c54fe0ff4ce..461394e44e36fa9613d36318d93ad396447e3965 100644 (file)
@@ -221,12 +221,6 @@ enum consumerd_state {
 static enum consumerd_state ust_consumerd_state;
 static enum consumerd_state kernel_consumerd_state;
 
-/* Used for the health monitoring of the session daemon. See health.h */
-struct health_state health_thread_cmd;
-struct health_state health_thread_app_manage;
-struct health_state health_thread_app_reg;
-struct health_state health_thread_kernel;
-
 /*
  * Socket timeout for receiving and sending in seconds.
  */
@@ -626,7 +620,7 @@ static int update_kernel_stream(struct consumer_data *consumer_data, int fd)
                                        struct lttng_ht_iter iter;
                                        struct consumer_socket *socket;
 
-
+                                       rcu_read_lock();
                                        cds_lfht_for_each_entry(ksess->consumer->socks->ht,
                                                        &iter.iter, socket, node.node) {
                                                /* Code flow error */
@@ -637,9 +631,11 @@ static int update_kernel_stream(struct consumer_data *consumer_data, int fd)
                                                                channel, ksess);
                                                pthread_mutex_unlock(socket->lock);
                                                if (ret < 0) {
+                                                       rcu_read_unlock();
                                                        goto error;
                                                }
                                        }
+                                       rcu_read_unlock();
                                }
                                goto error;
                        }
@@ -691,6 +687,8 @@ static void *thread_manage_kernel(void *data)
 
        DBG("[thread] Thread manage kernel started");
 
+       health_register(HEALTH_TYPE_KERNEL);
+
        /*
         * This first step of the while is to clean this structure which could free
         * non NULL pointers so zero it before the loop.
@@ -701,14 +699,14 @@ static void *thread_manage_kernel(void *data)
                goto error_testpoint;
        }
 
-       health_code_update(&health_thread_kernel);
+       health_code_update();
 
        if (testpoint(thread_manage_kernel_before_loop)) {
                goto error_testpoint;
        }
 
        while (1) {
-               health_code_update(&health_thread_kernel);
+               health_code_update();
 
                if (update_poll_flag == 1) {
                        /* Clean events object. We are about to populate it again. */
@@ -736,9 +734,9 @@ static void *thread_manage_kernel(void *data)
 
                /* Poll infinite value of time */
        restart:
-               health_poll_update(&health_thread_kernel);
+               health_poll_entry();
                ret = lttng_poll_wait(&events, -1);
-               health_poll_update(&health_thread_kernel);
+               health_poll_exit();
                if (ret < 0) {
                        /*
                         * Restart interrupted system call.
@@ -761,7 +759,7 @@ static void *thread_manage_kernel(void *data)
                        revents = LTTNG_POLL_GETEV(&events, i);
                        pollfd = LTTNG_POLL_GETFD(&events, i);
 
-                       health_code_update(&health_thread_kernel);
+                       health_code_update();
 
                        /* Thread quit pipe has been closed. Killing thread. */
                        ret = check_thread_quit_pipe(pollfd, revents);
@@ -772,7 +770,13 @@ static void *thread_manage_kernel(void *data)
 
                        /* Check for data on kernel pipe */
                        if (pollfd == kernel_poll_pipe[0] && (revents & LPOLLIN)) {
-                               ret = read(kernel_poll_pipe[0], &tmp, 1);
+                               do {
+                                       ret = read(kernel_poll_pipe[0], &tmp, 1);
+                               } while (ret < 0 && errno == EINTR);
+                               /*
+                                * Ret value is useless here, if this pipe gets any actions an
+                                * update is required anyway.
+                                */
                                update_poll_flag = 1;
                                continue;
                        } else {
@@ -803,12 +807,12 @@ error_testpoint:
        utils_close_pipe(kernel_poll_pipe);
        kernel_poll_pipe[0] = kernel_poll_pipe[1] = -1;
        if (err) {
-               health_error(&health_thread_kernel);
+               health_error();
                ERR("Health error occurred in %s", __func__);
                WARN("Kernel thread died unexpectedly. "
                                "Kernel tracing can continue but CPU hotplug is disabled.");
        }
-       health_exit(&health_thread_kernel);
+       health_unregister();
        DBG("Kernel thread dying");
        return NULL;
 }
@@ -849,24 +853,9 @@ static void *thread_manage_consumer(void *data)
 
        DBG("[thread] Manage consumer started");
 
-       /*
-        * Since the consumer thread can be spawned at any moment in time, we init
-        * the health to a poll status (1, which is a valid health over time).
-        * When the thread starts, we update here the health to a "code" path being
-        * an even value so this thread, when reaching a poll wait, does not
-        * trigger an error with an even value.
-        *
-        * Here is the use case we avoid.
-        *
-        * +1: the first poll update during initialization (main())
-        * +2 * x: multiple code update once in this thread.
-        * +1: poll wait in this thread (being a good health state).
-        * == even number which after the wait period shows as a bad health.
-        *
-        * In a nutshell, the following poll update to the health state brings back
-        * the state to an even value meaning a code path.
-        */
-       health_poll_update(&consumer_data->health);
+       health_register(HEALTH_TYPE_CONSUMER);
+
+       health_code_update();
 
        /*
         * Pass 2 as size here for the thread quit pipe and kconsumerd_err_sock.
@@ -887,18 +876,18 @@ static void *thread_manage_consumer(void *data)
                goto error;
        }
 
-       health_code_update(&consumer_data->health);
+       health_code_update();
 
        /* Inifinite blocking call, waiting for transmission */
 restart:
-       health_poll_update(&consumer_data->health);
+       health_poll_entry();
 
        if (testpoint(thread_manage_consumer)) {
                goto error;
        }
 
        ret = lttng_poll_wait(&events, -1);
-       health_poll_update(&consumer_data->health);
+       health_poll_exit();
        if (ret < 0) {
                /*
                 * Restart interrupted system call.
@@ -916,7 +905,7 @@ restart:
                revents = LTTNG_POLL_GETEV(&events, i);
                pollfd = LTTNG_POLL_GETFD(&events, i);
 
-               health_code_update(&consumer_data->health);
+               health_code_update();
 
                /* Thread quit pipe has been closed. Killing thread. */
                ret = check_thread_quit_pipe(pollfd, revents);
@@ -945,7 +934,7 @@ restart:
         */
        (void) utils_set_fd_cloexec(sock);
 
-       health_code_update(&consumer_data->health);
+       health_code_update();
 
        DBG2("Receiving code from consumer err_sock");
 
@@ -956,7 +945,7 @@ restart:
                goto error;
        }
 
-       health_code_update(&consumer_data->health);
+       health_code_update();
 
        if (code == LTTCOMM_CONSUMERD_COMMAND_SOCK_READY) {
                consumer_data->cmd_sock =
@@ -986,13 +975,13 @@ restart:
                goto error;
        }
 
-       health_code_update(&consumer_data->health);
+       health_code_update();
 
        /* Inifinite blocking call, waiting for transmission */
 restart_poll:
-       health_poll_update(&consumer_data->health);
+       health_poll_entry();
        ret = lttng_poll_wait(&events, -1);
-       health_poll_update(&consumer_data->health);
+       health_poll_exit();
        if (ret < 0) {
                /*
                 * Restart interrupted system call.
@@ -1010,7 +999,7 @@ restart_poll:
                revents = LTTNG_POLL_GETEV(&events, i);
                pollfd = LTTNG_POLL_GETFD(&events, i);
 
-               health_code_update(&consumer_data->health);
+               health_code_update();
 
                /* Thread quit pipe has been closed. Killing thread. */
                ret = check_thread_quit_pipe(pollfd, revents);
@@ -1028,7 +1017,7 @@ restart_poll:
                }
        }
 
-       health_code_update(&consumer_data->health);
+       health_code_update();
 
        /* Wait for any kconsumerd error */
        ret = lttcomm_recv_unix_sock(sock, &code,
@@ -1079,10 +1068,10 @@ error:
        lttng_poll_clean(&events);
 error_poll:
        if (err) {
-               health_error(&consumer_data->health);
+               health_error();
                ERR("Health error occurred in %s", __func__);
        }
-       health_exit(&consumer_data->health);
+       health_unregister();
        DBG("consumer thread cleanup completed");
 
        return NULL;
@@ -1103,11 +1092,13 @@ static void *thread_manage_apps(void *data)
        rcu_register_thread();
        rcu_thread_online();
 
+       health_register(HEALTH_TYPE_APP_MANAGE);
+
        if (testpoint(thread_manage_apps)) {
                goto error_testpoint;
        }
 
-       health_code_update(&health_thread_app_manage);
+       health_code_update();
 
        ret = create_thread_poll_set(&events, 2);
        if (ret < 0) {
@@ -1123,16 +1114,16 @@ static void *thread_manage_apps(void *data)
                goto error;
        }
 
-       health_code_update(&health_thread_app_manage);
+       health_code_update();
 
        while (1) {
                DBG("Apps thread polling on %d fds", LTTNG_POLL_GETNB(&events));
 
                /* Inifinite blocking call, waiting for transmission */
        restart:
-               health_poll_update(&health_thread_app_manage);
+               health_poll_entry();
                ret = lttng_poll_wait(&events, -1);
-               health_poll_update(&health_thread_app_manage);
+               health_poll_exit();
                if (ret < 0) {
                        /*
                         * Restart interrupted system call.
@@ -1150,7 +1141,7 @@ static void *thread_manage_apps(void *data)
                        revents = LTTNG_POLL_GETEV(&events, i);
                        pollfd = LTTNG_POLL_GETFD(&events, i);
 
-                       health_code_update(&health_thread_app_manage);
+                       health_code_update();
 
                        /* Thread quit pipe has been closed. Killing thread. */
                        ret = check_thread_quit_pipe(pollfd, revents);
@@ -1166,13 +1157,15 @@ static void *thread_manage_apps(void *data)
                                        goto error;
                                } else if (revents & LPOLLIN) {
                                        /* Empty pipe */
-                                       ret = read(apps_cmd_pipe[0], &ust_cmd, sizeof(ust_cmd));
+                                       do {
+                                               ret = read(apps_cmd_pipe[0], &ust_cmd, sizeof(ust_cmd));
+                                       } while (ret < 0 && errno == EINTR);
                                        if (ret < 0 || ret < sizeof(ust_cmd)) {
                                                PERROR("read apps cmd pipe");
                                                goto error;
                                        }
 
-                                       health_code_update(&health_thread_app_manage);
+                                       health_code_update();
 
                                        /* Register applicaton to the session daemon */
                                        ret = ust_app_register(&ust_cmd.reg_msg,
@@ -1183,7 +1176,7 @@ static void *thread_manage_apps(void *data)
                                                break;
                                        }
 
-                                       health_code_update(&health_thread_app_manage);
+                                       health_code_update();
 
                                        /*
                                         * Validate UST version compatibility.
@@ -1197,7 +1190,7 @@ static void *thread_manage_apps(void *data)
                                                update_ust_app(ust_cmd.sock);
                                        }
 
-                                       health_code_update(&health_thread_app_manage);
+                                       health_code_update();
 
                                        ret = ust_app_register_done(ust_cmd.sock);
                                        if (ret < 0) {
@@ -1228,7 +1221,7 @@ static void *thread_manage_apps(void *data)
                                                                ust_cmd.sock);
                                        }
 
-                                       health_code_update(&health_thread_app_manage);
+                                       health_code_update();
 
                                        break;
                                }
@@ -1250,7 +1243,7 @@ static void *thread_manage_apps(void *data)
                                }
                        }
 
-                       health_code_update(&health_thread_app_manage);
+                       health_code_update();
                }
        }
 
@@ -1269,10 +1262,10 @@ error_testpoint:
         */
 
        if (err) {
-               health_error(&health_thread_app_manage);
+               health_error();
                ERR("Health error occurred in %s", __func__);
        }
-       health_exit(&health_thread_app_manage);
+       health_unregister();
        DBG("Application communication apps thread cleanup complete");
        rcu_thread_offline();
        rcu_unregister_thread();
@@ -1318,9 +1311,11 @@ static void *thread_dispatch_ust_registration(void *data)
                         * at some point in time or wait to the end of the world :)
                         */
                        if (apps_cmd_pipe[1] >= 0) {
-                               ret = write(apps_cmd_pipe[1], ust_cmd,
-                                               sizeof(struct ust_command));
-                               if (ret < 0) {
+                               do {
+                                       ret = write(apps_cmd_pipe[1], ust_cmd,
+                                                       sizeof(struct ust_command));
+                               } while (ret < 0 && errno == EINTR);
+                               if (ret < 0 || ret != sizeof(struct ust_command)) {
                                        PERROR("write apps cmd pipe");
                                        if (errno == EBADF) {
                                                /*
@@ -1367,6 +1362,8 @@ static void *thread_registration_apps(void *data)
 
        DBG("[thread] Manage application registration started");
 
+       health_register(HEALTH_TYPE_APP_REG);
+
        if (testpoint(thread_registration_apps)) {
                goto error_testpoint;
        }
@@ -1404,9 +1401,9 @@ static void *thread_registration_apps(void *data)
 
                /* Inifinite blocking call, waiting for transmission */
        restart:
-               health_poll_update(&health_thread_app_reg);
+               health_poll_entry();
                ret = lttng_poll_wait(&events, -1);
-               health_poll_update(&health_thread_app_reg);
+               health_poll_exit();
                if (ret < 0) {
                        /*
                         * Restart interrupted system call.
@@ -1420,7 +1417,7 @@ static void *thread_registration_apps(void *data)
                nb_fd = ret;
 
                for (i = 0; i < nb_fd; i++) {
-                       health_code_update(&health_thread_app_reg);
+                       health_code_update();
 
                        /* Fetch once the poll data */
                        revents = LTTNG_POLL_GETEV(&events, i);
@@ -1472,7 +1469,7 @@ static void *thread_registration_apps(void *data)
                                                sock = -1;
                                                continue;
                                        }
-                                       health_code_update(&health_thread_app_reg);
+                                       health_code_update();
                                        ret = lttcomm_recv_unix_sock(sock, &ust_cmd->reg_msg,
                                                        sizeof(struct ust_register_msg));
                                        if (ret < 0 || ret < sizeof(struct ust_register_msg)) {
@@ -1490,7 +1487,7 @@ static void *thread_registration_apps(void *data)
                                                sock = -1;
                                                continue;
                                        }
-                                       health_code_update(&health_thread_app_reg);
+                                       health_code_update();
 
                                        ust_cmd->sock = sock;
                                        sock = -1;
@@ -1521,7 +1518,7 @@ static void *thread_registration_apps(void *data)
 exit:
 error:
        if (err) {
-               health_error(&health_thread_app_reg);
+               health_error();
                ERR("Health error occurred in %s", __func__);
        }
 
@@ -1549,7 +1546,7 @@ error_listen:
 error_create_poll:
 error_testpoint:
        DBG("UST Registration thread cleanup complete");
-       health_exit(&health_thread_app_reg);
+       health_unregister();
 
        return NULL;
 }
@@ -1924,9 +1921,7 @@ static int check_consumer_health(void)
 {
        int ret;
 
-       ret = health_check_state(&kconsumer_data.health) &&
-               health_check_state(&ustconsumer32_data.health) &&
-               health_check_state(&ustconsumer64_data.health);
+       ret = health_check_state(HEALTH_TYPE_CONSUMER);
 
        DBG3("Health consumer check %d", ret);
 
@@ -3044,26 +3039,26 @@ restart:
 
                switch (msg.component) {
                case LTTNG_HEALTH_CMD:
-                       reply.ret_code = health_check_state(&health_thread_cmd);
+                       reply.ret_code = health_check_state(HEALTH_TYPE_CMD);
                        break;
                case LTTNG_HEALTH_APP_MANAGE:
-                       reply.ret_code = health_check_state(&health_thread_app_manage);
+                       reply.ret_code = health_check_state(HEALTH_TYPE_APP_MANAGE);
                        break;
                case LTTNG_HEALTH_APP_REG:
-                       reply.ret_code = health_check_state(&health_thread_app_reg);
+                       reply.ret_code = health_check_state(HEALTH_TYPE_APP_REG);
                        break;
                case LTTNG_HEALTH_KERNEL:
-                       reply.ret_code = health_check_state(&health_thread_kernel);
+                       reply.ret_code = health_check_state(HEALTH_TYPE_KERNEL);
                        break;
                case LTTNG_HEALTH_CONSUMER:
                        reply.ret_code = check_consumer_health();
                        break;
                case LTTNG_HEALTH_ALL:
                        reply.ret_code =
-                               health_check_state(&health_thread_app_manage) &&
-                               health_check_state(&health_thread_app_reg) &&
-                               health_check_state(&health_thread_cmd) &&
-                               health_check_state(&health_thread_kernel) &&
+                               health_check_state(HEALTH_TYPE_APP_MANAGE) &&
+                               health_check_state(HEALTH_TYPE_APP_REG) &&
+                               health_check_state(HEALTH_TYPE_CMD) &&
+                               health_check_state(HEALTH_TYPE_KERNEL) &&
                                check_consumer_health();
                        break;
                default:
@@ -3138,11 +3133,13 @@ static void *thread_manage_clients(void *data)
 
        rcu_register_thread();
 
+       health_register(HEALTH_TYPE_CMD);
+
        if (testpoint(thread_manage_clients)) {
                goto error_testpoint;
        }
 
-       health_code_update(&health_thread_cmd);
+       health_code_update();
 
        ret = lttcomm_listen_unix_sock(client_sock);
        if (ret < 0) {
@@ -3175,16 +3172,16 @@ static void *thread_manage_clients(void *data)
                goto error;
        }
 
-       health_code_update(&health_thread_cmd);
+       health_code_update();
 
        while (1) {
                DBG("Accepting client command ...");
 
                /* Inifinite blocking call, waiting for transmission */
        restart:
-               health_poll_update(&health_thread_cmd);
+               health_poll_entry();
                ret = lttng_poll_wait(&events, -1);
-               health_poll_update(&health_thread_cmd);
+               health_poll_exit();
                if (ret < 0) {
                        /*
                         * Restart interrupted system call.
@@ -3202,7 +3199,7 @@ static void *thread_manage_clients(void *data)
                        revents = LTTNG_POLL_GETEV(&events, i);
                        pollfd = LTTNG_POLL_GETFD(&events, i);
 
-                       health_code_update(&health_thread_cmd);
+                       health_code_update();
 
                        /* Thread quit pipe has been closed. Killing thread. */
                        ret = check_thread_quit_pipe(pollfd, revents);
@@ -3222,7 +3219,7 @@ static void *thread_manage_clients(void *data)
 
                DBG("Wait for client response");
 
-               health_code_update(&health_thread_cmd);
+               health_code_update();
 
                sock = lttcomm_accept_unix_sock(client_sock);
                if (sock < 0) {
@@ -3258,7 +3255,7 @@ static void *thread_manage_clients(void *data)
                cmd_ctx->llm = NULL;
                cmd_ctx->session = NULL;
 
-               health_code_update(&health_thread_cmd);
+               health_code_update();
 
                /*
                 * Data is received from the lttng client. The struct
@@ -3279,7 +3276,7 @@ static void *thread_manage_clients(void *data)
                        continue;
                }
 
-               health_code_update(&health_thread_cmd);
+               health_code_update();
 
                // TODO: Validate cmd_ctx including sanity check for
                // security purpose.
@@ -3312,7 +3309,7 @@ static void *thread_manage_clients(void *data)
                        continue;
                }
 
-               health_code_update(&health_thread_cmd);
+               health_code_update();
 
                DBG("Sending response (size: %d, retcode: %s)",
                                cmd_ctx->lttng_msg_size,
@@ -3331,7 +3328,7 @@ static void *thread_manage_clients(void *data)
 
                clean_command_ctx(&cmd_ctx);
 
-               health_code_update(&health_thread_cmd);
+               health_code_update();
        }
 
 exit:
@@ -3358,11 +3355,11 @@ error_testpoint:
        }
 
        if (err) {
-               health_error(&health_thread_cmd);
+               health_error();
                ERR("Health error occurred in %s", __func__);
        }
 
-       health_exit(&health_thread_cmd);
+       health_unregister();
 
        DBG("Client thread dying");
 
@@ -4106,26 +4103,6 @@ int main(int argc, char **argv)
 
        cmd_init();
 
-       /* Init all health thread counters. */
-       health_init(&health_thread_cmd);
-       health_init(&health_thread_kernel);
-       health_init(&health_thread_app_manage);
-       health_init(&health_thread_app_reg);
-
-       /*
-        * Init health counters of the consumer thread. We do a quick hack here to
-        * the state of the consumer health is fine even if the thread is not
-        * started. Once the thread starts, the health state is updated with a poll
-        * value to set a health code path. This is simply to ease our life and has
-        * no cost what so ever.
-        */
-       health_init(&kconsumer_data.health);
-       health_poll_update(&kconsumer_data.health);
-       health_init(&ustconsumer32_data.health);
-       health_poll_update(&ustconsumer32_data.health);
-       health_init(&ustconsumer64_data.health);
-       health_poll_update(&ustconsumer64_data.health);
-
        /* Check for the application socket timeout env variable. */
        env_app_timeout = getenv(DEFAULT_APP_SOCKET_TIMEOUT_ENV);
        if (env_app_timeout) {
This page took 0.032504 seconds and 4 git commands to generate.