Fix: socket connect hang on heavy loads
[lttng-ust.git] / liblttng-ust / lttng-ust-comm.c
index a1ebcbe6fc83564a5fc4d9c0c0642403938da35f..1c7584ddd51e84ed3d1d55015467101791bd4657 100644 (file)
@@ -102,6 +102,7 @@ struct sock_info {
 
        char sock_path[PATH_MAX];
        int socket;
+       int notify_socket;
 
        char wait_shm_path[PATH_MAX];
        char *wait_shm_mmap;
@@ -116,10 +117,11 @@ struct sock_info global_apps = {
        .allowed = 1,
        .thread_active = 0,
 
-       .sock_path = DEFAULT_GLOBAL_APPS_UNIX_SOCK,
+       .sock_path = LTTNG_DEFAULT_RUNDIR "/" LTTNG_UST_SOCK_FILENAME,
        .socket = -1,
+       .notify_socket = -1,
 
-       .wait_shm_path = DEFAULT_GLOBAL_APPS_WAIT_SHM_PATH,
+       .wait_shm_path = "/" LTTNG_UST_WAIT_FILENAME,
 };
 
 /* TODO: allow global_apps_sock_path override */
@@ -132,6 +134,7 @@ struct sock_info local_apps = {
        .thread_active = 0,
 
        .socket = -1,
+       .notify_socket = -1,
 };
 
 static int wait_poll_fallback;
@@ -171,11 +174,18 @@ static const char *cmd_name_mapping[] = {
        [ LTTNG_UST_FILTER ] = "Create Filter",
 };
 
+static const char *str_timeout;
+static int got_timeout_env;
+
 extern void lttng_ring_buffer_client_overwrite_init(void);
+extern void lttng_ring_buffer_client_overwrite_rt_init(void);
 extern void lttng_ring_buffer_client_discard_init(void);
+extern void lttng_ring_buffer_client_discard_rt_init(void);
 extern void lttng_ring_buffer_metadata_client_init(void);
 extern void lttng_ring_buffer_client_overwrite_exit(void);
+extern void lttng_ring_buffer_client_overwrite_rt_exit(void);
 extern void lttng_ring_buffer_client_discard_exit(void);
+extern void lttng_ring_buffer_client_discard_rt_exit(void);
 extern void lttng_ring_buffer_metadata_client_exit(void);
 
 /*
@@ -187,15 +197,24 @@ void lttng_fixup_nest_count_tls(void)
        asm volatile ("" : : "m" (URCU_TLS(lttng_ust_nest_count)));
 }
 
+int lttng_get_notify_socket(void *owner)
+{
+       struct sock_info *info = owner;
+
+       return info->notify_socket;
+}
+
 static
 void print_cmd(int cmd, int handle)
 {
        const char *cmd_name = "Unknown";
 
-       if (cmd_name_mapping[cmd]) {
+       if (cmd >= 0 && cmd < LTTNG_ARRAY_SIZE(cmd_name_mapping)
+                       && cmd_name_mapping[cmd]) {
                cmd_name = cmd_name_mapping[cmd];
        }
-       DBG("Message Received \"%s\", Handle \"%s\" (%d)", cmd_name,
+       DBG("Message Received \"%s\" (%d), Handle \"%s\" (%d)",
+               cmd_name, cmd,
                lttng_ust_obj_get_name(handle), handle);
 }
 
@@ -220,41 +239,87 @@ int setup_local_apps(void)
                return -ENOENT;
        }
        local_apps.allowed = 1;
-       snprintf(local_apps.sock_path, PATH_MAX,
-                DEFAULT_HOME_APPS_UNIX_SOCK, home_dir);
-       snprintf(local_apps.wait_shm_path, PATH_MAX,
-                DEFAULT_HOME_APPS_WAIT_SHM_PATH, uid);
+       snprintf(local_apps.sock_path, PATH_MAX, "%s/%s/%s",
+               home_dir,
+               LTTNG_DEFAULT_HOME_RUNDIR,
+               LTTNG_UST_SOCK_FILENAME);
+       snprintf(local_apps.wait_shm_path, PATH_MAX, "/%s-%u",
+               LTTNG_UST_WAIT_FILENAME,
+               uid);
        return 0;
 }
 
+/*
+ * Get notify_sock timeout, in ms.
+ * -1: don't wait. 0: wait forever. >0: timeout, in ms.
+ */
 static
-int register_app_to_sessiond(int socket)
+long get_timeout(void)
 {
-       ssize_t ret;
-       struct {
-               uint32_t major;
-               uint32_t minor;
-               pid_t pid;
-               pid_t ppid;
-               uid_t uid;
-               gid_t gid;
-               uint32_t bits_per_long;
-               char name[16];  /* process name */
-       } reg_msg;
-
-       reg_msg.major = LTTNG_UST_COMM_VERSION_MAJOR;
-       reg_msg.minor = LTTNG_UST_COMM_VERSION_MINOR;
-       reg_msg.pid = getpid();
-       reg_msg.ppid = getppid();
-       reg_msg.uid = getuid();
-       reg_msg.gid = getgid();
-       reg_msg.bits_per_long = CAA_BITS_PER_LONG;
-       lttng_ust_getprocname(reg_msg.name);
-
-       ret = ustcomm_send_unix_sock(socket, &reg_msg, sizeof(reg_msg));
-       if (ret >= 0 && ret != sizeof(reg_msg))
-               return -EIO;
-       return ret;
+       long constructor_delay_ms = LTTNG_UST_DEFAULT_CONSTRUCTOR_TIMEOUT_MS;
+
+       if (!got_timeout_env) {
+               str_timeout = getenv("LTTNG_UST_REGISTER_TIMEOUT");
+               got_timeout_env = 1;
+       }
+       if (str_timeout)
+               constructor_delay_ms = strtol(str_timeout, NULL, 10);
+       return constructor_delay_ms;
+}
+
+static
+long get_notify_sock_timeout(void)
+{
+       return get_timeout();
+}
+
+/*
+ * Return values: -1: don't wait. 0: wait forever. 1: timeout wait.
+ */
+static
+int get_constructor_timeout(struct timespec *constructor_timeout)
+{
+       long constructor_delay_ms;
+       int ret;
+
+       constructor_delay_ms = get_timeout();
+
+       switch (constructor_delay_ms) {
+       case -1:/* fall-through */
+       case 0:
+               return constructor_delay_ms;
+       default:
+               break;
+       }
+
+       /*
+        * If we are unable to find the current time, don't wait.
+        */
+       ret = clock_gettime(CLOCK_REALTIME, constructor_timeout);
+       if (ret) {
+               return -1;
+       }
+       constructor_timeout->tv_sec += constructor_delay_ms / 1000UL;
+       constructor_timeout->tv_nsec +=
+               (constructor_delay_ms % 1000UL) * 1000000UL;
+       if (constructor_timeout->tv_nsec >= 1000000000UL) {
+               constructor_timeout->tv_sec++;
+               constructor_timeout->tv_nsec -= 1000000000UL;
+       }
+       return 1;
+}
+
+static
+int register_to_sessiond(int socket, enum ustctl_socket_type type)
+{
+       return ustcomm_send_reg_msg(socket,
+               type,
+               CAA_BITS_PER_LONG,
+               lttng_alignof(uint8_t) * CHAR_BIT,
+               lttng_alignof(uint16_t) * CHAR_BIT,
+               lttng_alignof(uint32_t) * CHAR_BIT,
+               lttng_alignof(uint64_t) * CHAR_BIT,
+               lttng_alignof(unsigned long) * CHAR_BIT);
 }
 
 static
@@ -334,7 +399,7 @@ int handle_message(struct sock_info *sock_info,
                if (lum->handle == LTTNG_UST_ROOT_HANDLE)
                        ret = -EPERM;
                else
-                       ret = lttng_ust_objd_unref(lum->handle);
+                       ret = lttng_ust_objd_unref(lum->handle, 1);
                break;
        case LTTNG_UST_FILTER:
        {
@@ -380,6 +445,7 @@ int handle_message(struct sock_info *sock_info,
                                        goto error;
                                }
                                ret = len;
+                               free(bytecode);
                                goto end;
                        } else {
                                DBG("incorrect filter data message size: %zd", len);
@@ -408,9 +474,11 @@ int handle_message(struct sock_info *sock_info,
        case LTTNG_UST_CHANNEL:
        {
                void *chan_data;
+               int wakeup_fd;
 
                len = ustcomm_recv_channel_from_sessiond(sock,
-                               &chan_data, lum->u.channel.len);
+                               &chan_data, lum->u.channel.len,
+                               &wakeup_fd);
                switch (len) {
                case 0: /* orderly shutdown */
                        ret = 0;
@@ -435,6 +503,7 @@ int handle_message(struct sock_info *sock_info,
                        }
                }
                args.channel.chan_data = chan_data;
+               args.channel.wakeup_fd = wakeup_fd;
                if (ops->cmd)
                        ret = ops->cmd(lum->handle, lum->cmd,
                                        (unsigned long) &lum->u,
@@ -558,28 +627,40 @@ void cleanup_sock_info(struct sock_info *sock_info, int exiting)
 {
        int ret;
 
-       if (sock_info->socket != -1) {
-               ret = ustcomm_close_unix_sock(sock_info->socket);
-               if (ret) {
-                       ERR("Error closing apps socket");
-               }
-               sock_info->socket = -1;
-       }
        if (sock_info->root_handle != -1) {
-               ret = lttng_ust_objd_unref(sock_info->root_handle);
+               ret = lttng_ust_objd_unref(sock_info->root_handle, 1);
                if (ret) {
                        ERR("Error unref root handle");
                }
                sock_info->root_handle = -1;
        }
        sock_info->constructor_sem_posted = 0;
+
        /*
-        * wait_shm_mmap is used by listener threads outside of the
-        * ust lock, so we cannot tear it down ourselves, because we
-        * cannot join on these threads. Leave this task to the OS
+        * wait_shm_mmap, socket and notify socket are used by listener
+        * threads outside of the ust lock, so we cannot tear them down
+        * ourselves, because we cannot join on these threads. Leave
+        * responsibility of cleaning up these resources to the OS
         * process exit.
         */
-       if (!exiting && sock_info->wait_shm_mmap) {
+       if (exiting)
+               return;
+
+       if (sock_info->socket != -1) {
+               ret = ustcomm_close_unix_sock(sock_info->socket);
+               if (ret) {
+                       ERR("Error closing ust cmd socket");
+               }
+               sock_info->socket = -1;
+       }
+       if (sock_info->notify_socket != -1) {
+               ret = ustcomm_close_unix_sock(sock_info->notify_socket);
+               if (ret) {
+                       ERR("Error closing ust notify socket");
+               }
+               sock_info->notify_socket = -1;
+       }
+       if (sock_info->wait_shm_mmap) {
                ret = munmap(sock_info->wait_shm_mmap, sysconf(_SC_PAGE_SIZE));
                if (ret) {
                        ERR("Error unmapping wait shm");
@@ -815,6 +896,7 @@ void *ust_listener_thread(void *arg)
 {
        struct sock_info *sock_info = arg;
        int sock, ret, prev_connect_failed = 0, has_waited = 0;
+       long timeout;
 
        /* Restart trying to connect to the session daemon */
 restart:
@@ -833,25 +915,43 @@ restart:
                has_waited = 1;
                prev_connect_failed = 0;
        }
-       ust_lock();
-
-       if (lttng_ust_comm_should_quit) {
-               goto quit;
-       }
 
        if (sock_info->socket != -1) {
                ret = ustcomm_close_unix_sock(sock_info->socket);
                if (ret) {
-                       ERR("Error closing %s apps socket", sock_info->name);
+                       ERR("Error closing %s ust cmd socket",
+                               sock_info->name);
                }
                sock_info->socket = -1;
        }
+       if (sock_info->notify_socket != -1) {
+               ret = ustcomm_close_unix_sock(sock_info->notify_socket);
+               if (ret) {
+                       ERR("Error closing %s ust notify socket",
+                               sock_info->name);
+               }
+               sock_info->notify_socket = -1;
+       }
 
-       /* Register */
+       /*
+        * Register. We need to perform both connect and sending
+        * registration message before doing the next connect otherwise
+        * we may reach unix socket connect queue max limits and block
+        * on the 2nd connect while the session daemon is awaiting the
+        * first connect registration message.
+        */
+       /* Connect cmd socket */
        ret = ustcomm_connect_unix_sock(sock_info->sock_path);
        if (ret < 0) {
                DBG("Info: sessiond not accepting connections to %s apps socket", sock_info->name);
                prev_connect_failed = 1;
+
+               ust_lock();
+
+               if (lttng_ust_comm_should_quit) {
+                       goto quit;
+               }
+
                /*
                 * If we cannot find the sessiond daemon, don't delay
                 * constructor execution.
@@ -861,8 +961,13 @@ restart:
                ust_unlock();
                goto restart;
        }
+       sock_info->socket = ret;
+
+       ust_lock();
 
-       sock_info->socket = sock = ret;
+       if (lttng_ust_comm_should_quit) {
+               goto quit;
+       }
 
        /*
         * Create only one root handle per listener thread for the whole
@@ -878,9 +983,79 @@ restart:
                sock_info->root_handle = ret;
        }
 
-       ret = register_app_to_sessiond(sock);
+       ret = register_to_sessiond(sock_info->socket, USTCTL_SOCKET_CMD);
+       if (ret < 0) {
+               ERR("Error registering to %s ust cmd socket",
+                       sock_info->name);
+               prev_connect_failed = 1;
+               /*
+                * If we cannot register to the sessiond daemon, don't
+                * delay constructor execution.
+                */
+               ret = handle_register_done(sock_info);
+               assert(!ret);
+               ust_unlock();
+               goto restart;
+       }
+
+       ust_unlock();
+
+       /* Connect notify socket */
+       ret = ustcomm_connect_unix_sock(sock_info->sock_path);
+       if (ret < 0) {
+               DBG("Info: sessiond not accepting connections to %s apps socket", sock_info->name);
+               prev_connect_failed = 1;
+
+               ust_lock();
+
+               if (lttng_ust_comm_should_quit) {
+                       goto quit;
+               }
+
+               /*
+                * If we cannot find the sessiond daemon, don't delay
+                * constructor execution.
+                */
+               ret = handle_register_done(sock_info);
+               assert(!ret);
+               ust_unlock();
+               goto restart;
+       }
+       sock_info->notify_socket = ret;
+
+       timeout = get_notify_sock_timeout();
+       if (timeout >= 0) {
+               /*
+                * Give at least 10ms to sessiond to reply to
+                * notifications.
+                */
+               if (timeout < 10)
+                       timeout = 10;
+               ret = ustcomm_setsockopt_rcv_timeout(sock_info->notify_socket,
+                               timeout);
+               if (ret < 0) {
+                       WARN("Error setting socket receive timeout");
+               }
+               ret = ustcomm_setsockopt_snd_timeout(sock_info->notify_socket,
+                               timeout);
+               if (ret < 0) {
+                       WARN("Error setting socket send timeout");
+               }
+       } else if (timeout < -1) {
+               WARN("Unsupported timeout value %ld", timeout);
+       }
+
+       ust_lock();
+
+       if (lttng_ust_comm_should_quit) {
+               goto quit;
+       }
+
+       ret = register_to_sessiond(sock_info->notify_socket,
+                       USTCTL_SOCKET_NOTIFY);
        if (ret < 0) {
-               ERR("Error registering to %s apps socket", sock_info->name);
+               ERR("Error registering to %s ust notify socket",
+                       sock_info->name);
                prev_connect_failed = 1;
                /*
                 * If we cannot register to the sessiond daemon, don't
@@ -891,6 +1066,8 @@ restart:
                ust_unlock();
                goto restart;
        }
+       sock = sock_info->socket;
+
        ust_unlock();
 
        for (;;) {
@@ -956,46 +1133,6 @@ quit:
        return NULL;
 }
 
-/*
- * Return values: -1: don't wait. 0: wait forever. 1: timeout wait.
- */
-static
-int get_timeout(struct timespec *constructor_timeout)
-{
-       long constructor_delay_ms = LTTNG_UST_DEFAULT_CONSTRUCTOR_TIMEOUT_MS;
-       char *str_delay;
-       int ret;
-
-       str_delay = getenv("LTTNG_UST_REGISTER_TIMEOUT");
-       if (str_delay) {
-               constructor_delay_ms = strtol(str_delay, NULL, 10);
-       }
-
-       switch (constructor_delay_ms) {
-       case -1:/* fall-through */
-       case 0:
-               return constructor_delay_ms;
-       default:
-               break;
-       }
-
-       /*
-        * If we are unable to find the current time, don't wait.
-        */
-       ret = clock_gettime(CLOCK_REALTIME, constructor_timeout);
-       if (ret) {
-               return -1;
-       }
-       constructor_timeout->tv_sec += constructor_delay_ms / 1000UL;
-       constructor_timeout->tv_nsec +=
-               (constructor_delay_ms % 1000UL) * 1000000UL;
-       if (constructor_timeout->tv_nsec >= 1000000000UL) {
-               constructor_timeout->tv_sec++;
-               constructor_timeout->tv_nsec -= 1000000000UL;
-       }
-       return 1;
-}
-
 /*
  * sessiond monitoring thread: monitor presence of global and per-user
  * sessiond by polling the application common named pipe.
@@ -1031,9 +1168,12 @@ void __attribute__((constructor)) lttng_ust_init(void)
        init_tracepoint();
        lttng_ring_buffer_metadata_client_init();
        lttng_ring_buffer_client_overwrite_init();
+       lttng_ring_buffer_client_overwrite_rt_init();
        lttng_ring_buffer_client_discard_init();
+       lttng_ring_buffer_client_discard_rt_init();
+       lttng_context_init();
 
-       timeout_mode = get_timeout(&constructor_timeout);
+       timeout_mode = get_constructor_timeout(&constructor_timeout);
 
        ret = sem_init(&constructor_wait, 0, 0);
        assert(!ret);
@@ -1134,7 +1274,10 @@ void lttng_ust_cleanup(int exiting)
         */
        lttng_ust_abi_exit();
        lttng_ust_events_exit();
+       lttng_context_exit();
+       lttng_ring_buffer_client_discard_rt_exit();
        lttng_ring_buffer_client_discard_exit();
+       lttng_ring_buffer_client_overwrite_rt_exit();
        lttng_ring_buffer_client_overwrite_exit();
        lttng_ring_buffer_metadata_client_exit();
        exit_tracepoint();
This page took 0.028009 seconds and 4 git commands to generate.