2 * Copyright (C) 2017 Julien Desfossez <jdesfossez@efficios.com>
3 * Copyright (C) 2018 Jérémie Galarneau <jeremie.galarneau@efficios.com>
5 * SPDX-License-Identifier: GPL-2.0-only
10 #include <lttng/trigger/trigger.h>
11 #include <common/error.h>
12 #include <common/config/session-config.h>
13 #include <common/defaults.h>
14 #include <common/utils.h>
15 #include <common/futex.h>
16 #include <common/align.h>
17 #include <common/time.h>
18 #include <common/hashtable/utils.h>
24 #include <common/kernel-ctl/kernel-ctl.h>
25 #include <lttng/notification/channel-internal.h>
26 #include <lttng/rotate-internal.h>
27 #include <lttng/location-internal.h>
29 #include "rotation-thread.h"
30 #include "lttng-sessiond.h"
31 #include "health-sessiond.h"
36 #include "notification-thread-commands.h"
41 #include <urcu/list.h>
43 struct lttng_notification_channel
*rotate_notification_channel
= NULL
;
45 struct rotation_thread
{
46 struct lttng_poll_event events
;
49 struct rotation_thread_job
{
50 enum rotation_thread_job_type type
;
51 struct ltt_session
*session
;
52 /* List member in struct rotation_thread_timer_queue. */
53 struct cds_list_head head
;
57 * The timer thread enqueues jobs and wakes up the rotation thread.
58 * When the rotation thread wakes up, it empties the queue.
60 struct rotation_thread_timer_queue
{
61 struct lttng_pipe
*event_pipe
;
62 struct cds_list_head list
;
66 struct rotation_thread_handle
{
67 struct rotation_thread_timer_queue
*rotation_timer_queue
;
68 /* Access to the notification thread cmd_queue */
69 struct notification_thread_handle
*notification_thread_handle
;
70 /* Thread-specific quit pipe. */
71 struct lttng_pipe
*quit_pipe
;
75 const char *get_job_type_str(enum rotation_thread_job_type job_type
)
78 case ROTATION_THREAD_JOB_TYPE_CHECK_PENDING_ROTATION
:
79 return "CHECK_PENDING_ROTATION";
80 case ROTATION_THREAD_JOB_TYPE_SCHEDULED_ROTATION
:
81 return "SCHEDULED_ROTATION";
87 struct rotation_thread_timer_queue
*rotation_thread_timer_queue_create(void)
89 struct rotation_thread_timer_queue
*queue
= NULL
;
91 queue
= zmalloc(sizeof(*queue
));
93 PERROR("Failed to allocate timer rotate queue");
97 queue
->event_pipe
= lttng_pipe_open(FD_CLOEXEC
| O_NONBLOCK
);
98 CDS_INIT_LIST_HEAD(&queue
->list
);
99 pthread_mutex_init(&queue
->lock
, NULL
);
104 void rotation_thread_timer_queue_destroy(
105 struct rotation_thread_timer_queue
*queue
)
111 lttng_pipe_destroy(queue
->event_pipe
);
113 pthread_mutex_lock(&queue
->lock
);
114 assert(cds_list_empty(&queue
->list
));
115 pthread_mutex_unlock(&queue
->lock
);
116 pthread_mutex_destroy(&queue
->lock
);
121 * Destroy the thread data previously created by the init function.
123 void rotation_thread_handle_destroy(
124 struct rotation_thread_handle
*handle
)
126 lttng_pipe_destroy(handle
->quit_pipe
);
130 struct rotation_thread_handle
*rotation_thread_handle_create(
131 struct rotation_thread_timer_queue
*rotation_timer_queue
,
132 struct notification_thread_handle
*notification_thread_handle
)
134 struct rotation_thread_handle
*handle
;
136 handle
= zmalloc(sizeof(*handle
));
141 handle
->rotation_timer_queue
= rotation_timer_queue
;
142 handle
->notification_thread_handle
= notification_thread_handle
;
143 handle
->quit_pipe
= lttng_pipe_open(FD_CLOEXEC
);
144 if (!handle
->quit_pipe
) {
151 rotation_thread_handle_destroy(handle
);
156 * Called with the rotation_thread_timer_queue lock held.
157 * Return true if the same timer job already exists in the queue, false if not.
160 bool timer_job_exists(const struct rotation_thread_timer_queue
*queue
,
161 enum rotation_thread_job_type job_type
,
162 struct ltt_session
*session
)
165 struct rotation_thread_job
*job
;
167 cds_list_for_each_entry(job
, &queue
->list
, head
) {
168 if (job
->session
== session
&& job
->type
== job_type
) {
177 void rotation_thread_enqueue_job(struct rotation_thread_timer_queue
*queue
,
178 enum rotation_thread_job_type job_type
,
179 struct ltt_session
*session
)
182 const char dummy
= '!';
183 struct rotation_thread_job
*job
= NULL
;
184 const char *job_type_str
= get_job_type_str(job_type
);
186 pthread_mutex_lock(&queue
->lock
);
187 if (timer_job_exists(queue
, job_type
, session
)) {
189 * This timer job is already pending, we don't need to add
195 job
= zmalloc(sizeof(struct rotation_thread_job
));
197 PERROR("Failed to allocate rotation thread job of type \"%s\" for session \"%s\"",
198 job_type_str
, session
->name
);
201 /* No reason for this to fail as the caller must hold a reference. */
202 (void) session_get(session
);
204 job
->session
= session
;
205 job
->type
= job_type
;
206 cds_list_add_tail(&job
->head
, &queue
->list
);
208 ret
= lttng_write(lttng_pipe_get_writefd(queue
->event_pipe
), &dummy
,
212 * We do not want to block in the timer handler, the job has
213 * been enqueued in the list, the wakeup pipe is probably full,
214 * the job will be processed when the rotation_thread catches
217 if (errno
== EAGAIN
|| errno
== EWOULDBLOCK
) {
219 * Not an error, but would be surprising and indicate
220 * that the rotation thread can't keep up with the
223 DBG("Wake-up pipe of rotation thread job queue is full");
226 PERROR("Failed to wake-up the rotation thread after pushing a job of type \"%s\" for session \"%s\"",
227 job_type_str
, session
->name
);
232 pthread_mutex_unlock(&queue
->lock
);
236 int init_poll_set(struct lttng_poll_event
*poll_set
,
237 struct rotation_thread_handle
*handle
)
242 * Create pollset with size 3:
243 * - rotation thread quit pipe,
244 * - rotation thread timer queue pipe,
245 * - notification channel sock,
247 ret
= lttng_poll_create(poll_set
, 5, LTTNG_CLOEXEC
);
252 ret
= lttng_poll_add(poll_set
,
253 lttng_pipe_get_readfd(handle
->quit_pipe
),
256 ERR("Failed to add quit pipe read fd to poll set");
260 ret
= lttng_poll_add(poll_set
,
261 lttng_pipe_get_readfd(handle
->rotation_timer_queue
->event_pipe
),
264 ERR("Failed to add rotate_pending fd to poll set");
270 lttng_poll_clean(poll_set
);
275 void fini_thread_state(struct rotation_thread
*state
)
277 lttng_poll_clean(&state
->events
);
278 if (rotate_notification_channel
) {
279 lttng_notification_channel_destroy(rotate_notification_channel
);
284 int init_thread_state(struct rotation_thread_handle
*handle
,
285 struct rotation_thread
*state
)
289 memset(state
, 0, sizeof(*state
));
290 lttng_poll_init(&state
->events
);
292 ret
= init_poll_set(&state
->events
, handle
);
294 ERR("Failed to initialize rotation thread poll set");
298 rotate_notification_channel
= lttng_notification_channel_create(
299 lttng_session_daemon_notification_endpoint
);
300 if (!rotate_notification_channel
) {
301 ERR("Could not create notification channel");
305 ret
= lttng_poll_add(&state
->events
, rotate_notification_channel
->socket
,
308 ERR("Failed to add notification fd to pollset");
317 void check_session_rotation_pending_on_consumers(struct ltt_session
*session
,
318 bool *_rotation_completed
)
321 struct consumer_socket
*socket
;
322 struct cds_lfht_iter iter
;
323 enum consumer_trace_chunk_exists_status exists_status
;
325 bool chunk_exists_on_peer
= false;
326 enum lttng_trace_chunk_status chunk_status
;
328 assert(session
->chunk_being_archived
);
331 * Check for a local pending rotation on all consumers (32-bit
332 * user space, 64-bit user space, and kernel).
335 if (!session
->ust_session
) {
338 cds_lfht_for_each_entry(session
->ust_session
->consumer
->socks
->ht
,
339 &iter
, socket
, node
.node
) {
340 relayd_id
= session
->ust_session
->consumer
->type
== CONSUMER_DST_LOCAL
?
342 session
->ust_session
->consumer
->net_seq_index
;
344 pthread_mutex_lock(socket
->lock
);
345 ret
= consumer_trace_chunk_exists(socket
,
347 session
->id
, session
->chunk_being_archived
,
350 pthread_mutex_unlock(socket
->lock
);
351 ERR("Error occurred while checking rotation status on consumer daemon");
355 if (exists_status
!= CONSUMER_TRACE_CHUNK_EXISTS_STATUS_UNKNOWN_CHUNK
) {
356 pthread_mutex_unlock(socket
->lock
);
357 chunk_exists_on_peer
= true;
360 pthread_mutex_unlock(socket
->lock
);
364 if (!session
->kernel_session
) {
367 cds_lfht_for_each_entry(session
->kernel_session
->consumer
->socks
->ht
,
368 &iter
, socket
, node
.node
) {
369 pthread_mutex_lock(socket
->lock
);
370 relayd_id
= session
->kernel_session
->consumer
->type
== CONSUMER_DST_LOCAL
?
372 session
->kernel_session
->consumer
->net_seq_index
;
374 ret
= consumer_trace_chunk_exists(socket
,
376 session
->id
, session
->chunk_being_archived
,
379 pthread_mutex_unlock(socket
->lock
);
380 ERR("Error occurred while checking rotation status on consumer daemon");
384 if (exists_status
!= CONSUMER_TRACE_CHUNK_EXISTS_STATUS_UNKNOWN_CHUNK
) {
385 pthread_mutex_unlock(socket
->lock
);
386 chunk_exists_on_peer
= true;
389 pthread_mutex_unlock(socket
->lock
);
395 if (!chunk_exists_on_peer
) {
396 uint64_t chunk_being_archived_id
;
398 chunk_status
= lttng_trace_chunk_get_id(
399 session
->chunk_being_archived
,
400 &chunk_being_archived_id
);
401 assert(chunk_status
== LTTNG_TRACE_CHUNK_STATUS_OK
);
402 DBG("Rotation of trace archive %" PRIu64
" of session \"%s\" is complete on all consumers",
403 chunk_being_archived_id
,
406 *_rotation_completed
= !chunk_exists_on_peer
;
408 ret
= session_reset_rotation_state(session
,
409 LTTNG_ROTATION_STATE_ERROR
);
411 ERR("Failed to reset rotation state of session \"%s\"",
418 * Check if the last rotation was completed, called with session lock held.
419 * Should only return non-zero in the event of a fatal error. Doing so will
420 * shutdown the thread.
423 int check_session_rotation_pending(struct ltt_session
*session
,
424 struct notification_thread_handle
*notification_thread_handle
)
427 struct lttng_trace_archive_location
*location
;
428 enum lttng_trace_chunk_status chunk_status
;
429 bool rotation_completed
= false;
430 const char *archived_chunk_name
;
431 uint64_t chunk_being_archived_id
;
433 if (!session
->chunk_being_archived
) {
438 chunk_status
= lttng_trace_chunk_get_id(session
->chunk_being_archived
,
439 &chunk_being_archived_id
);
440 assert(chunk_status
== LTTNG_TRACE_CHUNK_STATUS_OK
);
442 DBG("Checking for pending rotation on session \"%s\", trace archive %" PRIu64
,
443 session
->name
, chunk_being_archived_id
);
446 * The rotation-pending check timer of a session is launched in
447 * one-shot mode. If the rotation is incomplete, the rotation
448 * thread will re-enable the pending-check timer.
450 * The timer thread can't stop the timer itself since it is involved
451 * in the check for the timer's quiescence.
453 ret
= timer_session_rotation_pending_check_stop(session
);
455 goto check_ongoing_rotation
;
458 check_session_rotation_pending_on_consumers(session
,
459 &rotation_completed
);
460 if (!rotation_completed
||
461 session
->rotation_state
== LTTNG_ROTATION_STATE_ERROR
) {
462 goto check_ongoing_rotation
;
466 * Now we can clear the "ONGOING" state in the session. New
467 * rotations can start now.
469 chunk_status
= lttng_trace_chunk_get_name(session
->chunk_being_archived
,
470 &archived_chunk_name
, NULL
);
471 assert(chunk_status
== LTTNG_TRACE_CHUNK_STATUS_OK
);
472 free(session
->last_archived_chunk_name
);
473 session
->last_archived_chunk_name
= strdup(archived_chunk_name
);
474 if (!session
->last_archived_chunk_name
) {
475 PERROR("Failed to duplicate archived chunk name");
477 session_reset_rotation_state(session
, LTTNG_ROTATION_STATE_COMPLETED
);
479 if (!session
->quiet_rotation
) {
480 location
= session_get_trace_archive_location(session
);
481 ret
= notification_thread_command_session_rotation_completed(
482 notification_thread_handle
,
486 session
->last_archived_chunk_id
.value
,
488 lttng_trace_archive_location_put(location
);
489 if (ret
!= LTTNG_OK
) {
490 ERR("Failed to notify notification thread of completed rotation for session %s",
496 check_ongoing_rotation
:
497 if (session
->rotation_state
== LTTNG_ROTATION_STATE_ONGOING
) {
498 chunk_status
= lttng_trace_chunk_get_id(
499 session
->chunk_being_archived
,
500 &chunk_being_archived_id
);
501 assert(chunk_status
== LTTNG_TRACE_CHUNK_STATUS_OK
);
503 DBG("Rotation of trace archive %" PRIu64
" is still pending for session %s",
504 chunk_being_archived_id
, session
->name
);
505 ret
= timer_session_rotation_pending_check_start(session
,
506 DEFAULT_ROTATE_PENDING_TIMER
);
508 ERR("Failed to re-enable rotation pending timer");
518 /* Call with the session and session_list locks held. */
520 int launch_session_rotation(struct ltt_session
*session
)
523 struct lttng_rotate_session_return rotation_return
;
525 DBG("Launching scheduled time-based rotation on session \"%s\"",
528 ret
= cmd_rotate_session(session
, &rotation_return
, false,
529 LTTNG_TRACE_CHUNK_COMMAND_TYPE_MOVE_TO_COMPLETED
);
530 if (ret
== LTTNG_OK
) {
531 DBG("Scheduled time-based rotation successfully launched on session \"%s\"",
534 /* Don't consider errors as fatal. */
535 DBG("Scheduled time-based rotation aborted for session %s: %s",
536 session
->name
, lttng_strerror(ret
));
542 int run_job(struct rotation_thread_job
*job
, struct ltt_session
*session
,
543 struct notification_thread_handle
*notification_thread_handle
)
548 case ROTATION_THREAD_JOB_TYPE_SCHEDULED_ROTATION
:
549 ret
= launch_session_rotation(session
);
551 case ROTATION_THREAD_JOB_TYPE_CHECK_PENDING_ROTATION
:
552 ret
= check_session_rotation_pending(session
,
553 notification_thread_handle
);
562 int handle_job_queue(struct rotation_thread_handle
*handle
,
563 struct rotation_thread
*state
,
564 struct rotation_thread_timer_queue
*queue
)
569 struct ltt_session
*session
;
570 struct rotation_thread_job
*job
;
572 /* Take the queue lock only to pop an element from the list. */
573 pthread_mutex_lock(&queue
->lock
);
574 if (cds_list_empty(&queue
->list
)) {
575 pthread_mutex_unlock(&queue
->lock
);
578 job
= cds_list_first_entry(&queue
->list
,
580 cds_list_del(&job
->head
);
581 pthread_mutex_unlock(&queue
->lock
);
584 session
= job
->session
;
586 DBG("Session \"%s\" not found",
589 * This is a non-fatal error, and we cannot report it to
590 * the user (timer), so just print the error and
591 * continue the processing.
593 * While the timer thread will purge pending signals for
594 * a session on the session's destruction, it is
595 * possible for a job targeting that session to have
596 * already been queued before it was destroyed.
599 session_put(session
);
600 session_unlock_list();
604 session_lock(session
);
605 ret
= run_job(job
, session
, handle
->notification_thread_handle
);
606 session_unlock(session
);
607 /* Release reference held by the job. */
608 session_put(session
);
609 session_unlock_list();
623 int handle_condition(const struct lttng_condition
*condition
,
624 const struct lttng_evaluation
*evaluation
,
625 struct notification_thread_handle
*notification_thread_handle
)
628 const char *condition_session_name
= NULL
;
629 enum lttng_condition_type condition_type
;
630 enum lttng_condition_status condition_status
;
631 enum lttng_evaluation_status evaluation_status
;
633 struct ltt_session
*session
;
635 condition_type
= lttng_condition_get_type(condition
);
637 if (condition_type
!= LTTNG_CONDITION_TYPE_SESSION_CONSUMED_SIZE
) {
639 ERR("Condition type and session usage type are not the same");
643 /* Fetch info to test */
644 condition_status
= lttng_condition_session_consumed_size_get_session_name(
645 condition
, &condition_session_name
);
646 if (condition_status
!= LTTNG_CONDITION_STATUS_OK
) {
647 ERR("Session name could not be fetched");
651 evaluation_status
= lttng_evaluation_session_consumed_size_get_consumed_size(evaluation
,
653 if (evaluation_status
!= LTTNG_EVALUATION_STATUS_OK
) {
654 ERR("Failed to get evaluation");
660 session
= session_find_by_name(condition_session_name
);
663 session_unlock_list();
664 ERR("Session \"%s\" not found",
665 condition_session_name
);
668 session_lock(session
);
670 ret
= unsubscribe_session_consumed_size_rotation(session
,
671 notification_thread_handle
);
676 ret
= cmd_rotate_session(session
, NULL
, false,
677 LTTNG_TRACE_CHUNK_COMMAND_TYPE_MOVE_TO_COMPLETED
);
678 if (ret
== -LTTNG_ERR_ROTATION_PENDING
) {
679 DBG("Rotate already pending, subscribe to the next threshold value");
680 } else if (ret
!= LTTNG_OK
) {
681 ERR("Failed to rotate on size notification with error: %s",
682 lttng_strerror(ret
));
686 ret
= subscribe_session_consumed_size_rotation(session
,
687 consumed
+ session
->rotate_size
,
688 notification_thread_handle
);
690 ERR("Failed to subscribe to session consumed size condition");
696 session_unlock(session
);
697 session_put(session
);
698 session_unlock_list();
704 int handle_notification_channel(int fd
,
705 struct rotation_thread_handle
*handle
,
706 struct rotation_thread
*state
)
709 bool notification_pending
;
710 struct lttng_notification
*notification
= NULL
;
711 enum lttng_notification_channel_status status
;
712 const struct lttng_evaluation
*notification_evaluation
;
713 const struct lttng_condition
*notification_condition
;
715 status
= lttng_notification_channel_has_pending_notification(
716 rotate_notification_channel
, ¬ification_pending
);
717 if (status
!= LTTNG_NOTIFICATION_CHANNEL_STATUS_OK
) {
718 ERR("Error occurred while checking for pending notification");
723 if (!notification_pending
) {
728 /* Receive the next notification. */
729 status
= lttng_notification_channel_get_next_notification(
730 rotate_notification_channel
,
734 case LTTNG_NOTIFICATION_CHANNEL_STATUS_OK
:
736 case LTTNG_NOTIFICATION_CHANNEL_STATUS_NOTIFICATIONS_DROPPED
:
737 /* Not an error, we will wait for the next one */
740 case LTTNG_NOTIFICATION_CHANNEL_STATUS_CLOSED
:
741 ERR("Notification channel was closed");
745 /* Unhandled conditions / errors. */
746 ERR("Unknown notification channel status");
751 notification_condition
= lttng_notification_get_condition(notification
);
752 notification_evaluation
= lttng_notification_get_evaluation(notification
);
754 ret
= handle_condition(notification_condition
, notification_evaluation
,
755 handle
->notification_thread_handle
);
758 lttng_notification_destroy(notification
);
763 void *thread_rotation(void *data
)
766 struct rotation_thread_handle
*handle
= data
;
767 struct rotation_thread thread
;
770 DBG("Started rotation thread");
771 rcu_register_thread();
773 health_register(the_health_sessiond
, HEALTH_SESSIOND_TYPE_ROTATION
);
774 health_code_update();
777 ERR("Invalid thread context provided");
781 queue_pipe_fd
= lttng_pipe_get_readfd(
782 handle
->rotation_timer_queue
->event_pipe
);
785 ret
= init_thread_state(handle
, &thread
);
794 DBG("Entering poll wait");
795 ret
= lttng_poll_wait(&thread
.events
, -1);
796 DBG("Poll wait returned (%i)", ret
);
800 * Restart interrupted system call.
802 if (errno
== EINTR
) {
805 ERR("Error encountered during lttng_poll_wait (%i)", ret
);
810 for (i
= 0; i
< fd_count
; i
++) {
811 int fd
= LTTNG_POLL_GETFD(&thread
.events
, i
);
812 uint32_t revents
= LTTNG_POLL_GETEV(&thread
.events
, i
);
814 DBG("Handling fd (%i) activity (%u)",
817 if (revents
& LPOLLERR
) {
818 ERR("Polling returned an error on fd %i", fd
);
822 if (fd
== rotate_notification_channel
->socket
) {
823 ret
= handle_notification_channel(fd
, handle
,
826 ERR("Error occurred while handling activity on notification channel socket");
830 /* Job queue or quit pipe activity. */
833 * The job queue is serviced if there is
834 * activity on the quit pipe to ensure it is
835 * flushed and all references held in the queue
838 ret
= handle_job_queue(handle
, &thread
,
839 handle
->rotation_timer_queue
);
841 ERR("Failed to handle rotation timer pipe event");
845 if (fd
== queue_pipe_fd
) {
848 ret
= lttng_read(fd
, &buf
, 1);
850 ERR("Failed to read from wakeup pipe (fd = %i)", fd
);
854 DBG("Quit pipe activity");
863 fini_thread_state(&thread
);
865 health_unregister(the_health_sessiond
);
866 rcu_thread_offline();
867 rcu_unregister_thread();
872 bool shutdown_rotation_thread(void *thread_data
)
874 struct rotation_thread_handle
*handle
= thread_data
;
875 const int write_fd
= lttng_pipe_get_writefd(handle
->quit_pipe
);
877 return notify_thread_pipe(write_fd
) == 1;
880 bool launch_rotation_thread(struct rotation_thread_handle
*handle
)
882 struct lttng_thread
*thread
;
884 thread
= lttng_thread_create("Rotation",
886 shutdown_rotation_thread
,
892 lttng_thread_put(thread
);