From 95245d44f4c3d2f062d3c348710b605002168446 Mon Sep 17 00:00:00 2001 From: =?utf8?q?J=C3=A9r=C3=A9mie=20Galarneau?= Date: Mon, 30 Sep 2019 15:57:33 -0400 Subject: [PATCH] Fix: trace chunk reported unknown before close command execution MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit The check for the existence of a trace chunk is done by using the regular 'find' operations on a trace chunk registry in both the relay and consumer daemons. The 'find' operations attempt to acquire a reference to the trace chunk being looked-up. On failure to acquire a reference, a trace chunk is reported as being "unknown". The rotation completion check uses this mechanism to determine if a trace chunk is still in use. A close command defers a given operation until a trace chunk is no longer in use (when its last reference is dropped). Hence, a thread can attempt to 'find' a trace chunk, fail to acquire a reference, and fail to see the effects of the close command. In other words, the thread that has dropped the last reference to the chunk could still be running the close command of a trace chunk that is reported to be "unknown" by the consumer and relay daemons. To fix this, dedicated "chunk_exists" operations are introduced. These operations do not attempt to acquire a trace chunk. They only look it up in the registry's internal hash table. As the removal of the trace chunk from the hash table is performed _after_ the execution of its close command, it provides a reliable way to ensure that a chunk that had a close command could complete it before being reported as "unknown"/no longer in use. In terms of provided guarantees, this changes the moment at which a trace chunk is considered to no longer exist from the moment its last reference was dropped to the moment after the execution of its close command has completed. Signed-off-by: Jérémie Galarneau --- src/bin/lttng-relayd/main.c | 24 +++++++------ src/bin/lttng-relayd/sessiond-trace-chunks.c | 36 ++++++++++++++++++++ src/bin/lttng-relayd/sessiond-trace-chunks.h | 5 +++ src/common/consumer/consumer.c | 19 +++++++---- src/common/trace-chunk-registry.h | 10 ++++++ src/common/trace-chunk.c | 35 +++++++++++++++++++ 6 files changed, 112 insertions(+), 17 deletions(-) diff --git a/src/bin/lttng-relayd/main.c b/src/bin/lttng-relayd/main.c index 0cd163fe8..5b32341cd 100644 --- a/src/bin/lttng-relayd/main.c +++ b/src/bin/lttng-relayd/main.c @@ -2674,8 +2674,8 @@ static int relay_trace_chunk_exists(const struct lttcomm_relayd_hdr *recv_hdr, struct lttcomm_relayd_trace_chunk_exists *msg; struct lttcomm_relayd_trace_chunk_exists_reply reply = {}; struct lttng_buffer_view header_view; - struct lttng_trace_chunk *chunk = NULL; uint64_t chunk_id; + bool chunk_exists; if (!session || !conn->version_check_done) { ERR("Trying to close a trace chunk before version check"); @@ -2700,25 +2700,29 @@ static int relay_trace_chunk_exists(const struct lttcomm_relayd_hdr *recv_hdr, msg = (typeof(msg)) header_view.data; chunk_id = be64toh(msg->chunk_id); - chunk = sessiond_trace_chunk_registry_get_chunk( + ret = sessiond_trace_chunk_registry_chunk_exists( sessiond_trace_chunk_registry, conn->session->sessiond_uuid, conn->session->id, - chunk_id); - - reply = (typeof(reply)) { - .generic.ret_code = htobe32((uint32_t) LTTNG_OK), - .trace_chunk_exists = !!chunk, + chunk_id, &chunk_exists); + /* + * If ret is not 0, send the reply and report the error to the caller. + * It is a protocol (or internal) error and the session/connection + * should be torn down. + */ + reply = (typeof(reply)){ + .generic.ret_code = htobe32((uint32_t) + (ret == 0 ? LTTNG_OK : LTTNG_ERR_INVALID_PROTOCOL)), + .trace_chunk_exists = ret == 0 ? chunk_exists : 0, }; - send_ret = conn->sock->ops->sendmsg(conn->sock, - &reply, sizeof(reply), 0); + send_ret = conn->sock->ops->sendmsg( + conn->sock, &reply, sizeof(reply), 0); if (send_ret < (ssize_t) sizeof(reply)) { ERR("Failed to send \"create trace chunk\" command reply (ret = %zd)", send_ret); ret = -1; } end_no_reply: - lttng_trace_chunk_put(chunk); return ret; } diff --git a/src/bin/lttng-relayd/sessiond-trace-chunks.c b/src/bin/lttng-relayd/sessiond-trace-chunks.c index e7a7d112d..495227226 100644 --- a/src/bin/lttng-relayd/sessiond-trace-chunks.c +++ b/src/bin/lttng-relayd/sessiond-trace-chunks.c @@ -476,3 +476,39 @@ sessiond_trace_chunk_registry_get_chunk( end: return chunk; } + +int sessiond_trace_chunk_registry_chunk_exists( + struct sessiond_trace_chunk_registry *sessiond_registry, + const lttng_uuid sessiond_uuid, + uint64_t session_id, uint64_t chunk_id, bool *chunk_exists) +{ + int ret; + struct trace_chunk_registry_ht_element *element; + struct trace_chunk_registry_ht_key key; + + lttng_uuid_copy(key.sessiond_uuid, sessiond_uuid); + element = trace_chunk_registry_ht_element_find(sessiond_registry, &key); + if (!element) { + char uuid_str[UUID_STR_LEN]; + + lttng_uuid_to_str(sessiond_uuid, uuid_str); + /* + * While this certainly means that the chunk does not exist, + * it is unexpected for a chunk existence query to target a + * session daemon that does not have an active + * connection/registry. This would indicate a protocol + * (or internal) error. + */ + ERR("Failed to find trace chunk registry of sessiond {%s}", + uuid_str); + ret = -1; + goto end; + } + + ret = lttng_trace_chunk_registry_chunk_exists( + element->trace_chunk_registry, + session_id, chunk_id, chunk_exists); + trace_chunk_registry_ht_element_put(element); +end: + return ret; +} diff --git a/src/bin/lttng-relayd/sessiond-trace-chunks.h b/src/bin/lttng-relayd/sessiond-trace-chunks.h index 39804679c..c62fd3e6f 100644 --- a/src/bin/lttng-relayd/sessiond-trace-chunks.h +++ b/src/bin/lttng-relayd/sessiond-trace-chunks.h @@ -55,4 +55,9 @@ sessiond_trace_chunk_registry_get_chunk( const lttng_uuid sessiond_uuid, uint64_t session_id, uint64_t chunk_id); +int sessiond_trace_chunk_registry_chunk_exists( + struct sessiond_trace_chunk_registry *sessiond_registry, + const lttng_uuid sessiond_uuid, + uint64_t session_id, uint64_t chunk_id, bool *chunk_exists); + #endif /* SESSIOND_TRACE_CHUNK_REGISTRY_H */ diff --git a/src/common/consumer/consumer.c b/src/common/consumer/consumer.c index e1426f116..0086d748f 100644 --- a/src/common/consumer/consumer.c +++ b/src/common/consumer/consumer.c @@ -4772,12 +4772,11 @@ enum lttcomm_return_code lttng_consumer_trace_chunk_exists( { int ret; enum lttcomm_return_code ret_code; - struct lttng_trace_chunk *chunk; char relayd_id_buffer[MAX_INT_DEC_LEN(*relayd_id)]; const char *relayd_id_str = "(none)"; const bool is_local_trace = !relayd_id; struct consumer_relayd_sock_pair *relayd = NULL; - bool chunk_exists_remote; + bool chunk_exists_local, chunk_exists_remote; if (relayd_id) { int ret; @@ -4795,13 +4794,19 @@ enum lttcomm_return_code lttng_consumer_trace_chunk_exists( DBG("Consumer trace chunk exists command: relayd_id = %s" ", chunk_id = %" PRIu64, relayd_id_str, chunk_id); - chunk = lttng_trace_chunk_registry_find_chunk( + ret = lttng_trace_chunk_registry_chunk_exists( consumer_data.chunk_registry, session_id, - chunk_id); - DBG("Trace chunk %s locally", chunk ? "exists" : "does not exist"); - if (chunk) { + chunk_id, &chunk_exists_local); + if (ret) { + /* Internal error. */ + ERR("Failed to query the existence of a trace chunk"); + ret_code = LTTCOMM_CONSUMERD_FATAL; + goto end_rcu_unlock; + } + DBG("Trace chunk %s locally", + chunk_exists_local ? "exists" : "does not exist"); + if (chunk_exists_local) { ret_code = LTTCOMM_CONSUMERD_TRACE_CHUNK_EXISTS_LOCAL; - lttng_trace_chunk_put(chunk); goto end; } else if (is_local_trace) { ret_code = LTTCOMM_CONSUMERD_UNKNOWN_TRACE_CHUNK; diff --git a/src/common/trace-chunk-registry.h b/src/common/trace-chunk-registry.h index 23df91a3e..afe942ba9 100644 --- a/src/common/trace-chunk-registry.h +++ b/src/common/trace-chunk-registry.h @@ -83,6 +83,16 @@ lttng_trace_chunk_registry_find_chunk( const struct lttng_trace_chunk_registry *registry, uint64_t session_id, uint64_t chunk_id); +/* + * Query the existence of a trace chunk by session_id and chunk_id. + * + * Returns 0 on success, a negative value on error. + */ +LTTNG_HIDDEN +int lttng_trace_chunk_registry_chunk_exists( + const struct lttng_trace_chunk_registry *registry, + uint64_t session_id, uint64_t chunk_id, bool *chunk_exists); + /* * Look-up an anonymous trace chunk by session_id. * A reference is acquired on behalf of the caller. diff --git a/src/common/trace-chunk.c b/src/common/trace-chunk.c index 601d402e0..affe3ac20 100644 --- a/src/common/trace-chunk.c +++ b/src/common/trace-chunk.c @@ -1358,6 +1358,41 @@ lttng_trace_chunk_registry_find_chunk( session_id, &chunk_id); } +LTTNG_HIDDEN +int lttng_trace_chunk_registry_chunk_exists( + const struct lttng_trace_chunk_registry *registry, + uint64_t session_id, uint64_t chunk_id, bool *chunk_exists) +{ + int ret = 0; + const struct lttng_trace_chunk_registry_element target_element = { + .chunk.id.is_set = true, + .chunk.id.value = chunk_id, + .session_id = session_id, + }; + const unsigned long element_hash = + lttng_trace_chunk_registry_element_hash( + &target_element); + struct cds_lfht_node *published_node; + struct cds_lfht_iter iter; + + rcu_read_lock(); + cds_lfht_lookup(registry->ht, + element_hash, + lttng_trace_chunk_registry_element_match, + &target_element, + &iter); + published_node = cds_lfht_iter_get_node(&iter); + if (!published_node) { + *chunk_exists = false; + goto end; + } + + *chunk_exists = !cds_lfht_is_node_deleted(published_node); +end: + rcu_read_unlock(); + return ret; +} + LTTNG_HIDDEN struct lttng_trace_chunk * lttng_trace_chunk_registry_find_anonymous_chunk( -- 2.34.1