| 1 | #!/usr/bin/env python3 |
| 2 | # |
| 3 | # SPDX-FileCyoprightText: Kienan Stewart <kstewart@efficios.com> |
| 4 | # SPDX-License-Identifier: GPL-2.0-only |
| 5 | |
| 6 | """ |
| 7 | Test that the consumerd doesn't leak file descriptor allocations in /dev/shm |
| 8 | when the relayd exits before instrumented applications start. |
| 9 | |
| 10 | @see https://bugs.lttng.org/issues/1411 |
| 11 | """ |
| 12 | |
| 13 | import os |
| 14 | import pathlib |
| 15 | import subprocess |
| 16 | import sys |
| 17 | |
| 18 | test_utils_import_path = pathlib.Path(__file__).absolute().parents[3] / "utils" |
| 19 | sys.path.append(str(test_utils_import_path)) |
| 20 | |
| 21 | import lttngtest |
| 22 | |
| 23 | |
| 24 | def get_consumerd_pid(tap, parent, match_string): |
| 25 | pid = 0 |
| 26 | try: |
| 27 | process = subprocess.Popen( |
| 28 | ["pgrep", "-P", str(parent), "-f", match_string], |
| 29 | stdout=subprocess.PIPE, |
| 30 | ) |
| 31 | process.wait() |
| 32 | output = str(process.stdout.read(), encoding="UTF-8").splitlines() |
| 33 | if len(output) != 1: |
| 34 | raise Exception( |
| 35 | "Unexpected number of output lines (got {}): {}".format( |
| 36 | len(output), output |
| 37 | ) |
| 38 | ) |
| 39 | pid = int(output[0]) |
| 40 | except Exception as e: |
| 41 | tap.diagnostic( |
| 42 | "Failed to find child process of '{}' matching '{}': '{}'".format( |
| 43 | parent, match_string, str(e) |
| 44 | ) |
| 45 | ) |
| 46 | return pid |
| 47 | |
| 48 | |
| 49 | def count_process_dev_shm_fds(pid): |
| 50 | count = 0 |
| 51 | if pid == 0: |
| 52 | return count |
| 53 | dir = os.path.join("/proc", str(pid), "fd") |
| 54 | for root, dirs, files in os.walk(dir): |
| 55 | for f in files: |
| 56 | filename = pathlib.Path(os.path.join(root, f)) |
| 57 | try: |
| 58 | if filename.is_symlink() and str(filename.resolve()).startswith( |
| 59 | "/dev/shm/shm-ust-consumer" |
| 60 | ): |
| 61 | count += 1 |
| 62 | except FileNotFoundError: |
| 63 | # As we're walking /proc/XX/fd/, fds may be added or removed |
| 64 | continue |
| 65 | return count |
| 66 | |
| 67 | |
| 68 | def count_dev_shm_fds(tap, test_env): |
| 69 | consumer32_pid = get_consumerd_pid(tap, test_env._sessiond.pid, "ustconsumerd32") |
| 70 | fds_consumerd32 = count_process_dev_shm_fds(consumer32_pid) |
| 71 | consumer64_pid = get_consumerd_pid(tap, test_env._sessiond.pid, "ustconsumerd64") |
| 72 | fds_consumerd64 = count_process_dev_shm_fds(consumer64_pid) |
| 73 | return (fds_consumerd32, fds_consumerd64) |
| 74 | |
| 75 | |
| 76 | def test_fd_leak(tap, test_env, buffer_sharing_policy, kill_relayd=True): |
| 77 | tap.diagnostic( |
| 78 | "test_fd_leak with buffer sharing policy {}, kill relayd: {}".format( |
| 79 | buffer_sharing_policy, kill_relayd |
| 80 | ) |
| 81 | ) |
| 82 | client = lttngtest.LTTngClient(test_env, log=tap.diagnostic) |
| 83 | output = lttngtest.NetworkSessionOutputLocation( |
| 84 | "net://localhost:{}:{}/".format( |
| 85 | test_env.lttng_relayd_control_port, test_env.lttng_relayd_data_port |
| 86 | ) |
| 87 | ) |
| 88 | |
| 89 | session = client.create_session(output=output, live=True) |
| 90 | channel = session.add_channel( |
| 91 | lttngtest.lttngctl.TracingDomain.User, |
| 92 | buffer_sharing_policy=buffer_sharing_policy, |
| 93 | ) |
| 94 | channel.add_recording_rule(lttngtest.lttngctl.UserTracepointEventRule()) |
| 95 | session.start() |
| 96 | |
| 97 | count_post_start = count_dev_shm_fds(tap, test_env) |
| 98 | |
| 99 | # Kill the relayd |
| 100 | if kill_relayd: |
| 101 | test_env._terminate_relayd() |
| 102 | |
| 103 | test_env.launch_wait_trace_test_application(10) |
| 104 | count_post_app1 = count_dev_shm_fds(tap, test_env) |
| 105 | |
| 106 | test_env.launch_wait_trace_test_application(10) |
| 107 | count_post_app2 = count_dev_shm_fds(tap, test_env) |
| 108 | |
| 109 | test_env.launch_wait_trace_test_application(10) |
| 110 | count_post_app3 = count_dev_shm_fds(tap, test_env) |
| 111 | |
| 112 | session.stop() |
| 113 | session.destroy() |
| 114 | |
| 115 | count_post_destroy = count_dev_shm_fds(tap, test_env) |
| 116 | |
| 117 | tap.diagnostic( |
| 118 | "FD counts post-start: {}, post-destroy: {}".format( |
| 119 | count_post_start, count_post_destroy |
| 120 | ) |
| 121 | ) |
| 122 | tap.test( |
| 123 | count_post_start == count_post_destroy, |
| 124 | "Count of consumerd FDs in /dev/shm are equal after session start then after destroy", |
| 125 | ) |
| 126 | |
| 127 | tap.diagnostic( |
| 128 | "FD counts post-app-1: {}, post-app-2: {}, post-app-3: {}".format( |
| 129 | count_post_app1, count_post_app2, count_post_app3 |
| 130 | ) |
| 131 | ) |
| 132 | if buffer_sharing_policy == lttngtest.lttngctl.BufferSharingPolicy.PerUID: |
| 133 | tap.test( |
| 134 | (count_post_app1 == count_post_app2) |
| 135 | and (count_post_app2 == count_post_app3), |
| 136 | "Count of consumerd FDs in /dev/shm doesn't leak over several application invocations", |
| 137 | ) |
| 138 | else: |
| 139 | tap.skip( |
| 140 | "Count of consumerds FDs in /dev/shm doesn't leak over several application invocations - no mechanism is available to guarantee buffer reclamation within a given time frame" |
| 141 | ) |
| 142 | |
| 143 | |
| 144 | tap = lttngtest.TapGenerator(8) |
| 145 | for kill_relayd in [True, False]: |
| 146 | for buffer_sharing_policy in [ |
| 147 | lttngtest.lttngctl.BufferSharingPolicy.PerUID, |
| 148 | lttngtest.lttngctl.BufferSharingPolicy.PerPID, |
| 149 | ]: |
| 150 | with lttngtest.test_environment( |
| 151 | log=tap.diagnostic, with_relayd=True, with_sessiond=True |
| 152 | ) as test_env: |
| 153 | test_fd_leak(tap, test_env, buffer_sharing_policy, kill_relayd) |
| 154 | |
| 155 | sys.exit(0 if tap.is_successful else 1) |