lib/ringbuffer/ring_buffer_frontend.c

   1 /* SPDX-License-Identifier: (GPL-2.0 OR LGPL-2.1)
   2  *
   3  * ring_buffer_frontend.c
   4  *
   5  * Copyright (C) 2005-2012 Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
   6  *
   7  * Ring buffer wait-free buffer synchronization. Producer-consumer and flight
   8  * recorder (overwrite) modes. See thesis:
   9  *
  10  * Desnoyers, Mathieu (2009), "Low-Impact Operating System Tracing", Ph.D.
  11  * dissertation, Ecole Polytechnique de Montreal.
  12  * http://www.lttng.org/pub/thesis/desnoyers-dissertation-2009-12.pdf
  13  *
  14  * - Algorithm presentation in Chapter 5:
  15  *     "Lockless Multi-Core High-Throughput Buffering".
  16  * - Algorithm formal verification in Section 8.6:
  17  *     "Formal verification of LTTng"
  18  *
  19  * Author:
  20  *      Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
  21  *
  22  * Inspired from LTT and RelayFS:
  23  *  Karim Yaghmour <karim@opersys.com>
  24  *  Tom Zanussi <zanussi@us.ibm.com>
  25  *  Bob Wisniewski <bob@watson.ibm.com>
  26  * And from K42 :
  27  *  Bob Wisniewski <bob@watson.ibm.com>
  28  *
  29  * Buffer reader semantic :
  30  *
  31  * - get_subbuf_size
  32  * while buffer is not finalized and empty
  33  *   - get_subbuf
  34  *     - if return value != 0, continue
  35  *   - splice one subbuffer worth of data to a pipe
  36  *   - splice the data from pipe to disk/network
  37  *   - put_subbuf
  38  */
  39
  40 #include <linux/delay.h>
  41 #include <linux/module.h>
  42 #include <linux/percpu.h>
  43 #include <asm/cacheflush.h>
  44
  45 #include <wrapper/ringbuffer/config.h>
  46 #include <wrapper/ringbuffer/backend.h>
  47 #include <wrapper/ringbuffer/frontend.h>
  48 #include <wrapper/ringbuffer/iterator.h>
  49 #include <wrapper/ringbuffer/nohz.h>
  50 #include <wrapper/atomic.h>
  51 #include <wrapper/kref.h>
  52 #include <wrapper/percpu-defs.h>
  53 #include <wrapper/timer.h>
  54 #include <wrapper/vmalloc.h>
  55
  56 /*
  57  * Internal structure representing offsets to use at a sub-buffer switch.
  58  */
  59 struct switch_offsets {
  60         unsigned long begin, end, old;
  61         size_t pre_header_padding, size;
  62         unsigned int switch_new_start:1, switch_new_end:1, switch_old_start:1,
  63                      switch_old_end:1;
  64 };
  65
  66 #ifdef CONFIG_NO_HZ
  67 enum tick_nohz_val {
  68         TICK_NOHZ_STOP,
  69         TICK_NOHZ_FLUSH,
  70         TICK_NOHZ_RESTART,
  71 };
  72
  73 static ATOMIC_NOTIFIER_HEAD(tick_nohz_notifier);
  74 #endif /* CONFIG_NO_HZ */
  75
  76 static DEFINE_PER_CPU(spinlock_t, ring_buffer_nohz_lock);
  77
  78 DEFINE_PER_CPU(unsigned int, lib_ring_buffer_nesting);
  79 EXPORT_PER_CPU_SYMBOL(lib_ring_buffer_nesting);
  80
  81 static
  82 void lib_ring_buffer_print_errors(struct channel *chan,
  83                                   struct lib_ring_buffer *buf, int cpu);
  84 static
  85 void _lib_ring_buffer_switch_remote(struct lib_ring_buffer *buf,
  86                 enum switch_mode mode);
  87
  88 static
  89 int lib_ring_buffer_poll_deliver(const struct lib_ring_buffer_config *config,
  90                                  struct lib_ring_buffer *buf,
  91                                  struct channel *chan)
  92 {
  93         unsigned long consumed_old, consumed_idx, commit_count, write_offset;
  94
  95         consumed_old = atomic_long_read(&buf->consumed);
  96         consumed_idx = subbuf_index(consumed_old, chan);
  97         commit_count = v_read(config, &buf->commit_cold[consumed_idx].cc_sb);
  98         /*
  99          * No memory barrier here, since we are only interested
 100          * in a statistically correct polling result. The next poll will
 101          * get the data is we are racing. The mb() that ensures correct
 102          * memory order is in get_subbuf.
 103          */
 104         write_offset = v_read(config, &buf->offset);
 105
 106         /*
 107          * Check that the subbuffer we are trying to consume has been
 108          * already fully committed.
 109          */
 110
 111         if (((commit_count - chan->backend.subbuf_size)
 112              & chan->commit_count_mask)
 113             - (buf_trunc(consumed_old, chan)
 114                >> chan->backend.num_subbuf_order)
 115             != 0)
 116                 return 0;
 117
 118         /*
 119          * Check that we are not about to read the same subbuffer in
 120          * which the writer head is.
 121          */
 122         if (subbuf_trunc(write_offset, chan) - subbuf_trunc(consumed_old, chan)
 123             == 0)
 124                 return 0;
 125
 126         return 1;
 127 }
 128
 129 /*
 130  * Must be called under cpu hotplug protection.
 131  */
 132 void lib_ring_buffer_free(struct lib_ring_buffer *buf)
 133 {
 134         struct channel *chan = buf->backend.chan;
 135
 136         lib_ring_buffer_print_errors(chan, buf, buf->backend.cpu);
 137         lttng_kvfree(buf->commit_hot);
 138         lttng_kvfree(buf->commit_cold);
 139
 140         lib_ring_buffer_backend_free(&buf->backend);
 141 }
 142
 143 /**
 144  * lib_ring_buffer_reset - Reset ring buffer to initial values.
 145  * @buf: Ring buffer.
 146  *
 147  * Effectively empty the ring buffer. Should be called when the buffer is not
 148  * used for writing. The ring buffer can be opened for reading, but the reader
 149  * should not be using the iterator concurrently with reset. The previous
 150  * current iterator record is reset.
 151  */
 152 void lib_ring_buffer_reset(struct lib_ring_buffer *buf)
 153 {
 154         struct channel *chan = buf->backend.chan;
 155         const struct lib_ring_buffer_config *config = &chan->backend.config;
 156         unsigned int i;
 157
 158         /*
 159          * Reset iterator first. It will put the subbuffer if it currently holds
 160          * it.
 161          */
 162         lib_ring_buffer_iterator_reset(buf);
 163         v_set(config, &buf->offset, 0);
 164         for (i = 0; i < chan->backend.num_subbuf; i++) {
 165                 v_set(config, &buf->commit_hot[i].cc, 0);
 166                 v_set(config, &buf->commit_hot[i].seq, 0);
 167                 v_set(config, &buf->commit_cold[i].cc_sb, 0);
 168         }
 169         atomic_long_set(&buf->consumed, 0);
 170         atomic_set(&buf->record_disabled, 0);
 171         v_set(config, &buf->last_tsc, 0);
 172         lib_ring_buffer_backend_reset(&buf->backend);
 173         /* Don't reset number of active readers */
 174         v_set(config, &buf->records_lost_full, 0);
 175         v_set(config, &buf->records_lost_wrap, 0);
 176         v_set(config, &buf->records_lost_big, 0);
 177         v_set(config, &buf->records_count, 0);
 178         v_set(config, &buf->records_overrun, 0);
 179         buf->finalized = 0;
 180 }
 181 EXPORT_SYMBOL_GPL(lib_ring_buffer_reset);
 182
 183 /**
 184  * channel_reset - Reset channel to initial values.
 185  * @chan: Channel.
 186  *
 187  * Effectively empty the channel. Should be called when the channel is not used
 188  * for writing. The channel can be opened for reading, but the reader should not
 189  * be using the iterator concurrently with reset. The previous current iterator
 190  * record is reset.
 191  */
 192 void channel_reset(struct channel *chan)
 193 {
 194         /*
 195          * Reset iterators first. Will put the subbuffer if held for reading.
 196          */
 197         channel_iterator_reset(chan);
 198         atomic_set(&chan->record_disabled, 0);
 199         /* Don't reset commit_count_mask, still valid */
 200         channel_backend_reset(&chan->backend);
 201         /* Don't reset switch/read timer interval */
 202         /* Don't reset notifiers and notifier enable bits */
 203         /* Don't reset reader reference count */
 204 }
 205 EXPORT_SYMBOL_GPL(channel_reset);
 206
 207 /*
 208  * Must be called under cpu hotplug protection.
 209  */
 210 int lib_ring_buffer_create(struct lib_ring_buffer *buf,
 211                            struct channel_backend *chanb, int cpu)
 212 {
 213         const struct lib_ring_buffer_config *config = &chanb->config;
 214         struct channel *chan = container_of(chanb, struct channel, backend);
 215         void *priv = chanb->priv;
 216         size_t subbuf_header_size;
 217         u64 tsc;
 218         int ret;
 219
 220         /* Test for cpu hotplug */
 221         if (buf->backend.allocated)
 222                 return 0;
 223
 224         /*
 225          * Paranoia: per cpu dynamic allocation is not officially documented as
 226          * zeroing the memory, so let's do it here too, just in case.
 227          */
 228         memset(buf, 0, sizeof(*buf));
 229
 230         ret = lib_ring_buffer_backend_create(&buf->backend, &chan->backend, cpu);
 231         if (ret)
 232                 return ret;
 233
 234         buf->commit_hot =
 235                 lttng_kvzalloc_node(ALIGN(sizeof(*buf->commit_hot)
 236                                    * chan->backend.num_subbuf,
 237                                    1 << INTERNODE_CACHE_SHIFT),
 238                         GFP_KERNEL | __GFP_NOWARN,
 239                         cpu_to_node(max(cpu, 0)));
 240         if (!buf->commit_hot) {
 241                 ret = -ENOMEM;
 242                 goto free_chanbuf;
 243         }
 244
 245         buf->commit_cold =
 246                 lttng_kvzalloc_node(ALIGN(sizeof(*buf->commit_cold)
 247                                    * chan->backend.num_subbuf,
 248                                    1 << INTERNODE_CACHE_SHIFT),
 249                         GFP_KERNEL | __GFP_NOWARN,
 250                         cpu_to_node(max(cpu, 0)));
 251         if (!buf->commit_cold) {
 252                 ret = -ENOMEM;
 253                 goto free_commit;
 254         }
 255
 256         init_waitqueue_head(&buf->read_wait);
 257         init_waitqueue_head(&buf->write_wait);
 258         raw_spin_lock_init(&buf->raw_tick_nohz_spinlock);
 259
 260         /*
 261          * Write the subbuffer header for first subbuffer so we know the total
 262          * duration of data gathering.
 263          */
 264         subbuf_header_size = config->cb.subbuffer_header_size();
 265         v_set(config, &buf->offset, subbuf_header_size);
 266         subbuffer_id_clear_noref(config, &buf->backend.buf_wsb[0].id);
 267         tsc = config->cb.ring_buffer_clock_read(buf->backend.chan);
 268         config->cb.buffer_begin(buf, tsc, 0);
 269         v_add(config, subbuf_header_size, &buf->commit_hot[0].cc);
 270
 271         if (config->cb.buffer_create) {
 272                 ret = config->cb.buffer_create(buf, priv, cpu, chanb->name);
 273                 if (ret)
 274                         goto free_init;
 275         }
 276
 277         /*
 278          * Ensure the buffer is ready before setting it to allocated and setting
 279          * the cpumask.
 280          * Used for cpu hotplug vs cpumask iteration.
 281          */
 282         smp_wmb();
 283         buf->backend.allocated = 1;
 284
 285         if (config->alloc == RING_BUFFER_ALLOC_PER_CPU) {
 286                 CHAN_WARN_ON(chan, cpumask_test_cpu(cpu,
 287                              chan->backend.cpumask));
 288                 cpumask_set_cpu(cpu, chan->backend.cpumask);
 289         }
 290
 291         return 0;
 292
 293         /* Error handling */
 294 free_init:
 295         lttng_kvfree(buf->commit_cold);
 296 free_commit:
 297         lttng_kvfree(buf->commit_hot);
 298 free_chanbuf:
 299         lib_ring_buffer_backend_free(&buf->backend);
 300         return ret;
 301 }
 302
 303 static void switch_buffer_timer(LTTNG_TIMER_FUNC_ARG_TYPE t)
 304 {
 305         struct lib_ring_buffer *buf = lttng_from_timer(buf, t, switch_timer);
 306         struct channel *chan = buf->backend.chan;
 307         const struct lib_ring_buffer_config *config = &chan->backend.config;
 308
 309         /*
 310          * Only flush buffers periodically if readers are active.
 311          */
 312         if (atomic_long_read(&buf->active_readers))
 313                 lib_ring_buffer_switch_slow(buf, SWITCH_ACTIVE);
 314
 315         if (config->alloc == RING_BUFFER_ALLOC_PER_CPU)
 316                 lttng_mod_timer_pinned(&buf->switch_timer,
 317                                  jiffies + chan->switch_timer_interval);
 318         else
 319                 mod_timer(&buf->switch_timer,
 320                           jiffies + chan->switch_timer_interval);
 321 }
 322
 323 /*
 324  * Called with ring_buffer_nohz_lock held for per-cpu buffers.
 325  */
 326 static void lib_ring_buffer_start_switch_timer(struct lib_ring_buffer *buf)
 327 {
 328         struct channel *chan = buf->backend.chan;
 329         const struct lib_ring_buffer_config *config = &chan->backend.config;
 330         unsigned int flags = 0;
 331
 332         if (!chan->switch_timer_interval || buf->switch_timer_enabled)
 333                 return;
 334
 335         if (config->alloc == RING_BUFFER_ALLOC_PER_CPU)
 336                 flags = LTTNG_TIMER_PINNED;
 337
 338         lttng_timer_setup(&buf->switch_timer, switch_buffer_timer, flags, buf);
 339         buf->switch_timer.expires = jiffies + chan->switch_timer_interval;
 340
 341         if (config->alloc == RING_BUFFER_ALLOC_PER_CPU)
 342                 add_timer_on(&buf->switch_timer, buf->backend.cpu);
 343         else
 344                 add_timer(&buf->switch_timer);
 345
 346         buf->switch_timer_enabled = 1;
 347 }
 348
 349 /*
 350  * Called with ring_buffer_nohz_lock held for per-cpu buffers.
 351  */
 352 static void lib_ring_buffer_stop_switch_timer(struct lib_ring_buffer *buf)
 353 {
 354         struct channel *chan = buf->backend.chan;
 355
 356         if (!chan->switch_timer_interval || !buf->switch_timer_enabled)
 357                 return;
 358
 359         del_timer_sync(&buf->switch_timer);
 360         buf->switch_timer_enabled = 0;
 361 }
 362
 363 /*
 364  * Polling timer to check the channels for data.
 365  */
 366 static void read_buffer_timer(LTTNG_TIMER_FUNC_ARG_TYPE t)
 367 {
 368         struct lib_ring_buffer *buf = lttng_from_timer(buf, t, read_timer);
 369         struct channel *chan = buf->backend.chan;
 370         const struct lib_ring_buffer_config *config = &chan->backend.config;
 371
 372         CHAN_WARN_ON(chan, !buf->backend.allocated);
 373
 374         if (atomic_long_read(&buf->active_readers)
 375             && lib_ring_buffer_poll_deliver(config, buf, chan)) {
 376                 wake_up_interruptible(&buf->read_wait);
 377                 wake_up_interruptible(&chan->read_wait);
 378         }
 379
 380         if (config->alloc == RING_BUFFER_ALLOC_PER_CPU)
 381                 lttng_mod_timer_pinned(&buf->read_timer,
 382                                  jiffies + chan->read_timer_interval);
 383         else
 384                 mod_timer(&buf->read_timer,
 385                           jiffies + chan->read_timer_interval);
 386 }
 387
 388 /*
 389  * Called with ring_buffer_nohz_lock held for per-cpu buffers.
 390  */
 391 static void lib_ring_buffer_start_read_timer(struct lib_ring_buffer *buf)
 392 {
 393         struct channel *chan = buf->backend.chan;
 394         const struct lib_ring_buffer_config *config = &chan->backend.config;
 395         unsigned int flags;
 396
 397         if (config->wakeup != RING_BUFFER_WAKEUP_BY_TIMER
 398             || !chan->read_timer_interval
 399             || buf->read_timer_enabled)
 400                 return;
 401
 402         if (config->alloc == RING_BUFFER_ALLOC_PER_CPU)
 403                 flags = LTTNG_TIMER_PINNED;
 404
 405         lttng_timer_setup(&buf->read_timer, read_buffer_timer, flags, buf);
 406         buf->read_timer.expires = jiffies + chan->read_timer_interval;
 407
 408         if (config->alloc == RING_BUFFER_ALLOC_PER_CPU)
 409                 add_timer_on(&buf->read_timer, buf->backend.cpu);
 410         else
 411                 add_timer(&buf->read_timer);
 412
 413         buf->read_timer_enabled = 1;
 414 }
 415
 416 /*
 417  * Called with ring_buffer_nohz_lock held for per-cpu buffers.
 418  */
 419 static void lib_ring_buffer_stop_read_timer(struct lib_ring_buffer *buf)
 420 {
 421         struct channel *chan = buf->backend.chan;
 422         const struct lib_ring_buffer_config *config = &chan->backend.config;
 423
 424         if (config->wakeup != RING_BUFFER_WAKEUP_BY_TIMER
 425             || !chan->read_timer_interval
 426             || !buf->read_timer_enabled)
 427                 return;
 428
 429         del_timer_sync(&buf->read_timer);
 430         /*
 431          * do one more check to catch data that has been written in the last
 432          * timer period.
 433          */
 434         if (lib_ring_buffer_poll_deliver(config, buf, chan)) {
 435                 wake_up_interruptible(&buf->read_wait);
 436                 wake_up_interruptible(&chan->read_wait);
 437         }
 438         buf->read_timer_enabled = 0;
 439 }
 440
 441 #if (LINUX_VERSION_CODE >= KERNEL_VERSION(4,10,0))
 442
 443 enum cpuhp_state lttng_rb_hp_prepare;
 444 enum cpuhp_state lttng_rb_hp_online;
 445
 446 void lttng_rb_set_hp_prepare(enum cpuhp_state val)
 447 {
 448         lttng_rb_hp_prepare = val;
 449 }
 450 EXPORT_SYMBOL_GPL(lttng_rb_set_hp_prepare);
 451
 452 void lttng_rb_set_hp_online(enum cpuhp_state val)
 453 {
 454         lttng_rb_hp_online = val;
 455 }
 456 EXPORT_SYMBOL_GPL(lttng_rb_set_hp_online);
 457
 458 int lttng_cpuhp_rb_frontend_dead(unsigned int cpu,
 459                 struct lttng_cpuhp_node *node)
 460 {
 461         struct channel *chan = container_of(node, struct channel,
 462                                             cpuhp_prepare);
 463         struct lib_ring_buffer *buf = per_cpu_ptr(chan->backend.buf, cpu);
 464         const struct lib_ring_buffer_config *config = &chan->backend.config;
 465
 466         CHAN_WARN_ON(chan, config->alloc == RING_BUFFER_ALLOC_GLOBAL);
 467
 468         /*
 469          * Performing a buffer switch on a remote CPU. Performed by
 470          * the CPU responsible for doing the hotunplug after the target
 471          * CPU stopped running completely. Ensures that all data
 472          * from that remote CPU is flushed.
 473          */
 474         lib_ring_buffer_switch_slow(buf, SWITCH_ACTIVE);
 475         return 0;
 476 }
 477 EXPORT_SYMBOL_GPL(lttng_cpuhp_rb_frontend_dead);
 478
 479 int lttng_cpuhp_rb_frontend_online(unsigned int cpu,
 480                 struct lttng_cpuhp_node *node)
 481 {
 482         struct channel *chan = container_of(node, struct channel,
 483                                             cpuhp_online);
 484         struct lib_ring_buffer *buf = per_cpu_ptr(chan->backend.buf, cpu);
 485         const struct lib_ring_buffer_config *config = &chan->backend.config;
 486
 487         CHAN_WARN_ON(chan, config->alloc == RING_BUFFER_ALLOC_GLOBAL);
 488
 489         wake_up_interruptible(&chan->hp_wait);
 490         lib_ring_buffer_start_switch_timer(buf);
 491         lib_ring_buffer_start_read_timer(buf);
 492         return 0;
 493 }
 494 EXPORT_SYMBOL_GPL(lttng_cpuhp_rb_frontend_online);
 495
 496 int lttng_cpuhp_rb_frontend_offline(unsigned int cpu,
 497                 struct lttng_cpuhp_node *node)
 498 {
 499         struct channel *chan = container_of(node, struct channel,
 500                                             cpuhp_online);
 501         struct lib_ring_buffer *buf = per_cpu_ptr(chan->backend.buf, cpu);
 502         const struct lib_ring_buffer_config *config = &chan->backend.config;
 503
 504         CHAN_WARN_ON(chan, config->alloc == RING_BUFFER_ALLOC_GLOBAL);
 505
 506         lib_ring_buffer_stop_switch_timer(buf);
 507         lib_ring_buffer_stop_read_timer(buf);
 508         return 0;
 509 }
 510 EXPORT_SYMBOL_GPL(lttng_cpuhp_rb_frontend_offline);
 511
 512 #else /* #if (LINUX_VERSION_CODE >= KERNEL_VERSION(4,10,0)) */
 513
 514 #ifdef CONFIG_HOTPLUG_CPU
 515
 516 /**
 517  *      lib_ring_buffer_cpu_hp_callback - CPU hotplug callback
 518  *      @nb: notifier block
 519  *      @action: hotplug action to take
 520  *      @hcpu: CPU number
 521  *
 522  *      Returns the success/failure of the operation. (%NOTIFY_OK, %NOTIFY_BAD)
 523  */
 524 static
 525 int lib_ring_buffer_cpu_hp_callback(struct notifier_block *nb,
 526                                               unsigned long action,
 527                                               void *hcpu)
 528 {
 529         unsigned int cpu = (unsigned long)hcpu;
 530         struct channel *chan = container_of(nb, struct channel,
 531                                             cpu_hp_notifier);
 532         struct lib_ring_buffer *buf = per_cpu_ptr(chan->backend.buf, cpu);
 533         const struct lib_ring_buffer_config *config = &chan->backend.config;
 534
 535         if (!chan->cpu_hp_enable)
 536                 return NOTIFY_DONE;
 537
 538         CHAN_WARN_ON(chan, config->alloc == RING_BUFFER_ALLOC_GLOBAL);
 539
 540         switch (action) {
 541         case CPU_DOWN_FAILED:
 542         case CPU_DOWN_FAILED_FROZEN:
 543         case CPU_ONLINE:
 544         case CPU_ONLINE_FROZEN:
 545                 wake_up_interruptible(&chan->hp_wait);
 546                 lib_ring_buffer_start_switch_timer(buf);
 547                 lib_ring_buffer_start_read_timer(buf);
 548                 return NOTIFY_OK;
 549
 550         case CPU_DOWN_PREPARE:
 551         case CPU_DOWN_PREPARE_FROZEN:
 552                 lib_ring_buffer_stop_switch_timer(buf);
 553                 lib_ring_buffer_stop_read_timer(buf);
 554                 return NOTIFY_OK;
 555
 556         case CPU_DEAD:
 557         case CPU_DEAD_FROZEN:
 558                 /*
 559                  * Performing a buffer switch on a remote CPU. Performed by
 560                  * the CPU responsible for doing the hotunplug after the target
 561                  * CPU stopped running completely. Ensures that all data
 562                  * from that remote CPU is flushed.
 563                  */
 564                 lib_ring_buffer_switch_slow(buf, SWITCH_ACTIVE);
 565                 return NOTIFY_OK;
 566
 567         default:
 568                 return NOTIFY_DONE;
 569         }
 570 }
 571
 572 #endif
 573
 574 #endif /* #else #if (LINUX_VERSION_CODE >= KERNEL_VERSION(4,10,0)) */
 575
 576 #if defined(CONFIG_NO_HZ) && defined(CONFIG_LIB_RING_BUFFER)
 577 /*
 578  * For per-cpu buffers, call the reader wakeups before switching the buffer, so
 579  * that wake-up-tracing generated events are flushed before going idle (in
 580  * tick_nohz). We test if the spinlock is locked to deal with the race where
 581  * readers try to sample the ring buffer before we perform the switch. We let
 582  * the readers retry in that case. If there is data in the buffer, the wake up
 583  * is going to forbid the CPU running the reader thread from going idle.
 584  */
 585 static int notrace ring_buffer_tick_nohz_callback(struct notifier_block *nb,
 586                                                   unsigned long val,
 587                                                   void *data)
 588 {
 589         struct channel *chan = container_of(nb, struct channel,
 590                                             tick_nohz_notifier);
 591         const struct lib_ring_buffer_config *config = &chan->backend.config;
 592         struct lib_ring_buffer *buf;
 593         int cpu = smp_processor_id();
 594
 595         if (config->alloc != RING_BUFFER_ALLOC_PER_CPU) {
 596                 /*
 597                  * We don't support keeping the system idle with global buffers
 598                  * and streaming active. In order to do so, we would need to
 599                  * sample a non-nohz-cpumask racelessly with the nohz updates
 600                  * without adding synchronization overhead to nohz. Leave this
 601                  * use-case out for now.
 602                  */
 603                 return 0;
 604         }
 605
 606         buf = channel_get_ring_buffer(config, chan, cpu);
 607         switch (val) {
 608         case TICK_NOHZ_FLUSH:
 609                 raw_spin_lock(&buf->raw_tick_nohz_spinlock);
 610                 if (config->wakeup == RING_BUFFER_WAKEUP_BY_TIMER
 611                     && chan->read_timer_interval
 612                     && atomic_long_read(&buf->active_readers)
 613                     && (lib_ring_buffer_poll_deliver(config, buf, chan)
 614                         || lib_ring_buffer_pending_data(config, buf, chan))) {
 615                         wake_up_interruptible(&buf->read_wait);
 616                         wake_up_interruptible(&chan->read_wait);
 617                 }
 618                 if (chan->switch_timer_interval)
 619                         lib_ring_buffer_switch_slow(buf, SWITCH_ACTIVE);
 620                 raw_spin_unlock(&buf->raw_tick_nohz_spinlock);
 621                 break;
 622         case TICK_NOHZ_STOP:
 623                 spin_lock(lttng_this_cpu_ptr(&ring_buffer_nohz_lock));
 624                 lib_ring_buffer_stop_switch_timer(buf);
 625                 lib_ring_buffer_stop_read_timer(buf);
 626                 spin_unlock(lttng_this_cpu_ptr(&ring_buffer_nohz_lock));
 627                 break;
 628         case TICK_NOHZ_RESTART:
 629                 spin_lock(lttng_this_cpu_ptr(&ring_buffer_nohz_lock));
 630                 lib_ring_buffer_start_read_timer(buf);
 631                 lib_ring_buffer_start_switch_timer(buf);
 632                 spin_unlock(lttng_this_cpu_ptr(&ring_buffer_nohz_lock));
 633                 break;
 634         }
 635
 636         return 0;
 637 }
 638
 639 void notrace lib_ring_buffer_tick_nohz_flush(void)
 640 {
 641         atomic_notifier_call_chain(&tick_nohz_notifier, TICK_NOHZ_FLUSH,
 642                                    NULL);
 643 }
 644
 645 void notrace lib_ring_buffer_tick_nohz_stop(void)
 646 {
 647         atomic_notifier_call_chain(&tick_nohz_notifier, TICK_NOHZ_STOP,
 648                                    NULL);
 649 }
 650
 651 void notrace lib_ring_buffer_tick_nohz_restart(void)
 652 {
 653         atomic_notifier_call_chain(&tick_nohz_notifier, TICK_NOHZ_RESTART,
 654                                    NULL);
 655 }
 656 #endif /* defined(CONFIG_NO_HZ) && defined(CONFIG_LIB_RING_BUFFER) */
 657
 658 /*
 659  * Holds CPU hotplug.
 660  */
 661 static void channel_unregister_notifiers(struct channel *chan)
 662 {
 663         const struct lib_ring_buffer_config *config = &chan->backend.config;
 664
 665         channel_iterator_unregister_notifiers(chan);
 666         if (config->alloc == RING_BUFFER_ALLOC_PER_CPU) {
 667 #ifdef CONFIG_NO_HZ
 668                 /*
 669                  * Remove the nohz notifier first, so we are certain we stop
 670                  * the timers.
 671                  */
 672                 atomic_notifier_chain_unregister(&tick_nohz_notifier,
 673                                                  &chan->tick_nohz_notifier);
 674                 /*
 675                  * ring_buffer_nohz_lock will not be needed below, because
 676                  * we just removed the notifiers, which were the only source of
 677                  * concurrency.
 678                  */
 679 #endif /* CONFIG_NO_HZ */
 680 #if (LINUX_VERSION_CODE >= KERNEL_VERSION(4,10,0))
 681                 {
 682                         int ret;
 683
 684                         ret = cpuhp_state_remove_instance(lttng_rb_hp_online,
 685                                 &chan->cpuhp_online.node);
 686                         WARN_ON(ret);
 687                         ret = cpuhp_state_remove_instance_nocalls(lttng_rb_hp_prepare,
 688                                 &chan->cpuhp_prepare.node);
 689                         WARN_ON(ret);
 690                 }
 691 #else /* #if (LINUX_VERSION_CODE >= KERNEL_VERSION(4,10,0)) */
 692                 {
 693                         int cpu;
 694
 695 #ifdef CONFIG_HOTPLUG_CPU
 696                         get_online_cpus();
 697                         chan->cpu_hp_enable = 0;
 698                         for_each_online_cpu(cpu) {
 699                                 struct lib_ring_buffer *buf = per_cpu_ptr(chan->backend.buf,
 700                                                                       cpu);
 701                                 lib_ring_buffer_stop_switch_timer(buf);
 702                                 lib_ring_buffer_stop_read_timer(buf);
 703                         }
 704                         put_online_cpus();
 705                         unregister_cpu_notifier(&chan->cpu_hp_notifier);
 706 #else
 707                         for_each_possible_cpu(cpu) {
 708                                 struct lib_ring_buffer *buf = per_cpu_ptr(chan->backend.buf,
 709                                                                       cpu);
 710                                 lib_ring_buffer_stop_switch_timer(buf);
 711                                 lib_ring_buffer_stop_read_timer(buf);
 712                         }
 713 #endif
 714                 }
 715 #endif /* #else #if (LINUX_VERSION_CODE >= KERNEL_VERSION(4,10,0)) */
 716         } else {
 717                 struct lib_ring_buffer *buf = chan->backend.buf;
 718
 719                 lib_ring_buffer_stop_switch_timer(buf);
 720                 lib_ring_buffer_stop_read_timer(buf);
 721         }
 722         channel_backend_unregister_notifiers(&chan->backend);
 723 }
 724
 725 static void lib_ring_buffer_set_quiescent(struct lib_ring_buffer *buf)
 726 {
 727         if (!buf->quiescent) {
 728                 buf->quiescent = true;
 729                 _lib_ring_buffer_switch_remote(buf, SWITCH_FLUSH);
 730         }
 731 }
 732
 733 static void lib_ring_buffer_clear_quiescent(struct lib_ring_buffer *buf)
 734 {
 735         buf->quiescent = false;
 736 }
 737
 738 void lib_ring_buffer_set_quiescent_channel(struct channel *chan)
 739 {
 740         int cpu;
 741         const struct lib_ring_buffer_config *config = &chan->backend.config;
 742
 743         if (config->alloc == RING_BUFFER_ALLOC_PER_CPU) {
 744                 get_online_cpus();
 745                 for_each_channel_cpu(cpu, chan) {
 746                         struct lib_ring_buffer *buf = per_cpu_ptr(chan->backend.buf,
 747                                                               cpu);
 748
 749                         lib_ring_buffer_set_quiescent(buf);
 750                 }
 751                 put_online_cpus();
 752         } else {
 753                 struct lib_ring_buffer *buf = chan->backend.buf;
 754
 755                 lib_ring_buffer_set_quiescent(buf);
 756         }
 757 }
 758 EXPORT_SYMBOL_GPL(lib_ring_buffer_set_quiescent_channel);
 759
 760 void lib_ring_buffer_clear_quiescent_channel(struct channel *chan)
 761 {
 762         int cpu;
 763         const struct lib_ring_buffer_config *config = &chan->backend.config;
 764
 765         if (config->alloc == RING_BUFFER_ALLOC_PER_CPU) {
 766                 get_online_cpus();
 767                 for_each_channel_cpu(cpu, chan) {
 768                         struct lib_ring_buffer *buf = per_cpu_ptr(chan->backend.buf,
 769                                                               cpu);
 770
 771                         lib_ring_buffer_clear_quiescent(buf);
 772                 }
 773                 put_online_cpus();
 774         } else {
 775                 struct lib_ring_buffer *buf = chan->backend.buf;
 776
 777                 lib_ring_buffer_clear_quiescent(buf);
 778         }
 779 }
 780 EXPORT_SYMBOL_GPL(lib_ring_buffer_clear_quiescent_channel);
 781
 782 static void channel_free(struct channel *chan)
 783 {
 784         if (chan->backend.release_priv_ops) {
 785                 chan->backend.release_priv_ops(chan->backend.priv_ops);
 786         }
 787         channel_iterator_free(chan);
 788         channel_backend_free(&chan->backend);
 789         kfree(chan);
 790 }
 791
 792 /**
 793  * channel_create - Create channel.
 794  * @config: ring buffer instance configuration
 795  * @name: name of the channel
 796  * @priv: ring buffer client private data
 797  * @buf_addr: pointer the the beginning of the preallocated buffer contiguous
 798  *            address mapping. It is used only by RING_BUFFER_STATIC
 799  *            configuration. It can be set to NULL for other backends.
 800  * @subbuf_size: subbuffer size
 801  * @num_subbuf: number of subbuffers
 802  * @switch_timer_interval: Time interval (in us) to fill sub-buffers with
 803  *                         padding to let readers get those sub-buffers.
 804  *                         Used for live streaming.
 805  * @read_timer_interval: Time interval (in us) to wake up pending readers.
 806  *
 807  * Holds cpu hotplug.
 808  * Returns NULL on failure.
 809  */
 810 struct channel *channel_create(const struct lib_ring_buffer_config *config,
 811                    const char *name, void *priv, void *buf_addr,
 812                    size_t subbuf_size,
 813                    size_t num_subbuf, unsigned int switch_timer_interval,
 814                    unsigned int read_timer_interval)
 815 {
 816         int ret;
 817         struct channel *chan;
 818
 819         if (lib_ring_buffer_check_config(config, switch_timer_interval,
 820                                          read_timer_interval))
 821                 return NULL;
 822
 823         chan = kzalloc(sizeof(struct channel), GFP_KERNEL);
 824         if (!chan)
 825                 return NULL;
 826
 827         ret = channel_backend_init(&chan->backend, name, config, priv,
 828                                    subbuf_size, num_subbuf);
 829         if (ret)
 830                 goto error;
 831
 832         ret = channel_iterator_init(chan);
 833         if (ret)
 834                 goto error_free_backend;
 835
 836         chan->commit_count_mask = (~0UL >> chan->backend.num_subbuf_order);
 837         chan->switch_timer_interval = usecs_to_jiffies(switch_timer_interval);
 838         chan->read_timer_interval = usecs_to_jiffies(read_timer_interval);
 839         kref_init(&chan->ref);
 840         init_waitqueue_head(&chan->read_wait);
 841         init_waitqueue_head(&chan->hp_wait);
 842
 843         if (config->alloc == RING_BUFFER_ALLOC_PER_CPU) {
 844 #if (LINUX_VERSION_CODE >= KERNEL_VERSION(4,10,0))
 845                 chan->cpuhp_prepare.component = LTTNG_RING_BUFFER_FRONTEND;
 846                 ret = cpuhp_state_add_instance_nocalls(lttng_rb_hp_prepare,
 847                         &chan->cpuhp_prepare.node);
 848                 if (ret)
 849                         goto cpuhp_prepare_error;
 850
 851                 chan->cpuhp_online.component = LTTNG_RING_BUFFER_FRONTEND;
 852                 ret = cpuhp_state_add_instance(lttng_rb_hp_online,
 853                         &chan->cpuhp_online.node);
 854                 if (ret)
 855                         goto cpuhp_online_error;
 856 #else /* #if (LINUX_VERSION_CODE >= KERNEL_VERSION(4,10,0)) */
 857                 {
 858                         int cpu;
 859                         /*
 860                          * In case of non-hotplug cpu, if the ring-buffer is allocated
 861                          * in early initcall, it will not be notified of secondary cpus.
 862                          * In that off case, we need to allocate for all possible cpus.
 863                          */
 864 #ifdef CONFIG_HOTPLUG_CPU
 865                         chan->cpu_hp_notifier.notifier_call =
 866                                         lib_ring_buffer_cpu_hp_callback;
 867                         chan->cpu_hp_notifier.priority = 6;
 868                         register_cpu_notifier(&chan->cpu_hp_notifier);
 869
 870                         get_online_cpus();
 871                         for_each_online_cpu(cpu) {
 872                                 struct lib_ring_buffer *buf = per_cpu_ptr(chan->backend.buf,
 873                                                                        cpu);
 874                                 spin_lock(&per_cpu(ring_buffer_nohz_lock, cpu));
 875                                 lib_ring_buffer_start_switch_timer(buf);
 876                                 lib_ring_buffer_start_read_timer(buf);
 877                                 spin_unlock(&per_cpu(ring_buffer_nohz_lock, cpu));
 878                         }
 879                         chan->cpu_hp_enable = 1;
 880                         put_online_cpus();
 881 #else
 882                         for_each_possible_cpu(cpu) {
 883                                 struct lib_ring_buffer *buf = per_cpu_ptr(chan->backend.buf,
 884                                                                       cpu);
 885                                 spin_lock(&per_cpu(ring_buffer_nohz_lock, cpu));
 886                                 lib_ring_buffer_start_switch_timer(buf);
 887                                 lib_ring_buffer_start_read_timer(buf);
 888                                 spin_unlock(&per_cpu(ring_buffer_nohz_lock, cpu));
 889                         }
 890 #endif
 891                 }
 892 #endif /* #else #if (LINUX_VERSION_CODE >= KERNEL_VERSION(4,10,0)) */
 893
 894 #if defined(CONFIG_NO_HZ) && defined(CONFIG_LIB_RING_BUFFER)
 895                 /* Only benefit from NO_HZ idle with per-cpu buffers for now. */
 896                 chan->tick_nohz_notifier.notifier_call =
 897                         ring_buffer_tick_nohz_callback;
 898                 chan->tick_nohz_notifier.priority = ~0U;
 899                 atomic_notifier_chain_register(&tick_nohz_notifier,
 900                                        &chan->tick_nohz_notifier);
 901 #endif /* defined(CONFIG_NO_HZ) && defined(CONFIG_LIB_RING_BUFFER) */
 902
 903         } else {
 904                 struct lib_ring_buffer *buf = chan->backend.buf;
 905
 906                 lib_ring_buffer_start_switch_timer(buf);
 907                 lib_ring_buffer_start_read_timer(buf);
 908         }
 909
 910         return chan;
 911
 912 #if (LINUX_VERSION_CODE >= KERNEL_VERSION(4,10,0))
 913 cpuhp_online_error:
 914         ret = cpuhp_state_remove_instance_nocalls(lttng_rb_hp_prepare,
 915                         &chan->cpuhp_prepare.node);
 916         WARN_ON(ret);
 917 cpuhp_prepare_error:
 918 #endif /* #if (LINUX_VERSION_CODE >= KERNEL_VERSION(4,10,0)) */
 919 error_free_backend:
 920         channel_backend_free(&chan->backend);
 921 error:
 922         kfree(chan);
 923         return NULL;
 924 }
 925 EXPORT_SYMBOL_GPL(channel_create);
 926
 927 static
 928 void channel_release(struct kref *kref)
 929 {
 930         struct channel *chan = container_of(kref, struct channel, ref);
 931         channel_free(chan);
 932 }
 933
 934 /**
 935  * channel_destroy - Finalize, wait for q.s. and destroy channel.
 936  * @chan: channel to destroy
 937  *
 938  * Holds cpu hotplug.
 939  * Call "destroy" callback, finalize channels, and then decrement the
 940  * channel reference count.  Note that when readers have completed data
 941  * consumption of finalized channels, get_subbuf() will return -ENODATA.
 942  * They should release their handle at that point.  Returns the private
 943  * data pointer.
 944  */
 945 void *channel_destroy(struct channel *chan)
 946 {
 947         int cpu;
 948         const struct lib_ring_buffer_config *config = &chan->backend.config;
 949         void *priv;
 950
 951         channel_unregister_notifiers(chan);
 952
 953         if (config->alloc == RING_BUFFER_ALLOC_PER_CPU) {
 954                 /*
 955                  * No need to hold cpu hotplug, because all notifiers have been
 956                  * unregistered.
 957                  */
 958                 for_each_channel_cpu(cpu, chan) {
 959                         struct lib_ring_buffer *buf = per_cpu_ptr(chan->backend.buf,
 960                                                               cpu);
 961
 962                         if (config->cb.buffer_finalize)
 963                                 config->cb.buffer_finalize(buf,
 964                                                            chan->backend.priv,
 965                                                            cpu);
 966                         if (buf->backend.allocated)
 967                                 lib_ring_buffer_set_quiescent(buf);
 968                         /*
 969                          * Perform flush before writing to finalized.
 970                          */
 971                         smp_wmb();
 972                         WRITE_ONCE(buf->finalized, 1);
 973                         wake_up_interruptible(&buf->read_wait);
 974                 }
 975         } else {
 976                 struct lib_ring_buffer *buf = chan->backend.buf;
 977
 978                 if (config->cb.buffer_finalize)
 979                         config->cb.buffer_finalize(buf, chan->backend.priv, -1);
 980                 if (buf->backend.allocated)
 981                         lib_ring_buffer_set_quiescent(buf);
 982                 /*
 983                  * Perform flush before writing to finalized.
 984                  */
 985                 smp_wmb();
 986                 WRITE_ONCE(buf->finalized, 1);
 987                 wake_up_interruptible(&buf->read_wait);
 988         }
 989         WRITE_ONCE(chan->finalized, 1);
 990         wake_up_interruptible(&chan->hp_wait);
 991         wake_up_interruptible(&chan->read_wait);
 992         priv = chan->backend.priv;
 993         kref_put(&chan->ref, channel_release);
 994         return priv;
 995 }
 996 EXPORT_SYMBOL_GPL(channel_destroy);
 997
 998 struct lib_ring_buffer *channel_get_ring_buffer(
 999                                         const struct lib_ring_buffer_config *config,
1000                                         struct channel *chan, int cpu)
1001 {
1002         if (config->alloc == RING_BUFFER_ALLOC_GLOBAL)
1003                 return chan->backend.buf;
1004         else
1005                 return per_cpu_ptr(chan->backend.buf, cpu);
1006 }
1007 EXPORT_SYMBOL_GPL(channel_get_ring_buffer);
1008
1009 int lib_ring_buffer_open_read(struct lib_ring_buffer *buf)
1010 {
1011         struct channel *chan = buf->backend.chan;
1012
1013         if (!atomic_long_add_unless(&buf->active_readers, 1, 1))
1014                 return -EBUSY;
1015         if (!lttng_kref_get(&chan->ref)) {
1016                 atomic_long_dec(&buf->active_readers);
1017                 return -EOVERFLOW;
1018         }
1019         lttng_smp_mb__after_atomic();
1020         return 0;
1021 }
1022 EXPORT_SYMBOL_GPL(lib_ring_buffer_open_read);
1023
1024 void lib_ring_buffer_release_read(struct lib_ring_buffer *buf)
1025 {
1026         struct channel *chan = buf->backend.chan;
1027
1028         CHAN_WARN_ON(chan, atomic_long_read(&buf->active_readers) != 1);
1029         lttng_smp_mb__before_atomic();
1030         atomic_long_dec(&buf->active_readers);
1031         kref_put(&chan->ref, channel_release);
1032 }
1033 EXPORT_SYMBOL_GPL(lib_ring_buffer_release_read);
1034
1035 /*
1036  * Promote compiler barrier to a smp_mb().
1037  * For the specific ring buffer case, this IPI call should be removed if the
1038  * architecture does not reorder writes.  This should eventually be provided by
1039  * a separate architecture-specific infrastructure.
1040  */
1041 static void remote_mb(void *info)
1042 {
1043         smp_mb();
1044 }
1045
1046 /**
1047  * lib_ring_buffer_snapshot - save subbuffer position snapshot (for read)
1048  * @buf: ring buffer
1049  * @consumed: consumed count indicating the position where to read
1050  * @produced: produced count, indicates position when to stop reading
1051  *
1052  * Returns -ENODATA if buffer is finalized, -EAGAIN if there is currently no
1053  * data to read at consumed position, or 0 if the get operation succeeds.
1054  * Busy-loop trying to get data if the tick_nohz sequence lock is held.
1055  */
1056
1057 int lib_ring_buffer_snapshot(struct lib_ring_buffer *buf,
1058                              unsigned long *consumed, unsigned long *produced)
1059 {
1060         struct channel *chan = buf->backend.chan;
1061         const struct lib_ring_buffer_config *config = &chan->backend.config;
1062         unsigned long consumed_cur, write_offset;
1063         int finalized;
1064
1065 retry:
1066         finalized = READ_ONCE(buf->finalized);
1067         /*
1068          * Read finalized before counters.
1069          */
1070         smp_rmb();
1071         consumed_cur = atomic_long_read(&buf->consumed);
1072         /*
1073          * No need to issue a memory barrier between consumed count read and
1074          * write offset read, because consumed count can only change
1075          * concurrently in overwrite mode, and we keep a sequence counter
1076          * identifier derived from the write offset to check we are getting
1077          * the same sub-buffer we are expecting (the sub-buffers are atomically
1078          * "tagged" upon writes, tags are checked upon read).
1079          */
1080         write_offset = v_read(config, &buf->offset);
1081
1082         /*
1083          * Check that we are not about to read the same subbuffer in
1084          * which the writer head is.
1085          */
1086         if (subbuf_trunc(write_offset, chan) - subbuf_trunc(consumed_cur, chan)
1087             == 0)
1088                 goto nodata;
1089
1090         *consumed = consumed_cur;
1091         *produced = subbuf_trunc(write_offset, chan);
1092
1093         return 0;
1094
1095 nodata:
1096         /*
1097          * The memory barriers __wait_event()/wake_up_interruptible() take care
1098          * of "raw_spin_is_locked" memory ordering.
1099          */
1100         if (finalized)
1101                 return -ENODATA;
1102         else if (raw_spin_is_locked(&buf->raw_tick_nohz_spinlock))
1103                 goto retry;
1104         else
1105                 return -EAGAIN;
1106 }
1107 EXPORT_SYMBOL_GPL(lib_ring_buffer_snapshot);
1108
1109 /**
1110  * Performs the same function as lib_ring_buffer_snapshot(), but the positions
1111  * are saved regardless of whether the consumed and produced positions are
1112  * in the same subbuffer.
1113  * @buf: ring buffer
1114  * @consumed: consumed byte count indicating the last position read
1115  * @produced: produced byte count indicating the last position written
1116  *
1117  * This function is meant to provide information on the exact producer and
1118  * consumer positions without regard for the "snapshot" feature.
1119  */
1120 int lib_ring_buffer_snapshot_sample_positions(struct lib_ring_buffer *buf,
1121                 unsigned long *consumed, unsigned long *produced)
1122 {
1123         struct channel *chan = buf->backend.chan;
1124         const struct lib_ring_buffer_config *config = &chan->backend.config;
1125
1126         smp_rmb();
1127         *consumed = atomic_long_read(&buf->consumed);
1128         /*
1129          * No need to issue a memory barrier between consumed count read and
1130          * write offset read, because consumed count can only change
1131          * concurrently in overwrite mode, and we keep a sequence counter
1132          * identifier derived from the write offset to check we are getting
1133          * the same sub-buffer we are expecting (the sub-buffers are atomically
1134          * "tagged" upon writes, tags are checked upon read).
1135          */
1136         *produced = v_read(config, &buf->offset);
1137         return 0;
1138 }
1139
1140 /**
1141  * lib_ring_buffer_put_snapshot - move consumed counter forward
1142  *
1143  * Should only be called from consumer context.
1144  * @buf: ring buffer
1145  * @consumed_new: new consumed count value
1146  */
1147 void lib_ring_buffer_move_consumer(struct lib_ring_buffer *buf,
1148                                    unsigned long consumed_new)
1149 {
1150         struct lib_ring_buffer_backend *bufb = &buf->backend;
1151         struct channel *chan = bufb->chan;
1152         unsigned long consumed;
1153
1154         CHAN_WARN_ON(chan, atomic_long_read(&buf->active_readers) != 1);
1155
1156         /*
1157          * Only push the consumed value forward.
1158          * If the consumed cmpxchg fails, this is because we have been pushed by
1159          * the writer in flight recorder mode.
1160          */
1161         consumed = atomic_long_read(&buf->consumed);
1162         while ((long) consumed - (long) consumed_new < 0)
1163                 consumed = atomic_long_cmpxchg(&buf->consumed, consumed,
1164                                                consumed_new);
1165         /* Wake-up the metadata producer */
1166         wake_up_interruptible(&buf->write_wait);
1167 }
1168 EXPORT_SYMBOL_GPL(lib_ring_buffer_move_consumer);
1169
1170 #if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE
1171 static void lib_ring_buffer_flush_read_subbuf_dcache(
1172                 const struct lib_ring_buffer_config *config,
1173                 struct channel *chan,
1174                 struct lib_ring_buffer *buf)
1175 {
1176         struct lib_ring_buffer_backend_pages *pages;
1177         unsigned long sb_bindex, id, i, nr_pages;
1178
1179         if (config->output != RING_BUFFER_MMAP)
1180                 return;
1181
1182         /*
1183          * Architectures with caches aliased on virtual addresses may
1184          * use different cache lines for the linear mapping vs
1185          * user-space memory mapping. Given that the ring buffer is
1186          * based on the kernel linear mapping, aligning it with the
1187          * user-space mapping is not straightforward, and would require
1188          * extra TLB entries. Therefore, simply flush the dcache for the
1189          * entire sub-buffer before reading it.
1190          */
1191         id = buf->backend.buf_rsb.id;
1192         sb_bindex = subbuffer_id_get_index(config, id);
1193         pages = buf->backend.array[sb_bindex];
1194         nr_pages = buf->backend.num_pages_per_subbuf;
1195         for (i = 0; i < nr_pages; i++) {
1196                 struct lib_ring_buffer_backend_page *backend_page;
1197
1198                 backend_page = &pages->p[i];
1199                 flush_dcache_page(pfn_to_page(backend_page->pfn));
1200         }
1201 }
1202 #else
1203 static void lib_ring_buffer_flush_read_subbuf_dcache(
1204                 const struct lib_ring_buffer_config *config,
1205                 struct channel *chan,
1206                 struct lib_ring_buffer *buf)
1207 {
1208 }
1209 #endif
1210
1211 /**
1212  * lib_ring_buffer_get_subbuf - get exclusive access to subbuffer for reading
1213  * @buf: ring buffer
1214  * @consumed: consumed count indicating the position where to read
1215  *
1216  * Returns -ENODATA if buffer is finalized, -EAGAIN if there is currently no
1217  * data to read at consumed position, or 0 if the get operation succeeds.
1218  * Busy-loop trying to get data if the tick_nohz sequence lock is held.
1219  */
1220 int lib_ring_buffer_get_subbuf(struct lib_ring_buffer *buf,
1221                                unsigned long consumed)
1222 {
1223         struct channel *chan = buf->backend.chan;
1224         const struct lib_ring_buffer_config *config = &chan->backend.config;
1225         unsigned long consumed_cur, consumed_idx, commit_count, write_offset;
1226         int ret;
1227         int finalized;
1228
1229         if (buf->get_subbuf) {
1230                 /*
1231                  * Reader is trying to get a subbuffer twice.
1232                  */
1233                 CHAN_WARN_ON(chan, 1);
1234                 return -EBUSY;
1235         }
1236 retry:
1237         finalized = READ_ONCE(buf->finalized);
1238         /*
1239          * Read finalized before counters.
1240          */
1241         smp_rmb();
1242         consumed_cur = atomic_long_read(&buf->consumed);
1243         consumed_idx = subbuf_index(consumed, chan);
1244         commit_count = v_read(config, &buf->commit_cold[consumed_idx].cc_sb);
1245         /*
1246          * Make sure we read the commit count before reading the buffer
1247          * data and the write offset. Correct consumed offset ordering
1248          * wrt commit count is insured by the use of cmpxchg to update
1249          * the consumed offset.
1250          * smp_call_function_single can fail if the remote CPU is offline,
1251          * this is OK because then there is no wmb to execute there.
1252          * If our thread is executing on the same CPU as the on the buffers
1253          * belongs to, we don't have to synchronize it at all. If we are
1254          * migrated, the scheduler will take care of the memory barriers.
1255          * Normally, smp_call_function_single() should ensure program order when
1256          * executing the remote function, which implies that it surrounds the
1257          * function execution with :
1258          * smp_mb()
1259          * send IPI
1260          * csd_lock_wait
1261          *                recv IPI
1262          *                smp_mb()
1263          *                exec. function
1264          *                smp_mb()
1265          *                csd unlock
1266          * smp_mb()
1267          *
1268          * However, smp_call_function_single() does not seem to clearly execute
1269          * such barriers. It depends on spinlock semantic to provide the barrier
1270          * before executing the IPI and, when busy-looping, csd_lock_wait only
1271          * executes smp_mb() when it has to wait for the other CPU.
1272          *
1273          * I don't trust this code. Therefore, let's add the smp_mb() sequence
1274          * required ourself, even if duplicated. It has no performance impact
1275          * anyway.
1276          *
1277          * smp_mb() is needed because smp_rmb() and smp_wmb() only order read vs
1278          * read and write vs write. They do not ensure core synchronization. We
1279          * really have to ensure total order between the 3 barriers running on
1280          * the 2 CPUs.
1281          */
1282         if (config->ipi == RING_BUFFER_IPI_BARRIER) {
1283                 if (config->sync == RING_BUFFER_SYNC_PER_CPU
1284                     && config->alloc == RING_BUFFER_ALLOC_PER_CPU) {
1285                         if (raw_smp_processor_id() != buf->backend.cpu) {
1286                                 /* Total order with IPI handler smp_mb() */
1287                                 smp_mb();
1288                                 smp_call_function_single(buf->backend.cpu,
1289                                                          remote_mb, NULL, 1);
1290                                 /* Total order with IPI handler smp_mb() */
1291                                 smp_mb();
1292                         }
1293                 } else {
1294                         /* Total order with IPI handler smp_mb() */
1295                         smp_mb();
1296                         smp_call_function(remote_mb, NULL, 1);
1297                         /* Total order with IPI handler smp_mb() */
1298                         smp_mb();
1299                 }
1300         } else {
1301                 /*
1302                  * Local rmb to match the remote wmb to read the commit count
1303                  * before the buffer data and the write offset.
1304                  */
1305                 smp_rmb();
1306         }
1307
1308         write_offset = v_read(config, &buf->offset);
1309
1310         /*
1311          * Check that the buffer we are getting is after or at consumed_cur
1312          * position.
1313          */
1314         if ((long) subbuf_trunc(consumed, chan)
1315             - (long) subbuf_trunc(consumed_cur, chan) < 0)
1316                 goto nodata;
1317
1318         /*
1319          * Check that the subbuffer we are trying to consume has been
1320          * already fully committed.
1321          */
1322         if (((commit_count - chan->backend.subbuf_size)
1323              & chan->commit_count_mask)
1324             - (buf_trunc(consumed, chan)
1325                >> chan->backend.num_subbuf_order)
1326             != 0)
1327                 goto nodata;
1328
1329         /*
1330          * Check that we are not about to read the same subbuffer in
1331          * which the writer head is.
1332          */
1333         if (subbuf_trunc(write_offset, chan) - subbuf_trunc(consumed, chan)
1334             == 0)
1335                 goto nodata;
1336
1337         /*
1338          * Failure to get the subbuffer causes a busy-loop retry without going
1339          * to a wait queue. These are caused by short-lived race windows where
1340          * the writer is getting access to a subbuffer we were trying to get
1341          * access to. Also checks that the "consumed" buffer count we are
1342          * looking for matches the one contained in the subbuffer id.
1343          */
1344         ret = update_read_sb_index(config, &buf->backend, &chan->backend,
1345                                    consumed_idx, buf_trunc_val(consumed, chan));
1346         if (ret)
1347                 goto retry;
1348         subbuffer_id_clear_noref(config, &buf->backend.buf_rsb.id);
1349
1350         buf->get_subbuf_consumed = consumed;
1351         buf->get_subbuf = 1;
1352
1353         lib_ring_buffer_flush_read_subbuf_dcache(config, chan, buf);
1354
1355         return 0;
1356
1357 nodata:
1358         /*
1359          * The memory barriers __wait_event()/wake_up_interruptible() take care
1360          * of "raw_spin_is_locked" memory ordering.
1361          */
1362         if (finalized)
1363                 return -ENODATA;
1364         else if (raw_spin_is_locked(&buf->raw_tick_nohz_spinlock))
1365                 goto retry;
1366         else
1367                 return -EAGAIN;
1368 }
1369 EXPORT_SYMBOL_GPL(lib_ring_buffer_get_subbuf);
1370
1371 /**
1372  * lib_ring_buffer_put_subbuf - release exclusive subbuffer access
1373  * @buf: ring buffer
1374  */
1375 void lib_ring_buffer_put_subbuf(struct lib_ring_buffer *buf)
1376 {
1377         struct lib_ring_buffer_backend *bufb = &buf->backend;
1378         struct channel *chan = bufb->chan;
1379         const struct lib_ring_buffer_config *config = &chan->backend.config;
1380         unsigned long read_sb_bindex, consumed_idx, consumed;
1381
1382         CHAN_WARN_ON(chan, atomic_long_read(&buf->active_readers) != 1);
1383
1384         if (!buf->get_subbuf) {
1385                 /*
1386                  * Reader puts a subbuffer it did not get.
1387                  */
1388                 CHAN_WARN_ON(chan, 1);
1389                 return;
1390         }
1391         consumed = buf->get_subbuf_consumed;
1392         buf->get_subbuf = 0;
1393
1394         /*
1395          * Clear the records_unread counter. (overruns counter)
1396          * Can still be non-zero if a file reader simply grabbed the data
1397          * without using iterators.
1398          * Can be below zero if an iterator is used on a snapshot more than
1399          * once.
1400          */
1401         read_sb_bindex = subbuffer_id_get_index(config, bufb->buf_rsb.id);
1402         v_add(config, v_read(config,
1403                              &bufb->array[read_sb_bindex]->records_unread),
1404               &bufb->records_read);
1405         v_set(config, &bufb->array[read_sb_bindex]->records_unread, 0);
1406         CHAN_WARN_ON(chan, config->mode == RING_BUFFER_OVERWRITE
1407                      && subbuffer_id_is_noref(config, bufb->buf_rsb.id));
1408         subbuffer_id_set_noref(config, &bufb->buf_rsb.id);
1409
1410         /*
1411          * Exchange the reader subbuffer with the one we put in its place in the
1412          * writer subbuffer table. Expect the original consumed count. If
1413          * update_read_sb_index fails, this is because the writer updated the
1414          * subbuffer concurrently. We should therefore keep the subbuffer we
1415          * currently have: it has become invalid to try reading this sub-buffer
1416          * consumed count value anyway.
1417          */
1418         consumed_idx = subbuf_index(consumed, chan);
1419         update_read_sb_index(config, &buf->backend, &chan->backend,
1420                              consumed_idx, buf_trunc_val(consumed, chan));
1421         /*
1422          * update_read_sb_index return value ignored. Don't exchange sub-buffer
1423          * if the writer concurrently updated it.
1424          */
1425 }
1426 EXPORT_SYMBOL_GPL(lib_ring_buffer_put_subbuf);
1427
1428 /*
1429  * cons_offset is an iterator on all subbuffer offsets between the reader
1430  * position and the writer position. (inclusive)
1431  */
1432 static
1433 void lib_ring_buffer_print_subbuffer_errors(struct lib_ring_buffer *buf,
1434                                             struct channel *chan,
1435                                             unsigned long cons_offset,
1436                                             int cpu)
1437 {
1438         const struct lib_ring_buffer_config *config = &chan->backend.config;
1439         unsigned long cons_idx, commit_count, commit_count_sb;
1440
1441         cons_idx = subbuf_index(cons_offset, chan);
1442         commit_count = v_read(config, &buf->commit_hot[cons_idx].cc);
1443         commit_count_sb = v_read(config, &buf->commit_cold[cons_idx].cc_sb);
1444
1445         if (subbuf_offset(commit_count, chan) != 0)
1446                 printk(KERN_WARNING
1447                        "ring buffer %s, cpu %d: "
1448                        "commit count in subbuffer %lu,\n"
1449                        "expecting multiples of %lu bytes\n"
1450                        "  [ %lu bytes committed, %lu bytes reader-visible ]\n",
1451                        chan->backend.name, cpu, cons_idx,
1452                        chan->backend.subbuf_size,
1453                        commit_count, commit_count_sb);
1454
1455         printk(KERN_DEBUG "ring buffer: %s, cpu %d: %lu bytes committed\n",
1456                chan->backend.name, cpu, commit_count);
1457 }
1458
1459 static
1460 void lib_ring_buffer_print_buffer_errors(struct lib_ring_buffer *buf,
1461                                          struct channel *chan,
1462                                          void *priv, int cpu)
1463 {
1464         const struct lib_ring_buffer_config *config = &chan->backend.config;
1465         unsigned long write_offset, cons_offset;
1466
1467         /*
1468          * No need to order commit_count, write_offset and cons_offset reads
1469          * because we execute at teardown when no more writer nor reader
1470          * references are left.
1471          */
1472         write_offset = v_read(config, &buf->offset);
1473         cons_offset = atomic_long_read(&buf->consumed);
1474         if (write_offset != cons_offset)
1475                 printk(KERN_DEBUG
1476                        "ring buffer %s, cpu %d: "
1477                        "non-consumed data\n"
1478                        "  [ %lu bytes written, %lu bytes read ]\n",
1479                        chan->backend.name, cpu, write_offset, cons_offset);
1480
1481         for (cons_offset = atomic_long_read(&buf->consumed);
1482              (long) (subbuf_trunc((unsigned long) v_read(config, &buf->offset),
1483                                   chan)
1484                      - cons_offset) > 0;
1485              cons_offset = subbuf_align(cons_offset, chan))
1486                 lib_ring_buffer_print_subbuffer_errors(buf, chan, cons_offset,
1487                                                        cpu);
1488 }
1489
1490 static
1491 void lib_ring_buffer_print_errors(struct channel *chan,
1492                                   struct lib_ring_buffer *buf, int cpu)
1493 {
1494         const struct lib_ring_buffer_config *config = &chan->backend.config;
1495         void *priv = chan->backend.priv;
1496
1497         if (!strcmp(chan->backend.name, "relay-metadata")) {
1498                 printk(KERN_DEBUG "ring buffer %s: %lu records written, "
1499                         "%lu records overrun\n",
1500                         chan->backend.name,
1501                         v_read(config, &buf->records_count),
1502                         v_read(config, &buf->records_overrun));
1503         } else {
1504                 printk(KERN_DEBUG "ring buffer %s, cpu %d: %lu records written, "
1505                         "%lu records overrun\n",
1506                         chan->backend.name, cpu,
1507                         v_read(config, &buf->records_count),
1508                         v_read(config, &buf->records_overrun));
1509
1510                 if (v_read(config, &buf->records_lost_full)
1511                     || v_read(config, &buf->records_lost_wrap)
1512                     || v_read(config, &buf->records_lost_big))
1513                         printk(KERN_WARNING
1514                                 "ring buffer %s, cpu %d: records were lost. Caused by:\n"
1515                                 "  [ %lu buffer full, %lu nest buffer wrap-around, "
1516                                 "%lu event too big ]\n",
1517                                 chan->backend.name, cpu,
1518                                 v_read(config, &buf->records_lost_full),
1519                                 v_read(config, &buf->records_lost_wrap),
1520                                 v_read(config, &buf->records_lost_big));
1521         }
1522         lib_ring_buffer_print_buffer_errors(buf, chan, priv, cpu);
1523 }
1524
1525 /*
1526  * lib_ring_buffer_switch_old_start: Populate old subbuffer header.
1527  *
1528  * Only executed when the buffer is finalized, in SWITCH_FLUSH.
1529  */
1530 static
1531 void lib_ring_buffer_switch_old_start(struct lib_ring_buffer *buf,
1532                                       struct channel *chan,
1533                                       struct switch_offsets *offsets,
1534                                       u64 tsc)
1535 {
1536         const struct lib_ring_buffer_config *config = &chan->backend.config;
1537         unsigned long oldidx = subbuf_index(offsets->old, chan);
1538         unsigned long commit_count;
1539         struct commit_counters_hot *cc_hot;
1540
1541         config->cb.buffer_begin(buf, tsc, oldidx);
1542
1543         /*
1544          * Order all writes to buffer before the commit count update that will
1545          * determine that the subbuffer is full.
1546          */
1547         if (config->ipi == RING_BUFFER_IPI_BARRIER) {
1548                 /*
1549                  * Must write slot data before incrementing commit count.  This
1550                  * compiler barrier is upgraded into a smp_mb() by the IPI sent
1551                  * by get_subbuf().
1552                  */
1553                 barrier();
1554         } else
1555                 smp_wmb();
1556         cc_hot = &buf->commit_hot[oldidx];
1557         v_add(config, config->cb.subbuffer_header_size(), &cc_hot->cc);
1558         commit_count = v_read(config, &cc_hot->cc);
1559         /* Check if the written buffer has to be delivered */
1560         lib_ring_buffer_check_deliver(config, buf, chan, offsets->old,
1561                                       commit_count, oldidx, tsc);
1562         lib_ring_buffer_write_commit_counter(config, buf, chan,
1563                         offsets->old + config->cb.subbuffer_header_size(),
1564                         commit_count, cc_hot);
1565 }
1566
1567 /*
1568  * lib_ring_buffer_switch_old_end: switch old subbuffer
1569  *
1570  * Note : offset_old should never be 0 here. It is ok, because we never perform
1571  * buffer switch on an empty subbuffer in SWITCH_ACTIVE mode. The caller
1572  * increments the offset_old value when doing a SWITCH_FLUSH on an empty
1573  * subbuffer.
1574  */
1575 static
1576 void lib_ring_buffer_switch_old_end(struct lib_ring_buffer *buf,
1577                                     struct channel *chan,
1578                                     struct switch_offsets *offsets,
1579                                     u64 tsc)
1580 {
1581         const struct lib_ring_buffer_config *config = &chan->backend.config;
1582         unsigned long oldidx = subbuf_index(offsets->old - 1, chan);
1583         unsigned long commit_count, padding_size, data_size;
1584         struct commit_counters_hot *cc_hot;
1585
1586         data_size = subbuf_offset(offsets->old - 1, chan) + 1;
1587         padding_size = chan->backend.subbuf_size - data_size;
1588         subbuffer_set_data_size(config, &buf->backend, oldidx, data_size);
1589
1590         /*
1591          * Order all writes to buffer before the commit count update that will
1592          * determine that the subbuffer is full.
1593          */
1594         if (config->ipi == RING_BUFFER_IPI_BARRIER) {
1595                 /*
1596                  * Must write slot data before incrementing commit count.  This
1597                  * compiler barrier is upgraded into a smp_mb() by the IPI sent
1598                  * by get_subbuf().
1599                  */
1600                 barrier();
1601         } else
1602                 smp_wmb();
1603         cc_hot = &buf->commit_hot[oldidx];
1604         v_add(config, padding_size, &cc_hot->cc);
1605         commit_count = v_read(config, &cc_hot->cc);
1606         lib_ring_buffer_check_deliver(config, buf, chan, offsets->old - 1,
1607                                       commit_count, oldidx, tsc);
1608         lib_ring_buffer_write_commit_counter(config, buf, chan,
1609                         offsets->old + padding_size, commit_count,
1610                         cc_hot);
1611 }
1612
1613 /*
1614  * lib_ring_buffer_switch_new_start: Populate new subbuffer.
1615  *
1616  * This code can be executed unordered : writers may already have written to the
1617  * sub-buffer before this code gets executed, caution.  The commit makes sure
1618  * that this code is executed before the deliver of this sub-buffer.
1619  */
1620 static
1621 void lib_ring_buffer_switch_new_start(struct lib_ring_buffer *buf,
1622                                       struct channel *chan,
1623                                       struct switch_offsets *offsets,
1624                                       u64 tsc)
1625 {
1626         const struct lib_ring_buffer_config *config = &chan->backend.config;
1627         unsigned long beginidx = subbuf_index(offsets->begin, chan);
1628         unsigned long commit_count;
1629         struct commit_counters_hot *cc_hot;
1630
1631         config->cb.buffer_begin(buf, tsc, beginidx);
1632
1633         /*
1634          * Order all writes to buffer before the commit count update that will
1635          * determine that the subbuffer is full.
1636          */
1637         if (config->ipi == RING_BUFFER_IPI_BARRIER) {
1638                 /*
1639                  * Must write slot data before incrementing commit count.  This
1640                  * compiler barrier is upgraded into a smp_mb() by the IPI sent
1641                  * by get_subbuf().
1642                  */
1643                 barrier();
1644         } else
1645                 smp_wmb();
1646         cc_hot = &buf->commit_hot[beginidx];
1647         v_add(config, config->cb.subbuffer_header_size(), &cc_hot->cc);
1648         commit_count = v_read(config, &cc_hot->cc);
1649         /* Check if the written buffer has to be delivered */
1650         lib_ring_buffer_check_deliver(config, buf, chan, offsets->begin,
1651                                       commit_count, beginidx, tsc);
1652         lib_ring_buffer_write_commit_counter(config, buf, chan,
1653                         offsets->begin + config->cb.subbuffer_header_size(),
1654                         commit_count, cc_hot);
1655 }
1656
1657 /*
1658  * lib_ring_buffer_switch_new_end: finish switching current subbuffer
1659  *
1660  * Calls subbuffer_set_data_size() to set the data size of the current
1661  * sub-buffer. We do not need to perform check_deliver nor commit here,
1662  * since this task will be done by the "commit" of the event for which
1663  * we are currently doing the space reservation.
1664  */
1665 static
1666 void lib_ring_buffer_switch_new_end(struct lib_ring_buffer *buf,
1667                                             struct channel *chan,
1668                                             struct switch_offsets *offsets,
1669                                             u64 tsc)
1670 {
1671         const struct lib_ring_buffer_config *config = &chan->backend.config;
1672         unsigned long endidx, data_size;
1673
1674         endidx = subbuf_index(offsets->end - 1, chan);
1675         data_size = subbuf_offset(offsets->end - 1, chan) + 1;
1676         subbuffer_set_data_size(config, &buf->backend, endidx, data_size);
1677 }
1678
1679 /*
1680  * Returns :
1681  * 0 if ok
1682  * !0 if execution must be aborted.
1683  */
1684 static
1685 int lib_ring_buffer_try_switch_slow(enum switch_mode mode,
1686                                     struct lib_ring_buffer *buf,
1687                                     struct channel *chan,
1688                                     struct switch_offsets *offsets,
1689                                     u64 *tsc)
1690 {
1691         const struct lib_ring_buffer_config *config = &chan->backend.config;
1692         unsigned long off, reserve_commit_diff;
1693
1694         offsets->begin = v_read(config, &buf->offset);
1695         offsets->old = offsets->begin;
1696         offsets->switch_old_start = 0;
1697         off = subbuf_offset(offsets->begin, chan);
1698
1699         *tsc = config->cb.ring_buffer_clock_read(chan);
1700
1701         /*
1702          * Ensure we flush the header of an empty subbuffer when doing the
1703          * finalize (SWITCH_FLUSH). This ensures that we end up knowing the
1704          * total data gathering duration even if there were no records saved
1705          * after the last buffer switch.
1706          * In SWITCH_ACTIVE mode, switch the buffer when it contains events.
1707          * SWITCH_ACTIVE only flushes the current subbuffer, dealing with end of
1708          * subbuffer header as appropriate.
1709          * The next record that reserves space will be responsible for
1710          * populating the following subbuffer header. We choose not to populate
1711          * the next subbuffer header here because we want to be able to use
1712          * SWITCH_ACTIVE for periodical buffer flush and CPU tick_nohz stop
1713          * buffer flush, which must guarantee that all the buffer content
1714          * (records and header timestamps) are visible to the reader. This is
1715          * required for quiescence guarantees for the fusion merge.
1716          */
1717         if (mode != SWITCH_FLUSH && !off)
1718                 return -1;      /* we do not have to switch : buffer is empty */
1719
1720         if (unlikely(off == 0)) {
1721                 unsigned long sb_index, commit_count;
1722
1723                 /*
1724                  * We are performing a SWITCH_FLUSH. At this stage, there are no
1725                  * concurrent writes into the buffer.
1726                  *
1727                  * The client does not save any header information.  Don't
1728                  * switch empty subbuffer on finalize, because it is invalid to
1729                  * deliver a completely empty subbuffer.
1730                  */
1731                 if (!config->cb.subbuffer_header_size())
1732                         return -1;
1733
1734                 /* Test new buffer integrity */
1735                 sb_index = subbuf_index(offsets->begin, chan);
1736                 commit_count = v_read(config,
1737                                 &buf->commit_cold[sb_index].cc_sb);
1738                 reserve_commit_diff =
1739                   (buf_trunc(offsets->begin, chan)
1740                    >> chan->backend.num_subbuf_order)
1741                   - (commit_count & chan->commit_count_mask);
1742                 if (likely(reserve_commit_diff == 0)) {
1743                         /* Next subbuffer not being written to. */
1744                         if (unlikely(config->mode != RING_BUFFER_OVERWRITE &&
1745                                 subbuf_trunc(offsets->begin, chan)
1746                                  - subbuf_trunc((unsigned long)
1747                                      atomic_long_read(&buf->consumed), chan)
1748                                 >= chan->backend.buf_size)) {
1749                                 /*
1750                                  * We do not overwrite non consumed buffers
1751                                  * and we are full : don't switch.
1752                                  */
1753                                 return -1;
1754                         } else {
1755                                 /*
1756                                  * Next subbuffer not being written to, and we
1757                                  * are either in overwrite mode or the buffer is
1758                                  * not full. It's safe to write in this new
1759                                  * subbuffer.
1760                                  */
1761                         }
1762                 } else {
1763                         /*
1764                          * Next subbuffer reserve offset does not match the
1765                          * commit offset. Don't perform switch in
1766                          * producer-consumer and overwrite mode.  Caused by
1767                          * either a writer OOPS or too many nested writes over a
1768                          * reserve/commit pair.
1769                          */
1770                         return -1;
1771                 }
1772
1773                 /*
1774                  * Need to write the subbuffer start header on finalize.
1775                  */
1776                 offsets->switch_old_start = 1;
1777         }
1778         offsets->begin = subbuf_align(offsets->begin, chan);
1779         /* Note: old points to the next subbuf at offset 0 */
1780         offsets->end = offsets->begin;
1781         return 0;
1782 }
1783
1784 /*
1785  * Force a sub-buffer switch. This operation is completely reentrant : can be
1786  * called while tracing is active with absolutely no lock held.
1787  *
1788  * Note, however, that as a v_cmpxchg is used for some atomic
1789  * operations, this function must be called from the CPU which owns the buffer
1790  * for a ACTIVE flush.
1791  */
1792 void lib_ring_buffer_switch_slow(struct lib_ring_buffer *buf, enum switch_mode mode)
1793 {
1794         struct channel *chan = buf->backend.chan;
1795         const struct lib_ring_buffer_config *config = &chan->backend.config;
1796         struct switch_offsets offsets;
1797         unsigned long oldidx;
1798         u64 tsc;
1799
1800         offsets.size = 0;
1801
1802         /*
1803          * Perform retryable operations.
1804          */
1805         do {
1806                 if (lib_ring_buffer_try_switch_slow(mode, buf, chan, &offsets,
1807                                                     &tsc))
1808                         return; /* Switch not needed */
1809         } while (v_cmpxchg(config, &buf->offset, offsets.old, offsets.end)
1810                  != offsets.old);
1811
1812         /*
1813          * Atomically update last_tsc. This update races against concurrent
1814          * atomic updates, but the race will always cause supplementary full TSC
1815          * records, never the opposite (missing a full TSC record when it would
1816          * be needed).
1817          */
1818         save_last_tsc(config, buf, tsc);
1819
1820         /*
1821          * Push the reader if necessary
1822          */
1823         lib_ring_buffer_reserve_push_reader(buf, chan, offsets.old);
1824
1825         oldidx = subbuf_index(offsets.old, chan);
1826         lib_ring_buffer_clear_noref(config, &buf->backend, oldidx);
1827
1828         /*
1829          * May need to populate header start on SWITCH_FLUSH.
1830          */
1831         if (offsets.switch_old_start) {
1832                 lib_ring_buffer_switch_old_start(buf, chan, &offsets, tsc);
1833                 offsets.old += config->cb.subbuffer_header_size();
1834         }
1835
1836         /*
1837          * Switch old subbuffer.
1838          */
1839         lib_ring_buffer_switch_old_end(buf, chan, &offsets, tsc);
1840 }
1841 EXPORT_SYMBOL_GPL(lib_ring_buffer_switch_slow);
1842
1843 struct switch_param {
1844         struct lib_ring_buffer *buf;
1845         enum switch_mode mode;
1846 };
1847
1848 static void remote_switch(void *info)
1849 {
1850         struct switch_param *param = info;
1851         struct lib_ring_buffer *buf = param->buf;
1852
1853         lib_ring_buffer_switch_slow(buf, param->mode);
1854 }
1855
1856 static void _lib_ring_buffer_switch_remote(struct lib_ring_buffer *buf,
1857                 enum switch_mode mode)
1858 {
1859         struct channel *chan = buf->backend.chan;
1860         const struct lib_ring_buffer_config *config = &chan->backend.config;
1861         int ret;
1862         struct switch_param param;
1863
1864         /*
1865          * With global synchronization we don't need to use the IPI scheme.
1866          */
1867         if (config->sync == RING_BUFFER_SYNC_GLOBAL) {
1868                 lib_ring_buffer_switch_slow(buf, mode);
1869                 return;
1870         }
1871
1872         /*
1873          * Disabling preemption ensures two things: first, that the
1874          * target cpu is not taken concurrently offline while we are within
1875          * smp_call_function_single(). Secondly, if it happens that the
1876          * CPU is not online, our own call to lib_ring_buffer_switch_slow()
1877          * needs to be protected from CPU hotplug handlers, which can
1878          * also perform a remote subbuffer switch.
1879          */
1880         preempt_disable();
1881         param.buf = buf;
1882         param.mode = mode;
1883         ret = smp_call_function_single(buf->backend.cpu,
1884                                  remote_switch, &param, 1);
1885         if (ret) {
1886                 /* Remote CPU is offline, do it ourself. */
1887                 lib_ring_buffer_switch_slow(buf, mode);
1888         }
1889         preempt_enable();
1890 }
1891
1892 /* Switch sub-buffer if current sub-buffer is non-empty. */
1893 void lib_ring_buffer_switch_remote(struct lib_ring_buffer *buf)
1894 {
1895         _lib_ring_buffer_switch_remote(buf, SWITCH_ACTIVE);
1896 }
1897 EXPORT_SYMBOL_GPL(lib_ring_buffer_switch_remote);
1898
1899 /* Switch sub-buffer even if current sub-buffer is empty. */
1900 void lib_ring_buffer_switch_remote_empty(struct lib_ring_buffer *buf)
1901 {
1902         _lib_ring_buffer_switch_remote(buf, SWITCH_FLUSH);
1903 }
1904 EXPORT_SYMBOL_GPL(lib_ring_buffer_switch_remote_empty);
1905
1906 /*
1907  * Returns :
1908  * 0 if ok
1909  * -ENOSPC if event size is too large for packet.
1910  * -ENOBUFS if there is currently not enough space in buffer for the event.
1911  * -EIO if data cannot be written into the buffer for any other reason.
1912  */
1913 static
1914 int lib_ring_buffer_try_reserve_slow(struct lib_ring_buffer *buf,
1915                                      struct channel *chan,
1916                                      struct switch_offsets *offsets,
1917                                      struct lib_ring_buffer_ctx *ctx,
1918                                      void *client_ctx)
1919 {
1920         const struct lib_ring_buffer_config *config = &chan->backend.config;
1921         unsigned long reserve_commit_diff, offset_cmp;
1922
1923 retry:
1924         offsets->begin = offset_cmp = v_read(config, &buf->offset);
1925         offsets->old = offsets->begin;
1926         offsets->switch_new_start = 0;
1927         offsets->switch_new_end = 0;
1928         offsets->switch_old_end = 0;
1929         offsets->pre_header_padding = 0;
1930
1931         ctx->tsc = config->cb.ring_buffer_clock_read(chan);
1932         if ((int64_t) ctx->tsc == -EIO)
1933                 return -EIO;
1934
1935         if (last_tsc_overflow(config, buf, ctx->tsc))
1936                 ctx->rflags |= RING_BUFFER_RFLAG_FULL_TSC;
1937
1938         if (unlikely(subbuf_offset(offsets->begin, ctx->chan) == 0)) {
1939                 offsets->switch_new_start = 1;          /* For offsets->begin */
1940         } else {
1941                 offsets->size = config->cb.record_header_size(config, chan,
1942                                                 offsets->begin,
1943                                                 &offsets->pre_header_padding,
1944                                                 ctx, client_ctx);
1945                 offsets->size +=
1946                         lib_ring_buffer_align(offsets->begin + offsets->size,
1947                                               ctx->largest_align)
1948                         + ctx->data_size;
1949                 if (unlikely(subbuf_offset(offsets->begin, chan) +
1950                              offsets->size > chan->backend.subbuf_size)) {
1951                         offsets->switch_old_end = 1;    /* For offsets->old */
1952                         offsets->switch_new_start = 1;  /* For offsets->begin */
1953                 }
1954         }
1955         if (unlikely(offsets->switch_new_start)) {
1956                 unsigned long sb_index, commit_count;
1957
1958                 /*
1959                  * We are typically not filling the previous buffer completely.
1960                  */
1961                 if (likely(offsets->switch_old_end))
1962                         offsets->begin = subbuf_align(offsets->begin, chan);
1963                 offsets->begin = offsets->begin
1964                                  + config->cb.subbuffer_header_size();
1965                 /* Test new buffer integrity */
1966                 sb_index = subbuf_index(offsets->begin, chan);
1967                 /*
1968                  * Read buf->offset before buf->commit_cold[sb_index].cc_sb.
1969                  * lib_ring_buffer_check_deliver() has the matching
1970                  * memory barriers required around commit_cold cc_sb
1971                  * updates to ensure reserve and commit counter updates
1972                  * are not seen reordered when updated by another CPU.
1973                  */
1974                 smp_rmb();
1975                 commit_count = v_read(config,
1976                                 &buf->commit_cold[sb_index].cc_sb);
1977                 /* Read buf->commit_cold[sb_index].cc_sb before buf->offset. */
1978                 smp_rmb();
1979                 if (unlikely(offset_cmp != v_read(config, &buf->offset))) {
1980                         /*
1981                          * The reserve counter have been concurrently updated
1982                          * while we read the commit counter. This means the
1983                          * commit counter we read might not match buf->offset
1984                          * due to concurrent update. We therefore need to retry.
1985                          */
1986                         goto retry;
1987                 }
1988                 reserve_commit_diff =
1989                   (buf_trunc(offsets->begin, chan)
1990                    >> chan->backend.num_subbuf_order)
1991                   - (commit_count & chan->commit_count_mask);
1992                 if (likely(reserve_commit_diff == 0)) {
1993                         /* Next subbuffer not being written to. */
1994                         if (unlikely(config->mode != RING_BUFFER_OVERWRITE &&
1995                                 subbuf_trunc(offsets->begin, chan)
1996                                  - subbuf_trunc((unsigned long)
1997                                      atomic_long_read(&buf->consumed), chan)
1998                                 >= chan->backend.buf_size)) {
1999                                 /*
2000                                  * We do not overwrite non consumed buffers
2001                                  * and we are full : record is lost.
2002                                  */
2003                                 v_inc(config, &buf->records_lost_full);
2004                                 return -ENOBUFS;
2005                         } else {
2006                                 /*
2007                                  * Next subbuffer not being written to, and we
2008                                  * are either in overwrite mode or the buffer is
2009                                  * not full. It's safe to write in this new
2010                                  * subbuffer.
2011                                  */
2012                         }
2013                 } else {
2014                         /*
2015                          * Next subbuffer reserve offset does not match the
2016                          * commit offset, and this did not involve update to the
2017                          * reserve counter. Drop record in producer-consumer and
2018                          * overwrite mode.  Caused by either a writer OOPS or
2019                          * too many nested writes over a reserve/commit pair.
2020                          */
2021                         v_inc(config, &buf->records_lost_wrap);
2022                         return -EIO;
2023                 }
2024                 offsets->size =
2025                         config->cb.record_header_size(config, chan,
2026                                                 offsets->begin,
2027                                                 &offsets->pre_header_padding,
2028                                                 ctx, client_ctx);
2029                 offsets->size +=
2030                         lib_ring_buffer_align(offsets->begin + offsets->size,
2031                                               ctx->largest_align)
2032                         + ctx->data_size;
2033                 if (unlikely(subbuf_offset(offsets->begin, chan)
2034                              + offsets->size > chan->backend.subbuf_size)) {
2035                         /*
2036                          * Record too big for subbuffers, report error, don't
2037                          * complete the sub-buffer switch.
2038                          */
2039                         v_inc(config, &buf->records_lost_big);
2040                         return -ENOSPC;
2041                 } else {
2042                         /*
2043                          * We just made a successful buffer switch and the
2044                          * record fits in the new subbuffer. Let's write.
2045                          */
2046                 }
2047         } else {
2048                 /*
2049                  * Record fits in the current buffer and we are not on a switch
2050                  * boundary. It's safe to write.
2051                  */
2052         }
2053         offsets->end = offsets->begin + offsets->size;
2054
2055         if (unlikely(subbuf_offset(offsets->end, chan) == 0)) {
2056                 /*
2057                  * The offset_end will fall at the very beginning of the next
2058                  * subbuffer.
2059                  */
2060                 offsets->switch_new_end = 1;    /* For offsets->begin */
2061         }
2062         return 0;
2063 }
2064
2065 static struct lib_ring_buffer *get_current_buf(struct channel *chan, int cpu)
2066 {
2067         const struct lib_ring_buffer_config *config = &chan->backend.config;
2068
2069         if (config->alloc == RING_BUFFER_ALLOC_PER_CPU)
2070                 return per_cpu_ptr(chan->backend.buf, cpu);
2071         else
2072                 return chan->backend.buf;
2073 }
2074
2075 void lib_ring_buffer_lost_event_too_big(struct channel *chan)
2076 {
2077         const struct lib_ring_buffer_config *config = &chan->backend.config;
2078         struct lib_ring_buffer *buf = get_current_buf(chan, smp_processor_id());
2079
2080         v_inc(config, &buf->records_lost_big);
2081 }
2082 EXPORT_SYMBOL_GPL(lib_ring_buffer_lost_event_too_big);
2083
2084 /**
2085  * lib_ring_buffer_reserve_slow - Atomic slot reservation in a buffer.
2086  * @ctx: ring buffer context.
2087  *
2088  * Return : -NOBUFS if not enough space, -ENOSPC if event size too large,
2089  * -EIO for other errors, else returns 0.
2090  * It will take care of sub-buffer switching.
2091  */
2092 int lib_ring_buffer_reserve_slow(struct lib_ring_buffer_ctx *ctx,
2093                 void *client_ctx)
2094 {
2095         struct channel *chan = ctx->chan;
2096         const struct lib_ring_buffer_config *config = &chan->backend.config;
2097         struct lib_ring_buffer *buf;
2098         struct switch_offsets offsets;
2099         int ret;
2100
2101         ctx->buf = buf = get_current_buf(chan, ctx->cpu);
2102         offsets.size = 0;
2103
2104         do {
2105                 ret = lib_ring_buffer_try_reserve_slow(buf, chan, &offsets,
2106                                                        ctx, client_ctx);
2107                 if (unlikely(ret))
2108                         return ret;
2109         } while (unlikely(v_cmpxchg(config, &buf->offset, offsets.old,
2110                                     offsets.end)
2111                           != offsets.old));
2112
2113         /*
2114          * Atomically update last_tsc. This update races against concurrent
2115          * atomic updates, but the race will always cause supplementary full TSC
2116          * records, never the opposite (missing a full TSC record when it would
2117          * be needed).
2118          */
2119         save_last_tsc(config, buf, ctx->tsc);
2120
2121         /*
2122          * Push the reader if necessary
2123          */
2124         lib_ring_buffer_reserve_push_reader(buf, chan, offsets.end - 1);
2125
2126         /*
2127          * Clear noref flag for this subbuffer.
2128          */
2129         lib_ring_buffer_clear_noref(config, &buf->backend,
2130                                     subbuf_index(offsets.end - 1, chan));
2131
2132         /*
2133          * Switch old subbuffer if needed.
2134          */
2135         if (unlikely(offsets.switch_old_end)) {
2136                 lib_ring_buffer_clear_noref(config, &buf->backend,
2137                                             subbuf_index(offsets.old - 1, chan));
2138                 lib_ring_buffer_switch_old_end(buf, chan, &offsets, ctx->tsc);
2139         }
2140
2141         /*
2142          * Populate new subbuffer.
2143          */
2144         if (unlikely(offsets.switch_new_start))
2145                 lib_ring_buffer_switch_new_start(buf, chan, &offsets, ctx->tsc);
2146
2147         if (unlikely(offsets.switch_new_end))
2148                 lib_ring_buffer_switch_new_end(buf, chan, &offsets, ctx->tsc);
2149
2150         ctx->slot_size = offsets.size;
2151         ctx->pre_offset = offsets.begin;
2152         ctx->buf_offset = offsets.begin + offsets.pre_header_padding;
2153         return 0;
2154 }
2155 EXPORT_SYMBOL_GPL(lib_ring_buffer_reserve_slow);
2156
2157 static
2158 void lib_ring_buffer_vmcore_check_deliver(const struct lib_ring_buffer_config *config,
2159                                           struct lib_ring_buffer *buf,
2160                                           unsigned long commit_count,
2161                                           unsigned long idx)
2162 {
2163         if (config->oops == RING_BUFFER_OOPS_CONSISTENCY)
2164                 v_set(config, &buf->commit_hot[idx].seq, commit_count);
2165 }
2166
2167 /*
2168  * The ring buffer can count events recorded and overwritten per buffer,
2169  * but it is disabled by default due to its performance overhead.
2170  */
2171 #ifdef LTTNG_RING_BUFFER_COUNT_EVENTS
2172 static
2173 void deliver_count_events(const struct lib_ring_buffer_config *config,
2174                 struct lib_ring_buffer *buf,
2175                 unsigned long idx)
2176 {
2177         v_add(config, subbuffer_get_records_count(config,
2178                         &buf->backend, idx),
2179                 &buf->records_count);
2180         v_add(config, subbuffer_count_records_overrun(config,
2181                         &buf->backend, idx),
2182                 &buf->records_overrun);
2183 }
2184 #else /* LTTNG_RING_BUFFER_COUNT_EVENTS */
2185 static
2186 void deliver_count_events(const struct lib_ring_buffer_config *config,
2187                 struct lib_ring_buffer *buf,
2188                 unsigned long idx)
2189 {
2190 }
2191 #endif /* #else LTTNG_RING_BUFFER_COUNT_EVENTS */
2192
2193
2194 void lib_ring_buffer_check_deliver_slow(const struct lib_ring_buffer_config *config,
2195                                    struct lib_ring_buffer *buf,
2196                                    struct channel *chan,
2197                                    unsigned long offset,
2198                                    unsigned long commit_count,
2199                                    unsigned long idx,
2200                                    u64 tsc)
2201 {
2202         unsigned long old_commit_count = commit_count
2203                                          - chan->backend.subbuf_size;
2204
2205         /*
2206          * If we succeeded at updating cc_sb below, we are the subbuffer
2207          * writer delivering the subbuffer. Deals with concurrent
2208          * updates of the "cc" value without adding a add_return atomic
2209          * operation to the fast path.
2210          *
2211          * We are doing the delivery in two steps:
2212          * - First, we cmpxchg() cc_sb to the new value
2213          *   old_commit_count + 1. This ensures that we are the only
2214          *   subbuffer user successfully filling the subbuffer, but we
2215          *   do _not_ set the cc_sb value to "commit_count" yet.
2216          *   Therefore, other writers that would wrap around the ring
2217          *   buffer and try to start writing to our subbuffer would
2218          *   have to drop records, because it would appear as
2219          *   non-filled.
2220          *   We therefore have exclusive access to the subbuffer control
2221          *   structures.  This mutual exclusion with other writers is
2222          *   crucially important to perform record overruns count in
2223          *   flight recorder mode locklessly.
2224          * - When we are ready to release the subbuffer (either for
2225          *   reading or for overrun by other writers), we simply set the
2226          *   cc_sb value to "commit_count" and perform delivery.
2227          *
2228          * The subbuffer size is least 2 bytes (minimum size: 1 page).
2229          * This guarantees that old_commit_count + 1 != commit_count.
2230          */
2231
2232         /*
2233          * Order prior updates to reserve count prior to the
2234          * commit_cold cc_sb update.
2235          */
2236         smp_wmb();
2237         if (likely(v_cmpxchg(config, &buf->commit_cold[idx].cc_sb,
2238                                  old_commit_count, old_commit_count + 1)
2239                    == old_commit_count)) {
2240                 /*
2241                  * Start of exclusive subbuffer access. We are
2242                  * guaranteed to be the last writer in this subbuffer
2243                  * and any other writer trying to access this subbuffer
2244                  * in this state is required to drop records.
2245                  */
2246                 deliver_count_events(config, buf, idx);
2247                 config->cb.buffer_end(buf, tsc, idx,
2248                                       lib_ring_buffer_get_data_size(config,
2249                                                                 buf,
2250                                                                 idx));
2251
2252                 /*
2253                  * Increment the packet counter while we have exclusive
2254                  * access.
2255                  */
2256                 subbuffer_inc_packet_count(config, &buf->backend, idx);
2257
2258                 /*
2259                  * Set noref flag and offset for this subbuffer id.
2260                  * Contains a memory barrier that ensures counter stores
2261                  * are ordered before set noref and offset.
2262                  */
2263                 lib_ring_buffer_set_noref_offset(config, &buf->backend, idx,
2264                                                  buf_trunc_val(offset, chan));
2265
2266                 /*
2267                  * Order set_noref and record counter updates before the
2268                  * end of subbuffer exclusive access. Orders with
2269                  * respect to writers coming into the subbuffer after
2270                  * wrap around, and also order wrt concurrent readers.
2271                  */
2272                 smp_mb();
2273                 /* End of exclusive subbuffer access */
2274                 v_set(config, &buf->commit_cold[idx].cc_sb,
2275                       commit_count);
2276                 /*
2277                  * Order later updates to reserve count after
2278                  * the commit_cold cc_sb update.
2279                  */
2280                 smp_wmb();
2281                 lib_ring_buffer_vmcore_check_deliver(config, buf,
2282                                                  commit_count, idx);
2283
2284                 /*
2285                  * RING_BUFFER_WAKEUP_BY_WRITER wakeup is not lock-free.
2286                  */
2287                 if (config->wakeup == RING_BUFFER_WAKEUP_BY_WRITER
2288                     && atomic_long_read(&buf->active_readers)
2289                     && lib_ring_buffer_poll_deliver(config, buf, chan)) {
2290                         wake_up_interruptible(&buf->read_wait);
2291                         wake_up_interruptible(&chan->read_wait);
2292                 }
2293
2294         }
2295 }
2296 EXPORT_SYMBOL_GPL(lib_ring_buffer_check_deliver_slow);
2297
2298 int __init init_lib_ring_buffer_frontend(void)
2299 {
2300         int cpu;
2301
2302         for_each_possible_cpu(cpu)
2303                 spin_lock_init(&per_cpu(ring_buffer_nohz_lock, cpu));
2304         return 0;
2305 }
2306
2307 module_init(init_lib_ring_buffer_frontend);
2308
2309 void __exit exit_lib_ring_buffer_frontend(void)
2310 {
2311 }
2312
2313 module_exit(exit_lib_ring_buffer_frontend);