lib/ringbuffer/ring_buffer_frontend.c

   1 /* SPDX-License-Identifier: (GPL-2.0-only OR LGPL-2.1-only)
   2  *
   3  * ring_buffer_frontend.c
   4  *
   5  * Copyright (C) 2005-2012 Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
   6  *
   7  * Ring buffer wait-free buffer synchronization. Producer-consumer and flight
   8  * recorder (overwrite) modes. See thesis:
   9  *
  10  * Desnoyers, Mathieu (2009), "Low-Impact Operating System Tracing", Ph.D.
  11  * dissertation, Ecole Polytechnique de Montreal.
  12  * http://www.lttng.org/pub/thesis/desnoyers-dissertation-2009-12.pdf
  13  *
  14  * - Algorithm presentation in Chapter 5:
  15  *     "Lockless Multi-Core High-Throughput Buffering".
  16  * - Algorithm formal verification in Section 8.6:
  17  *     "Formal verification of LTTng"
  18  *
  19  * Author:
  20  *      Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
  21  *
  22  * Inspired from LTT and RelayFS:
  23  *  Karim Yaghmour <karim@opersys.com>
  24  *  Tom Zanussi <zanussi@us.ibm.com>
  25  *  Bob Wisniewski <bob@watson.ibm.com>
  26  * And from K42 :
  27  *  Bob Wisniewski <bob@watson.ibm.com>
  28  *
  29  * Buffer reader semantic :
  30  *
  31  * - get_subbuf_size
  32  * while buffer is not finalized and empty
  33  *   - get_subbuf
  34  *     - if return value != 0, continue
  35  *   - splice one subbuffer worth of data to a pipe
  36  *   - splice the data from pipe to disk/network
  37  *   - put_subbuf
  38  */
  39
  40 #include <linux/delay.h>
  41 #include <linux/module.h>
  42 #include <linux/percpu.h>
  43 #include <linux/kref.h>
  44 #include <linux/percpu-defs.h>
  45 #include <linux/timer.h>
  46 #include <asm/cacheflush.h>
  47
  48 #include <wrapper/ringbuffer/config.h>
  49 #include <wrapper/ringbuffer/backend.h>
  50 #include <wrapper/ringbuffer/frontend.h>
  51 #include <wrapper/ringbuffer/iterator.h>
  52 #include <wrapper/ringbuffer/nohz.h>
  53
  54 /*
  55  * Internal structure representing offsets to use at a sub-buffer switch.
  56  */
  57 struct switch_offsets {
  58         unsigned long begin, end, old;
  59         size_t pre_header_padding, size;
  60         unsigned int switch_new_start:1, switch_new_end:1, switch_old_start:1,
  61                      switch_old_end:1;
  62 };
  63
  64 #ifdef CONFIG_NO_HZ
  65 enum tick_nohz_val {
  66         TICK_NOHZ_STOP,
  67         TICK_NOHZ_FLUSH,
  68         TICK_NOHZ_RESTART,
  69 };
  70
  71 static ATOMIC_NOTIFIER_HEAD(tick_nohz_notifier);
  72 #endif /* CONFIG_NO_HZ */
  73
  74 static DEFINE_PER_CPU(spinlock_t, ring_buffer_nohz_lock);
  75
  76 DEFINE_PER_CPU(unsigned int, lib_ring_buffer_nesting);
  77 EXPORT_PER_CPU_SYMBOL(lib_ring_buffer_nesting);
  78
  79 static
  80 void lib_ring_buffer_print_errors(struct channel *chan,
  81                                   struct lib_ring_buffer *buf, int cpu);
  82 static
  83 void _lib_ring_buffer_switch_remote(struct lib_ring_buffer *buf,
  84                 enum switch_mode mode);
  85
  86 static
  87 int lib_ring_buffer_poll_deliver(const struct lib_ring_buffer_config *config,
  88                                  struct lib_ring_buffer *buf,
  89                                  struct channel *chan)
  90 {
  91         unsigned long consumed_old, consumed_idx, commit_count, write_offset;
  92
  93         consumed_old = atomic_long_read(&buf->consumed);
  94         consumed_idx = subbuf_index(consumed_old, chan);
  95         commit_count = v_read(config, &buf->commit_cold[consumed_idx].cc_sb);
  96         /*
  97          * No memory barrier here, since we are only interested
  98          * in a statistically correct polling result. The next poll will
  99          * get the data is we are racing. The mb() that ensures correct
 100          * memory order is in get_subbuf.
 101          */
 102         write_offset = v_read(config, &buf->offset);
 103
 104         /*
 105          * Check that the subbuffer we are trying to consume has been
 106          * already fully committed.
 107          */
 108
 109         if (((commit_count - chan->backend.subbuf_size)
 110              & chan->commit_count_mask)
 111             - (buf_trunc(consumed_old, chan)
 112                >> chan->backend.num_subbuf_order)
 113             != 0)
 114                 return 0;
 115
 116         /*
 117          * Check that we are not about to read the same subbuffer in
 118          * which the writer head is.
 119          */
 120         if (subbuf_trunc(write_offset, chan) - subbuf_trunc(consumed_old, chan)
 121             == 0)
 122                 return 0;
 123
 124         return 1;
 125 }
 126
 127 /*
 128  * Must be called under cpu hotplug protection.
 129  */
 130 void lib_ring_buffer_free(struct lib_ring_buffer *buf)
 131 {
 132         struct channel *chan = buf->backend.chan;
 133
 134         lib_ring_buffer_print_errors(chan, buf, buf->backend.cpu);
 135         kvfree(buf->commit_hot);
 136         kvfree(buf->commit_cold);
 137         kvfree(buf->ts_end);
 138
 139         lib_ring_buffer_backend_free(&buf->backend);
 140 }
 141
 142 /**
 143  * lib_ring_buffer_reset - Reset ring buffer to initial values.
 144  * @buf: Ring buffer.
 145  *
 146  * Effectively empty the ring buffer. Should be called when the buffer is not
 147  * used for writing. The ring buffer can be opened for reading, but the reader
 148  * should not be using the iterator concurrently with reset. The previous
 149  * current iterator record is reset.
 150  */
 151 void lib_ring_buffer_reset(struct lib_ring_buffer *buf)
 152 {
 153         struct channel *chan = buf->backend.chan;
 154         const struct lib_ring_buffer_config *config = &chan->backend.config;
 155         unsigned int i;
 156
 157         /*
 158          * Reset iterator first. It will put the subbuffer if it currently holds
 159          * it.
 160          */
 161         lib_ring_buffer_iterator_reset(buf);
 162         v_set(config, &buf->offset, 0);
 163         for (i = 0; i < chan->backend.num_subbuf; i++) {
 164                 v_set(config, &buf->commit_hot[i].cc, 0);
 165                 v_set(config, &buf->commit_hot[i].seq, 0);
 166                 v_set(config, &buf->commit_cold[i].cc_sb, 0);
 167                 buf->ts_end[i] = 0;
 168         }
 169         atomic_long_set(&buf->consumed, 0);
 170         atomic_set(&buf->record_disabled, 0);
 171         v_set(config, &buf->last_tsc, 0);
 172         lib_ring_buffer_backend_reset(&buf->backend);
 173         /* Don't reset number of active readers */
 174         v_set(config, &buf->records_lost_full, 0);
 175         v_set(config, &buf->records_lost_wrap, 0);
 176         v_set(config, &buf->records_lost_big, 0);
 177         v_set(config, &buf->records_count, 0);
 178         v_set(config, &buf->records_overrun, 0);
 179         buf->finalized = 0;
 180 }
 181 EXPORT_SYMBOL_GPL(lib_ring_buffer_reset);
 182
 183 /**
 184  * channel_reset - Reset channel to initial values.
 185  * @chan: Channel.
 186  *
 187  * Effectively empty the channel. Should be called when the channel is not used
 188  * for writing. The channel can be opened for reading, but the reader should not
 189  * be using the iterator concurrently with reset. The previous current iterator
 190  * record is reset.
 191  */
 192 void channel_reset(struct channel *chan)
 193 {
 194         /*
 195          * Reset iterators first. Will put the subbuffer if held for reading.
 196          */
 197         channel_iterator_reset(chan);
 198         atomic_set(&chan->record_disabled, 0);
 199         /* Don't reset commit_count_mask, still valid */
 200         channel_backend_reset(&chan->backend);
 201         /* Don't reset switch/read timer interval */
 202         /* Don't reset notifiers and notifier enable bits */
 203         /* Don't reset reader reference count */
 204 }
 205 EXPORT_SYMBOL_GPL(channel_reset);
 206
 207 /*
 208  * Must be called under cpu hotplug protection.
 209  */
 210 int lib_ring_buffer_create(struct lib_ring_buffer *buf,
 211                            struct channel_backend *chanb, int cpu)
 212 {
 213         const struct lib_ring_buffer_config *config = &chanb->config;
 214         struct channel *chan = container_of(chanb, struct channel, backend);
 215         void *priv = chanb->priv;
 216         size_t subbuf_header_size;
 217         u64 tsc;
 218         int ret;
 219
 220         /* Test for cpu hotplug */
 221         if (buf->backend.allocated)
 222                 return 0;
 223
 224         /*
 225          * Paranoia: per cpu dynamic allocation is not officially documented as
 226          * zeroing the memory, so let's do it here too, just in case.
 227          */
 228         memset(buf, 0, sizeof(*buf));
 229
 230         ret = lib_ring_buffer_backend_create(&buf->backend, &chan->backend, cpu);
 231         if (ret)
 232                 return ret;
 233
 234         buf->commit_hot =
 235                 kvzalloc_node(ALIGN(sizeof(*buf->commit_hot)
 236                                    * chan->backend.num_subbuf,
 237                                    1 << INTERNODE_CACHE_SHIFT),
 238                         GFP_KERNEL | __GFP_NOWARN,
 239                         cpu_to_node(max(cpu, 0)));
 240         if (!buf->commit_hot) {
 241                 ret = -ENOMEM;
 242                 goto free_chanbuf;
 243         }
 244
 245         buf->commit_cold =
 246                 kvzalloc_node(ALIGN(sizeof(*buf->commit_cold)
 247                                    * chan->backend.num_subbuf,
 248                                    1 << INTERNODE_CACHE_SHIFT),
 249                         GFP_KERNEL | __GFP_NOWARN,
 250                         cpu_to_node(max(cpu, 0)));
 251         if (!buf->commit_cold) {
 252                 ret = -ENOMEM;
 253                 goto free_commit;
 254         }
 255
 256         buf->ts_end =
 257                 kvzalloc_node(ALIGN(sizeof(*buf->ts_end)
 258                                    * chan->backend.num_subbuf,
 259                                    1 << INTERNODE_CACHE_SHIFT),
 260                         GFP_KERNEL | __GFP_NOWARN,
 261                         cpu_to_node(max(cpu, 0)));
 262         if (!buf->ts_end) {
 263                 ret = -ENOMEM;
 264                 goto free_commit_cold;
 265         }
 266
 267         init_waitqueue_head(&buf->read_wait);
 268         init_waitqueue_head(&buf->write_wait);
 269         raw_spin_lock_init(&buf->raw_tick_nohz_spinlock);
 270
 271         /*
 272          * Write the subbuffer header for first subbuffer so we know the total
 273          * duration of data gathering.
 274          */
 275         subbuf_header_size = config->cb.subbuffer_header_size();
 276         v_set(config, &buf->offset, subbuf_header_size);
 277         subbuffer_id_clear_noref(config, &buf->backend.buf_wsb[0].id);
 278         tsc = config->cb.ring_buffer_clock_read(buf->backend.chan);
 279         config->cb.buffer_begin(buf, tsc, 0);
 280         v_add(config, subbuf_header_size, &buf->commit_hot[0].cc);
 281
 282         if (config->cb.buffer_create) {
 283                 ret = config->cb.buffer_create(buf, priv, cpu, chanb->name);
 284                 if (ret)
 285                         goto free_init;
 286         }
 287
 288         /*
 289          * Ensure the buffer is ready before setting it to allocated and setting
 290          * the cpumask.
 291          * Used for cpu hotplug vs cpumask iteration.
 292          */
 293         smp_wmb();
 294         buf->backend.allocated = 1;
 295
 296         if (config->alloc == RING_BUFFER_ALLOC_PER_CPU) {
 297                 CHAN_WARN_ON(chan, cpumask_test_cpu(cpu,
 298                              chan->backend.cpumask));
 299                 cpumask_set_cpu(cpu, chan->backend.cpumask);
 300         }
 301         return 0;
 302
 303         /* Error handling */
 304 free_init:
 305         kvfree(buf->ts_end);
 306 free_commit_cold:
 307         kvfree(buf->commit_cold);
 308 free_commit:
 309         kvfree(buf->commit_hot);
 310 free_chanbuf:
 311         lib_ring_buffer_backend_free(&buf->backend);
 312         return ret;
 313 }
 314
 315 static void switch_buffer_timer(struct timer_list *t)
 316 {
 317         struct lib_ring_buffer *buf = from_timer(buf, t, switch_timer);
 318         struct channel *chan = buf->backend.chan;
 319
 320         /*
 321          * Only flush buffers periodically if readers are active.
 322          */
 323         if (atomic_long_read(&buf->active_readers))
 324                 lib_ring_buffer_switch_slow(buf, SWITCH_ACTIVE);
 325
 326         mod_timer(&buf->switch_timer,
 327                   jiffies + chan->switch_timer_interval);
 328 }
 329
 330 /*
 331  * Called with ring_buffer_nohz_lock held for per-cpu buffers.
 332  */
 333 static void lib_ring_buffer_start_switch_timer(struct lib_ring_buffer *buf)
 334 {
 335         struct channel *chan = buf->backend.chan;
 336         const struct lib_ring_buffer_config *config = &chan->backend.config;
 337         unsigned int flags = 0;
 338
 339         if (!chan->switch_timer_interval || buf->switch_timer_enabled)
 340                 return;
 341
 342         if (config->alloc == RING_BUFFER_ALLOC_PER_CPU)
 343                 flags = TIMER_PINNED;
 344
 345         timer_setup(&buf->switch_timer, switch_buffer_timer, flags);
 346         buf->switch_timer.expires = jiffies + chan->switch_timer_interval;
 347
 348         if (config->alloc == RING_BUFFER_ALLOC_PER_CPU)
 349                 add_timer_on(&buf->switch_timer, buf->backend.cpu);
 350         else
 351                 add_timer(&buf->switch_timer);
 352
 353         buf->switch_timer_enabled = 1;
 354 }
 355
 356 /*
 357  * Called with ring_buffer_nohz_lock held for per-cpu buffers.
 358  */
 359 static void lib_ring_buffer_stop_switch_timer(struct lib_ring_buffer *buf)
 360 {
 361         struct channel *chan = buf->backend.chan;
 362
 363         if (!chan->switch_timer_interval || !buf->switch_timer_enabled)
 364                 return;
 365
 366         del_timer_sync(&buf->switch_timer);
 367         buf->switch_timer_enabled = 0;
 368 }
 369
 370 /*
 371  * Polling timer to check the channels for data.
 372  */
 373 static void read_buffer_timer(struct timer_list *t)
 374 {
 375         struct lib_ring_buffer *buf = from_timer(buf, t, read_timer);
 376         struct channel *chan = buf->backend.chan;
 377         const struct lib_ring_buffer_config *config = &chan->backend.config;
 378
 379         CHAN_WARN_ON(chan, !buf->backend.allocated);
 380
 381         if (atomic_long_read(&buf->active_readers)
 382             && lib_ring_buffer_poll_deliver(config, buf, chan)) {
 383                 wake_up_interruptible(&buf->read_wait);
 384                 wake_up_interruptible(&chan->read_wait);
 385         }
 386
 387         mod_timer(&buf->read_timer,
 388                   jiffies + chan->read_timer_interval);
 389 }
 390
 391 /*
 392  * Called with ring_buffer_nohz_lock held for per-cpu buffers.
 393  */
 394 static void lib_ring_buffer_start_read_timer(struct lib_ring_buffer *buf)
 395 {
 396         struct channel *chan = buf->backend.chan;
 397         const struct lib_ring_buffer_config *config = &chan->backend.config;
 398         unsigned int flags = 0;
 399
 400         if (config->wakeup != RING_BUFFER_WAKEUP_BY_TIMER
 401             || !chan->read_timer_interval
 402             || buf->read_timer_enabled)
 403                 return;
 404
 405         if (config->alloc == RING_BUFFER_ALLOC_PER_CPU)
 406                 flags = TIMER_PINNED;
 407
 408         timer_setup(&buf->read_timer, read_buffer_timer, flags);
 409         buf->read_timer.expires = jiffies + chan->read_timer_interval;
 410
 411         if (config->alloc == RING_BUFFER_ALLOC_PER_CPU)
 412                 add_timer_on(&buf->read_timer, buf->backend.cpu);
 413         else
 414                 add_timer(&buf->read_timer);
 415
 416         buf->read_timer_enabled = 1;
 417 }
 418
 419 /*
 420  * Called with ring_buffer_nohz_lock held for per-cpu buffers.
 421  */
 422 static void lib_ring_buffer_stop_read_timer(struct lib_ring_buffer *buf)
 423 {
 424         struct channel *chan = buf->backend.chan;
 425         const struct lib_ring_buffer_config *config = &chan->backend.config;
 426
 427         if (config->wakeup != RING_BUFFER_WAKEUP_BY_TIMER
 428             || !chan->read_timer_interval
 429             || !buf->read_timer_enabled)
 430                 return;
 431
 432         del_timer_sync(&buf->read_timer);
 433         /*
 434          * do one more check to catch data that has been written in the last
 435          * timer period.
 436          */
 437         if (lib_ring_buffer_poll_deliver(config, buf, chan)) {
 438                 wake_up_interruptible(&buf->read_wait);
 439                 wake_up_interruptible(&chan->read_wait);
 440         }
 441         buf->read_timer_enabled = 0;
 442 }
 443
 444 #if (LINUX_VERSION_CODE >= KERNEL_VERSION(4,10,0))
 445
 446 enum cpuhp_state lttng_rb_hp_prepare;
 447 enum cpuhp_state lttng_rb_hp_online;
 448
 449 void lttng_rb_set_hp_prepare(enum cpuhp_state val)
 450 {
 451         lttng_rb_hp_prepare = val;
 452 }
 453 EXPORT_SYMBOL_GPL(lttng_rb_set_hp_prepare);
 454
 455 void lttng_rb_set_hp_online(enum cpuhp_state val)
 456 {
 457         lttng_rb_hp_online = val;
 458 }
 459 EXPORT_SYMBOL_GPL(lttng_rb_set_hp_online);
 460
 461 int lttng_cpuhp_rb_frontend_dead(unsigned int cpu,
 462                 struct lttng_cpuhp_node *node)
 463 {
 464         struct channel *chan = container_of(node, struct channel,
 465                                             cpuhp_prepare);
 466         struct lib_ring_buffer *buf = per_cpu_ptr(chan->backend.buf, cpu);
 467         const struct lib_ring_buffer_config *config = &chan->backend.config;
 468
 469         CHAN_WARN_ON(chan, config->alloc == RING_BUFFER_ALLOC_GLOBAL);
 470
 471         /*
 472          * Performing a buffer switch on a remote CPU. Performed by
 473          * the CPU responsible for doing the hotunplug after the target
 474          * CPU stopped running completely. Ensures that all data
 475          * from that remote CPU is flushed.
 476          */
 477         lib_ring_buffer_switch_slow(buf, SWITCH_ACTIVE);
 478         return 0;
 479 }
 480 EXPORT_SYMBOL_GPL(lttng_cpuhp_rb_frontend_dead);
 481
 482 int lttng_cpuhp_rb_frontend_online(unsigned int cpu,
 483                 struct lttng_cpuhp_node *node)
 484 {
 485         struct channel *chan = container_of(node, struct channel,
 486                                             cpuhp_online);
 487         struct lib_ring_buffer *buf = per_cpu_ptr(chan->backend.buf, cpu);
 488         const struct lib_ring_buffer_config *config = &chan->backend.config;
 489
 490         CHAN_WARN_ON(chan, config->alloc == RING_BUFFER_ALLOC_GLOBAL);
 491
 492         wake_up_interruptible(&chan->hp_wait);
 493         lib_ring_buffer_start_switch_timer(buf);
 494         lib_ring_buffer_start_read_timer(buf);
 495         return 0;
 496 }
 497 EXPORT_SYMBOL_GPL(lttng_cpuhp_rb_frontend_online);
 498
 499 int lttng_cpuhp_rb_frontend_offline(unsigned int cpu,
 500                 struct lttng_cpuhp_node *node)
 501 {
 502         struct channel *chan = container_of(node, struct channel,
 503                                             cpuhp_online);
 504         struct lib_ring_buffer *buf = per_cpu_ptr(chan->backend.buf, cpu);
 505         const struct lib_ring_buffer_config *config = &chan->backend.config;
 506
 507         CHAN_WARN_ON(chan, config->alloc == RING_BUFFER_ALLOC_GLOBAL);
 508
 509         lib_ring_buffer_stop_switch_timer(buf);
 510         lib_ring_buffer_stop_read_timer(buf);
 511         return 0;
 512 }
 513 EXPORT_SYMBOL_GPL(lttng_cpuhp_rb_frontend_offline);
 514
 515 #else /* #if (LINUX_VERSION_CODE >= KERNEL_VERSION(4,10,0)) */
 516
 517 #ifdef CONFIG_HOTPLUG_CPU
 518
 519 /**
 520  *      lib_ring_buffer_cpu_hp_callback - CPU hotplug callback
 521  *      @nb: notifier block
 522  *      @action: hotplug action to take
 523  *      @hcpu: CPU number
 524  *
 525  *      Returns the success/failure of the operation. (%NOTIFY_OK, %NOTIFY_BAD)
 526  */
 527 static
 528 int lib_ring_buffer_cpu_hp_callback(struct notifier_block *nb,
 529                                               unsigned long action,
 530                                               void *hcpu)
 531 {
 532         unsigned int cpu = (unsigned long)hcpu;
 533         struct channel *chan = container_of(nb, struct channel,
 534                                             cpu_hp_notifier);
 535         struct lib_ring_buffer *buf = per_cpu_ptr(chan->backend.buf, cpu);
 536         const struct lib_ring_buffer_config *config = &chan->backend.config;
 537
 538         if (!chan->cpu_hp_enable)
 539                 return NOTIFY_DONE;
 540
 541         CHAN_WARN_ON(chan, config->alloc == RING_BUFFER_ALLOC_GLOBAL);
 542
 543         switch (action) {
 544         case CPU_DOWN_FAILED:
 545         case CPU_DOWN_FAILED_FROZEN:
 546         case CPU_ONLINE:
 547         case CPU_ONLINE_FROZEN:
 548                 wake_up_interruptible(&chan->hp_wait);
 549                 lib_ring_buffer_start_switch_timer(buf);
 550                 lib_ring_buffer_start_read_timer(buf);
 551                 return NOTIFY_OK;
 552
 553         case CPU_DOWN_PREPARE:
 554         case CPU_DOWN_PREPARE_FROZEN:
 555                 lib_ring_buffer_stop_switch_timer(buf);
 556                 lib_ring_buffer_stop_read_timer(buf);
 557                 return NOTIFY_OK;
 558
 559         case CPU_DEAD:
 560         case CPU_DEAD_FROZEN:
 561                 /*
 562                  * Performing a buffer switch on a remote CPU. Performed by
 563                  * the CPU responsible for doing the hotunplug after the target
 564                  * CPU stopped running completely. Ensures that all data
 565                  * from that remote CPU is flushed.
 566                  */
 567                 lib_ring_buffer_switch_slow(buf, SWITCH_ACTIVE);
 568                 return NOTIFY_OK;
 569
 570         default:
 571                 return NOTIFY_DONE;
 572         }
 573 }
 574
 575 #endif
 576
 577 #endif /* #else #if (LINUX_VERSION_CODE >= KERNEL_VERSION(4,10,0)) */
 578
 579 #if defined(CONFIG_NO_HZ) && defined(CONFIG_LIB_RING_BUFFER)
 580 /*
 581  * For per-cpu buffers, call the reader wakeups before switching the buffer, so
 582  * that wake-up-tracing generated events are flushed before going idle (in
 583  * tick_nohz). We test if the spinlock is locked to deal with the race where
 584  * readers try to sample the ring buffer before we perform the switch. We let
 585  * the readers retry in that case. If there is data in the buffer, the wake up
 586  * is going to forbid the CPU running the reader thread from going idle.
 587  */
 588 static int notrace ring_buffer_tick_nohz_callback(struct notifier_block *nb,
 589                                                   unsigned long val,
 590                                                   void *data)
 591 {
 592         struct channel *chan = container_of(nb, struct channel,
 593                                             tick_nohz_notifier);
 594         const struct lib_ring_buffer_config *config = &chan->backend.config;
 595         struct lib_ring_buffer *buf;
 596         int cpu = smp_processor_id();
 597
 598         if (config->alloc != RING_BUFFER_ALLOC_PER_CPU) {
 599                 /*
 600                  * We don't support keeping the system idle with global buffers
 601                  * and streaming active. In order to do so, we would need to
 602                  * sample a non-nohz-cpumask racelessly with the nohz updates
 603                  * without adding synchronization overhead to nohz. Leave this
 604                  * use-case out for now.
 605                  */
 606                 return 0;
 607         }
 608
 609         buf = channel_get_ring_buffer(config, chan, cpu);
 610         switch (val) {
 611         case TICK_NOHZ_FLUSH:
 612                 raw_spin_lock(&buf->raw_tick_nohz_spinlock);
 613                 if (config->wakeup == RING_BUFFER_WAKEUP_BY_TIMER
 614                     && chan->read_timer_interval
 615                     && atomic_long_read(&buf->active_readers)
 616                     && (lib_ring_buffer_poll_deliver(config, buf, chan)
 617                         || lib_ring_buffer_pending_data(config, buf, chan))) {
 618                         wake_up_interruptible(&buf->read_wait);
 619                         wake_up_interruptible(&chan->read_wait);
 620                 }
 621                 if (chan->switch_timer_interval)
 622                         lib_ring_buffer_switch_slow(buf, SWITCH_ACTIVE);
 623                 raw_spin_unlock(&buf->raw_tick_nohz_spinlock);
 624                 break;
 625         case TICK_NOHZ_STOP:
 626                 spin_lock(this_cpu_ptr(&ring_buffer_nohz_lock));
 627                 lib_ring_buffer_stop_switch_timer(buf);
 628                 lib_ring_buffer_stop_read_timer(buf);
 629                 spin_unlock(this_cpu_ptr(&ring_buffer_nohz_lock));
 630                 break;
 631         case TICK_NOHZ_RESTART:
 632                 spin_lock(this_cpu_ptr(&ring_buffer_nohz_lock));
 633                 lib_ring_buffer_start_read_timer(buf);
 634                 lib_ring_buffer_start_switch_timer(buf);
 635                 spin_unlock(this_cpu_ptr(&ring_buffer_nohz_lock));
 636                 break;
 637         }
 638
 639         return 0;
 640 }
 641
 642 void notrace lib_ring_buffer_tick_nohz_flush(void)
 643 {
 644         atomic_notifier_call_chain(&tick_nohz_notifier, TICK_NOHZ_FLUSH,
 645                                    NULL);
 646 }
 647
 648 void notrace lib_ring_buffer_tick_nohz_stop(void)
 649 {
 650         atomic_notifier_call_chain(&tick_nohz_notifier, TICK_NOHZ_STOP,
 651                                    NULL);
 652 }
 653
 654 void notrace lib_ring_buffer_tick_nohz_restart(void)
 655 {
 656         atomic_notifier_call_chain(&tick_nohz_notifier, TICK_NOHZ_RESTART,
 657                                    NULL);
 658 }
 659 #endif /* defined(CONFIG_NO_HZ) && defined(CONFIG_LIB_RING_BUFFER) */
 660
 661 /*
 662  * Holds CPU hotplug.
 663  */
 664 static void channel_unregister_notifiers(struct channel *chan)
 665 {
 666         const struct lib_ring_buffer_config *config = &chan->backend.config;
 667
 668         channel_iterator_unregister_notifiers(chan);
 669         if (config->alloc == RING_BUFFER_ALLOC_PER_CPU) {
 670 #ifdef CONFIG_NO_HZ
 671                 /*
 672                  * Remove the nohz notifier first, so we are certain we stop
 673                  * the timers.
 674                  */
 675                 atomic_notifier_chain_unregister(&tick_nohz_notifier,
 676                                                  &chan->tick_nohz_notifier);
 677                 /*
 678                  * ring_buffer_nohz_lock will not be needed below, because
 679                  * we just removed the notifiers, which were the only source of
 680                  * concurrency.
 681                  */
 682 #endif /* CONFIG_NO_HZ */
 683 #if (LINUX_VERSION_CODE >= KERNEL_VERSION(4,10,0))
 684                 {
 685                         int ret;
 686
 687                         ret = cpuhp_state_remove_instance(lttng_rb_hp_online,
 688                                 &chan->cpuhp_online.node);
 689                         WARN_ON(ret);
 690                         ret = cpuhp_state_remove_instance_nocalls(lttng_rb_hp_prepare,
 691                                 &chan->cpuhp_prepare.node);
 692                         WARN_ON(ret);
 693                 }
 694 #else /* #if (LINUX_VERSION_CODE >= KERNEL_VERSION(4,10,0)) */
 695                 {
 696                         int cpu;
 697
 698 #ifdef CONFIG_HOTPLUG_CPU
 699                         get_online_cpus();
 700                         chan->cpu_hp_enable = 0;
 701                         for_each_online_cpu(cpu) {
 702                                 struct lib_ring_buffer *buf = per_cpu_ptr(chan->backend.buf,
 703                                                                       cpu);
 704                                 lib_ring_buffer_stop_switch_timer(buf);
 705                                 lib_ring_buffer_stop_read_timer(buf);
 706                         }
 707                         put_online_cpus();
 708                         unregister_cpu_notifier(&chan->cpu_hp_notifier);
 709 #else
 710                         for_each_possible_cpu(cpu) {
 711                                 struct lib_ring_buffer *buf = per_cpu_ptr(chan->backend.buf,
 712                                                                       cpu);
 713                                 lib_ring_buffer_stop_switch_timer(buf);
 714                                 lib_ring_buffer_stop_read_timer(buf);
 715                         }
 716 #endif
 717                 }
 718 #endif /* #else #if (LINUX_VERSION_CODE >= KERNEL_VERSION(4,10,0)) */
 719         } else {
 720                 struct lib_ring_buffer *buf = chan->backend.buf;
 721
 722                 lib_ring_buffer_stop_switch_timer(buf);
 723                 lib_ring_buffer_stop_read_timer(buf);
 724         }
 725         channel_backend_unregister_notifiers(&chan->backend);
 726 }
 727
 728 static void lib_ring_buffer_set_quiescent(struct lib_ring_buffer *buf)
 729 {
 730         if (!buf->quiescent) {
 731                 buf->quiescent = true;
 732                 _lib_ring_buffer_switch_remote(buf, SWITCH_FLUSH);
 733         }
 734 }
 735
 736 static void lib_ring_buffer_clear_quiescent(struct lib_ring_buffer *buf)
 737 {
 738         buf->quiescent = false;
 739 }
 740
 741 void lib_ring_buffer_set_quiescent_channel(struct channel *chan)
 742 {
 743         int cpu;
 744         const struct lib_ring_buffer_config *config = &chan->backend.config;
 745
 746         if (config->alloc == RING_BUFFER_ALLOC_PER_CPU) {
 747                 get_online_cpus();
 748                 for_each_channel_cpu(cpu, chan) {
 749                         struct lib_ring_buffer *buf = per_cpu_ptr(chan->backend.buf,
 750                                                               cpu);
 751
 752                         lib_ring_buffer_set_quiescent(buf);
 753                 }
 754                 put_online_cpus();
 755         } else {
 756                 struct lib_ring_buffer *buf = chan->backend.buf;
 757
 758                 lib_ring_buffer_set_quiescent(buf);
 759         }
 760 }
 761 EXPORT_SYMBOL_GPL(lib_ring_buffer_set_quiescent_channel);
 762
 763 void lib_ring_buffer_clear_quiescent_channel(struct channel *chan)
 764 {
 765         int cpu;
 766         const struct lib_ring_buffer_config *config = &chan->backend.config;
 767
 768         if (config->alloc == RING_BUFFER_ALLOC_PER_CPU) {
 769                 get_online_cpus();
 770                 for_each_channel_cpu(cpu, chan) {
 771                         struct lib_ring_buffer *buf = per_cpu_ptr(chan->backend.buf,
 772                                                               cpu);
 773
 774                         lib_ring_buffer_clear_quiescent(buf);
 775                 }
 776                 put_online_cpus();
 777         } else {
 778                 struct lib_ring_buffer *buf = chan->backend.buf;
 779
 780                 lib_ring_buffer_clear_quiescent(buf);
 781         }
 782 }
 783 EXPORT_SYMBOL_GPL(lib_ring_buffer_clear_quiescent_channel);
 784
 785 static void channel_free(struct channel *chan)
 786 {
 787         if (chan->backend.release_priv_ops) {
 788                 chan->backend.release_priv_ops(chan->backend.priv_ops);
 789         }
 790         channel_iterator_free(chan);
 791         channel_backend_free(&chan->backend);
 792         kfree(chan);
 793 }
 794
 795 /**
 796  * channel_create - Create channel.
 797  * @config: ring buffer instance configuration
 798  * @name: name of the channel
 799  * @priv: ring buffer client private data
 800  * @buf_addr: pointer the the beginning of the preallocated buffer contiguous
 801  *            address mapping. It is used only by RING_BUFFER_STATIC
 802  *            configuration. It can be set to NULL for other backends.
 803  * @subbuf_size: subbuffer size
 804  * @num_subbuf: number of subbuffers
 805  * @switch_timer_interval: Time interval (in us) to fill sub-buffers with
 806  *                         padding to let readers get those sub-buffers.
 807  *                         Used for live streaming.
 808  * @read_timer_interval: Time interval (in us) to wake up pending readers.
 809  *
 810  * Holds cpu hotplug.
 811  * Returns NULL on failure.
 812  */
 813 struct channel *channel_create(const struct lib_ring_buffer_config *config,
 814                    const char *name, void *priv, void *buf_addr,
 815                    size_t subbuf_size,
 816                    size_t num_subbuf, unsigned int switch_timer_interval,
 817                    unsigned int read_timer_interval)
 818 {
 819         int ret;
 820         struct channel *chan;
 821
 822         if (lib_ring_buffer_check_config(config, switch_timer_interval,
 823                                          read_timer_interval))
 824                 return NULL;
 825
 826         chan = kzalloc(sizeof(struct channel), GFP_KERNEL);
 827         if (!chan)
 828                 return NULL;
 829
 830         ret = channel_backend_init(&chan->backend, name, config, priv,
 831                                    subbuf_size, num_subbuf);
 832         if (ret)
 833                 goto error;
 834
 835         ret = channel_iterator_init(chan);
 836         if (ret)
 837                 goto error_free_backend;
 838
 839         chan->commit_count_mask = (~0UL >> chan->backend.num_subbuf_order);
 840         chan->switch_timer_interval = usecs_to_jiffies(switch_timer_interval);
 841         chan->read_timer_interval = usecs_to_jiffies(read_timer_interval);
 842         kref_init(&chan->ref);
 843         init_waitqueue_head(&chan->read_wait);
 844         init_waitqueue_head(&chan->hp_wait);
 845
 846         if (config->alloc == RING_BUFFER_ALLOC_PER_CPU) {
 847 #if (LINUX_VERSION_CODE >= KERNEL_VERSION(4,10,0))
 848                 chan->cpuhp_prepare.component = LTTNG_RING_BUFFER_FRONTEND;
 849                 ret = cpuhp_state_add_instance_nocalls(lttng_rb_hp_prepare,
 850                         &chan->cpuhp_prepare.node);
 851                 if (ret)
 852                         goto cpuhp_prepare_error;
 853
 854                 chan->cpuhp_online.component = LTTNG_RING_BUFFER_FRONTEND;
 855                 ret = cpuhp_state_add_instance(lttng_rb_hp_online,
 856                         &chan->cpuhp_online.node);
 857                 if (ret)
 858                         goto cpuhp_online_error;
 859 #else /* #if (LINUX_VERSION_CODE >= KERNEL_VERSION(4,10,0)) */
 860                 {
 861                         int cpu;
 862                         /*
 863                          * In case of non-hotplug cpu, if the ring-buffer is allocated
 864                          * in early initcall, it will not be notified of secondary cpus.
 865                          * In that off case, we need to allocate for all possible cpus.
 866                          */
 867 #ifdef CONFIG_HOTPLUG_CPU
 868                         chan->cpu_hp_notifier.notifier_call =
 869                                         lib_ring_buffer_cpu_hp_callback;
 870                         chan->cpu_hp_notifier.priority = 6;
 871                         register_cpu_notifier(&chan->cpu_hp_notifier);
 872
 873                         get_online_cpus();
 874                         for_each_online_cpu(cpu) {
 875                                 struct lib_ring_buffer *buf = per_cpu_ptr(chan->backend.buf,
 876                                                                        cpu);
 877                                 spin_lock(&per_cpu(ring_buffer_nohz_lock, cpu));
 878                                 lib_ring_buffer_start_switch_timer(buf);
 879                                 lib_ring_buffer_start_read_timer(buf);
 880                                 spin_unlock(&per_cpu(ring_buffer_nohz_lock, cpu));
 881                         }
 882                         chan->cpu_hp_enable = 1;
 883                         put_online_cpus();
 884 #else
 885                         for_each_possible_cpu(cpu) {
 886                                 struct lib_ring_buffer *buf = per_cpu_ptr(chan->backend.buf,
 887                                                                       cpu);
 888                                 spin_lock(&per_cpu(ring_buffer_nohz_lock, cpu));
 889                                 lib_ring_buffer_start_switch_timer(buf);
 890                                 lib_ring_buffer_start_read_timer(buf);
 891                                 spin_unlock(&per_cpu(ring_buffer_nohz_lock, cpu));
 892                         }
 893 #endif
 894                 }
 895 #endif /* #else #if (LINUX_VERSION_CODE >= KERNEL_VERSION(4,10,0)) */
 896
 897 #if defined(CONFIG_NO_HZ) && defined(CONFIG_LIB_RING_BUFFER)
 898                 /* Only benefit from NO_HZ idle with per-cpu buffers for now. */
 899                 chan->tick_nohz_notifier.notifier_call =
 900                         ring_buffer_tick_nohz_callback;
 901                 chan->tick_nohz_notifier.priority = ~0U;
 902                 atomic_notifier_chain_register(&tick_nohz_notifier,
 903                                        &chan->tick_nohz_notifier);
 904 #endif /* defined(CONFIG_NO_HZ) && defined(CONFIG_LIB_RING_BUFFER) */
 905
 906         } else {
 907                 struct lib_ring_buffer *buf = chan->backend.buf;
 908
 909                 lib_ring_buffer_start_switch_timer(buf);
 910                 lib_ring_buffer_start_read_timer(buf);
 911         }
 912
 913         return chan;
 914
 915 #if (LINUX_VERSION_CODE >= KERNEL_VERSION(4,10,0))
 916 cpuhp_online_error:
 917         ret = cpuhp_state_remove_instance_nocalls(lttng_rb_hp_prepare,
 918                         &chan->cpuhp_prepare.node);
 919         WARN_ON(ret);
 920 cpuhp_prepare_error:
 921 #endif /* #if (LINUX_VERSION_CODE >= KERNEL_VERSION(4,10,0)) */
 922 error_free_backend:
 923         channel_backend_free(&chan->backend);
 924 error:
 925         kfree(chan);
 926         return NULL;
 927 }
 928 EXPORT_SYMBOL_GPL(channel_create);
 929
 930 static
 931 void channel_release(struct kref *kref)
 932 {
 933         struct channel *chan = container_of(kref, struct channel, ref);
 934         channel_free(chan);
 935 }
 936
 937 /**
 938  * channel_destroy - Finalize, wait for q.s. and destroy channel.
 939  * @chan: channel to destroy
 940  *
 941  * Holds cpu hotplug.
 942  * Call "destroy" callback, finalize channels, and then decrement the
 943  * channel reference count.  Note that when readers have completed data
 944  * consumption of finalized channels, get_subbuf() will return -ENODATA.
 945  * They should release their handle at that point.  Returns the private
 946  * data pointer.
 947  */
 948 void *channel_destroy(struct channel *chan)
 949 {
 950         int cpu;
 951         const struct lib_ring_buffer_config *config = &chan->backend.config;
 952         void *priv;
 953
 954         channel_unregister_notifiers(chan);
 955
 956         if (config->alloc == RING_BUFFER_ALLOC_PER_CPU) {
 957                 /*
 958                  * No need to hold cpu hotplug, because all notifiers have been
 959                  * unregistered.
 960                  */
 961                 for_each_channel_cpu(cpu, chan) {
 962                         struct lib_ring_buffer *buf = per_cpu_ptr(chan->backend.buf,
 963                                                               cpu);
 964
 965                         if (config->cb.buffer_finalize)
 966                                 config->cb.buffer_finalize(buf,
 967                                                            chan->backend.priv,
 968                                                            cpu);
 969                         /*
 970                          * Perform flush before writing to finalized.
 971                          */
 972                         smp_wmb();
 973                         WRITE_ONCE(buf->finalized, 1);
 974                         wake_up_interruptible(&buf->read_wait);
 975                 }
 976         } else {
 977                 struct lib_ring_buffer *buf = chan->backend.buf;
 978
 979                 if (config->cb.buffer_finalize)
 980                         config->cb.buffer_finalize(buf, chan->backend.priv, -1);
 981                 /*
 982                  * Perform flush before writing to finalized.
 983                  */
 984                 smp_wmb();
 985                 WRITE_ONCE(buf->finalized, 1);
 986                 wake_up_interruptible(&buf->read_wait);
 987         }
 988         WRITE_ONCE(chan->finalized, 1);
 989         wake_up_interruptible(&chan->hp_wait);
 990         wake_up_interruptible(&chan->read_wait);
 991         priv = chan->backend.priv;
 992         kref_put(&chan->ref, channel_release);
 993         return priv;
 994 }
 995 EXPORT_SYMBOL_GPL(channel_destroy);
 996
 997 struct lib_ring_buffer *channel_get_ring_buffer(
 998                                         const struct lib_ring_buffer_config *config,
 999                                         struct channel *chan, int cpu)
1000 {
1001         if (config->alloc == RING_BUFFER_ALLOC_GLOBAL)
1002                 return chan->backend.buf;
1003         else
1004                 return per_cpu_ptr(chan->backend.buf, cpu);
1005 }
1006 EXPORT_SYMBOL_GPL(channel_get_ring_buffer);
1007
1008 int lib_ring_buffer_open_read(struct lib_ring_buffer *buf)
1009 {
1010         struct channel *chan = buf->backend.chan;
1011
1012         if (!atomic_long_add_unless(&buf->active_readers, 1, 1))
1013                 return -EBUSY;
1014         kref_get(&chan->ref);
1015         smp_mb__after_atomic();
1016         return 0;
1017 }
1018 EXPORT_SYMBOL_GPL(lib_ring_buffer_open_read);
1019
1020 void lib_ring_buffer_release_read(struct lib_ring_buffer *buf)
1021 {
1022         struct channel *chan = buf->backend.chan;
1023
1024         CHAN_WARN_ON(chan, atomic_long_read(&buf->active_readers) != 1);
1025         smp_mb__before_atomic();
1026         atomic_long_dec(&buf->active_readers);
1027         kref_put(&chan->ref, channel_release);
1028 }
1029 EXPORT_SYMBOL_GPL(lib_ring_buffer_release_read);
1030
1031 /*
1032  * Promote compiler barrier to a smp_mb().
1033  * For the specific ring buffer case, this IPI call should be removed if the
1034  * architecture does not reorder writes.  This should eventually be provided by
1035  * a separate architecture-specific infrastructure.
1036  */
1037 static void remote_mb(void *info)
1038 {
1039         smp_mb();
1040 }
1041
1042 /**
1043  * lib_ring_buffer_snapshot - save subbuffer position snapshot (for read)
1044  * @buf: ring buffer
1045  * @consumed: consumed count indicating the position where to read
1046  * @produced: produced count, indicates position when to stop reading
1047  *
1048  * Returns -ENODATA if buffer is finalized, -EAGAIN if there is currently no
1049  * data to read at consumed position, or 0 if the get operation succeeds.
1050  * Busy-loop trying to get data if the tick_nohz sequence lock is held.
1051  */
1052
1053 int lib_ring_buffer_snapshot(struct lib_ring_buffer *buf,
1054                              unsigned long *consumed, unsigned long *produced)
1055 {
1056         struct channel *chan = buf->backend.chan;
1057         const struct lib_ring_buffer_config *config = &chan->backend.config;
1058         unsigned long consumed_cur, write_offset;
1059         int finalized;
1060
1061 retry:
1062         finalized = READ_ONCE(buf->finalized);
1063         /*
1064          * Read finalized before counters.
1065          */
1066         smp_rmb();
1067         consumed_cur = atomic_long_read(&buf->consumed);
1068         /*
1069          * No need to issue a memory barrier between consumed count read and
1070          * write offset read, because consumed count can only change
1071          * concurrently in overwrite mode, and we keep a sequence counter
1072          * identifier derived from the write offset to check we are getting
1073          * the same sub-buffer we are expecting (the sub-buffers are atomically
1074          * "tagged" upon writes, tags are checked upon read).
1075          */
1076         write_offset = v_read(config, &buf->offset);
1077
1078         /*
1079          * Check that we are not about to read the same subbuffer in
1080          * which the writer head is.
1081          */
1082         if (subbuf_trunc(write_offset, chan) - subbuf_trunc(consumed_cur, chan)
1083             == 0)
1084                 goto nodata;
1085
1086         *consumed = consumed_cur;
1087         *produced = subbuf_trunc(write_offset, chan);
1088
1089         return 0;
1090
1091 nodata:
1092         /*
1093          * The memory barriers __wait_event()/wake_up_interruptible() take care
1094          * of "raw_spin_is_locked" memory ordering.
1095          */
1096         if (finalized)
1097                 return -ENODATA;
1098         else if (raw_spin_is_locked(&buf->raw_tick_nohz_spinlock))
1099                 goto retry;
1100         else
1101                 return -EAGAIN;
1102 }
1103 EXPORT_SYMBOL_GPL(lib_ring_buffer_snapshot);
1104
1105 /**
1106  * Performs the same function as lib_ring_buffer_snapshot(), but the positions
1107  * are saved regardless of whether the consumed and produced positions are
1108  * in the same subbuffer.
1109  * @buf: ring buffer
1110  * @consumed: consumed byte count indicating the last position read
1111  * @produced: produced byte count indicating the last position written
1112  *
1113  * This function is meant to provide information on the exact producer and
1114  * consumer positions without regard for the "snapshot" feature.
1115  */
1116 int lib_ring_buffer_snapshot_sample_positions(struct lib_ring_buffer *buf,
1117                 unsigned long *consumed, unsigned long *produced)
1118 {
1119         struct channel *chan = buf->backend.chan;
1120         const struct lib_ring_buffer_config *config = &chan->backend.config;
1121
1122         smp_rmb();
1123         *consumed = atomic_long_read(&buf->consumed);
1124         /*
1125          * No need to issue a memory barrier between consumed count read and
1126          * write offset read, because consumed count can only change
1127          * concurrently in overwrite mode, and we keep a sequence counter
1128          * identifier derived from the write offset to check we are getting
1129          * the same sub-buffer we are expecting (the sub-buffers are atomically
1130          * "tagged" upon writes, tags are checked upon read).
1131          */
1132         *produced = v_read(config, &buf->offset);
1133         return 0;
1134 }
1135
1136 /**
1137  * lib_ring_buffer_put_snapshot - move consumed counter forward
1138  *
1139  * Should only be called from consumer context.
1140  * @buf: ring buffer
1141  * @consumed_new: new consumed count value
1142  */
1143 void lib_ring_buffer_move_consumer(struct lib_ring_buffer *buf,
1144                                    unsigned long consumed_new)
1145 {
1146         struct lib_ring_buffer_backend *bufb = &buf->backend;
1147         struct channel *chan = bufb->chan;
1148         unsigned long consumed;
1149
1150         CHAN_WARN_ON(chan, atomic_long_read(&buf->active_readers) != 1);
1151
1152         /*
1153          * Only push the consumed value forward.
1154          * If the consumed cmpxchg fails, this is because we have been pushed by
1155          * the writer in flight recorder mode.
1156          */
1157         consumed = atomic_long_read(&buf->consumed);
1158         while ((long) consumed - (long) consumed_new < 0)
1159                 consumed = atomic_long_cmpxchg(&buf->consumed, consumed,
1160                                                consumed_new);
1161         /* Wake-up the metadata producer */
1162         wake_up_interruptible(&buf->write_wait);
1163 }
1164 EXPORT_SYMBOL_GPL(lib_ring_buffer_move_consumer);
1165
1166 #if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE
1167 static void lib_ring_buffer_flush_read_subbuf_dcache(
1168                 const struct lib_ring_buffer_config *config,
1169                 struct channel *chan,
1170                 struct lib_ring_buffer *buf)
1171 {
1172         struct lib_ring_buffer_backend_pages *pages;
1173         unsigned long sb_bindex, id, i, nr_pages;
1174
1175         if (config->output != RING_BUFFER_MMAP)
1176                 return;
1177
1178         /*
1179          * Architectures with caches aliased on virtual addresses may
1180          * use different cache lines for the linear mapping vs
1181          * user-space memory mapping. Given that the ring buffer is
1182          * based on the kernel linear mapping, aligning it with the
1183          * user-space mapping is not straightforward, and would require
1184          * extra TLB entries. Therefore, simply flush the dcache for the
1185          * entire sub-buffer before reading it.
1186          */
1187         id = buf->backend.buf_rsb.id;
1188         sb_bindex = subbuffer_id_get_index(config, id);
1189         pages = buf->backend.array[sb_bindex];
1190         nr_pages = buf->backend.num_pages_per_subbuf;
1191         for (i = 0; i < nr_pages; i++) {
1192                 struct lib_ring_buffer_backend_page *backend_page;
1193
1194                 backend_page = &pages->p[i];
1195                 flush_dcache_page(pfn_to_page(backend_page->pfn));
1196         }
1197 }
1198 #else
1199 static void lib_ring_buffer_flush_read_subbuf_dcache(
1200                 const struct lib_ring_buffer_config *config,
1201                 struct channel *chan,
1202                 struct lib_ring_buffer *buf)
1203 {
1204 }
1205 #endif
1206
1207 /**
1208  * lib_ring_buffer_get_subbuf - get exclusive access to subbuffer for reading
1209  * @buf: ring buffer
1210  * @consumed: consumed count indicating the position where to read
1211  *
1212  * Returns -ENODATA if buffer is finalized, -EAGAIN if there is currently no
1213  * data to read at consumed position, or 0 if the get operation succeeds.
1214  * Busy-loop trying to get data if the tick_nohz sequence lock is held.
1215  */
1216 int lib_ring_buffer_get_subbuf(struct lib_ring_buffer *buf,
1217                                unsigned long consumed)
1218 {
1219         struct channel *chan = buf->backend.chan;
1220         const struct lib_ring_buffer_config *config = &chan->backend.config;
1221         unsigned long consumed_cur, consumed_idx, commit_count, write_offset;
1222         int ret;
1223         int finalized;
1224
1225         if (buf->get_subbuf) {
1226                 /*
1227                  * Reader is trying to get a subbuffer twice.
1228                  */
1229                 CHAN_WARN_ON(chan, 1);
1230                 return -EBUSY;
1231         }
1232 retry:
1233         finalized = READ_ONCE(buf->finalized);
1234         /*
1235          * Read finalized before counters.
1236          */
1237         smp_rmb();
1238         consumed_cur = atomic_long_read(&buf->consumed);
1239         consumed_idx = subbuf_index(consumed, chan);
1240         commit_count = v_read(config, &buf->commit_cold[consumed_idx].cc_sb);
1241         /*
1242          * Make sure we read the commit count before reading the buffer
1243          * data and the write offset. Correct consumed offset ordering
1244          * wrt commit count is insured by the use of cmpxchg to update
1245          * the consumed offset.
1246          * smp_call_function_single can fail if the remote CPU is offline,
1247          * this is OK because then there is no wmb to execute there.
1248          * If our thread is executing on the same CPU as the on the buffers
1249          * belongs to, we don't have to synchronize it at all. If we are
1250          * migrated, the scheduler will take care of the memory barriers.
1251          * Normally, smp_call_function_single() should ensure program order when
1252          * executing the remote function, which implies that it surrounds the
1253          * function execution with :
1254          * smp_mb()
1255          * send IPI
1256          * csd_lock_wait
1257          *                recv IPI
1258          *                smp_mb()
1259          *                exec. function
1260          *                smp_mb()
1261          *                csd unlock
1262          * smp_mb()
1263          *
1264          * However, smp_call_function_single() does not seem to clearly execute
1265          * such barriers. It depends on spinlock semantic to provide the barrier
1266          * before executing the IPI and, when busy-looping, csd_lock_wait only
1267          * executes smp_mb() when it has to wait for the other CPU.
1268          *
1269          * I don't trust this code. Therefore, let's add the smp_mb() sequence
1270          * required ourself, even if duplicated. It has no performance impact
1271          * anyway.
1272          *
1273          * smp_mb() is needed because smp_rmb() and smp_wmb() only order read vs
1274          * read and write vs write. They do not ensure core synchronization. We
1275          * really have to ensure total order between the 3 barriers running on
1276          * the 2 CPUs.
1277          */
1278         if (config->ipi == RING_BUFFER_IPI_BARRIER) {
1279                 if (config->sync == RING_BUFFER_SYNC_PER_CPU
1280                     && config->alloc == RING_BUFFER_ALLOC_PER_CPU) {
1281                         if (raw_smp_processor_id() != buf->backend.cpu) {
1282                                 /* Total order with IPI handler smp_mb() */
1283                                 smp_mb();
1284                                 smp_call_function_single(buf->backend.cpu,
1285                                                          remote_mb, NULL, 1);
1286                                 /* Total order with IPI handler smp_mb() */
1287                                 smp_mb();
1288                         }
1289                 } else {
1290                         /* Total order with IPI handler smp_mb() */
1291                         smp_mb();
1292                         smp_call_function(remote_mb, NULL, 1);
1293                         /* Total order with IPI handler smp_mb() */
1294                         smp_mb();
1295                 }
1296         } else {
1297                 /*
1298                  * Local rmb to match the remote wmb to read the commit count
1299                  * before the buffer data and the write offset.
1300                  */
1301                 smp_rmb();
1302         }
1303
1304         write_offset = v_read(config, &buf->offset);
1305
1306         /*
1307          * Check that the buffer we are getting is after or at consumed_cur
1308          * position.
1309          */
1310         if ((long) subbuf_trunc(consumed, chan)
1311             - (long) subbuf_trunc(consumed_cur, chan) < 0)
1312                 goto nodata;
1313
1314         /*
1315          * Check that the subbuffer we are trying to consume has been
1316          * already fully committed.
1317          */
1318         if (((commit_count - chan->backend.subbuf_size)
1319              & chan->commit_count_mask)
1320             - (buf_trunc(consumed, chan)
1321                >> chan->backend.num_subbuf_order)
1322             != 0)
1323                 goto nodata;
1324
1325         /*
1326          * Check that we are not about to read the same subbuffer in
1327          * which the writer head is.
1328          */
1329         if (subbuf_trunc(write_offset, chan) - subbuf_trunc(consumed, chan)
1330             == 0)
1331                 goto nodata;
1332
1333         /*
1334          * Failure to get the subbuffer causes a busy-loop retry without going
1335          * to a wait queue. These are caused by short-lived race windows where
1336          * the writer is getting access to a subbuffer we were trying to get
1337          * access to. Also checks that the "consumed" buffer count we are
1338          * looking for matches the one contained in the subbuffer id.
1339          */
1340         ret = update_read_sb_index(config, &buf->backend, &chan->backend,
1341                                    consumed_idx, buf_trunc_val(consumed, chan));
1342         if (ret)
1343                 goto retry;
1344         subbuffer_id_clear_noref(config, &buf->backend.buf_rsb.id);
1345
1346         buf->get_subbuf_consumed = consumed;
1347         buf->get_subbuf = 1;
1348
1349         lib_ring_buffer_flush_read_subbuf_dcache(config, chan, buf);
1350
1351         return 0;
1352
1353 nodata:
1354         /*
1355          * The memory barriers __wait_event()/wake_up_interruptible() take care
1356          * of "raw_spin_is_locked" memory ordering.
1357          */
1358         if (finalized)
1359                 return -ENODATA;
1360         else if (raw_spin_is_locked(&buf->raw_tick_nohz_spinlock))
1361                 goto retry;
1362         else
1363                 return -EAGAIN;
1364 }
1365 EXPORT_SYMBOL_GPL(lib_ring_buffer_get_subbuf);
1366
1367 /**
1368  * lib_ring_buffer_put_subbuf - release exclusive subbuffer access
1369  * @buf: ring buffer
1370  */
1371 void lib_ring_buffer_put_subbuf(struct lib_ring_buffer *buf)
1372 {
1373         struct lib_ring_buffer_backend *bufb = &buf->backend;
1374         struct channel *chan = bufb->chan;
1375         const struct lib_ring_buffer_config *config = &chan->backend.config;
1376         unsigned long read_sb_bindex, consumed_idx, consumed;
1377
1378         CHAN_WARN_ON(chan, atomic_long_read(&buf->active_readers) != 1);
1379
1380         if (!buf->get_subbuf) {
1381                 /*
1382                  * Reader puts a subbuffer it did not get.
1383                  */
1384                 CHAN_WARN_ON(chan, 1);
1385                 return;
1386         }
1387         consumed = buf->get_subbuf_consumed;
1388         buf->get_subbuf = 0;
1389
1390         /*
1391          * Clear the records_unread counter. (overruns counter)
1392          * Can still be non-zero if a file reader simply grabbed the data
1393          * without using iterators.
1394          * Can be below zero if an iterator is used on a snapshot more than
1395          * once.
1396          */
1397         read_sb_bindex = subbuffer_id_get_index(config, bufb->buf_rsb.id);
1398         v_add(config, v_read(config,
1399                              &bufb->array[read_sb_bindex]->records_unread),
1400               &bufb->records_read);
1401         v_set(config, &bufb->array[read_sb_bindex]->records_unread, 0);
1402         CHAN_WARN_ON(chan, config->mode == RING_BUFFER_OVERWRITE
1403                      && subbuffer_id_is_noref(config, bufb->buf_rsb.id));
1404         subbuffer_id_set_noref(config, &bufb->buf_rsb.id);
1405
1406         /*
1407          * Exchange the reader subbuffer with the one we put in its place in the
1408          * writer subbuffer table. Expect the original consumed count. If
1409          * update_read_sb_index fails, this is because the writer updated the
1410          * subbuffer concurrently. We should therefore keep the subbuffer we
1411          * currently have: it has become invalid to try reading this sub-buffer
1412          * consumed count value anyway.
1413          */
1414         consumed_idx = subbuf_index(consumed, chan);
1415         update_read_sb_index(config, &buf->backend, &chan->backend,
1416                              consumed_idx, buf_trunc_val(consumed, chan));
1417         /*
1418          * update_read_sb_index return value ignored. Don't exchange sub-buffer
1419          * if the writer concurrently updated it.
1420          */
1421 }
1422 EXPORT_SYMBOL_GPL(lib_ring_buffer_put_subbuf);
1423
1424 /*
1425  * cons_offset is an iterator on all subbuffer offsets between the reader
1426  * position and the writer position. (inclusive)
1427  */
1428 static
1429 void lib_ring_buffer_print_subbuffer_errors(struct lib_ring_buffer *buf,
1430                                             struct channel *chan,
1431                                             unsigned long cons_offset,
1432                                             int cpu)
1433 {
1434         const struct lib_ring_buffer_config *config = &chan->backend.config;
1435         unsigned long cons_idx, commit_count, commit_count_sb;
1436
1437         cons_idx = subbuf_index(cons_offset, chan);
1438         commit_count = v_read(config, &buf->commit_hot[cons_idx].cc);
1439         commit_count_sb = v_read(config, &buf->commit_cold[cons_idx].cc_sb);
1440
1441         if (subbuf_offset(commit_count, chan) != 0)
1442                 printk(KERN_WARNING
1443                        "ring buffer %s, cpu %d: "
1444                        "commit count in subbuffer %lu,\n"
1445                        "expecting multiples of %lu bytes\n"
1446                        "  [ %lu bytes committed, %lu bytes reader-visible ]\n",
1447                        chan->backend.name, cpu, cons_idx,
1448                        chan->backend.subbuf_size,
1449                        commit_count, commit_count_sb);
1450
1451         printk(KERN_DEBUG "ring buffer: %s, cpu %d: %lu bytes committed\n",
1452                chan->backend.name, cpu, commit_count);
1453 }
1454
1455 static
1456 void lib_ring_buffer_print_buffer_errors(struct lib_ring_buffer *buf,
1457                                          struct channel *chan,
1458                                          void *priv, int cpu)
1459 {
1460         const struct lib_ring_buffer_config *config = &chan->backend.config;
1461         unsigned long write_offset, cons_offset;
1462
1463         /*
1464          * No need to order commit_count, write_offset and cons_offset reads
1465          * because we execute at teardown when no more writer nor reader
1466          * references are left.
1467          */
1468         write_offset = v_read(config, &buf->offset);
1469         cons_offset = atomic_long_read(&buf->consumed);
1470         if (write_offset != cons_offset)
1471                 printk(KERN_DEBUG
1472                        "ring buffer %s, cpu %d: "
1473                        "non-consumed data\n"
1474                        "  [ %lu bytes written, %lu bytes read ]\n",
1475                        chan->backend.name, cpu, write_offset, cons_offset);
1476
1477         for (cons_offset = atomic_long_read(&buf->consumed);
1478              (long) (subbuf_trunc((unsigned long) v_read(config, &buf->offset),
1479                                   chan)
1480                      - cons_offset) > 0;
1481              cons_offset = subbuf_align(cons_offset, chan))
1482                 lib_ring_buffer_print_subbuffer_errors(buf, chan, cons_offset,
1483                                                        cpu);
1484 }
1485
1486 #ifdef LTTNG_RING_BUFFER_COUNT_EVENTS
1487 static
1488 void lib_ring_buffer_print_records_count(struct channel *chan,
1489                                          struct lib_ring_buffer *buf,
1490                                          int cpu)
1491 {
1492         const struct lib_ring_buffer_config *config = &chan->backend.config;
1493
1494         if (!strcmp(chan->backend.name, "relay-metadata")) {
1495                 printk(KERN_DEBUG "ring buffer %s: %lu records written, "
1496                         "%lu records overrun\n",
1497                         chan->backend.name,
1498                         v_read(config, &buf->records_count),
1499                         v_read(config, &buf->records_overrun));
1500         } else {
1501                 printk(KERN_DEBUG "ring buffer %s, cpu %d: %lu records written, "
1502                         "%lu records overrun\n",
1503                         chan->backend.name, cpu,
1504                         v_read(config, &buf->records_count),
1505                         v_read(config, &buf->records_overrun));
1506         }
1507 }
1508 #else
1509 static
1510 void lib_ring_buffer_print_records_count(struct channel *chan,
1511                                          struct lib_ring_buffer *buf,
1512                                          int cpu)
1513 {
1514 }
1515 #endif
1516
1517 static
1518 void lib_ring_buffer_print_errors(struct channel *chan,
1519                                   struct lib_ring_buffer *buf, int cpu)
1520 {
1521         const struct lib_ring_buffer_config *config = &chan->backend.config;
1522         void *priv = chan->backend.priv;
1523
1524         lib_ring_buffer_print_records_count(chan, buf, cpu);
1525         if (strcmp(chan->backend.name, "relay-metadata")) {
1526                 if (v_read(config, &buf->records_lost_full)
1527                     || v_read(config, &buf->records_lost_wrap)
1528                     || v_read(config, &buf->records_lost_big))
1529                         printk(KERN_WARNING
1530                                 "ring buffer %s, cpu %d: records were lost. Caused by:\n"
1531                                 "  [ %lu buffer full, %lu nest buffer wrap-around, "
1532                                 "%lu event too big ]\n",
1533                                 chan->backend.name, cpu,
1534                                 v_read(config, &buf->records_lost_full),
1535                                 v_read(config, &buf->records_lost_wrap),
1536                                 v_read(config, &buf->records_lost_big));
1537         }
1538         lib_ring_buffer_print_buffer_errors(buf, chan, priv, cpu);
1539 }
1540
1541 /*
1542  * lib_ring_buffer_switch_old_start: Populate old subbuffer header.
1543  *
1544  * Only executed when the buffer is finalized, in SWITCH_FLUSH.
1545  */
1546 static
1547 void lib_ring_buffer_switch_old_start(struct lib_ring_buffer *buf,
1548                                       struct channel *chan,
1549                                       struct switch_offsets *offsets,
1550                                       u64 tsc)
1551 {
1552         const struct lib_ring_buffer_config *config = &chan->backend.config;
1553         unsigned long oldidx = subbuf_index(offsets->old, chan);
1554         unsigned long commit_count;
1555         struct commit_counters_hot *cc_hot;
1556
1557         config->cb.buffer_begin(buf, tsc, oldidx);
1558
1559         /*
1560          * Order all writes to buffer before the commit count update that will
1561          * determine that the subbuffer is full.
1562          */
1563         if (config->ipi == RING_BUFFER_IPI_BARRIER) {
1564                 /*
1565                  * Must write slot data before incrementing commit count.  This
1566                  * compiler barrier is upgraded into a smp_mb() by the IPI sent
1567                  * by get_subbuf().
1568                  */
1569                 barrier();
1570         } else
1571                 smp_wmb();
1572         cc_hot = &buf->commit_hot[oldidx];
1573         v_add(config, config->cb.subbuffer_header_size(), &cc_hot->cc);
1574         commit_count = v_read(config, &cc_hot->cc);
1575         /* Check if the written buffer has to be delivered */
1576         lib_ring_buffer_check_deliver(config, buf, chan, offsets->old,
1577                                       commit_count, oldidx, tsc);
1578         lib_ring_buffer_write_commit_counter(config, buf, chan,
1579                         offsets->old + config->cb.subbuffer_header_size(),
1580                         commit_count, cc_hot);
1581 }
1582
1583 /*
1584  * lib_ring_buffer_switch_old_end: switch old subbuffer
1585  *
1586  * Note : offset_old should never be 0 here. It is ok, because we never perform
1587  * buffer switch on an empty subbuffer in SWITCH_ACTIVE mode. The caller
1588  * increments the offset_old value when doing a SWITCH_FLUSH on an empty
1589  * subbuffer.
1590  */
1591 static
1592 void lib_ring_buffer_switch_old_end(struct lib_ring_buffer *buf,
1593                                     struct channel *chan,
1594                                     struct switch_offsets *offsets,
1595                                     u64 tsc)
1596 {
1597         const struct lib_ring_buffer_config *config = &chan->backend.config;
1598         unsigned long oldidx = subbuf_index(offsets->old - 1, chan);
1599         unsigned long commit_count, padding_size, data_size;
1600         struct commit_counters_hot *cc_hot;
1601         u64 *ts_end;
1602
1603         data_size = subbuf_offset(offsets->old - 1, chan) + 1;
1604         padding_size = chan->backend.subbuf_size - data_size;
1605         subbuffer_set_data_size(config, &buf->backend, oldidx, data_size);
1606
1607         ts_end = &buf->ts_end[oldidx];
1608         /*
1609          * This is the last space reservation in that sub-buffer before
1610          * it gets delivered. This provides exclusive access to write to
1611          * this sub-buffer's ts_end. There are also no concurrent
1612          * readers of that ts_end because delivery of that sub-buffer is
1613          * postponed until the commit counter is incremented for the
1614          * current space reservation.
1615          */
1616         *ts_end = tsc;
1617
1618         /*
1619          * Order all writes to buffer and store to ts_end before the commit
1620          * count update that will determine that the subbuffer is full.
1621          */
1622         if (config->ipi == RING_BUFFER_IPI_BARRIER) {
1623                 /*
1624                  * Must write slot data before incrementing commit count.  This
1625                  * compiler barrier is upgraded into a smp_mb() by the IPI sent
1626                  * by get_subbuf().
1627                  */
1628                 barrier();
1629         } else
1630                 smp_wmb();
1631         cc_hot = &buf->commit_hot[oldidx];
1632         v_add(config, padding_size, &cc_hot->cc);
1633         commit_count = v_read(config, &cc_hot->cc);
1634         lib_ring_buffer_check_deliver(config, buf, chan, offsets->old - 1,
1635                                       commit_count, oldidx, tsc);
1636         lib_ring_buffer_write_commit_counter(config, buf, chan,
1637                         offsets->old + padding_size, commit_count,
1638                         cc_hot);
1639 }
1640
1641 /*
1642  * lib_ring_buffer_switch_new_start: Populate new subbuffer.
1643  *
1644  * This code can be executed unordered : writers may already have written to the
1645  * sub-buffer before this code gets executed, caution.  The commit makes sure
1646  * that this code is executed before the deliver of this sub-buffer.
1647  */
1648 static
1649 void lib_ring_buffer_switch_new_start(struct lib_ring_buffer *buf,
1650                                       struct channel *chan,
1651                                       struct switch_offsets *offsets,
1652                                       u64 tsc)
1653 {
1654         const struct lib_ring_buffer_config *config = &chan->backend.config;
1655         unsigned long beginidx = subbuf_index(offsets->begin, chan);
1656         unsigned long commit_count;
1657         struct commit_counters_hot *cc_hot;
1658
1659         config->cb.buffer_begin(buf, tsc, beginidx);
1660
1661         /*
1662          * Order all writes to buffer before the commit count update that will
1663          * determine that the subbuffer is full.
1664          */
1665         if (config->ipi == RING_BUFFER_IPI_BARRIER) {
1666                 /*
1667                  * Must write slot data before incrementing commit count.  This
1668                  * compiler barrier is upgraded into a smp_mb() by the IPI sent
1669                  * by get_subbuf().
1670                  */
1671                 barrier();
1672         } else
1673                 smp_wmb();
1674         cc_hot = &buf->commit_hot[beginidx];
1675         v_add(config, config->cb.subbuffer_header_size(), &cc_hot->cc);
1676         commit_count = v_read(config, &cc_hot->cc);
1677         /* Check if the written buffer has to be delivered */
1678         lib_ring_buffer_check_deliver(config, buf, chan, offsets->begin,
1679                                       commit_count, beginidx, tsc);
1680         lib_ring_buffer_write_commit_counter(config, buf, chan,
1681                         offsets->begin + config->cb.subbuffer_header_size(),
1682                         commit_count, cc_hot);
1683 }
1684
1685 /*
1686  * lib_ring_buffer_switch_new_end: finish switching current subbuffer
1687  *
1688  * Calls subbuffer_set_data_size() to set the data size of the current
1689  * sub-buffer. We do not need to perform check_deliver nor commit here,
1690  * since this task will be done by the "commit" of the event for which
1691  * we are currently doing the space reservation.
1692  */
1693 static
1694 void lib_ring_buffer_switch_new_end(struct lib_ring_buffer *buf,
1695                                             struct channel *chan,
1696                                             struct switch_offsets *offsets,
1697                                             u64 tsc)
1698 {
1699         const struct lib_ring_buffer_config *config = &chan->backend.config;
1700         unsigned long endidx, data_size;
1701         u64 *ts_end;
1702
1703         endidx = subbuf_index(offsets->end - 1, chan);
1704         data_size = subbuf_offset(offsets->end - 1, chan) + 1;
1705         subbuffer_set_data_size(config, &buf->backend, endidx, data_size);
1706         ts_end = &buf->ts_end[endidx];
1707         /*
1708          * This is the last space reservation in that sub-buffer before
1709          * it gets delivered. This provides exclusive access to write to
1710          * this sub-buffer's ts_end. There are also no concurrent
1711          * readers of that ts_end because delivery of that sub-buffer is
1712          * postponed until the commit counter is incremented for the
1713          * current space reservation.
1714          */
1715         *ts_end = tsc;
1716 }
1717
1718 /*
1719  * Returns :
1720  * 0 if ok
1721  * !0 if execution must be aborted.
1722  */
1723 static
1724 int lib_ring_buffer_try_switch_slow(enum switch_mode mode,
1725                                     struct lib_ring_buffer *buf,
1726                                     struct channel *chan,
1727                                     struct switch_offsets *offsets,
1728                                     u64 *tsc)
1729 {
1730         const struct lib_ring_buffer_config *config = &chan->backend.config;
1731         unsigned long off, reserve_commit_diff;
1732
1733         offsets->begin = v_read(config, &buf->offset);
1734         offsets->old = offsets->begin;
1735         offsets->switch_old_start = 0;
1736         off = subbuf_offset(offsets->begin, chan);
1737
1738         *tsc = config->cb.ring_buffer_clock_read(chan);
1739
1740         /*
1741          * Ensure we flush the header of an empty subbuffer when doing the
1742          * finalize (SWITCH_FLUSH). This ensures that we end up knowing the
1743          * total data gathering duration even if there were no records saved
1744          * after the last buffer switch.
1745          * In SWITCH_ACTIVE mode, switch the buffer when it contains events.
1746          * SWITCH_ACTIVE only flushes the current subbuffer, dealing with end of
1747          * subbuffer header as appropriate.
1748          * The next record that reserves space will be responsible for
1749          * populating the following subbuffer header. We choose not to populate
1750          * the next subbuffer header here because we want to be able to use
1751          * SWITCH_ACTIVE for periodical buffer flush and CPU tick_nohz stop
1752          * buffer flush, which must guarantee that all the buffer content
1753          * (records and header timestamps) are visible to the reader. This is
1754          * required for quiescence guarantees for the fusion merge.
1755          */
1756         if (mode != SWITCH_FLUSH && !off)
1757                 return -1;      /* we do not have to switch : buffer is empty */
1758
1759         if (unlikely(off == 0)) {
1760                 unsigned long sb_index, commit_count;
1761
1762                 /*
1763                  * We are performing a SWITCH_FLUSH. At this stage, there are no
1764                  * concurrent writes into the buffer.
1765                  *
1766                  * The client does not save any header information.  Don't
1767                  * switch empty subbuffer on finalize, because it is invalid to
1768                  * deliver a completely empty subbuffer.
1769                  */
1770                 if (!config->cb.subbuffer_header_size())
1771                         return -1;
1772
1773                 /* Test new buffer integrity */
1774                 sb_index = subbuf_index(offsets->begin, chan);
1775                 commit_count = v_read(config,
1776                                 &buf->commit_cold[sb_index].cc_sb);
1777                 reserve_commit_diff =
1778                   (buf_trunc(offsets->begin, chan)
1779                    >> chan->backend.num_subbuf_order)
1780                   - (commit_count & chan->commit_count_mask);
1781                 if (likely(reserve_commit_diff == 0)) {
1782                         /* Next subbuffer not being written to. */
1783                         if (unlikely(config->mode != RING_BUFFER_OVERWRITE &&
1784                                 subbuf_trunc(offsets->begin, chan)
1785                                  - subbuf_trunc((unsigned long)
1786                                      atomic_long_read(&buf->consumed), chan)
1787                                 >= chan->backend.buf_size)) {
1788                                 /*
1789                                  * We do not overwrite non consumed buffers
1790                                  * and we are full : don't switch.
1791                                  */
1792                                 return -1;
1793                         } else {
1794                                 /*
1795                                  * Next subbuffer not being written to, and we
1796                                  * are either in overwrite mode or the buffer is
1797                                  * not full. It's safe to write in this new
1798                                  * subbuffer.
1799                                  */
1800                         }
1801                 } else {
1802                         /*
1803                          * Next subbuffer reserve offset does not match the
1804                          * commit offset. Don't perform switch in
1805                          * producer-consumer and overwrite mode.  Caused by
1806                          * either a writer OOPS or too many nested writes over a
1807                          * reserve/commit pair.
1808                          */
1809                         return -1;
1810                 }
1811
1812                 /*
1813                  * Need to write the subbuffer start header on finalize.
1814                  */
1815                 offsets->switch_old_start = 1;
1816         }
1817         offsets->begin = subbuf_align(offsets->begin, chan);
1818         /* Note: old points to the next subbuf at offset 0 */
1819         offsets->end = offsets->begin;
1820         return 0;
1821 }
1822
1823 /*
1824  * Force a sub-buffer switch. This operation is completely reentrant : can be
1825  * called while tracing is active with absolutely no lock held.
1826  *
1827  * Note, however, that as a v_cmpxchg is used for some atomic
1828  * operations, this function must be called from the CPU which owns the buffer
1829  * for a ACTIVE flush.
1830  */
1831 void lib_ring_buffer_switch_slow(struct lib_ring_buffer *buf, enum switch_mode mode)
1832 {
1833         struct channel *chan = buf->backend.chan;
1834         const struct lib_ring_buffer_config *config = &chan->backend.config;
1835         struct switch_offsets offsets;
1836         unsigned long oldidx;
1837         u64 tsc;
1838
1839         offsets.size = 0;
1840
1841         /*
1842          * Perform retryable operations.
1843          */
1844         do {
1845                 if (lib_ring_buffer_try_switch_slow(mode, buf, chan, &offsets,
1846                                                     &tsc))
1847                         return; /* Switch not needed */
1848         } while (v_cmpxchg(config, &buf->offset, offsets.old, offsets.end)
1849                  != offsets.old);
1850
1851         /*
1852          * Atomically update last_tsc. This update races against concurrent
1853          * atomic updates, but the race will always cause supplementary full TSC
1854          * records, never the opposite (missing a full TSC record when it would
1855          * be needed).
1856          */
1857         save_last_tsc(config, buf, tsc);
1858
1859         /*
1860          * Push the reader if necessary
1861          */
1862         lib_ring_buffer_reserve_push_reader(buf, chan, offsets.old);
1863
1864         oldidx = subbuf_index(offsets.old, chan);
1865         lib_ring_buffer_clear_noref(config, &buf->backend, oldidx);
1866
1867         /*
1868          * May need to populate header start on SWITCH_FLUSH.
1869          */
1870         if (offsets.switch_old_start) {
1871                 lib_ring_buffer_switch_old_start(buf, chan, &offsets, tsc);
1872                 offsets.old += config->cb.subbuffer_header_size();
1873         }
1874
1875         /*
1876          * Switch old subbuffer.
1877          */
1878         lib_ring_buffer_switch_old_end(buf, chan, &offsets, tsc);
1879 }
1880 EXPORT_SYMBOL_GPL(lib_ring_buffer_switch_slow);
1881
1882 struct switch_param {
1883         struct lib_ring_buffer *buf;
1884         enum switch_mode mode;
1885 };
1886
1887 static void remote_switch(void *info)
1888 {
1889         struct switch_param *param = info;
1890         struct lib_ring_buffer *buf = param->buf;
1891
1892         lib_ring_buffer_switch_slow(buf, param->mode);
1893 }
1894
1895 static void _lib_ring_buffer_switch_remote(struct lib_ring_buffer *buf,
1896                 enum switch_mode mode)
1897 {
1898         struct channel *chan = buf->backend.chan;
1899         const struct lib_ring_buffer_config *config = &chan->backend.config;
1900         int ret;
1901         struct switch_param param;
1902
1903         /*
1904          * With global synchronization we don't need to use the IPI scheme.
1905          */
1906         if (config->sync == RING_BUFFER_SYNC_GLOBAL) {
1907                 lib_ring_buffer_switch_slow(buf, mode);
1908                 return;
1909         }
1910
1911         /*
1912          * Disabling preemption ensures two things: first, that the
1913          * target cpu is not taken concurrently offline while we are within
1914          * smp_call_function_single(). Secondly, if it happens that the
1915          * CPU is not online, our own call to lib_ring_buffer_switch_slow()
1916          * needs to be protected from CPU hotplug handlers, which can
1917          * also perform a remote subbuffer switch.
1918          */
1919         preempt_disable();
1920         param.buf = buf;
1921         param.mode = mode;
1922         ret = smp_call_function_single(buf->backend.cpu,
1923                                  remote_switch, &param, 1);
1924         if (ret) {
1925                 /* Remote CPU is offline, do it ourself. */
1926                 lib_ring_buffer_switch_slow(buf, mode);
1927         }
1928         preempt_enable();
1929 }
1930
1931 /* Switch sub-buffer if current sub-buffer is non-empty. */
1932 void lib_ring_buffer_switch_remote(struct lib_ring_buffer *buf)
1933 {
1934         _lib_ring_buffer_switch_remote(buf, SWITCH_ACTIVE);
1935 }
1936 EXPORT_SYMBOL_GPL(lib_ring_buffer_switch_remote);
1937
1938 /* Switch sub-buffer even if current sub-buffer is empty. */
1939 void lib_ring_buffer_switch_remote_empty(struct lib_ring_buffer *buf)
1940 {
1941         _lib_ring_buffer_switch_remote(buf, SWITCH_FLUSH);
1942 }
1943 EXPORT_SYMBOL_GPL(lib_ring_buffer_switch_remote_empty);
1944
1945 void lib_ring_buffer_clear(struct lib_ring_buffer *buf)
1946 {
1947         struct lib_ring_buffer_backend *bufb = &buf->backend;
1948         struct channel *chan = bufb->chan;
1949
1950         lib_ring_buffer_switch_remote(buf);
1951         lib_ring_buffer_clear_reader(buf, chan);
1952 }
1953 EXPORT_SYMBOL_GPL(lib_ring_buffer_clear);
1954
1955 /*
1956  * Returns :
1957  * 0 if ok
1958  * -ENOSPC if event size is too large for packet.
1959  * -ENOBUFS if there is currently not enough space in buffer for the event.
1960  * -EIO if data cannot be written into the buffer for any other reason.
1961  */
1962 static
1963 int lib_ring_buffer_try_reserve_slow(struct lib_ring_buffer *buf,
1964                                      struct channel *chan,
1965                                      struct switch_offsets *offsets,
1966                                      struct lib_ring_buffer_ctx *ctx,
1967                                      void *client_ctx)
1968 {
1969         const struct lib_ring_buffer_config *config = &chan->backend.config;
1970         unsigned long reserve_commit_diff, offset_cmp;
1971
1972 retry:
1973         offsets->begin = offset_cmp = v_read(config, &buf->offset);
1974         offsets->old = offsets->begin;
1975         offsets->switch_new_start = 0;
1976         offsets->switch_new_end = 0;
1977         offsets->switch_old_end = 0;
1978         offsets->pre_header_padding = 0;
1979
1980         ctx->tsc = config->cb.ring_buffer_clock_read(chan);
1981         if ((int64_t) ctx->tsc == -EIO)
1982                 return -EIO;
1983
1984         if (last_tsc_overflow(config, buf, ctx->tsc))
1985                 ctx->rflags |= RING_BUFFER_RFLAG_FULL_TSC;
1986
1987         if (unlikely(subbuf_offset(offsets->begin, ctx->chan) == 0)) {
1988                 offsets->switch_new_start = 1;          /* For offsets->begin */
1989         } else {
1990                 offsets->size = config->cb.record_header_size(config, chan,
1991                                                 offsets->begin,
1992                                                 &offsets->pre_header_padding,
1993                                                 ctx, client_ctx);
1994                 offsets->size +=
1995                         lib_ring_buffer_align(offsets->begin + offsets->size,
1996                                               ctx->largest_align)
1997                         + ctx->data_size;
1998                 if (unlikely(subbuf_offset(offsets->begin, chan) +
1999                              offsets->size > chan->backend.subbuf_size)) {
2000                         offsets->switch_old_end = 1;    /* For offsets->old */
2001                         offsets->switch_new_start = 1;  /* For offsets->begin */
2002                 }
2003         }
2004         if (unlikely(offsets->switch_new_start)) {
2005                 unsigned long sb_index, commit_count;
2006
2007                 /*
2008                  * We are typically not filling the previous buffer completely.
2009                  */
2010                 if (likely(offsets->switch_old_end))
2011                         offsets->begin = subbuf_align(offsets->begin, chan);
2012                 offsets->begin = offsets->begin
2013                                  + config->cb.subbuffer_header_size();
2014                 /* Test new buffer integrity */
2015                 sb_index = subbuf_index(offsets->begin, chan);
2016                 /*
2017                  * Read buf->offset before buf->commit_cold[sb_index].cc_sb.
2018                  * lib_ring_buffer_check_deliver() has the matching
2019                  * memory barriers required around commit_cold cc_sb
2020                  * updates to ensure reserve and commit counter updates
2021                  * are not seen reordered when updated by another CPU.
2022                  */
2023                 smp_rmb();
2024                 commit_count = v_read(config,
2025                                 &buf->commit_cold[sb_index].cc_sb);
2026                 /* Read buf->commit_cold[sb_index].cc_sb before buf->offset. */
2027                 smp_rmb();
2028                 if (unlikely(offset_cmp != v_read(config, &buf->offset))) {
2029                         /*
2030                          * The reserve counter have been concurrently updated
2031                          * while we read the commit counter. This means the
2032                          * commit counter we read might not match buf->offset
2033                          * due to concurrent update. We therefore need to retry.
2034                          */
2035                         goto retry;
2036                 }
2037                 reserve_commit_diff =
2038                   (buf_trunc(offsets->begin, chan)
2039                    >> chan->backend.num_subbuf_order)
2040                   - (commit_count & chan->commit_count_mask);
2041                 if (likely(reserve_commit_diff == 0)) {
2042                         /* Next subbuffer not being written to. */
2043                         if (unlikely(config->mode != RING_BUFFER_OVERWRITE &&
2044                                 subbuf_trunc(offsets->begin, chan)
2045                                  - subbuf_trunc((unsigned long)
2046                                      atomic_long_read(&buf->consumed), chan)
2047                                 >= chan->backend.buf_size)) {
2048                                 /*
2049                                  * We do not overwrite non consumed buffers
2050                                  * and we are full : record is lost.
2051                                  */
2052                                 v_inc(config, &buf->records_lost_full);
2053                                 return -ENOBUFS;
2054                         } else {
2055                                 /*
2056                                  * Next subbuffer not being written to, and we
2057                                  * are either in overwrite mode or the buffer is
2058                                  * not full. It's safe to write in this new
2059                                  * subbuffer.
2060                                  */
2061                         }
2062                 } else {
2063                         /*
2064                          * Next subbuffer reserve offset does not match the
2065                          * commit offset, and this did not involve update to the
2066                          * reserve counter. Drop record in producer-consumer and
2067                          * overwrite mode.  Caused by either a writer OOPS or
2068                          * too many nested writes over a reserve/commit pair.
2069                          */
2070                         v_inc(config, &buf->records_lost_wrap);
2071                         return -EIO;
2072                 }
2073                 offsets->size =
2074                         config->cb.record_header_size(config, chan,
2075                                                 offsets->begin,
2076                                                 &offsets->pre_header_padding,
2077                                                 ctx, client_ctx);
2078                 offsets->size +=
2079                         lib_ring_buffer_align(offsets->begin + offsets->size,
2080                                               ctx->largest_align)
2081                         + ctx->data_size;
2082                 if (unlikely(subbuf_offset(offsets->begin, chan)
2083                              + offsets->size > chan->backend.subbuf_size)) {
2084                         /*
2085                          * Record too big for subbuffers, report error, don't
2086                          * complete the sub-buffer switch.
2087                          */
2088                         v_inc(config, &buf->records_lost_big);
2089                         return -ENOSPC;
2090                 } else {
2091                         /*
2092                          * We just made a successful buffer switch and the
2093                          * record fits in the new subbuffer. Let's write.
2094                          */
2095                 }
2096         } else {
2097                 /*
2098                  * Record fits in the current buffer and we are not on a switch
2099                  * boundary. It's safe to write.
2100                  */
2101         }
2102         offsets->end = offsets->begin + offsets->size;
2103
2104         if (unlikely(subbuf_offset(offsets->end, chan) == 0)) {
2105                 /*
2106                  * The offset_end will fall at the very beginning of the next
2107                  * subbuffer.
2108                  */
2109                 offsets->switch_new_end = 1;    /* For offsets->begin */
2110         }
2111         return 0;
2112 }
2113
2114 static struct lib_ring_buffer *get_current_buf(struct channel *chan, int cpu)
2115 {
2116         const struct lib_ring_buffer_config *config = &chan->backend.config;
2117
2118         if (config->alloc == RING_BUFFER_ALLOC_PER_CPU)
2119                 return per_cpu_ptr(chan->backend.buf, cpu);
2120         else
2121                 return chan->backend.buf;
2122 }
2123
2124 void lib_ring_buffer_lost_event_too_big(struct channel *chan)
2125 {
2126         const struct lib_ring_buffer_config *config = &chan->backend.config;
2127         struct lib_ring_buffer *buf = get_current_buf(chan, smp_processor_id());
2128
2129         v_inc(config, &buf->records_lost_big);
2130 }
2131 EXPORT_SYMBOL_GPL(lib_ring_buffer_lost_event_too_big);
2132
2133 /**
2134  * lib_ring_buffer_reserve_slow - Atomic slot reservation in a buffer.
2135  * @ctx: ring buffer context.
2136  *
2137  * Return : -NOBUFS if not enough space, -ENOSPC if event size too large,
2138  * -EIO for other errors, else returns 0.
2139  * It will take care of sub-buffer switching.
2140  */
2141 int lib_ring_buffer_reserve_slow(struct lib_ring_buffer_ctx *ctx,
2142                 void *client_ctx)
2143 {
2144         struct channel *chan = ctx->chan;
2145         const struct lib_ring_buffer_config *config = &chan->backend.config;
2146         struct lib_ring_buffer *buf;
2147         struct switch_offsets offsets;
2148         int ret;
2149
2150         ctx->buf = buf = get_current_buf(chan, ctx->cpu);
2151         offsets.size = 0;
2152
2153         do {
2154                 ret = lib_ring_buffer_try_reserve_slow(buf, chan, &offsets,
2155                                                        ctx, client_ctx);
2156                 if (unlikely(ret))
2157                         return ret;
2158         } while (unlikely(v_cmpxchg(config, &buf->offset, offsets.old,
2159                                     offsets.end)
2160                           != offsets.old));
2161
2162         /*
2163          * Atomically update last_tsc. This update races against concurrent
2164          * atomic updates, but the race will always cause supplementary full TSC
2165          * records, never the opposite (missing a full TSC record when it would
2166          * be needed).
2167          */
2168         save_last_tsc(config, buf, ctx->tsc);
2169
2170         /*
2171          * Push the reader if necessary
2172          */
2173         lib_ring_buffer_reserve_push_reader(buf, chan, offsets.end - 1);
2174
2175         /*
2176          * Clear noref flag for this subbuffer.
2177          */
2178         lib_ring_buffer_clear_noref(config, &buf->backend,
2179                                     subbuf_index(offsets.end - 1, chan));
2180
2181         /*
2182          * Switch old subbuffer if needed.
2183          */
2184         if (unlikely(offsets.switch_old_end)) {
2185                 lib_ring_buffer_clear_noref(config, &buf->backend,
2186                                             subbuf_index(offsets.old - 1, chan));
2187                 lib_ring_buffer_switch_old_end(buf, chan, &offsets, ctx->tsc);
2188         }
2189
2190         /*
2191          * Populate new subbuffer.
2192          */
2193         if (unlikely(offsets.switch_new_start))
2194                 lib_ring_buffer_switch_new_start(buf, chan, &offsets, ctx->tsc);
2195
2196         if (unlikely(offsets.switch_new_end))
2197                 lib_ring_buffer_switch_new_end(buf, chan, &offsets, ctx->tsc);
2198
2199         ctx->slot_size = offsets.size;
2200         ctx->pre_offset = offsets.begin;
2201         ctx->buf_offset = offsets.begin + offsets.pre_header_padding;
2202         return 0;
2203 }
2204 EXPORT_SYMBOL_GPL(lib_ring_buffer_reserve_slow);
2205
2206 static
2207 void lib_ring_buffer_vmcore_check_deliver(const struct lib_ring_buffer_config *config,
2208                                           struct lib_ring_buffer *buf,
2209                                           unsigned long commit_count,
2210                                           unsigned long idx)
2211 {
2212         if (config->oops == RING_BUFFER_OOPS_CONSISTENCY)
2213                 v_set(config, &buf->commit_hot[idx].seq, commit_count);
2214 }
2215
2216 /*
2217  * The ring buffer can count events recorded and overwritten per buffer,
2218  * but it is disabled by default due to its performance overhead.
2219  */
2220 #ifdef LTTNG_RING_BUFFER_COUNT_EVENTS
2221 static
2222 void deliver_count_events(const struct lib_ring_buffer_config *config,
2223                 struct lib_ring_buffer *buf,
2224                 unsigned long idx)
2225 {
2226         v_add(config, subbuffer_get_records_count(config,
2227                         &buf->backend, idx),
2228                 &buf->records_count);
2229         v_add(config, subbuffer_count_records_overrun(config,
2230                         &buf->backend, idx),
2231                 &buf->records_overrun);
2232 }
2233 #else /* LTTNG_RING_BUFFER_COUNT_EVENTS */
2234 static
2235 void deliver_count_events(const struct lib_ring_buffer_config *config,
2236                 struct lib_ring_buffer *buf,
2237                 unsigned long idx)
2238 {
2239 }
2240 #endif /* #else LTTNG_RING_BUFFER_COUNT_EVENTS */
2241
2242
2243 void lib_ring_buffer_check_deliver_slow(const struct lib_ring_buffer_config *config,
2244                                    struct lib_ring_buffer *buf,
2245                                    struct channel *chan,
2246                                    unsigned long offset,
2247                                    unsigned long commit_count,
2248                                    unsigned long idx,
2249                                    u64 tsc)
2250 {
2251         unsigned long old_commit_count = commit_count
2252                                          - chan->backend.subbuf_size;
2253
2254         /*
2255          * If we succeeded at updating cc_sb below, we are the subbuffer
2256          * writer delivering the subbuffer. Deals with concurrent
2257          * updates of the "cc" value without adding a add_return atomic
2258          * operation to the fast path.
2259          *
2260          * We are doing the delivery in two steps:
2261          * - First, we cmpxchg() cc_sb to the new value
2262          *   old_commit_count + 1. This ensures that we are the only
2263          *   subbuffer user successfully filling the subbuffer, but we
2264          *   do _not_ set the cc_sb value to "commit_count" yet.
2265          *   Therefore, other writers that would wrap around the ring
2266          *   buffer and try to start writing to our subbuffer would
2267          *   have to drop records, because it would appear as
2268          *   non-filled.
2269          *   We therefore have exclusive access to the subbuffer control
2270          *   structures.  This mutual exclusion with other writers is
2271          *   crucially important to perform record overruns count in
2272          *   flight recorder mode locklessly.
2273          * - When we are ready to release the subbuffer (either for
2274          *   reading or for overrun by other writers), we simply set the
2275          *   cc_sb value to "commit_count" and perform delivery.
2276          *
2277          * The subbuffer size is least 2 bytes (minimum size: 1 page).
2278          * This guarantees that old_commit_count + 1 != commit_count.
2279          */
2280
2281         /*
2282          * Order prior updates to reserve count prior to the
2283          * commit_cold cc_sb update.
2284          */
2285         smp_wmb();
2286         if (likely(v_cmpxchg(config, &buf->commit_cold[idx].cc_sb,
2287                                  old_commit_count, old_commit_count + 1)
2288                    == old_commit_count)) {
2289                 u64 *ts_end;
2290
2291                 /*
2292                  * Start of exclusive subbuffer access. We are
2293                  * guaranteed to be the last writer in this subbuffer
2294                  * and any other writer trying to access this subbuffer
2295                  * in this state is required to drop records.
2296                  *
2297                  * We can read the ts_end for the current sub-buffer
2298                  * which has been saved by the very last space
2299                  * reservation for the current sub-buffer.
2300                  *
2301                  * Order increment of commit counter before reading ts_end.
2302                  */
2303                 smp_mb();
2304                 ts_end = &buf->ts_end[idx];
2305                 deliver_count_events(config, buf, idx);
2306                 config->cb.buffer_end(buf, *ts_end, idx,
2307                                       lib_ring_buffer_get_data_size(config,
2308                                                                 buf,
2309                                                                 idx));
2310
2311                 /*
2312                  * Increment the packet counter while we have exclusive
2313                  * access.
2314                  */
2315                 subbuffer_inc_packet_count(config, &buf->backend, idx);
2316
2317                 /*
2318                  * Set noref flag and offset for this subbuffer id.
2319                  * Contains a memory barrier that ensures counter stores
2320                  * are ordered before set noref and offset.
2321                  */
2322                 lib_ring_buffer_set_noref_offset(config, &buf->backend, idx,
2323                                                  buf_trunc_val(offset, chan));
2324
2325                 /*
2326                  * Order set_noref and record counter updates before the
2327                  * end of subbuffer exclusive access. Orders with
2328                  * respect to writers coming into the subbuffer after
2329                  * wrap around, and also order wrt concurrent readers.
2330                  */
2331                 smp_mb();
2332                 /* End of exclusive subbuffer access */
2333                 v_set(config, &buf->commit_cold[idx].cc_sb,
2334                       commit_count);
2335                 /*
2336                  * Order later updates to reserve count after
2337                  * the commit_cold cc_sb update.
2338                  */
2339                 smp_wmb();
2340                 lib_ring_buffer_vmcore_check_deliver(config, buf,
2341                                                  commit_count, idx);
2342
2343                 /*
2344                  * RING_BUFFER_WAKEUP_BY_WRITER wakeup is not lock-free.
2345                  */
2346                 if (config->wakeup == RING_BUFFER_WAKEUP_BY_WRITER
2347                     && atomic_long_read(&buf->active_readers)
2348                     && lib_ring_buffer_poll_deliver(config, buf, chan)) {
2349                         wake_up_interruptible(&buf->read_wait);
2350                         wake_up_interruptible(&chan->read_wait);
2351                 }
2352
2353         }
2354 }
2355 EXPORT_SYMBOL_GPL(lib_ring_buffer_check_deliver_slow);
2356
2357 int __init init_lib_ring_buffer_frontend(void)
2358 {
2359         int cpu;
2360
2361         for_each_possible_cpu(cpu)
2362                 spin_lock_init(&per_cpu(ring_buffer_nohz_lock, cpu));
2363         return 0;
2364 }
2365
2366 module_init(init_lib_ring_buffer_frontend);
2367
2368 void __exit exit_lib_ring_buffer_frontend(void)
2369 {
2370 }
2371
2372 module_exit(exit_lib_ring_buffer_frontend);