lib/ringbuffer/ring_buffer_frontend.c

   1 /*
   2  * ring_buffer_frontend.c
   3  *
   4  * Copyright (C) 2005-2012 Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
   5  *
   6  * This library is free software; you can redistribute it and/or
   7  * modify it under the terms of the GNU Lesser General Public
   8  * License as published by the Free Software Foundation; only
   9  * version 2.1 of the License.
  10  *
  11  * This library is distributed in the hope that it will be useful,
  12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  14  * Lesser General Public License for more details.
  15  *
  16  * You should have received a copy of the GNU Lesser General Public
  17  * License along with this library; if not, write to the Free Software
  18  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  19  *
  20  *
  21  * Ring buffer wait-free buffer synchronization. Producer-consumer and flight
  22  * recorder (overwrite) modes. See thesis:
  23  *
  24  * Desnoyers, Mathieu (2009), "Low-Impact Operating System Tracing", Ph.D.
  25  * dissertation, Ecole Polytechnique de Montreal.
  26  * http://www.lttng.org/pub/thesis/desnoyers-dissertation-2009-12.pdf
  27  *
  28  * - Algorithm presentation in Chapter 5:
  29  *     "Lockless Multi-Core High-Throughput Buffering".
  30  * - Algorithm formal verification in Section 8.6:
  31  *     "Formal verification of LTTng"
  32  *
  33  * Author:
  34  *      Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
  35  *
  36  * Inspired from LTT and RelayFS:
  37  *  Karim Yaghmour <karim@opersys.com>
  38  *  Tom Zanussi <zanussi@us.ibm.com>
  39  *  Bob Wisniewski <bob@watson.ibm.com>
  40  * And from K42 :
  41  *  Bob Wisniewski <bob@watson.ibm.com>
  42  *
  43  * Buffer reader semantic :
  44  *
  45  * - get_subbuf_size
  46  * while buffer is not finalized and empty
  47  *   - get_subbuf
  48  *     - if return value != 0, continue
  49  *   - splice one subbuffer worth of data to a pipe
  50  *   - splice the data from pipe to disk/network
  51  *   - put_subbuf
  52  */
  53
  54 #include <linux/delay.h>
  55 #include <linux/module.h>
  56 #include <linux/percpu.h>
  57
  58 #include <wrapper/ringbuffer/config.h>
  59 #include <wrapper/ringbuffer/backend.h>
  60 #include <wrapper/ringbuffer/frontend.h>
  61 #include <wrapper/ringbuffer/iterator.h>
  62 #include <wrapper/ringbuffer/nohz.h>
  63 #include <wrapper/atomic.h>
  64 #include <wrapper/kref.h>
  65 #include <wrapper/percpu-defs.h>
  66 #include <wrapper/timer.h>
  67
  68 /*
  69  * Internal structure representing offsets to use at a sub-buffer switch.
  70  */
  71 struct switch_offsets {
  72         unsigned long begin, end, old;
  73         size_t pre_header_padding, size;
  74         unsigned int switch_new_start:1, switch_new_end:1, switch_old_start:1,
  75                      switch_old_end:1;
  76 };
  77
  78 #ifdef CONFIG_NO_HZ
  79 enum tick_nohz_val {
  80         TICK_NOHZ_STOP,
  81         TICK_NOHZ_FLUSH,
  82         TICK_NOHZ_RESTART,
  83 };
  84
  85 static ATOMIC_NOTIFIER_HEAD(tick_nohz_notifier);
  86 #endif /* CONFIG_NO_HZ */
  87
  88 static DEFINE_PER_CPU(spinlock_t, ring_buffer_nohz_lock);
  89
  90 DEFINE_PER_CPU(unsigned int, lib_ring_buffer_nesting);
  91 EXPORT_PER_CPU_SYMBOL(lib_ring_buffer_nesting);
  92
  93 static
  94 void lib_ring_buffer_print_errors(struct channel *chan,
  95                                   struct lib_ring_buffer *buf, int cpu);
  96 static
  97 void _lib_ring_buffer_switch_remote(struct lib_ring_buffer *buf,
  98                 enum switch_mode mode);
  99
 100 /*
 101  * Must be called under cpu hotplug protection.
 102  */
 103 void lib_ring_buffer_free(struct lib_ring_buffer *buf)
 104 {
 105         struct channel *chan = buf->backend.chan;
 106
 107         lib_ring_buffer_print_errors(chan, buf, buf->backend.cpu);
 108         kfree(buf->commit_hot);
 109         kfree(buf->commit_cold);
 110
 111         lib_ring_buffer_backend_free(&buf->backend);
 112 }
 113
 114 /**
 115  * lib_ring_buffer_reset - Reset ring buffer to initial values.
 116  * @buf: Ring buffer.
 117  *
 118  * Effectively empty the ring buffer. Should be called when the buffer is not
 119  * used for writing. The ring buffer can be opened for reading, but the reader
 120  * should not be using the iterator concurrently with reset. The previous
 121  * current iterator record is reset.
 122  */
 123 void lib_ring_buffer_reset(struct lib_ring_buffer *buf)
 124 {
 125         struct channel *chan = buf->backend.chan;
 126         const struct lib_ring_buffer_config *config = &chan->backend.config;
 127         unsigned int i;
 128
 129         /*
 130          * Reset iterator first. It will put the subbuffer if it currently holds
 131          * it.
 132          */
 133         lib_ring_buffer_iterator_reset(buf);
 134         v_set(config, &buf->offset, 0);
 135         for (i = 0; i < chan->backend.num_subbuf; i++) {
 136                 v_set(config, &buf->commit_hot[i].cc, 0);
 137                 v_set(config, &buf->commit_hot[i].seq, 0);
 138                 v_set(config, &buf->commit_cold[i].cc_sb, 0);
 139         }
 140         atomic_long_set(&buf->consumed, 0);
 141         atomic_set(&buf->record_disabled, 0);
 142         v_set(config, &buf->last_tsc, 0);
 143         lib_ring_buffer_backend_reset(&buf->backend);
 144         /* Don't reset number of active readers */
 145         v_set(config, &buf->records_lost_full, 0);
 146         v_set(config, &buf->records_lost_wrap, 0);
 147         v_set(config, &buf->records_lost_big, 0);
 148         v_set(config, &buf->records_count, 0);
 149         v_set(config, &buf->records_overrun, 0);
 150         buf->finalized = 0;
 151 }
 152 EXPORT_SYMBOL_GPL(lib_ring_buffer_reset);
 153
 154 /**
 155  * channel_reset - Reset channel to initial values.
 156  * @chan: Channel.
 157  *
 158  * Effectively empty the channel. Should be called when the channel is not used
 159  * for writing. The channel can be opened for reading, but the reader should not
 160  * be using the iterator concurrently with reset. The previous current iterator
 161  * record is reset.
 162  */
 163 void channel_reset(struct channel *chan)
 164 {
 165         /*
 166          * Reset iterators first. Will put the subbuffer if held for reading.
 167          */
 168         channel_iterator_reset(chan);
 169         atomic_set(&chan->record_disabled, 0);
 170         /* Don't reset commit_count_mask, still valid */
 171         channel_backend_reset(&chan->backend);
 172         /* Don't reset switch/read timer interval */
 173         /* Don't reset notifiers and notifier enable bits */
 174         /* Don't reset reader reference count */
 175 }
 176 EXPORT_SYMBOL_GPL(channel_reset);
 177
 178 /*
 179  * Must be called under cpu hotplug protection.
 180  */
 181 int lib_ring_buffer_create(struct lib_ring_buffer *buf,
 182                            struct channel_backend *chanb, int cpu)
 183 {
 184         const struct lib_ring_buffer_config *config = &chanb->config;
 185         struct channel *chan = container_of(chanb, struct channel, backend);
 186         void *priv = chanb->priv;
 187         size_t subbuf_header_size;
 188         u64 tsc;
 189         int ret;
 190
 191         /* Test for cpu hotplug */
 192         if (buf->backend.allocated)
 193                 return 0;
 194
 195         /*
 196          * Paranoia: per cpu dynamic allocation is not officially documented as
 197          * zeroing the memory, so let's do it here too, just in case.
 198          */
 199         memset(buf, 0, sizeof(*buf));
 200
 201         ret = lib_ring_buffer_backend_create(&buf->backend, &chan->backend, cpu);
 202         if (ret)
 203                 return ret;
 204
 205         buf->commit_hot =
 206                 kzalloc_node(ALIGN(sizeof(*buf->commit_hot)
 207                                    * chan->backend.num_subbuf,
 208                                    1 << INTERNODE_CACHE_SHIFT),
 209                         GFP_KERNEL, cpu_to_node(max(cpu, 0)));
 210         if (!buf->commit_hot) {
 211                 ret = -ENOMEM;
 212                 goto free_chanbuf;
 213         }
 214
 215         buf->commit_cold =
 216                 kzalloc_node(ALIGN(sizeof(*buf->commit_cold)
 217                                    * chan->backend.num_subbuf,
 218                                    1 << INTERNODE_CACHE_SHIFT),
 219                         GFP_KERNEL, cpu_to_node(max(cpu, 0)));
 220         if (!buf->commit_cold) {
 221                 ret = -ENOMEM;
 222                 goto free_commit;
 223         }
 224
 225         init_waitqueue_head(&buf->read_wait);
 226         init_waitqueue_head(&buf->write_wait);
 227         raw_spin_lock_init(&buf->raw_tick_nohz_spinlock);
 228
 229         /*
 230          * Write the subbuffer header for first subbuffer so we know the total
 231          * duration of data gathering.
 232          */
 233         subbuf_header_size = config->cb.subbuffer_header_size();
 234         v_set(config, &buf->offset, subbuf_header_size);
 235         subbuffer_id_clear_noref(config, &buf->backend.buf_wsb[0].id);
 236         tsc = config->cb.ring_buffer_clock_read(buf->backend.chan);
 237         config->cb.buffer_begin(buf, tsc, 0);
 238         v_add(config, subbuf_header_size, &buf->commit_hot[0].cc);
 239
 240         if (config->cb.buffer_create) {
 241                 ret = config->cb.buffer_create(buf, priv, cpu, chanb->name);
 242                 if (ret)
 243                         goto free_init;
 244         }
 245
 246         /*
 247          * Ensure the buffer is ready before setting it to allocated and setting
 248          * the cpumask.
 249          * Used for cpu hotplug vs cpumask iteration.
 250          */
 251         smp_wmb();
 252         buf->backend.allocated = 1;
 253
 254         if (config->alloc == RING_BUFFER_ALLOC_PER_CPU) {
 255                 CHAN_WARN_ON(chan, cpumask_test_cpu(cpu,
 256                              chan->backend.cpumask));
 257                 cpumask_set_cpu(cpu, chan->backend.cpumask);
 258         }
 259
 260         return 0;
 261
 262         /* Error handling */
 263 free_init:
 264         kfree(buf->commit_cold);
 265 free_commit:
 266         kfree(buf->commit_hot);
 267 free_chanbuf:
 268         lib_ring_buffer_backend_free(&buf->backend);
 269         return ret;
 270 }
 271
 272 static void switch_buffer_timer(unsigned long data)
 273 {
 274         struct lib_ring_buffer *buf = (struct lib_ring_buffer *)data;
 275         struct channel *chan = buf->backend.chan;
 276         const struct lib_ring_buffer_config *config = &chan->backend.config;
 277
 278         /*
 279          * Only flush buffers periodically if readers are active.
 280          */
 281         if (atomic_long_read(&buf->active_readers))
 282                 lib_ring_buffer_switch_slow(buf, SWITCH_ACTIVE);
 283
 284         if (config->alloc == RING_BUFFER_ALLOC_PER_CPU)
 285                 lttng_mod_timer_pinned(&buf->switch_timer,
 286                                  jiffies + chan->switch_timer_interval);
 287         else
 288                 mod_timer(&buf->switch_timer,
 289                           jiffies + chan->switch_timer_interval);
 290 }
 291
 292 /*
 293  * Called with ring_buffer_nohz_lock held for per-cpu buffers.
 294  */
 295 static void lib_ring_buffer_start_switch_timer(struct lib_ring_buffer *buf)
 296 {
 297         struct channel *chan = buf->backend.chan;
 298         const struct lib_ring_buffer_config *config = &chan->backend.config;
 299
 300         if (!chan->switch_timer_interval || buf->switch_timer_enabled)
 301                 return;
 302
 303         if (config->alloc == RING_BUFFER_ALLOC_PER_CPU)
 304                 lttng_init_timer_pinned(&buf->switch_timer);
 305         else
 306                 init_timer(&buf->switch_timer);
 307
 308         buf->switch_timer.function = switch_buffer_timer;
 309         buf->switch_timer.expires = jiffies + chan->switch_timer_interval;
 310         buf->switch_timer.data = (unsigned long)buf;
 311         if (config->alloc == RING_BUFFER_ALLOC_PER_CPU)
 312                 add_timer_on(&buf->switch_timer, buf->backend.cpu);
 313         else
 314                 add_timer(&buf->switch_timer);
 315         buf->switch_timer_enabled = 1;
 316 }
 317
 318 /*
 319  * Called with ring_buffer_nohz_lock held for per-cpu buffers.
 320  */
 321 static void lib_ring_buffer_stop_switch_timer(struct lib_ring_buffer *buf)
 322 {
 323         struct channel *chan = buf->backend.chan;
 324
 325         if (!chan->switch_timer_interval || !buf->switch_timer_enabled)
 326                 return;
 327
 328         del_timer_sync(&buf->switch_timer);
 329         buf->switch_timer_enabled = 0;
 330 }
 331
 332 /*
 333  * Polling timer to check the channels for data.
 334  */
 335 static void read_buffer_timer(unsigned long data)
 336 {
 337         struct lib_ring_buffer *buf = (struct lib_ring_buffer *)data;
 338         struct channel *chan = buf->backend.chan;
 339         const struct lib_ring_buffer_config *config = &chan->backend.config;
 340
 341         CHAN_WARN_ON(chan, !buf->backend.allocated);
 342
 343         if (atomic_long_read(&buf->active_readers)
 344             && lib_ring_buffer_poll_deliver(config, buf, chan)) {
 345                 wake_up_interruptible(&buf->read_wait);
 346                 wake_up_interruptible(&chan->read_wait);
 347         }
 348
 349         if (config->alloc == RING_BUFFER_ALLOC_PER_CPU)
 350                 lttng_mod_timer_pinned(&buf->read_timer,
 351                                  jiffies + chan->read_timer_interval);
 352         else
 353                 mod_timer(&buf->read_timer,
 354                           jiffies + chan->read_timer_interval);
 355 }
 356
 357 /*
 358  * Called with ring_buffer_nohz_lock held for per-cpu buffers.
 359  */
 360 static void lib_ring_buffer_start_read_timer(struct lib_ring_buffer *buf)
 361 {
 362         struct channel *chan = buf->backend.chan;
 363         const struct lib_ring_buffer_config *config = &chan->backend.config;
 364
 365         if (config->wakeup != RING_BUFFER_WAKEUP_BY_TIMER
 366             || !chan->read_timer_interval
 367             || buf->read_timer_enabled)
 368                 return;
 369
 370         if (config->alloc == RING_BUFFER_ALLOC_PER_CPU)
 371                 lttng_init_timer_pinned(&buf->read_timer);
 372         else
 373                 init_timer(&buf->read_timer);
 374
 375         buf->read_timer.function = read_buffer_timer;
 376         buf->read_timer.expires = jiffies + chan->read_timer_interval;
 377         buf->read_timer.data = (unsigned long)buf;
 378
 379         if (config->alloc == RING_BUFFER_ALLOC_PER_CPU)
 380                 add_timer_on(&buf->read_timer, buf->backend.cpu);
 381         else
 382                 add_timer(&buf->read_timer);
 383         buf->read_timer_enabled = 1;
 384 }
 385
 386 /*
 387  * Called with ring_buffer_nohz_lock held for per-cpu buffers.
 388  */
 389 static void lib_ring_buffer_stop_read_timer(struct lib_ring_buffer *buf)
 390 {
 391         struct channel *chan = buf->backend.chan;
 392         const struct lib_ring_buffer_config *config = &chan->backend.config;
 393
 394         if (config->wakeup != RING_BUFFER_WAKEUP_BY_TIMER
 395             || !chan->read_timer_interval
 396             || !buf->read_timer_enabled)
 397                 return;
 398
 399         del_timer_sync(&buf->read_timer);
 400         /*
 401          * do one more check to catch data that has been written in the last
 402          * timer period.
 403          */
 404         if (lib_ring_buffer_poll_deliver(config, buf, chan)) {
 405                 wake_up_interruptible(&buf->read_wait);
 406                 wake_up_interruptible(&chan->read_wait);
 407         }
 408         buf->read_timer_enabled = 0;
 409 }
 410
 411 #ifdef CONFIG_HOTPLUG_CPU
 412 /**
 413  *      lib_ring_buffer_cpu_hp_callback - CPU hotplug callback
 414  *      @nb: notifier block
 415  *      @action: hotplug action to take
 416  *      @hcpu: CPU number
 417  *
 418  *      Returns the success/failure of the operation. (%NOTIFY_OK, %NOTIFY_BAD)
 419  */
 420 static
 421 int lib_ring_buffer_cpu_hp_callback(struct notifier_block *nb,
 422                                               unsigned long action,
 423                                               void *hcpu)
 424 {
 425         unsigned int cpu = (unsigned long)hcpu;
 426         struct channel *chan = container_of(nb, struct channel,
 427                                             cpu_hp_notifier);
 428         struct lib_ring_buffer *buf = per_cpu_ptr(chan->backend.buf, cpu);
 429         const struct lib_ring_buffer_config *config = &chan->backend.config;
 430
 431         if (!chan->cpu_hp_enable)
 432                 return NOTIFY_DONE;
 433
 434         CHAN_WARN_ON(chan, config->alloc == RING_BUFFER_ALLOC_GLOBAL);
 435
 436         switch (action) {
 437         case CPU_DOWN_FAILED:
 438         case CPU_DOWN_FAILED_FROZEN:
 439         case CPU_ONLINE:
 440         case CPU_ONLINE_FROZEN:
 441                 wake_up_interruptible(&chan->hp_wait);
 442                 lib_ring_buffer_start_switch_timer(buf);
 443                 lib_ring_buffer_start_read_timer(buf);
 444                 return NOTIFY_OK;
 445
 446         case CPU_DOWN_PREPARE:
 447         case CPU_DOWN_PREPARE_FROZEN:
 448                 lib_ring_buffer_stop_switch_timer(buf);
 449                 lib_ring_buffer_stop_read_timer(buf);
 450                 return NOTIFY_OK;
 451
 452         case CPU_DEAD:
 453         case CPU_DEAD_FROZEN:
 454                 /*
 455                  * Performing a buffer switch on a remote CPU. Performed by
 456                  * the CPU responsible for doing the hotunplug after the target
 457                  * CPU stopped running completely. Ensures that all data
 458                  * from that remote CPU is flushed.
 459                  */
 460                 lib_ring_buffer_switch_slow(buf, SWITCH_ACTIVE);
 461                 return NOTIFY_OK;
 462
 463         default:
 464                 return NOTIFY_DONE;
 465         }
 466 }
 467 #endif
 468
 469 #if defined(CONFIG_NO_HZ) && defined(CONFIG_LIB_RING_BUFFER)
 470 /*
 471  * For per-cpu buffers, call the reader wakeups before switching the buffer, so
 472  * that wake-up-tracing generated events are flushed before going idle (in
 473  * tick_nohz). We test if the spinlock is locked to deal with the race where
 474  * readers try to sample the ring buffer before we perform the switch. We let
 475  * the readers retry in that case. If there is data in the buffer, the wake up
 476  * is going to forbid the CPU running the reader thread from going idle.
 477  */
 478 static int notrace ring_buffer_tick_nohz_callback(struct notifier_block *nb,
 479                                                   unsigned long val,
 480                                                   void *data)
 481 {
 482         struct channel *chan = container_of(nb, struct channel,
 483                                             tick_nohz_notifier);
 484         const struct lib_ring_buffer_config *config = &chan->backend.config;
 485         struct lib_ring_buffer *buf;
 486         int cpu = smp_processor_id();
 487
 488         if (config->alloc != RING_BUFFER_ALLOC_PER_CPU) {
 489                 /*
 490                  * We don't support keeping the system idle with global buffers
 491                  * and streaming active. In order to do so, we would need to
 492                  * sample a non-nohz-cpumask racelessly with the nohz updates
 493                  * without adding synchronization overhead to nohz. Leave this
 494                  * use-case out for now.
 495                  */
 496                 return 0;
 497         }
 498
 499         buf = channel_get_ring_buffer(config, chan, cpu);
 500         switch (val) {
 501         case TICK_NOHZ_FLUSH:
 502                 raw_spin_lock(&buf->raw_tick_nohz_spinlock);
 503                 if (config->wakeup == RING_BUFFER_WAKEUP_BY_TIMER
 504                     && chan->read_timer_interval
 505                     && atomic_long_read(&buf->active_readers)
 506                     && (lib_ring_buffer_poll_deliver(config, buf, chan)
 507                         || lib_ring_buffer_pending_data(config, buf, chan))) {
 508                         wake_up_interruptible(&buf->read_wait);
 509                         wake_up_interruptible(&chan->read_wait);
 510                 }
 511                 if (chan->switch_timer_interval)
 512                         lib_ring_buffer_switch_slow(buf, SWITCH_ACTIVE);
 513                 raw_spin_unlock(&buf->raw_tick_nohz_spinlock);
 514                 break;
 515         case TICK_NOHZ_STOP:
 516                 spin_lock(lttng_this_cpu_ptr(&ring_buffer_nohz_lock));
 517                 lib_ring_buffer_stop_switch_timer(buf);
 518                 lib_ring_buffer_stop_read_timer(buf);
 519                 spin_unlock(lttng_this_cpu_ptr(&ring_buffer_nohz_lock));
 520                 break;
 521         case TICK_NOHZ_RESTART:
 522                 spin_lock(lttng_this_cpu_ptr(&ring_buffer_nohz_lock));
 523                 lib_ring_buffer_start_read_timer(buf);
 524                 lib_ring_buffer_start_switch_timer(buf);
 525                 spin_unlock(lttng_this_cpu_ptr(&ring_buffer_nohz_lock));
 526                 break;
 527         }
 528
 529         return 0;
 530 }
 531
 532 void notrace lib_ring_buffer_tick_nohz_flush(void)
 533 {
 534         atomic_notifier_call_chain(&tick_nohz_notifier, TICK_NOHZ_FLUSH,
 535                                    NULL);
 536 }
 537
 538 void notrace lib_ring_buffer_tick_nohz_stop(void)
 539 {
 540         atomic_notifier_call_chain(&tick_nohz_notifier, TICK_NOHZ_STOP,
 541                                    NULL);
 542 }
 543
 544 void notrace lib_ring_buffer_tick_nohz_restart(void)
 545 {
 546         atomic_notifier_call_chain(&tick_nohz_notifier, TICK_NOHZ_RESTART,
 547                                    NULL);
 548 }
 549 #endif /* defined(CONFIG_NO_HZ) && defined(CONFIG_LIB_RING_BUFFER) */
 550
 551 /*
 552  * Holds CPU hotplug.
 553  */
 554 static void channel_unregister_notifiers(struct channel *chan)
 555 {
 556         const struct lib_ring_buffer_config *config = &chan->backend.config;
 557         int cpu;
 558
 559         channel_iterator_unregister_notifiers(chan);
 560         if (config->alloc == RING_BUFFER_ALLOC_PER_CPU) {
 561 #ifdef CONFIG_NO_HZ
 562                 /*
 563                  * Remove the nohz notifier first, so we are certain we stop
 564                  * the timers.
 565                  */
 566                 atomic_notifier_chain_unregister(&tick_nohz_notifier,
 567                                                  &chan->tick_nohz_notifier);
 568                 /*
 569                  * ring_buffer_nohz_lock will not be needed below, because
 570                  * we just removed the notifiers, which were the only source of
 571                  * concurrency.
 572                  */
 573 #endif /* CONFIG_NO_HZ */
 574 #ifdef CONFIG_HOTPLUG_CPU
 575                 get_online_cpus();
 576                 chan->cpu_hp_enable = 0;
 577                 for_each_online_cpu(cpu) {
 578                         struct lib_ring_buffer *buf = per_cpu_ptr(chan->backend.buf,
 579                                                               cpu);
 580                         lib_ring_buffer_stop_switch_timer(buf);
 581                         lib_ring_buffer_stop_read_timer(buf);
 582                 }
 583                 put_online_cpus();
 584                 unregister_cpu_notifier(&chan->cpu_hp_notifier);
 585 #else
 586                 for_each_possible_cpu(cpu) {
 587                         struct lib_ring_buffer *buf = per_cpu_ptr(chan->backend.buf,
 588                                                               cpu);
 589                         lib_ring_buffer_stop_switch_timer(buf);
 590                         lib_ring_buffer_stop_read_timer(buf);
 591                 }
 592 #endif
 593         } else {
 594                 struct lib_ring_buffer *buf = chan->backend.buf;
 595
 596                 lib_ring_buffer_stop_switch_timer(buf);
 597                 lib_ring_buffer_stop_read_timer(buf);
 598         }
 599         channel_backend_unregister_notifiers(&chan->backend);
 600 }
 601
 602 static void lib_ring_buffer_set_quiescent(struct lib_ring_buffer *buf)
 603 {
 604         if (!buf->quiescent) {
 605                 buf->quiescent = true;
 606                 _lib_ring_buffer_switch_remote(buf, SWITCH_FLUSH);
 607         }
 608 }
 609
 610 static void lib_ring_buffer_clear_quiescent(struct lib_ring_buffer *buf)
 611 {
 612         buf->quiescent = false;
 613 }
 614
 615 void lib_ring_buffer_set_quiescent_channel(struct channel *chan)
 616 {
 617         int cpu;
 618         const struct lib_ring_buffer_config *config = &chan->backend.config;
 619
 620         if (config->alloc == RING_BUFFER_ALLOC_PER_CPU) {
 621                 get_online_cpus();
 622                 for_each_channel_cpu(cpu, chan) {
 623                         struct lib_ring_buffer *buf = per_cpu_ptr(chan->backend.buf,
 624                                                               cpu);
 625
 626                         lib_ring_buffer_set_quiescent(buf);
 627                 }
 628                 put_online_cpus();
 629         } else {
 630                 struct lib_ring_buffer *buf = chan->backend.buf;
 631
 632                 lib_ring_buffer_set_quiescent(buf);
 633         }
 634 }
 635 EXPORT_SYMBOL_GPL(lib_ring_buffer_set_quiescent_channel);
 636
 637 void lib_ring_buffer_clear_quiescent_channel(struct channel *chan)
 638 {
 639         int cpu;
 640         const struct lib_ring_buffer_config *config = &chan->backend.config;
 641
 642         if (config->alloc == RING_BUFFER_ALLOC_PER_CPU) {
 643                 get_online_cpus();
 644                 for_each_channel_cpu(cpu, chan) {
 645                         struct lib_ring_buffer *buf = per_cpu_ptr(chan->backend.buf,
 646                                                               cpu);
 647
 648                         lib_ring_buffer_clear_quiescent(buf);
 649                 }
 650                 put_online_cpus();
 651         } else {
 652                 struct lib_ring_buffer *buf = chan->backend.buf;
 653
 654                 lib_ring_buffer_clear_quiescent(buf);
 655         }
 656 }
 657 EXPORT_SYMBOL_GPL(lib_ring_buffer_clear_quiescent_channel);
 658
 659 static void channel_free(struct channel *chan)
 660 {
 661         if (chan->backend.release_priv_ops) {
 662                 chan->backend.release_priv_ops(chan->backend.priv_ops);
 663         }
 664         channel_iterator_free(chan);
 665         channel_backend_free(&chan->backend);
 666         kfree(chan);
 667 }
 668
 669 /**
 670  * channel_create - Create channel.
 671  * @config: ring buffer instance configuration
 672  * @name: name of the channel
 673  * @priv: ring buffer client private data
 674  * @buf_addr: pointer the the beginning of the preallocated buffer contiguous
 675  *            address mapping. It is used only by RING_BUFFER_STATIC
 676  *            configuration. It can be set to NULL for other backends.
 677  * @subbuf_size: subbuffer size
 678  * @num_subbuf: number of subbuffers
 679  * @switch_timer_interval: Time interval (in us) to fill sub-buffers with
 680  *                         padding to let readers get those sub-buffers.
 681  *                         Used for live streaming.
 682  * @read_timer_interval: Time interval (in us) to wake up pending readers.
 683  *
 684  * Holds cpu hotplug.
 685  * Returns NULL on failure.
 686  */
 687 struct channel *channel_create(const struct lib_ring_buffer_config *config,
 688                    const char *name, void *priv, void *buf_addr,
 689                    size_t subbuf_size,
 690                    size_t num_subbuf, unsigned int switch_timer_interval,
 691                    unsigned int read_timer_interval)
 692 {
 693         int ret, cpu;
 694         struct channel *chan;
 695
 696         if (lib_ring_buffer_check_config(config, switch_timer_interval,
 697                                          read_timer_interval))
 698                 return NULL;
 699
 700         chan = kzalloc(sizeof(struct channel), GFP_KERNEL);
 701         if (!chan)
 702                 return NULL;
 703
 704         ret = channel_backend_init(&chan->backend, name, config, priv,
 705                                    subbuf_size, num_subbuf);
 706         if (ret)
 707                 goto error;
 708
 709         ret = channel_iterator_init(chan);
 710         if (ret)
 711                 goto error_free_backend;
 712
 713         chan->commit_count_mask = (~0UL >> chan->backend.num_subbuf_order);
 714         chan->switch_timer_interval = usecs_to_jiffies(switch_timer_interval);
 715         chan->read_timer_interval = usecs_to_jiffies(read_timer_interval);
 716         kref_init(&chan->ref);
 717         init_waitqueue_head(&chan->read_wait);
 718         init_waitqueue_head(&chan->hp_wait);
 719
 720         if (config->alloc == RING_BUFFER_ALLOC_PER_CPU) {
 721 #if defined(CONFIG_NO_HZ) && defined(CONFIG_LIB_RING_BUFFER)
 722                 /* Only benefit from NO_HZ idle with per-cpu buffers for now. */
 723                 chan->tick_nohz_notifier.notifier_call =
 724                         ring_buffer_tick_nohz_callback;
 725                 chan->tick_nohz_notifier.priority = ~0U;
 726                 atomic_notifier_chain_register(&tick_nohz_notifier,
 727                                        &chan->tick_nohz_notifier);
 728 #endif /* defined(CONFIG_NO_HZ) && defined(CONFIG_LIB_RING_BUFFER) */
 729
 730                 /*
 731                  * In case of non-hotplug cpu, if the ring-buffer is allocated
 732                  * in early initcall, it will not be notified of secondary cpus.
 733                  * In that off case, we need to allocate for all possible cpus.
 734                  */
 735 #ifdef CONFIG_HOTPLUG_CPU
 736                 chan->cpu_hp_notifier.notifier_call =
 737                                 lib_ring_buffer_cpu_hp_callback;
 738                 chan->cpu_hp_notifier.priority = 6;
 739                 register_cpu_notifier(&chan->cpu_hp_notifier);
 740
 741                 get_online_cpus();
 742                 for_each_online_cpu(cpu) {
 743                         struct lib_ring_buffer *buf = per_cpu_ptr(chan->backend.buf,
 744                                                                cpu);
 745                         spin_lock(&per_cpu(ring_buffer_nohz_lock, cpu));
 746                         lib_ring_buffer_start_switch_timer(buf);
 747                         lib_ring_buffer_start_read_timer(buf);
 748                         spin_unlock(&per_cpu(ring_buffer_nohz_lock, cpu));
 749                 }
 750                 chan->cpu_hp_enable = 1;
 751                 put_online_cpus();
 752 #else
 753                 for_each_possible_cpu(cpu) {
 754                         struct lib_ring_buffer *buf = per_cpu_ptr(chan->backend.buf,
 755                                                               cpu);
 756                         spin_lock(&per_cpu(ring_buffer_nohz_lock, cpu));
 757                         lib_ring_buffer_start_switch_timer(buf);
 758                         lib_ring_buffer_start_read_timer(buf);
 759                         spin_unlock(&per_cpu(ring_buffer_nohz_lock, cpu));
 760                 }
 761 #endif
 762         } else {
 763                 struct lib_ring_buffer *buf = chan->backend.buf;
 764
 765                 lib_ring_buffer_start_switch_timer(buf);
 766                 lib_ring_buffer_start_read_timer(buf);
 767         }
 768
 769         return chan;
 770
 771 error_free_backend:
 772         channel_backend_free(&chan->backend);
 773 error:
 774         kfree(chan);
 775         return NULL;
 776 }
 777 EXPORT_SYMBOL_GPL(channel_create);
 778
 779 static
 780 void channel_release(struct kref *kref)
 781 {
 782         struct channel *chan = container_of(kref, struct channel, ref);
 783         channel_free(chan);
 784 }
 785
 786 /**
 787  * channel_destroy - Finalize, wait for q.s. and destroy channel.
 788  * @chan: channel to destroy
 789  *
 790  * Holds cpu hotplug.
 791  * Call "destroy" callback, finalize channels, and then decrement the
 792  * channel reference count.  Note that when readers have completed data
 793  * consumption of finalized channels, get_subbuf() will return -ENODATA.
 794  * They should release their handle at that point.  Returns the private
 795  * data pointer.
 796  */
 797 void *channel_destroy(struct channel *chan)
 798 {
 799         int cpu;
 800         const struct lib_ring_buffer_config *config = &chan->backend.config;
 801         void *priv;
 802
 803         channel_unregister_notifiers(chan);
 804
 805         if (config->alloc == RING_BUFFER_ALLOC_PER_CPU) {
 806                 /*
 807                  * No need to hold cpu hotplug, because all notifiers have been
 808                  * unregistered.
 809                  */
 810                 for_each_channel_cpu(cpu, chan) {
 811                         struct lib_ring_buffer *buf = per_cpu_ptr(chan->backend.buf,
 812                                                               cpu);
 813
 814                         if (config->cb.buffer_finalize)
 815                                 config->cb.buffer_finalize(buf,
 816                                                            chan->backend.priv,
 817                                                            cpu);
 818                         if (buf->backend.allocated)
 819                                 lib_ring_buffer_set_quiescent(buf);
 820                         /*
 821                          * Perform flush before writing to finalized.
 822                          */
 823                         smp_wmb();
 824                         ACCESS_ONCE(buf->finalized) = 1;
 825                         wake_up_interruptible(&buf->read_wait);
 826                 }
 827         } else {
 828                 struct lib_ring_buffer *buf = chan->backend.buf;
 829
 830                 if (config->cb.buffer_finalize)
 831                         config->cb.buffer_finalize(buf, chan->backend.priv, -1);
 832                 if (buf->backend.allocated)
 833                         lib_ring_buffer_set_quiescent(buf);
 834                 /*
 835                  * Perform flush before writing to finalized.
 836                  */
 837                 smp_wmb();
 838                 ACCESS_ONCE(buf->finalized) = 1;
 839                 wake_up_interruptible(&buf->read_wait);
 840         }
 841         ACCESS_ONCE(chan->finalized) = 1;
 842         wake_up_interruptible(&chan->hp_wait);
 843         wake_up_interruptible(&chan->read_wait);
 844         priv = chan->backend.priv;
 845         kref_put(&chan->ref, channel_release);
 846         return priv;
 847 }
 848 EXPORT_SYMBOL_GPL(channel_destroy);
 849
 850 struct lib_ring_buffer *channel_get_ring_buffer(
 851                                         const struct lib_ring_buffer_config *config,
 852                                         struct channel *chan, int cpu)
 853 {
 854         if (config->alloc == RING_BUFFER_ALLOC_GLOBAL)
 855                 return chan->backend.buf;
 856         else
 857                 return per_cpu_ptr(chan->backend.buf, cpu);
 858 }
 859 EXPORT_SYMBOL_GPL(channel_get_ring_buffer);
 860
 861 int lib_ring_buffer_open_read(struct lib_ring_buffer *buf)
 862 {
 863         struct channel *chan = buf->backend.chan;
 864
 865         if (!atomic_long_add_unless(&buf->active_readers, 1, 1))
 866                 return -EBUSY;
 867         if (!lttng_kref_get(&chan->ref)) {
 868                 atomic_long_dec(&buf->active_readers);
 869                 return -EOVERFLOW;
 870         }
 871         lttng_smp_mb__after_atomic();
 872         return 0;
 873 }
 874 EXPORT_SYMBOL_GPL(lib_ring_buffer_open_read);
 875
 876 void lib_ring_buffer_release_read(struct lib_ring_buffer *buf)
 877 {
 878         struct channel *chan = buf->backend.chan;
 879
 880         CHAN_WARN_ON(chan, atomic_long_read(&buf->active_readers) != 1);
 881         lttng_smp_mb__before_atomic();
 882         atomic_long_dec(&buf->active_readers);
 883         kref_put(&chan->ref, channel_release);
 884 }
 885 EXPORT_SYMBOL_GPL(lib_ring_buffer_release_read);
 886
 887 /*
 888  * Promote compiler barrier to a smp_mb().
 889  * For the specific ring buffer case, this IPI call should be removed if the
 890  * architecture does not reorder writes.  This should eventually be provided by
 891  * a separate architecture-specific infrastructure.
 892  */
 893 static void remote_mb(void *info)
 894 {
 895         smp_mb();
 896 }
 897
 898 /**
 899  * lib_ring_buffer_snapshot - save subbuffer position snapshot (for read)
 900  * @buf: ring buffer
 901  * @consumed: consumed count indicating the position where to read
 902  * @produced: produced count, indicates position when to stop reading
 903  *
 904  * Returns -ENODATA if buffer is finalized, -EAGAIN if there is currently no
 905  * data to read at consumed position, or 0 if the get operation succeeds.
 906  * Busy-loop trying to get data if the tick_nohz sequence lock is held.
 907  */
 908
 909 int lib_ring_buffer_snapshot(struct lib_ring_buffer *buf,
 910                              unsigned long *consumed, unsigned long *produced)
 911 {
 912         struct channel *chan = buf->backend.chan;
 913         const struct lib_ring_buffer_config *config = &chan->backend.config;
 914         unsigned long consumed_cur, write_offset;
 915         int finalized;
 916
 917 retry:
 918         finalized = ACCESS_ONCE(buf->finalized);
 919         /*
 920          * Read finalized before counters.
 921          */
 922         smp_rmb();
 923         consumed_cur = atomic_long_read(&buf->consumed);
 924         /*
 925          * No need to issue a memory barrier between consumed count read and
 926          * write offset read, because consumed count can only change
 927          * concurrently in overwrite mode, and we keep a sequence counter
 928          * identifier derived from the write offset to check we are getting
 929          * the same sub-buffer we are expecting (the sub-buffers are atomically
 930          * "tagged" upon writes, tags are checked upon read).
 931          */
 932         write_offset = v_read(config, &buf->offset);
 933
 934         /*
 935          * Check that we are not about to read the same subbuffer in
 936          * which the writer head is.
 937          */
 938         if (subbuf_trunc(write_offset, chan) - subbuf_trunc(consumed_cur, chan)
 939             == 0)
 940                 goto nodata;
 941
 942         *consumed = consumed_cur;
 943         *produced = subbuf_trunc(write_offset, chan);
 944
 945         return 0;
 946
 947 nodata:
 948         /*
 949          * The memory barriers __wait_event()/wake_up_interruptible() take care
 950          * of "raw_spin_is_locked" memory ordering.
 951          */
 952         if (finalized)
 953                 return -ENODATA;
 954         else if (raw_spin_is_locked(&buf->raw_tick_nohz_spinlock))
 955                 goto retry;
 956         else
 957                 return -EAGAIN;
 958 }
 959 EXPORT_SYMBOL_GPL(lib_ring_buffer_snapshot);
 960
 961 /**
 962  * lib_ring_buffer_put_snapshot - move consumed counter forward
 963  *
 964  * Should only be called from consumer context.
 965  * @buf: ring buffer
 966  * @consumed_new: new consumed count value
 967  */
 968 void lib_ring_buffer_move_consumer(struct lib_ring_buffer *buf,
 969                                    unsigned long consumed_new)
 970 {
 971         struct lib_ring_buffer_backend *bufb = &buf->backend;
 972         struct channel *chan = bufb->chan;
 973         unsigned long consumed;
 974
 975         CHAN_WARN_ON(chan, atomic_long_read(&buf->active_readers) != 1);
 976
 977         /*
 978          * Only push the consumed value forward.
 979          * If the consumed cmpxchg fails, this is because we have been pushed by
 980          * the writer in flight recorder mode.
 981          */
 982         consumed = atomic_long_read(&buf->consumed);
 983         while ((long) consumed - (long) consumed_new < 0)
 984                 consumed = atomic_long_cmpxchg(&buf->consumed, consumed,
 985                                                consumed_new);
 986         /* Wake-up the metadata producer */
 987         wake_up_interruptible(&buf->write_wait);
 988 }
 989 EXPORT_SYMBOL_GPL(lib_ring_buffer_move_consumer);
 990
 991 /**
 992  * lib_ring_buffer_get_subbuf - get exclusive access to subbuffer for reading
 993  * @buf: ring buffer
 994  * @consumed: consumed count indicating the position where to read
 995  *
 996  * Returns -ENODATA if buffer is finalized, -EAGAIN if there is currently no
 997  * data to read at consumed position, or 0 if the get operation succeeds.
 998  * Busy-loop trying to get data if the tick_nohz sequence lock is held.
 999  */
1000 int lib_ring_buffer_get_subbuf(struct lib_ring_buffer *buf,
1001                                unsigned long consumed)
1002 {
1003         struct channel *chan = buf->backend.chan;
1004         const struct lib_ring_buffer_config *config = &chan->backend.config;
1005         unsigned long consumed_cur, consumed_idx, commit_count, write_offset;
1006         int ret;
1007         int finalized;
1008
1009         if (buf->get_subbuf) {
1010                 /*
1011                  * Reader is trying to get a subbuffer twice.
1012                  */
1013                 CHAN_WARN_ON(chan, 1);
1014                 return -EBUSY;
1015         }
1016 retry:
1017         finalized = ACCESS_ONCE(buf->finalized);
1018         /*
1019          * Read finalized before counters.
1020          */
1021         smp_rmb();
1022         consumed_cur = atomic_long_read(&buf->consumed);
1023         consumed_idx = subbuf_index(consumed, chan);
1024         commit_count = v_read(config, &buf->commit_cold[consumed_idx].cc_sb);
1025         /*
1026          * Make sure we read the commit count before reading the buffer
1027          * data and the write offset. Correct consumed offset ordering
1028          * wrt commit count is insured by the use of cmpxchg to update
1029          * the consumed offset.
1030          * smp_call_function_single can fail if the remote CPU is offline,
1031          * this is OK because then there is no wmb to execute there.
1032          * If our thread is executing on the same CPU as the on the buffers
1033          * belongs to, we don't have to synchronize it at all. If we are
1034          * migrated, the scheduler will take care of the memory barriers.
1035          * Normally, smp_call_function_single() should ensure program order when
1036          * executing the remote function, which implies that it surrounds the
1037          * function execution with :
1038          * smp_mb()
1039          * send IPI
1040          * csd_lock_wait
1041          *                recv IPI
1042          *                smp_mb()
1043          *                exec. function
1044          *                smp_mb()
1045          *                csd unlock
1046          * smp_mb()
1047          *
1048          * However, smp_call_function_single() does not seem to clearly execute
1049          * such barriers. It depends on spinlock semantic to provide the barrier
1050          * before executing the IPI and, when busy-looping, csd_lock_wait only
1051          * executes smp_mb() when it has to wait for the other CPU.
1052          *
1053          * I don't trust this code. Therefore, let's add the smp_mb() sequence
1054          * required ourself, even if duplicated. It has no performance impact
1055          * anyway.
1056          *
1057          * smp_mb() is needed because smp_rmb() and smp_wmb() only order read vs
1058          * read and write vs write. They do not ensure core synchronization. We
1059          * really have to ensure total order between the 3 barriers running on
1060          * the 2 CPUs.
1061          */
1062         if (config->ipi == RING_BUFFER_IPI_BARRIER) {
1063                 if (config->sync == RING_BUFFER_SYNC_PER_CPU
1064                     && config->alloc == RING_BUFFER_ALLOC_PER_CPU) {
1065                         if (raw_smp_processor_id() != buf->backend.cpu) {
1066                                 /* Total order with IPI handler smp_mb() */
1067                                 smp_mb();
1068                                 smp_call_function_single(buf->backend.cpu,
1069                                                          remote_mb, NULL, 1);
1070                                 /* Total order with IPI handler smp_mb() */
1071                                 smp_mb();
1072                         }
1073                 } else {
1074                         /* Total order with IPI handler smp_mb() */
1075                         smp_mb();
1076                         smp_call_function(remote_mb, NULL, 1);
1077                         /* Total order with IPI handler smp_mb() */
1078                         smp_mb();
1079                 }
1080         } else {
1081                 /*
1082                  * Local rmb to match the remote wmb to read the commit count
1083                  * before the buffer data and the write offset.
1084                  */
1085                 smp_rmb();
1086         }
1087
1088         write_offset = v_read(config, &buf->offset);
1089
1090         /*
1091          * Check that the buffer we are getting is after or at consumed_cur
1092          * position.
1093          */
1094         if ((long) subbuf_trunc(consumed, chan)
1095             - (long) subbuf_trunc(consumed_cur, chan) < 0)
1096                 goto nodata;
1097
1098         /*
1099          * Check that the subbuffer we are trying to consume has been
1100          * already fully committed.
1101          */
1102         if (((commit_count - chan->backend.subbuf_size)
1103              & chan->commit_count_mask)
1104             - (buf_trunc(consumed, chan)
1105                >> chan->backend.num_subbuf_order)
1106             != 0)
1107                 goto nodata;
1108
1109         /*
1110          * Check that we are not about to read the same subbuffer in
1111          * which the writer head is.
1112          */
1113         if (subbuf_trunc(write_offset, chan) - subbuf_trunc(consumed, chan)
1114             == 0)
1115                 goto nodata;
1116
1117         /*
1118          * Failure to get the subbuffer causes a busy-loop retry without going
1119          * to a wait queue. These are caused by short-lived race windows where
1120          * the writer is getting access to a subbuffer we were trying to get
1121          * access to. Also checks that the "consumed" buffer count we are
1122          * looking for matches the one contained in the subbuffer id.
1123          */
1124         ret = update_read_sb_index(config, &buf->backend, &chan->backend,
1125                                    consumed_idx, buf_trunc_val(consumed, chan));
1126         if (ret)
1127                 goto retry;
1128         subbuffer_id_clear_noref(config, &buf->backend.buf_rsb.id);
1129
1130         buf->get_subbuf_consumed = consumed;
1131         buf->get_subbuf = 1;
1132
1133         return 0;
1134
1135 nodata:
1136         /*
1137          * The memory barriers __wait_event()/wake_up_interruptible() take care
1138          * of "raw_spin_is_locked" memory ordering.
1139          */
1140         if (finalized)
1141                 return -ENODATA;
1142         else if (raw_spin_is_locked(&buf->raw_tick_nohz_spinlock))
1143                 goto retry;
1144         else
1145                 return -EAGAIN;
1146 }
1147 EXPORT_SYMBOL_GPL(lib_ring_buffer_get_subbuf);
1148
1149 /**
1150  * lib_ring_buffer_put_subbuf - release exclusive subbuffer access
1151  * @buf: ring buffer
1152  */
1153 void lib_ring_buffer_put_subbuf(struct lib_ring_buffer *buf)
1154 {
1155         struct lib_ring_buffer_backend *bufb = &buf->backend;
1156         struct channel *chan = bufb->chan;
1157         const struct lib_ring_buffer_config *config = &chan->backend.config;
1158         unsigned long read_sb_bindex, consumed_idx, consumed;
1159
1160         CHAN_WARN_ON(chan, atomic_long_read(&buf->active_readers) != 1);
1161
1162         if (!buf->get_subbuf) {
1163                 /*
1164                  * Reader puts a subbuffer it did not get.
1165                  */
1166                 CHAN_WARN_ON(chan, 1);
1167                 return;
1168         }
1169         consumed = buf->get_subbuf_consumed;
1170         buf->get_subbuf = 0;
1171
1172         /*
1173          * Clear the records_unread counter. (overruns counter)
1174          * Can still be non-zero if a file reader simply grabbed the data
1175          * without using iterators.
1176          * Can be below zero if an iterator is used on a snapshot more than
1177          * once.
1178          */
1179         read_sb_bindex = subbuffer_id_get_index(config, bufb->buf_rsb.id);
1180         v_add(config, v_read(config,
1181                              &bufb->array[read_sb_bindex]->records_unread),
1182               &bufb->records_read);
1183         v_set(config, &bufb->array[read_sb_bindex]->records_unread, 0);
1184         CHAN_WARN_ON(chan, config->mode == RING_BUFFER_OVERWRITE
1185                      && subbuffer_id_is_noref(config, bufb->buf_rsb.id));
1186         subbuffer_id_set_noref(config, &bufb->buf_rsb.id);
1187
1188         /*
1189          * Exchange the reader subbuffer with the one we put in its place in the
1190          * writer subbuffer table. Expect the original consumed count. If
1191          * update_read_sb_index fails, this is because the writer updated the
1192          * subbuffer concurrently. We should therefore keep the subbuffer we
1193          * currently have: it has become invalid to try reading this sub-buffer
1194          * consumed count value anyway.
1195          */
1196         consumed_idx = subbuf_index(consumed, chan);
1197         update_read_sb_index(config, &buf->backend, &chan->backend,
1198                              consumed_idx, buf_trunc_val(consumed, chan));
1199         /*
1200          * update_read_sb_index return value ignored. Don't exchange sub-buffer
1201          * if the writer concurrently updated it.
1202          */
1203 }
1204 EXPORT_SYMBOL_GPL(lib_ring_buffer_put_subbuf);
1205
1206 /*
1207  * cons_offset is an iterator on all subbuffer offsets between the reader
1208  * position and the writer position. (inclusive)
1209  */
1210 static
1211 void lib_ring_buffer_print_subbuffer_errors(struct lib_ring_buffer *buf,
1212                                             struct channel *chan,
1213                                             unsigned long cons_offset,
1214                                             int cpu)
1215 {
1216         const struct lib_ring_buffer_config *config = &chan->backend.config;
1217         unsigned long cons_idx, commit_count, commit_count_sb;
1218
1219         cons_idx = subbuf_index(cons_offset, chan);
1220         commit_count = v_read(config, &buf->commit_hot[cons_idx].cc);
1221         commit_count_sb = v_read(config, &buf->commit_cold[cons_idx].cc_sb);
1222
1223         if (subbuf_offset(commit_count, chan) != 0)
1224                 printk(KERN_WARNING
1225                        "ring buffer %s, cpu %d: "
1226                        "commit count in subbuffer %lu,\n"
1227                        "expecting multiples of %lu bytes\n"
1228                        "  [ %lu bytes committed, %lu bytes reader-visible ]\n",
1229                        chan->backend.name, cpu, cons_idx,
1230                        chan->backend.subbuf_size,
1231                        commit_count, commit_count_sb);
1232
1233         printk(KERN_DEBUG "ring buffer: %s, cpu %d: %lu bytes committed\n",
1234                chan->backend.name, cpu, commit_count);
1235 }
1236
1237 static
1238 void lib_ring_buffer_print_buffer_errors(struct lib_ring_buffer *buf,
1239                                          struct channel *chan,
1240                                          void *priv, int cpu)
1241 {
1242         const struct lib_ring_buffer_config *config = &chan->backend.config;
1243         unsigned long write_offset, cons_offset;
1244
1245         /*
1246          * No need to order commit_count, write_offset and cons_offset reads
1247          * because we execute at teardown when no more writer nor reader
1248          * references are left.
1249          */
1250         write_offset = v_read(config, &buf->offset);
1251         cons_offset = atomic_long_read(&buf->consumed);
1252         if (write_offset != cons_offset)
1253                 printk(KERN_DEBUG
1254                        "ring buffer %s, cpu %d: "
1255                        "non-consumed data\n"
1256                        "  [ %lu bytes written, %lu bytes read ]\n",
1257                        chan->backend.name, cpu, write_offset, cons_offset);
1258
1259         for (cons_offset = atomic_long_read(&buf->consumed);
1260              (long) (subbuf_trunc((unsigned long) v_read(config, &buf->offset),
1261                                   chan)
1262                      - cons_offset) > 0;
1263              cons_offset = subbuf_align(cons_offset, chan))
1264                 lib_ring_buffer_print_subbuffer_errors(buf, chan, cons_offset,
1265                                                        cpu);
1266 }
1267
1268 static
1269 void lib_ring_buffer_print_errors(struct channel *chan,
1270                                   struct lib_ring_buffer *buf, int cpu)
1271 {
1272         const struct lib_ring_buffer_config *config = &chan->backend.config;
1273         void *priv = chan->backend.priv;
1274
1275         if (!strcmp(chan->backend.name, "relay-metadata")) {
1276                 printk(KERN_DEBUG "ring buffer %s: %lu records written, "
1277                         "%lu records overrun\n",
1278                         chan->backend.name,
1279                         v_read(config, &buf->records_count),
1280                         v_read(config, &buf->records_overrun));
1281         } else {
1282                 printk(KERN_DEBUG "ring buffer %s, cpu %d: %lu records written, "
1283                         "%lu records overrun\n",
1284                         chan->backend.name, cpu,
1285                         v_read(config, &buf->records_count),
1286                         v_read(config, &buf->records_overrun));
1287
1288                 if (v_read(config, &buf->records_lost_full)
1289                     || v_read(config, &buf->records_lost_wrap)
1290                     || v_read(config, &buf->records_lost_big))
1291                         printk(KERN_WARNING
1292                                 "ring buffer %s, cpu %d: records were lost. Caused by:\n"
1293                                 "  [ %lu buffer full, %lu nest buffer wrap-around, "
1294                                 "%lu event too big ]\n",
1295                                 chan->backend.name, cpu,
1296                                 v_read(config, &buf->records_lost_full),
1297                                 v_read(config, &buf->records_lost_wrap),
1298                                 v_read(config, &buf->records_lost_big));
1299         }
1300         lib_ring_buffer_print_buffer_errors(buf, chan, priv, cpu);
1301 }
1302
1303 /*
1304  * lib_ring_buffer_switch_old_start: Populate old subbuffer header.
1305  *
1306  * Only executed by SWITCH_FLUSH, which can be issued while tracing is active
1307  * or at buffer finalization (destroy).
1308  */
1309 static
1310 void lib_ring_buffer_switch_old_start(struct lib_ring_buffer *buf,
1311                                       struct channel *chan,
1312                                       struct switch_offsets *offsets,
1313                                       u64 tsc)
1314 {
1315         const struct lib_ring_buffer_config *config = &chan->backend.config;
1316         unsigned long oldidx = subbuf_index(offsets->old, chan);
1317         unsigned long commit_count;
1318
1319         config->cb.buffer_begin(buf, tsc, oldidx);
1320
1321         /*
1322          * Order all writes to buffer before the commit count update that will
1323          * determine that the subbuffer is full.
1324          */
1325         if (config->ipi == RING_BUFFER_IPI_BARRIER) {
1326                 /*
1327                  * Must write slot data before incrementing commit count.  This
1328                  * compiler barrier is upgraded into a smp_mb() by the IPI sent
1329                  * by get_subbuf().
1330                  */
1331                 barrier();
1332         } else
1333                 smp_wmb();
1334         v_add(config, config->cb.subbuffer_header_size(),
1335               &buf->commit_hot[oldidx].cc);
1336         commit_count = v_read(config, &buf->commit_hot[oldidx].cc);
1337         /* Check if the written buffer has to be delivered */
1338         lib_ring_buffer_check_deliver(config, buf, chan, offsets->old,
1339                                       commit_count, oldidx, tsc);
1340         lib_ring_buffer_write_commit_counter(config, buf, chan, oldidx,
1341                         offsets->old + config->cb.subbuffer_header_size(),
1342                         commit_count);
1343 }
1344
1345 /*
1346  * lib_ring_buffer_switch_old_end: switch old subbuffer
1347  *
1348  * Note : offset_old should never be 0 here. It is ok, because we never perform
1349  * buffer switch on an empty subbuffer in SWITCH_ACTIVE mode. The caller
1350  * increments the offset_old value when doing a SWITCH_FLUSH on an empty
1351  * subbuffer.
1352  */
1353 static
1354 void lib_ring_buffer_switch_old_end(struct lib_ring_buffer *buf,
1355                                     struct channel *chan,
1356                                     struct switch_offsets *offsets,
1357                                     u64 tsc)
1358 {
1359         const struct lib_ring_buffer_config *config = &chan->backend.config;
1360         unsigned long oldidx = subbuf_index(offsets->old - 1, chan);
1361         unsigned long commit_count, padding_size, data_size;
1362
1363         data_size = subbuf_offset(offsets->old - 1, chan) + 1;
1364         padding_size = chan->backend.subbuf_size - data_size;
1365         subbuffer_set_data_size(config, &buf->backend, oldidx, data_size);
1366
1367         /*
1368          * Order all writes to buffer before the commit count update that will
1369          * determine that the subbuffer is full.
1370          */
1371         if (config->ipi == RING_BUFFER_IPI_BARRIER) {
1372                 /*
1373                  * Must write slot data before incrementing commit count.  This
1374                  * compiler barrier is upgraded into a smp_mb() by the IPI sent
1375                  * by get_subbuf().
1376                  */
1377                 barrier();
1378         } else
1379                 smp_wmb();
1380         v_add(config, padding_size, &buf->commit_hot[oldidx].cc);
1381         commit_count = v_read(config, &buf->commit_hot[oldidx].cc);
1382         lib_ring_buffer_check_deliver(config, buf, chan, offsets->old - 1,
1383                                       commit_count, oldidx, tsc);
1384         lib_ring_buffer_write_commit_counter(config, buf, chan, oldidx,
1385                         offsets->old + padding_size, commit_count);
1386 }
1387
1388 /*
1389  * lib_ring_buffer_switch_new_start: Populate new subbuffer.
1390  *
1391  * This code can be executed unordered : writers may already have written to the
1392  * sub-buffer before this code gets executed, caution.  The commit makes sure
1393  * that this code is executed before the deliver of this sub-buffer.
1394  */
1395 static
1396 void lib_ring_buffer_switch_new_start(struct lib_ring_buffer *buf,
1397                                       struct channel *chan,
1398                                       struct switch_offsets *offsets,
1399                                       u64 tsc)
1400 {
1401         const struct lib_ring_buffer_config *config = &chan->backend.config;
1402         unsigned long beginidx = subbuf_index(offsets->begin, chan);
1403         unsigned long commit_count;
1404
1405         config->cb.buffer_begin(buf, tsc, beginidx);
1406
1407         /*
1408          * Order all writes to buffer before the commit count update that will
1409          * determine that the subbuffer is full.
1410          */
1411         if (config->ipi == RING_BUFFER_IPI_BARRIER) {
1412                 /*
1413                  * Must write slot data before incrementing commit count.  This
1414                  * compiler barrier is upgraded into a smp_mb() by the IPI sent
1415                  * by get_subbuf().
1416                  */
1417                 barrier();
1418         } else
1419                 smp_wmb();
1420         v_add(config, config->cb.subbuffer_header_size(),
1421               &buf->commit_hot[beginidx].cc);
1422         commit_count = v_read(config, &buf->commit_hot[beginidx].cc);
1423         /* Check if the written buffer has to be delivered */
1424         lib_ring_buffer_check_deliver(config, buf, chan, offsets->begin,
1425                                       commit_count, beginidx, tsc);
1426         lib_ring_buffer_write_commit_counter(config, buf, chan, beginidx,
1427                         offsets->begin + config->cb.subbuffer_header_size(),
1428                         commit_count);
1429 }
1430
1431 /*
1432  * lib_ring_buffer_switch_new_end: finish switching current subbuffer
1433  *
1434  * Calls subbuffer_set_data_size() to set the data size of the current
1435  * sub-buffer. We do not need to perform check_deliver nor commit here,
1436  * since this task will be done by the "commit" of the event for which
1437  * we are currently doing the space reservation.
1438  */
1439 static
1440 void lib_ring_buffer_switch_new_end(struct lib_ring_buffer *buf,
1441                                             struct channel *chan,
1442                                             struct switch_offsets *offsets,
1443                                             u64 tsc)
1444 {
1445         const struct lib_ring_buffer_config *config = &chan->backend.config;
1446         unsigned long endidx, data_size;
1447
1448         endidx = subbuf_index(offsets->end - 1, chan);
1449         data_size = subbuf_offset(offsets->end - 1, chan) + 1;
1450         subbuffer_set_data_size(config, &buf->backend, endidx, data_size);
1451 }
1452
1453 /*
1454  * Returns :
1455  * 0 if ok
1456  * !0 if execution must be aborted.
1457  */
1458 static
1459 int lib_ring_buffer_try_switch_slow(enum switch_mode mode,
1460                                     struct lib_ring_buffer *buf,
1461                                     struct channel *chan,
1462                                     struct switch_offsets *offsets,
1463                                     u64 *tsc)
1464 {
1465         const struct lib_ring_buffer_config *config = &chan->backend.config;
1466         unsigned long off, reserve_commit_diff;
1467
1468         offsets->begin = v_read(config, &buf->offset);
1469         offsets->old = offsets->begin;
1470         offsets->switch_old_start = 0;
1471         off = subbuf_offset(offsets->begin, chan);
1472
1473         *tsc = config->cb.ring_buffer_clock_read(chan);
1474
1475         /*
1476          * Ensure we flush the header of an empty subbuffer when doing the
1477          * finalize (SWITCH_FLUSH). This ensures that we end up knowing the
1478          * total data gathering duration even if there were no records saved
1479          * after the last buffer switch.
1480          * In SWITCH_ACTIVE mode, switch the buffer when it contains events.
1481          * SWITCH_ACTIVE only flushes the current subbuffer, dealing with end of
1482          * subbuffer header as appropriate.
1483          * The next record that reserves space will be responsible for
1484          * populating the following subbuffer header. We choose not to populate
1485          * the next subbuffer header here because we want to be able to use
1486          * SWITCH_ACTIVE for periodical buffer flush and CPU tick_nohz stop
1487          * buffer flush, which must guarantee that all the buffer content
1488          * (records and header timestamps) are visible to the reader. This is
1489          * required for quiescence guarantees for the fusion merge.
1490          */
1491         if (mode != SWITCH_FLUSH && !off)
1492                 return -1;      /* we do not have to switch : buffer is empty */
1493
1494         if (unlikely(off == 0)) {
1495                 unsigned long sb_index, commit_count;
1496
1497                 /*
1498                  * We are performing a SWITCH_FLUSH. There may be concurrent
1499                  * writes into the buffer if e.g. invoked while performing a
1500                  * snapshot on an active trace.
1501                  *
1502                  * If the client does not save any header information (sub-buffer
1503                  * header size == 0), don't switch empty subbuffer on finalize,
1504                  * because it is invalid to deliver a completely empty
1505                  * subbuffer.
1506                  */
1507                 if (!config->cb.subbuffer_header_size())
1508                         return -1;
1509
1510                 /* Test new buffer integrity */
1511                 sb_index = subbuf_index(offsets->begin, chan);
1512                 commit_count = v_read(config,
1513                                 &buf->commit_cold[sb_index].cc_sb);
1514                 reserve_commit_diff =
1515                   (buf_trunc(offsets->begin, chan)
1516                    >> chan->backend.num_subbuf_order)
1517                   - (commit_count & chan->commit_count_mask);
1518                 if (likely(reserve_commit_diff == 0)) {
1519                         /* Next subbuffer not being written to. */
1520                         if (unlikely(config->mode != RING_BUFFER_OVERWRITE &&
1521                                 subbuf_trunc(offsets->begin, chan)
1522                                  - subbuf_trunc((unsigned long)
1523                                      atomic_long_read(&buf->consumed), chan)
1524                                 >= chan->backend.buf_size)) {
1525                                 /*
1526                                  * We do not overwrite non consumed buffers
1527                                  * and we are full : don't switch.
1528                                  */
1529                                 return -1;
1530                         } else {
1531                                 /*
1532                                  * Next subbuffer not being written to, and we
1533                                  * are either in overwrite mode or the buffer is
1534                                  * not full. It's safe to write in this new
1535                                  * subbuffer.
1536                                  */
1537                         }
1538                 } else {
1539                         /*
1540                          * Next subbuffer reserve offset does not match the
1541                          * commit offset. Don't perform switch in
1542                          * producer-consumer and overwrite mode.  Caused by
1543                          * either a writer OOPS or too many nested writes over a
1544                          * reserve/commit pair.
1545                          */
1546                         return -1;
1547                 }
1548
1549                 /*
1550                  * Need to write the subbuffer start header on finalize.
1551                  */
1552                 offsets->switch_old_start = 1;
1553         }
1554         offsets->begin = subbuf_align(offsets->begin, chan);
1555         /* Note: old points to the next subbuf at offset 0 */
1556         offsets->end = offsets->begin;
1557         return 0;
1558 }
1559
1560 /*
1561  * Force a sub-buffer switch. This operation is completely reentrant : can be
1562  * called while tracing is active with absolutely no lock held.
1563  *
1564  * Note, however, that as a v_cmpxchg is used for some atomic
1565  * operations, this function must be called from the CPU which owns the buffer
1566  * for a ACTIVE flush.
1567  */
1568 void lib_ring_buffer_switch_slow(struct lib_ring_buffer *buf, enum switch_mode mode)
1569 {
1570         struct channel *chan = buf->backend.chan;
1571         const struct lib_ring_buffer_config *config = &chan->backend.config;
1572         struct switch_offsets offsets;
1573         unsigned long oldidx;
1574         u64 tsc;
1575
1576         offsets.size = 0;
1577
1578         /*
1579          * Perform retryable operations.
1580          */
1581         do {
1582                 if (lib_ring_buffer_try_switch_slow(mode, buf, chan, &offsets,
1583                                                     &tsc))
1584                         return; /* Switch not needed */
1585         } while (v_cmpxchg(config, &buf->offset, offsets.old, offsets.end)
1586                  != offsets.old);
1587
1588         /*
1589          * Atomically update last_tsc. This update races against concurrent
1590          * atomic updates, but the race will always cause supplementary full TSC
1591          * records, never the opposite (missing a full TSC record when it would
1592          * be needed).
1593          */
1594         save_last_tsc(config, buf, tsc);
1595
1596         /*
1597          * Push the reader if necessary
1598          */
1599         lib_ring_buffer_reserve_push_reader(buf, chan, offsets.old);
1600
1601         oldidx = subbuf_index(offsets.old, chan);
1602         lib_ring_buffer_clear_noref(config, &buf->backend, oldidx);
1603
1604         /*
1605          * May need to populate header start on SWITCH_FLUSH.
1606          */
1607         if (offsets.switch_old_start) {
1608                 lib_ring_buffer_switch_old_start(buf, chan, &offsets, tsc);
1609                 offsets.old += config->cb.subbuffer_header_size();
1610         }
1611
1612         /*
1613          * Switch old subbuffer.
1614          */
1615         lib_ring_buffer_switch_old_end(buf, chan, &offsets, tsc);
1616 }
1617 EXPORT_SYMBOL_GPL(lib_ring_buffer_switch_slow);
1618
1619 struct switch_param {
1620         struct lib_ring_buffer *buf;
1621         enum switch_mode mode;
1622 };
1623
1624 static void remote_switch(void *info)
1625 {
1626         struct switch_param *param = info;
1627         struct lib_ring_buffer *buf = param->buf;
1628
1629         lib_ring_buffer_switch_slow(buf, param->mode);
1630 }
1631
1632 static void _lib_ring_buffer_switch_remote(struct lib_ring_buffer *buf,
1633                 enum switch_mode mode)
1634 {
1635         struct channel *chan = buf->backend.chan;
1636         const struct lib_ring_buffer_config *config = &chan->backend.config;
1637         int ret;
1638         struct switch_param param;
1639
1640         /*
1641          * With global synchronization we don't need to use the IPI scheme.
1642          */
1643         if (config->sync == RING_BUFFER_SYNC_GLOBAL) {
1644                 lib_ring_buffer_switch_slow(buf, mode);
1645                 return;
1646         }
1647
1648         /*
1649          * Taking lock on CPU hotplug to ensure two things: first, that the
1650          * target cpu is not taken concurrently offline while we are within
1651          * smp_call_function_single() (I don't trust that get_cpu() on the
1652          * _local_ CPU actually inhibit CPU hotplug for the _remote_ CPU (to be
1653          * confirmed)). Secondly, if it happens that the CPU is not online, our
1654          * own call to lib_ring_buffer_switch_slow() needs to be protected from
1655          * CPU hotplug handlers, which can also perform a remote subbuffer
1656          * switch.
1657          */
1658         get_online_cpus();
1659         param.buf = buf;
1660         param.mode = mode;
1661         ret = smp_call_function_single(buf->backend.cpu,
1662                                  remote_switch, &param, 1);
1663         if (ret) {
1664                 /* Remote CPU is offline, do it ourself. */
1665                 lib_ring_buffer_switch_slow(buf, mode);
1666         }
1667         put_online_cpus();
1668 }
1669
1670 void lib_ring_buffer_switch_remote(struct lib_ring_buffer *buf)
1671 {
1672         _lib_ring_buffer_switch_remote(buf, SWITCH_ACTIVE);
1673 }
1674 EXPORT_SYMBOL_GPL(lib_ring_buffer_switch_remote);
1675
1676 /* Switch sub-buffer even if current sub-buffer is empty. */
1677 void lib_ring_buffer_switch_remote_empty(struct lib_ring_buffer *buf)
1678 {
1679         _lib_ring_buffer_switch_remote(buf, SWITCH_FLUSH);
1680 }
1681 EXPORT_SYMBOL_GPL(lib_ring_buffer_switch_remote_empty);
1682
1683 /*
1684  * Returns :
1685  * 0 if ok
1686  * -ENOSPC if event size is too large for packet.
1687  * -ENOBUFS if there is currently not enough space in buffer for the event.
1688  * -EIO if data cannot be written into the buffer for any other reason.
1689  */
1690 static
1691 int lib_ring_buffer_try_reserve_slow(struct lib_ring_buffer *buf,
1692                                      struct channel *chan,
1693                                      struct switch_offsets *offsets,
1694                                      struct lib_ring_buffer_ctx *ctx)
1695 {
1696         const struct lib_ring_buffer_config *config = &chan->backend.config;
1697         unsigned long reserve_commit_diff, offset_cmp;
1698
1699 retry:
1700         offsets->begin = offset_cmp = v_read(config, &buf->offset);
1701         offsets->old = offsets->begin;
1702         offsets->switch_new_start = 0;
1703         offsets->switch_new_end = 0;
1704         offsets->switch_old_end = 0;
1705         offsets->pre_header_padding = 0;
1706
1707         ctx->tsc = config->cb.ring_buffer_clock_read(chan);
1708         if ((int64_t) ctx->tsc == -EIO)
1709                 return -EIO;
1710
1711         if (last_tsc_overflow(config, buf, ctx->tsc))
1712                 ctx->rflags |= RING_BUFFER_RFLAG_FULL_TSC;
1713
1714         if (unlikely(subbuf_offset(offsets->begin, ctx->chan) == 0)) {
1715                 offsets->switch_new_start = 1;          /* For offsets->begin */
1716         } else {
1717                 offsets->size = config->cb.record_header_size(config, chan,
1718                                                 offsets->begin,
1719                                                 &offsets->pre_header_padding,
1720                                                 ctx);
1721                 offsets->size +=
1722                         lib_ring_buffer_align(offsets->begin + offsets->size,
1723                                               ctx->largest_align)
1724                         + ctx->data_size;
1725                 if (unlikely(subbuf_offset(offsets->begin, chan) +
1726                              offsets->size > chan->backend.subbuf_size)) {
1727                         offsets->switch_old_end = 1;    /* For offsets->old */
1728                         offsets->switch_new_start = 1;  /* For offsets->begin */
1729                 }
1730         }
1731         if (unlikely(offsets->switch_new_start)) {
1732                 unsigned long sb_index, commit_count;
1733
1734                 /*
1735                  * We are typically not filling the previous buffer completely.
1736                  */
1737                 if (likely(offsets->switch_old_end))
1738                         offsets->begin = subbuf_align(offsets->begin, chan);
1739                 offsets->begin = offsets->begin
1740                                  + config->cb.subbuffer_header_size();
1741                 /* Test new buffer integrity */
1742                 sb_index = subbuf_index(offsets->begin, chan);
1743                 /*
1744                  * Read buf->offset before buf->commit_cold[sb_index].cc_sb.
1745                  * lib_ring_buffer_check_deliver() has the matching
1746                  * memory barriers required around commit_cold cc_sb
1747                  * updates to ensure reserve and commit counter updates
1748                  * are not seen reordered when updated by another CPU.
1749                  */
1750                 smp_rmb();
1751                 commit_count = v_read(config,
1752                                 &buf->commit_cold[sb_index].cc_sb);
1753                 /* Read buf->commit_cold[sb_index].cc_sb before buf->offset. */
1754                 smp_rmb();
1755                 if (unlikely(offset_cmp != v_read(config, &buf->offset))) {
1756                         /*
1757                          * The reserve counter have been concurrently updated
1758                          * while we read the commit counter. This means the
1759                          * commit counter we read might not match buf->offset
1760                          * due to concurrent update. We therefore need to retry.
1761                          */
1762                         goto retry;
1763                 }
1764                 reserve_commit_diff =
1765                   (buf_trunc(offsets->begin, chan)
1766                    >> chan->backend.num_subbuf_order)
1767                   - (commit_count & chan->commit_count_mask);
1768                 if (likely(reserve_commit_diff == 0)) {
1769                         /* Next subbuffer not being written to. */
1770                         if (unlikely(config->mode != RING_BUFFER_OVERWRITE &&
1771                                 subbuf_trunc(offsets->begin, chan)
1772                                  - subbuf_trunc((unsigned long)
1773                                      atomic_long_read(&buf->consumed), chan)
1774                                 >= chan->backend.buf_size)) {
1775                                 /*
1776                                  * We do not overwrite non consumed buffers
1777                                  * and we are full : record is lost.
1778                                  */
1779                                 v_inc(config, &buf->records_lost_full);
1780                                 return -ENOBUFS;
1781                         } else {
1782                                 /*
1783                                  * Next subbuffer not being written to, and we
1784                                  * are either in overwrite mode or the buffer is
1785                                  * not full. It's safe to write in this new
1786                                  * subbuffer.
1787                                  */
1788                         }
1789                 } else {
1790                         /*
1791                          * Next subbuffer reserve offset does not match the
1792                          * commit offset, and this did not involve update to the
1793                          * reserve counter. Drop record in producer-consumer and
1794                          * overwrite mode.  Caused by either a writer OOPS or
1795                          * too many nested writes over a reserve/commit pair.
1796                          */
1797                         v_inc(config, &buf->records_lost_wrap);
1798                         return -EIO;
1799                 }
1800                 offsets->size =
1801                         config->cb.record_header_size(config, chan,
1802                                                 offsets->begin,
1803                                                 &offsets->pre_header_padding,
1804                                                 ctx);
1805                 offsets->size +=
1806                         lib_ring_buffer_align(offsets->begin + offsets->size,
1807                                               ctx->largest_align)
1808                         + ctx->data_size;
1809                 if (unlikely(subbuf_offset(offsets->begin, chan)
1810                              + offsets->size > chan->backend.subbuf_size)) {
1811                         /*
1812                          * Record too big for subbuffers, report error, don't
1813                          * complete the sub-buffer switch.
1814                          */
1815                         v_inc(config, &buf->records_lost_big);
1816                         return -ENOSPC;
1817                 } else {
1818                         /*
1819                          * We just made a successful buffer switch and the
1820                          * record fits in the new subbuffer. Let's write.
1821                          */
1822                 }
1823         } else {
1824                 /*
1825                  * Record fits in the current buffer and we are not on a switch
1826                  * boundary. It's safe to write.
1827                  */
1828         }
1829         offsets->end = offsets->begin + offsets->size;
1830
1831         if (unlikely(subbuf_offset(offsets->end, chan) == 0)) {
1832                 /*
1833                  * The offset_end will fall at the very beginning of the next
1834                  * subbuffer.
1835                  */
1836                 offsets->switch_new_end = 1;    /* For offsets->begin */
1837         }
1838         return 0;
1839 }
1840
1841 static struct lib_ring_buffer *get_current_buf(struct channel *chan, int cpu)
1842 {
1843         const struct lib_ring_buffer_config *config = &chan->backend.config;
1844
1845         if (config->alloc == RING_BUFFER_ALLOC_PER_CPU)
1846                 return per_cpu_ptr(chan->backend.buf, cpu);
1847         else
1848                 return chan->backend.buf;
1849 }
1850
1851 void lib_ring_buffer_lost_event_too_big(struct channel *chan)
1852 {
1853         const struct lib_ring_buffer_config *config = &chan->backend.config;
1854         struct lib_ring_buffer *buf = get_current_buf(chan, smp_processor_id());
1855
1856         v_inc(config, &buf->records_lost_big);
1857 }
1858 EXPORT_SYMBOL_GPL(lib_ring_buffer_lost_event_too_big);
1859
1860 /**
1861  * lib_ring_buffer_reserve_slow - Atomic slot reservation in a buffer.
1862  * @ctx: ring buffer context.
1863  *
1864  * Return : -NOBUFS if not enough space, -ENOSPC if event size too large,
1865  * -EIO for other errors, else returns 0.
1866  * It will take care of sub-buffer switching.
1867  */
1868 int lib_ring_buffer_reserve_slow(struct lib_ring_buffer_ctx *ctx)
1869 {
1870         struct channel *chan = ctx->chan;
1871         const struct lib_ring_buffer_config *config = &chan->backend.config;
1872         struct lib_ring_buffer *buf;
1873         struct switch_offsets offsets;
1874         int ret;
1875
1876         ctx->buf = buf = get_current_buf(chan, ctx->cpu);
1877         offsets.size = 0;
1878
1879         do {
1880                 ret = lib_ring_buffer_try_reserve_slow(buf, chan, &offsets,
1881                                                        ctx);
1882                 if (unlikely(ret))
1883                         return ret;
1884         } while (unlikely(v_cmpxchg(config, &buf->offset, offsets.old,
1885                                     offsets.end)
1886                           != offsets.old));
1887
1888         /*
1889          * Atomically update last_tsc. This update races against concurrent
1890          * atomic updates, but the race will always cause supplementary full TSC
1891          * records, never the opposite (missing a full TSC record when it would
1892          * be needed).
1893          */
1894         save_last_tsc(config, buf, ctx->tsc);
1895
1896         /*
1897          * Push the reader if necessary
1898          */
1899         lib_ring_buffer_reserve_push_reader(buf, chan, offsets.end - 1);
1900
1901         /*
1902          * Clear noref flag for this subbuffer.
1903          */
1904         lib_ring_buffer_clear_noref(config, &buf->backend,
1905                                     subbuf_index(offsets.end - 1, chan));
1906
1907         /*
1908          * Switch old subbuffer if needed.
1909          */
1910         if (unlikely(offsets.switch_old_end)) {
1911                 lib_ring_buffer_clear_noref(config, &buf->backend,
1912                                             subbuf_index(offsets.old - 1, chan));
1913                 lib_ring_buffer_switch_old_end(buf, chan, &offsets, ctx->tsc);
1914         }
1915
1916         /*
1917          * Populate new subbuffer.
1918          */
1919         if (unlikely(offsets.switch_new_start))
1920                 lib_ring_buffer_switch_new_start(buf, chan, &offsets, ctx->tsc);
1921
1922         if (unlikely(offsets.switch_new_end))
1923                 lib_ring_buffer_switch_new_end(buf, chan, &offsets, ctx->tsc);
1924
1925         ctx->slot_size = offsets.size;
1926         ctx->pre_offset = offsets.begin;
1927         ctx->buf_offset = offsets.begin + offsets.pre_header_padding;
1928         return 0;
1929 }
1930 EXPORT_SYMBOL_GPL(lib_ring_buffer_reserve_slow);
1931
1932 int __init init_lib_ring_buffer_frontend(void)
1933 {
1934         int cpu;
1935
1936         for_each_possible_cpu(cpu)
1937                 spin_lock_init(&per_cpu(ring_buffer_nohz_lock, cpu));
1938         return 0;
1939 }
1940
1941 module_init(init_lib_ring_buffer_frontend);
1942
1943 void __exit exit_lib_ring_buffer_frontend(void)
1944 {
1945 }
1946
1947 module_exit(exit_lib_ring_buffer_frontend);