2 * ring_buffer_iterator.c
4 * (C) Copyright 2010 - Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
6 * Ring buffer and channel iterators. Get each event of a channel in order. Uses
7 * a prio heap for per-cpu buffers, giving a O(log(NR_CPUS)) algorithmic
8 * complexity for the "get next event" operation.
11 * Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
13 * Dual LGPL v2.1/GPL v2 license.
19 * Safety factor taking into account internal kernel interrupt latency.
20 * Assuming 250ms worse-case latency.
22 #define MAX_SYSTEM_LATENCY 250
25 * Maximum delta expected between trace clocks. At most 1 jiffy delta.
27 #define MAX_CLOCK_DELTA (jiffies_to_usecs(1) * 1000)
30 * lib_ring_buffer_get_next_record - Get the next record in a buffer.
34 * Returns the size of the event read, -EAGAIN if buffer is empty, -ENODATA if
35 * buffer is empty and finalized. The buffer must already be opened for reading.
37 ssize_t
lib_ring_buffer_get_next_record(struct channel
*chan
,
38 struct lib_ring_buffer
*buf
)
40 const struct lib_ring_buffer_config
*config
= chan
->backend
.config
;
41 struct lib_ring_buffer_iter
*iter
= &buf
->iter
;
45 switch (iter
->state
) {
47 ret
= lib_ring_buffer_get_next_subbuf(buf
);
48 if (ret
&& !ACCESS_ONCE(buf
->finalized
)
49 && config
->alloc
== RING_BUFFER_ALLOC_GLOBAL
) {
51 * Use "pull" scheme for global buffers. The reader
52 * itself flushes the buffer to "pull" data not visible
53 * to readers yet. Flush current subbuffer and re-try.
55 * Per-CPU buffers rather use a "push" scheme because
56 * the IPI needed to flush all CPU's buffers is too
57 * costly. In the "push" scheme, the reader waits for
58 * the writer periodic deferrable timer to flush the
59 * buffers (keeping track of a quiescent state
60 * timestamp). Therefore, the writer "pushes" data out
61 * of the buffers rather than letting the reader "pull"
62 * data from the buffer.
64 lib_ring_buffer_switch_slow(buf
, SWITCH_ACTIVE
);
65 ret
= lib_ring_buffer_get_next_subbuf(buf
);
69 iter
->consumed
= buf
->cons_snapshot
;
70 iter
->data_size
= lib_ring_buffer_get_read_data_size(config
, buf
);
71 iter
->read_offset
= iter
->consumed
;
73 iter
->read_offset
+= config
->cb
.subbuffer_header_size();
74 iter
->state
= ITER_TEST_RECORD
;
76 case ITER_TEST_RECORD
:
77 if (iter
->read_offset
- iter
->consumed
>= iter
->data_size
) {
78 iter
->state
= ITER_PUT_SUBBUF
;
80 CHAN_WARN_ON(chan
, !config
->cb
.record_get
);
81 config
->cb
.record_get(config
, chan
, buf
,
86 iter
->read_offset
+= iter
->header_len
;
87 subbuffer_consume_record(config
, &buf
->backend
);
88 iter
->state
= ITER_NEXT_RECORD
;
89 return iter
->payload_len
;
92 case ITER_NEXT_RECORD
:
93 iter
->read_offset
+= iter
->payload_len
;
94 iter
->state
= ITER_TEST_RECORD
;
97 lib_ring_buffer_put_next_subbuf(buf
);
98 iter
->state
= ITER_GET_SUBBUF
;
101 CHAN_WARN_ON(chan
, 1); /* Should not happen */
105 EXPORT_SYMBOL_GPL(lib_ring_buffer_get_next_record
);
107 static int buf_is_higher(void *a
, void *b
)
109 struct lib_ring_buffer
*bufa
= a
;
110 struct lib_ring_buffer
*bufb
= b
;
112 /* Consider lowest timestamps to be at the top of the heap */
113 return (bufa
->iter
.timestamp
< bufb
->iter
.timestamp
);
117 void lib_ring_buffer_get_empty_buf_records(const struct lib_ring_buffer_config
*config
,
118 struct channel
*chan
)
120 struct lttng_ptr_heap
*heap
= &chan
->iter
.heap
;
121 struct lib_ring_buffer
*buf
, *tmp
;
124 list_for_each_entry_safe(buf
, tmp
, &chan
->iter
.empty_head
,
126 len
= lib_ring_buffer_get_next_record(chan
, buf
);
129 * Deal with -EAGAIN and -ENODATA.
130 * len >= 0 means record contains data.
131 * -EBUSY should never happen, because we support only one
136 /* Keep node in empty list */
140 * Buffer is finalized. Don't add to list of empty
141 * buffer, because it has no more data to provide, ever.
143 list_del(&buf
->iter
.empty_node
);
146 CHAN_WARN_ON(chan
, 1);
150 * Insert buffer into the heap, remove from empty buffer
153 CHAN_WARN_ON(chan
, len
< 0);
154 list_del(&buf
->iter
.empty_node
);
155 CHAN_WARN_ON(chan
, lttng_heap_insert(heap
, buf
));
161 void lib_ring_buffer_wait_for_qs(const struct lib_ring_buffer_config
*config
,
162 struct channel
*chan
)
165 unsigned long wait_msecs
;
168 * No need to wait if no empty buffers are present.
170 if (list_empty(&chan
->iter
.empty_head
))
173 timestamp_qs
= config
->cb
.ring_buffer_clock_read(chan
);
175 * We need to consider previously empty buffers.
176 * Do a get next buf record on each of them. Add them to
177 * the heap if they have data. If at least one of them
178 * don't have data, we need to wait for
179 * switch_timer_interval + MAX_SYSTEM_LATENCY (so we are sure the
180 * buffers have been switched either by the timer or idle entry) and
181 * check them again, adding them if they have data.
183 lib_ring_buffer_get_empty_buf_records(config
, chan
);
186 * No need to wait if no empty buffers are present.
188 if (list_empty(&chan
->iter
.empty_head
))
192 * We need to wait for the buffer switch timer to run. If the
193 * CPU is idle, idle entry performed the switch.
194 * TODO: we could optimize further by skipping the sleep if all
195 * empty buffers belong to idle or offline cpus.
197 wait_msecs
= jiffies_to_msecs(chan
->switch_timer_interval
);
198 wait_msecs
+= MAX_SYSTEM_LATENCY
;
200 lib_ring_buffer_get_empty_buf_records(config
, chan
);
202 * Any buffer still in the empty list here cannot possibly
203 * contain an event with a timestamp prior to "timestamp_qs".
204 * The new quiescent state timestamp is the one we grabbed
205 * before waiting for buffer data. It is therefore safe to
206 * ignore empty buffers up to last_qs timestamp for fusion
209 chan
->iter
.last_qs
= timestamp_qs
;
213 * channel_get_next_record - Get the next record in a channel.
215 * @ret_buf: the buffer in which the event is located (output)
217 * Returns the size of new current event, -EAGAIN if all buffers are empty,
218 * -ENODATA if all buffers are empty and finalized. The channel must already be
219 * opened for reading.
222 ssize_t
channel_get_next_record(struct channel
*chan
,
223 struct lib_ring_buffer
**ret_buf
)
225 const struct lib_ring_buffer_config
*config
= chan
->backend
.config
;
226 struct lib_ring_buffer
*buf
;
227 struct lttng_ptr_heap
*heap
;
230 if (config
->alloc
== RING_BUFFER_ALLOC_GLOBAL
) {
231 *ret_buf
= channel_get_ring_buffer(config
, chan
, 0);
232 return lib_ring_buffer_get_next_record(chan
, *ret_buf
);
235 heap
= &chan
->iter
.heap
;
238 * get next record for topmost buffer.
240 buf
= lttng_heap_maximum(heap
);
242 len
= lib_ring_buffer_get_next_record(chan
, buf
);
244 * Deal with -EAGAIN and -ENODATA.
245 * len >= 0 means record contains data.
249 buf
->iter
.timestamp
= 0;
250 list_add(&buf
->iter
.empty_node
, &chan
->iter
.empty_head
);
251 /* Remove topmost buffer from the heap */
252 CHAN_WARN_ON(chan
, lttng_heap_remove(heap
) != buf
);
256 * Buffer is finalized. Remove buffer from heap and
257 * don't add to list of empty buffer, because it has no
258 * more data to provide, ever.
260 CHAN_WARN_ON(chan
, lttng_heap_remove(heap
) != buf
);
263 CHAN_WARN_ON(chan
, 1);
267 * Reinsert buffer into the heap. Note that heap can be
268 * partially empty, so we need to use
269 * lttng_heap_replace_max().
271 CHAN_WARN_ON(chan
, len
< 0);
272 CHAN_WARN_ON(chan
, lttng_heap_replace_max(heap
, buf
) != buf
);
277 buf
= lttng_heap_maximum(heap
);
278 if (!buf
|| buf
->iter
.timestamp
> chan
->iter
.last_qs
) {
280 * Deal with buffers previously showing no data.
281 * Add buffers containing data to the heap, update
284 lib_ring_buffer_wait_for_qs(config
, chan
);
287 *ret_buf
= buf
= lttng_heap_maximum(heap
);
290 * If this warning triggers, you probably need to check your
291 * system interrupt latency. Typical causes: too many printk()
292 * output going to a serial console with interrupts off.
293 * Allow for MAX_CLOCK_DELTA ns timestamp delta going backward.
294 * Observed on SMP KVM setups with trace_clock().
296 if (chan
->iter
.last_timestamp
297 > (buf
->iter
.timestamp
+ MAX_CLOCK_DELTA
)) {
298 printk(KERN_WARNING
"ring_buffer: timestamps going "
299 "backward. Last time %llu ns, cpu %d, "
300 "current time %llu ns, cpu %d, "
302 chan
->iter
.last_timestamp
, chan
->iter
.last_cpu
,
303 buf
->iter
.timestamp
, buf
->backend
.cpu
,
304 chan
->iter
.last_timestamp
- buf
->iter
.timestamp
);
305 CHAN_WARN_ON(chan
, 1);
307 chan
->iter
.last_timestamp
= buf
->iter
.timestamp
;
308 chan
->iter
.last_cpu
= buf
->backend
.cpu
;
309 return buf
->iter
.payload_len
;
312 if (list_empty(&chan
->iter
.empty_head
))
313 return -ENODATA
; /* All buffers finalized */
315 return -EAGAIN
; /* Temporarily empty */
318 EXPORT_SYMBOL_GPL(channel_get_next_record
);
321 void lib_ring_buffer_iterator_init(struct channel
*chan
, struct lib_ring_buffer
*buf
)
323 if (buf
->iter
.allocated
)
326 buf
->iter
.allocated
= 1;
327 if (chan
->iter
.read_open
&& !buf
->iter
.read_open
) {
328 CHAN_WARN_ON(chan
, lib_ring_buffer_open_read(buf
) != 0);
329 buf
->iter
.read_open
= 1;
332 /* Add to list of buffers without any current record */
333 if (chan
->backend
.config
->alloc
== RING_BUFFER_ALLOC_PER_CPU
)
334 list_add(&buf
->iter
.empty_node
, &chan
->iter
.empty_head
);
337 #ifdef CONFIG_HOTPLUG_CPU
339 int __cpuinit
channel_iterator_cpu_hotplug(struct notifier_block
*nb
,
340 unsigned long action
,
343 unsigned int cpu
= (unsigned long)hcpu
;
344 struct channel
*chan
= container_of(nb
, struct channel
,
346 struct lib_ring_buffer
*buf
= per_cpu_ptr(chan
->backend
.buf
, cpu
);
347 const struct lib_ring_buffer_config
*config
= chan
->backend
.config
;
349 if (!chan
->hp_iter_enable
)
352 CHAN_WARN_ON(chan
, config
->alloc
== RING_BUFFER_ALLOC_GLOBAL
);
355 case CPU_DOWN_FAILED
:
356 case CPU_DOWN_FAILED_FROZEN
:
358 case CPU_ONLINE_FROZEN
:
359 lib_ring_buffer_iterator_init(chan
, buf
);
367 int channel_iterator_init(struct channel
*chan
)
369 const struct lib_ring_buffer_config
*config
= chan
->backend
.config
;
370 struct lib_ring_buffer
*buf
;
372 if (config
->alloc
== RING_BUFFER_ALLOC_PER_CPU
) {
375 INIT_LIST_HEAD(&chan
->iter
.empty_head
);
376 ret
= lttng_heap_init(&chan
->iter
.heap
,
378 GFP_KERNEL
, buf_is_higher
);
382 * In case of non-hotplug cpu, if the ring-buffer is allocated
383 * in early initcall, it will not be notified of secondary cpus.
384 * In that off case, we need to allocate for all possible cpus.
386 #ifdef CONFIG_HOTPLUG_CPU
387 chan
->hp_iter_notifier
.notifier_call
=
388 channel_iterator_cpu_hotplug
;
389 chan
->hp_iter_notifier
.priority
= 10;
390 register_cpu_notifier(&chan
->hp_iter_notifier
);
392 for_each_online_cpu(cpu
) {
393 buf
= per_cpu_ptr(chan
->backend
.buf
, cpu
);
394 lib_ring_buffer_iterator_init(chan
, buf
);
396 chan
->hp_iter_enable
= 1;
399 for_each_possible_cpu(cpu
) {
400 buf
= per_cpu_ptr(chan
->backend
.buf
, cpu
);
401 lib_ring_buffer_iterator_init(chan
, buf
);
405 buf
= channel_get_ring_buffer(config
, chan
, 0);
406 lib_ring_buffer_iterator_init(chan
, buf
);
411 void channel_iterator_unregister_notifiers(struct channel
*chan
)
413 const struct lib_ring_buffer_config
*config
= chan
->backend
.config
;
415 if (config
->alloc
== RING_BUFFER_ALLOC_PER_CPU
) {
416 chan
->hp_iter_enable
= 0;
417 unregister_cpu_notifier(&chan
->hp_iter_notifier
);
421 void channel_iterator_free(struct channel
*chan
)
423 const struct lib_ring_buffer_config
*config
= chan
->backend
.config
;
425 if (config
->alloc
== RING_BUFFER_ALLOC_PER_CPU
)
426 lttng_heap_free(&chan
->iter
.heap
);
429 int lib_ring_buffer_iterator_open(struct lib_ring_buffer
*buf
)
431 struct channel
*chan
= buf
->backend
.chan
;
432 const struct lib_ring_buffer_config
*config
= chan
->backend
.config
;
433 CHAN_WARN_ON(chan
, config
->output
!= RING_BUFFER_ITERATOR
);
434 return lib_ring_buffer_open_read(buf
);
436 EXPORT_SYMBOL_GPL(lib_ring_buffer_iterator_open
);
439 * Note: Iterators must not be mixed with other types of outputs, because an
440 * iterator can leave the buffer in "GET" state, which is not consistent with
441 * other types of output (mmap, splice, raw data read).
443 void lib_ring_buffer_iterator_release(struct lib_ring_buffer
*buf
)
445 lib_ring_buffer_release_read(buf
);
447 EXPORT_SYMBOL_GPL(lib_ring_buffer_iterator_release
);
449 int channel_iterator_open(struct channel
*chan
)
451 const struct lib_ring_buffer_config
*config
= chan
->backend
.config
;
452 struct lib_ring_buffer
*buf
;
455 CHAN_WARN_ON(chan
, config
->output
!= RING_BUFFER_ITERATOR
);
457 if (config
->alloc
== RING_BUFFER_ALLOC_PER_CPU
) {
459 /* Allow CPU hotplug to keep track of opened reader */
460 chan
->iter
.read_open
= 1;
461 for_each_channel_cpu(cpu
, chan
) {
462 buf
= channel_get_ring_buffer(config
, chan
, cpu
);
463 ret
= lib_ring_buffer_iterator_open(buf
);
466 buf
->iter
.read_open
= 1;
470 buf
= channel_get_ring_buffer(config
, chan
, 0);
471 ret
= lib_ring_buffer_iterator_open(buf
);
475 /* Error should always happen on CPU 0, hence no close is required. */
476 CHAN_WARN_ON(chan
, cpu
!= 0);
480 EXPORT_SYMBOL_GPL(channel_iterator_open
);
482 void channel_iterator_release(struct channel
*chan
)
484 const struct lib_ring_buffer_config
*config
= chan
->backend
.config
;
485 struct lib_ring_buffer
*buf
;
488 if (config
->alloc
== RING_BUFFER_ALLOC_PER_CPU
) {
490 for_each_channel_cpu(cpu
, chan
) {
491 buf
= channel_get_ring_buffer(config
, chan
, cpu
);
492 if (buf
->iter
.read_open
) {
493 lib_ring_buffer_iterator_release(buf
);
494 buf
->iter
.read_open
= 0;
497 chan
->iter
.read_open
= 0;
500 buf
= channel_get_ring_buffer(config
, chan
, 0);
501 lib_ring_buffer_iterator_release(buf
);
504 EXPORT_SYMBOL_GPL(channel_iterator_release
);
506 void lib_ring_buffer_iterator_reset(struct lib_ring_buffer
*buf
)
508 struct channel
*chan
= buf
->backend
.chan
;
510 if (buf
->iter
.state
!= ITER_GET_SUBBUF
)
511 lib_ring_buffer_put_next_subbuf(buf
);
512 buf
->iter
.state
= ITER_GET_SUBBUF
;
513 /* Remove from heap (if present). */
514 if (lttng_heap_cherrypick(&chan
->iter
.heap
, buf
))
515 list_add(&buf
->iter
.empty_node
, &chan
->iter
.empty_head
);
516 buf
->iter
.timestamp
= 0;
517 buf
->iter
.header_len
= 0;
518 buf
->iter
.payload_len
= 0;
519 buf
->iter
.consumed
= 0;
520 buf
->iter
.read_offset
= 0;
521 buf
->iter
.data_size
= 0;
522 /* Don't reset allocated and read_open */
525 void channel_iterator_reset(struct channel
*chan
)
527 const struct lib_ring_buffer_config
*config
= chan
->backend
.config
;
528 struct lib_ring_buffer
*buf
;
531 /* Empty heap, put into empty_head */
532 while ((buf
= lttng_heap_remove(&chan
->iter
.heap
)) != NULL
)
533 list_add(&buf
->iter
.empty_node
, &chan
->iter
.empty_head
);
535 for_each_channel_cpu(cpu
, chan
) {
536 buf
= channel_get_ring_buffer(config
, chan
, cpu
);
537 lib_ring_buffer_iterator_reset(buf
);
539 /* Don't reset read_open */
540 chan
->iter
.last_qs
= 0;
541 chan
->iter
.last_timestamp
= 0;
542 chan
->iter
.last_cpu
= 0;
543 chan
->iter
.len_left
= 0;
547 * Ring buffer payload extraction read() implementation.
550 ssize_t
channel_ring_buffer_file_read(struct file
*filp
,
551 char __user
*user_buf
,
554 struct channel
*chan
,
555 struct lib_ring_buffer
*buf
,
558 const struct lib_ring_buffer_config
*config
= chan
->backend
.config
;
559 size_t read_count
= 0, read_offset
;
563 if (!access_ok(VERIFY_WRITE
, user_buf
, count
))
566 /* Finish copy of previous record */
568 if (read_count
< count
) {
569 len
= chan
->iter
.len_left
;
571 if (config
->alloc
== RING_BUFFER_ALLOC_PER_CPU
573 buf
= lttng_heap_maximum(&chan
->iter
.heap
);
574 CHAN_WARN_ON(chan
, !buf
);
579 while (read_count
< count
) {
580 size_t copy_len
, space_left
;
583 len
= channel_get_next_record(chan
, &buf
);
585 len
= lib_ring_buffer_get_next_record(chan
, buf
);
589 * Check if buffer is finalized (end of file).
591 if (len
== -ENODATA
) {
592 /* A 0 read_count will tell about end of file */
595 if (filp
->f_flags
& O_NONBLOCK
) {
597 read_count
= -EAGAIN
;
603 * No data available at the moment, return what
610 * Wait for returned len to be >= 0 or -ENODATA.
613 error
= wait_event_interruptible(
615 ((len
= channel_get_next_record(chan
,
616 &buf
)), len
!= -EAGAIN
));
618 error
= wait_event_interruptible(
620 ((len
= lib_ring_buffer_get_next_record(
621 chan
, buf
)), len
!= -EAGAIN
));
622 CHAN_WARN_ON(chan
, len
== -EBUSY
);
627 CHAN_WARN_ON(chan
, len
< 0 && len
!= -ENODATA
);
631 read_offset
= buf
->iter
.read_offset
;
633 space_left
= count
- read_count
;
634 if (len
<= space_left
) {
636 chan
->iter
.len_left
= 0;
639 copy_len
= space_left
;
640 chan
->iter
.len_left
= len
- copy_len
;
641 *ppos
= read_offset
+ copy_len
;
643 if (__lib_ring_buffer_copy_to_user(&buf
->backend
, read_offset
,
644 &user_buf
[read_count
],
647 * Leave the len_left and ppos values at their current
648 * state, as we currently have a valid event to read.
652 read_count
+= copy_len
;
658 chan
->iter
.len_left
= 0;
663 * lib_ring_buffer_file_read - Read buffer record payload.
664 * @filp: file structure pointer.
665 * @buffer: user buffer to read data into.
666 * @count: number of bytes to read.
667 * @ppos: file read position.
669 * Returns a negative value on error, or the number of bytes read on success.
670 * ppos is used to save the position _within the current record_ between calls
674 ssize_t
lib_ring_buffer_file_read(struct file
*filp
,
675 char __user
*user_buf
,
679 struct inode
*inode
= filp
->f_dentry
->d_inode
;
680 struct lib_ring_buffer
*buf
= inode
->i_private
;
681 struct channel
*chan
= buf
->backend
.chan
;
683 return channel_ring_buffer_file_read(filp
, user_buf
, count
, ppos
,
688 * channel_file_read - Read channel record payload.
689 * @filp: file structure pointer.
690 * @buffer: user buffer to read data into.
691 * @count: number of bytes to read.
692 * @ppos: file read position.
694 * Returns a negative value on error, or the number of bytes read on success.
695 * ppos is used to save the position _within the current record_ between calls
699 ssize_t
channel_file_read(struct file
*filp
,
700 char __user
*user_buf
,
704 struct inode
*inode
= filp
->f_dentry
->d_inode
;
705 struct channel
*chan
= inode
->i_private
;
706 const struct lib_ring_buffer_config
*config
= chan
->backend
.config
;
708 if (config
->alloc
== RING_BUFFER_ALLOC_PER_CPU
)
709 return channel_ring_buffer_file_read(filp
, user_buf
, count
,
710 ppos
, chan
, NULL
, 1);
712 struct lib_ring_buffer
*buf
=
713 channel_get_ring_buffer(config
, chan
, 0);
714 return channel_ring_buffer_file_read(filp
, user_buf
, count
,
720 int lib_ring_buffer_file_open(struct inode
*inode
, struct file
*file
)
722 struct lib_ring_buffer
*buf
= inode
->i_private
;
725 ret
= lib_ring_buffer_iterator_open(buf
);
729 file
->private_data
= buf
;
730 ret
= nonseekable_open(inode
, file
);
736 lib_ring_buffer_iterator_release(buf
);
741 int lib_ring_buffer_file_release(struct inode
*inode
, struct file
*file
)
743 struct lib_ring_buffer
*buf
= inode
->i_private
;
745 lib_ring_buffer_iterator_release(buf
);
750 int channel_file_open(struct inode
*inode
, struct file
*file
)
752 struct channel
*chan
= inode
->i_private
;
755 ret
= channel_iterator_open(chan
);
759 file
->private_data
= chan
;
760 ret
= nonseekable_open(inode
, file
);
766 channel_iterator_release(chan
);
771 int channel_file_release(struct inode
*inode
, struct file
*file
)
773 struct channel
*chan
= inode
->i_private
;
775 channel_iterator_release(chan
);
779 const struct file_operations channel_payload_file_operations
= {
780 .open
= channel_file_open
,
781 .release
= channel_file_release
,
782 .read
= channel_file_read
,
783 .llseek
= lib_ring_buffer_no_llseek
,
785 EXPORT_SYMBOL_GPL(channel_payload_file_operations
);
787 const struct file_operations lib_ring_buffer_payload_file_operations
= {
788 .open
= lib_ring_buffer_file_open
,
789 .release
= lib_ring_buffer_file_release
,
790 .read
= lib_ring_buffer_file_read
,
791 .llseek
= lib_ring_buffer_no_llseek
,
793 EXPORT_SYMBOL_GPL(lib_ring_buffer_payload_file_operations
);