The patch below does not apply to the 4.9-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
Possible dependencies:
42fb0a1e84ff ("tracing/ring-buffer: Have polling block on watermark")
7e9fbbb1b776 ("ring-buffer: Add ring_buffer_wake_waiters()")
13292494379f ("tracing: Make struct ring_buffer less ambiguous")
1c5eb4481e01 ("tracing: Rename trace_buffer to array_buffer")
2d6425af6116 ("tracing: Declare newly exported APIs in include/linux/trace.h")
a47b53e95acc ("tracing: Rename tracing_reset() to tracing_reset_cpu()")
46cc0b44428d ("tracing/snapshot: Resize spare buffer if size changed")
d2d8b146043a ("Merge tag 'trace-v5.2' of git://git.kernel.org/pub/scm/linux/kernel/git/rostedt/linux-trace")
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 42fb0a1e84ff525ebe560e2baf9451ab69127e2b Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (Google)" <rostedt(a)goodmis.org>
Date: Thu, 20 Oct 2022 23:14:27 -0400
Subject: [PATCH] tracing/ring-buffer: Have polling block on watermark
Currently the way polling works on the ring buffer is broken. It will
return immediately if there's any data in the ring buffer whereas a read
will block until the watermark (defined by the tracefs buffer_percent file)
is hit.
That is, a select() or poll() will return as if there's data available,
but then the following read will block. This is broken for the way
select()s and poll()s are supposed to work.
Have the polling on the ring buffer also block the same way reads and
splice does on the ring buffer.
Link: https://lkml.kernel.org/r/20221020231427.41be3f26@gandalf.local.home
Cc: Linux Trace Kernel <linux-trace-kernel(a)vger.kernel.org>
Cc: Masami Hiramatsu <mhiramat(a)kernel.org>
Cc: Mathieu Desnoyers <mathieu.desnoyers(a)efficios.com>
Cc: Primiano Tucci <primiano(a)google.com>
Cc: stable(a)vger.kernel.org
Fixes: 1e0d6714aceb7 ("ring-buffer: Do not wake up a splice waiter when page is not full")
Signed-off-by: Steven Rostedt (Google) <rostedt(a)goodmis.org>
diff --git a/include/linux/ring_buffer.h b/include/linux/ring_buffer.h
index 2504df9a0453..3c7d295746f6 100644
--- a/include/linux/ring_buffer.h
+++ b/include/linux/ring_buffer.h
@@ -100,7 +100,7 @@ __ring_buffer_alloc(unsigned long size, unsigned flags, struct lock_class_key *k
int ring_buffer_wait(struct trace_buffer *buffer, int cpu, int full);
__poll_t ring_buffer_poll_wait(struct trace_buffer *buffer, int cpu,
- struct file *filp, poll_table *poll_table);
+ struct file *filp, poll_table *poll_table, int full);
void ring_buffer_wake_waiters(struct trace_buffer *buffer, int cpu);
#define RING_BUFFER_ALL_CPUS -1
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 9712083832f4..089b1ec9cb3b 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -907,6 +907,21 @@ size_t ring_buffer_nr_dirty_pages(struct trace_buffer *buffer, int cpu)
return cnt - read;
}
+static __always_inline bool full_hit(struct trace_buffer *buffer, int cpu, int full)
+{
+ struct ring_buffer_per_cpu *cpu_buffer = buffer->buffers[cpu];
+ size_t nr_pages;
+ size_t dirty;
+
+ nr_pages = cpu_buffer->nr_pages;
+ if (!nr_pages || !full)
+ return true;
+
+ dirty = ring_buffer_nr_dirty_pages(buffer, cpu);
+
+ return (dirty * 100) > (full * nr_pages);
+}
+
/*
* rb_wake_up_waiters - wake up tasks waiting for ring buffer input
*
@@ -1046,22 +1061,20 @@ int ring_buffer_wait(struct trace_buffer *buffer, int cpu, int full)
!ring_buffer_empty_cpu(buffer, cpu)) {
unsigned long flags;
bool pagebusy;
- size_t nr_pages;
- size_t dirty;
+ bool done;
if (!full)
break;
raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
pagebusy = cpu_buffer->reader_page == cpu_buffer->commit_page;
- nr_pages = cpu_buffer->nr_pages;
- dirty = ring_buffer_nr_dirty_pages(buffer, cpu);
+ done = !pagebusy && full_hit(buffer, cpu, full);
+
if (!cpu_buffer->shortest_full ||
cpu_buffer->shortest_full > full)
cpu_buffer->shortest_full = full;
raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
- if (!pagebusy &&
- (!nr_pages || (dirty * 100) > full * nr_pages))
+ if (done)
break;
}
@@ -1087,6 +1100,7 @@ int ring_buffer_wait(struct trace_buffer *buffer, int cpu, int full)
* @cpu: the cpu buffer to wait on
* @filp: the file descriptor
* @poll_table: The poll descriptor
+ * @full: wait until the percentage of pages are available, if @cpu != RING_BUFFER_ALL_CPUS
*
* If @cpu == RING_BUFFER_ALL_CPUS then the task will wake up as soon
* as data is added to any of the @buffer's cpu buffers. Otherwise
@@ -1096,14 +1110,15 @@ int ring_buffer_wait(struct trace_buffer *buffer, int cpu, int full)
* zero otherwise.
*/
__poll_t ring_buffer_poll_wait(struct trace_buffer *buffer, int cpu,
- struct file *filp, poll_table *poll_table)
+ struct file *filp, poll_table *poll_table, int full)
{
struct ring_buffer_per_cpu *cpu_buffer;
struct rb_irq_work *work;
- if (cpu == RING_BUFFER_ALL_CPUS)
+ if (cpu == RING_BUFFER_ALL_CPUS) {
work = &buffer->irq_work;
- else {
+ full = 0;
+ } else {
if (!cpumask_test_cpu(cpu, buffer->cpumask))
return -EINVAL;
@@ -1111,8 +1126,14 @@ __poll_t ring_buffer_poll_wait(struct trace_buffer *buffer, int cpu,
work = &cpu_buffer->irq_work;
}
- poll_wait(filp, &work->waiters, poll_table);
- work->waiters_pending = true;
+ if (full) {
+ poll_wait(filp, &work->full_waiters, poll_table);
+ work->full_waiters_pending = true;
+ } else {
+ poll_wait(filp, &work->waiters, poll_table);
+ work->waiters_pending = true;
+ }
+
/*
* There's a tight race between setting the waiters_pending and
* checking if the ring buffer is empty. Once the waiters_pending bit
@@ -1128,6 +1149,9 @@ __poll_t ring_buffer_poll_wait(struct trace_buffer *buffer, int cpu,
*/
smp_mb();
+ if (full)
+ return full_hit(buffer, cpu, full) ? EPOLLIN | EPOLLRDNORM : 0;
+
if ((cpu == RING_BUFFER_ALL_CPUS && !ring_buffer_empty(buffer)) ||
(cpu != RING_BUFFER_ALL_CPUS && !ring_buffer_empty_cpu(buffer, cpu)))
return EPOLLIN | EPOLLRDNORM;
@@ -3155,10 +3179,6 @@ static void rb_commit(struct ring_buffer_per_cpu *cpu_buffer,
static __always_inline void
rb_wakeups(struct trace_buffer *buffer, struct ring_buffer_per_cpu *cpu_buffer)
{
- size_t nr_pages;
- size_t dirty;
- size_t full;
-
if (buffer->irq_work.waiters_pending) {
buffer->irq_work.waiters_pending = false;
/* irq_work_queue() supplies it's own memory barriers */
@@ -3182,10 +3202,7 @@ rb_wakeups(struct trace_buffer *buffer, struct ring_buffer_per_cpu *cpu_buffer)
cpu_buffer->last_pages_touch = local_read(&cpu_buffer->pages_touched);
- full = cpu_buffer->shortest_full;
- nr_pages = cpu_buffer->nr_pages;
- dirty = ring_buffer_nr_dirty_pages(buffer, cpu_buffer->cpu);
- if (full && nr_pages && (dirty * 100) <= full * nr_pages)
+ if (!full_hit(buffer, cpu_buffer->cpu, cpu_buffer->shortest_full))
return;
cpu_buffer->irq_work.wakeup_full = true;
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 47a44b055a1d..c6c7a0af3ed2 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -6681,7 +6681,7 @@ trace_poll(struct trace_iterator *iter, struct file *filp, poll_table *poll_tabl
return EPOLLIN | EPOLLRDNORM;
else
return ring_buffer_poll_wait(iter->array_buffer->buffer, iter->cpu_file,
- filp, poll_table);
+ filp, poll_table, iter->tr->buffer_percent);
}
static __poll_t
The patch below does not apply to the 4.14-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
Possible dependencies:
42fb0a1e84ff ("tracing/ring-buffer: Have polling block on watermark")
7e9fbbb1b776 ("ring-buffer: Add ring_buffer_wake_waiters()")
13292494379f ("tracing: Make struct ring_buffer less ambiguous")
1c5eb4481e01 ("tracing: Rename trace_buffer to array_buffer")
2d6425af6116 ("tracing: Declare newly exported APIs in include/linux/trace.h")
a47b53e95acc ("tracing: Rename tracing_reset() to tracing_reset_cpu()")
46cc0b44428d ("tracing/snapshot: Resize spare buffer if size changed")
d2d8b146043a ("Merge tag 'trace-v5.2' of git://git.kernel.org/pub/scm/linux/kernel/git/rostedt/linux-trace")
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 42fb0a1e84ff525ebe560e2baf9451ab69127e2b Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (Google)" <rostedt(a)goodmis.org>
Date: Thu, 20 Oct 2022 23:14:27 -0400
Subject: [PATCH] tracing/ring-buffer: Have polling block on watermark
Currently the way polling works on the ring buffer is broken. It will
return immediately if there's any data in the ring buffer whereas a read
will block until the watermark (defined by the tracefs buffer_percent file)
is hit.
That is, a select() or poll() will return as if there's data available,
but then the following read will block. This is broken for the way
select()s and poll()s are supposed to work.
Have the polling on the ring buffer also block the same way reads and
splice does on the ring buffer.
Link: https://lkml.kernel.org/r/20221020231427.41be3f26@gandalf.local.home
Cc: Linux Trace Kernel <linux-trace-kernel(a)vger.kernel.org>
Cc: Masami Hiramatsu <mhiramat(a)kernel.org>
Cc: Mathieu Desnoyers <mathieu.desnoyers(a)efficios.com>
Cc: Primiano Tucci <primiano(a)google.com>
Cc: stable(a)vger.kernel.org
Fixes: 1e0d6714aceb7 ("ring-buffer: Do not wake up a splice waiter when page is not full")
Signed-off-by: Steven Rostedt (Google) <rostedt(a)goodmis.org>
diff --git a/include/linux/ring_buffer.h b/include/linux/ring_buffer.h
index 2504df9a0453..3c7d295746f6 100644
--- a/include/linux/ring_buffer.h
+++ b/include/linux/ring_buffer.h
@@ -100,7 +100,7 @@ __ring_buffer_alloc(unsigned long size, unsigned flags, struct lock_class_key *k
int ring_buffer_wait(struct trace_buffer *buffer, int cpu, int full);
__poll_t ring_buffer_poll_wait(struct trace_buffer *buffer, int cpu,
- struct file *filp, poll_table *poll_table);
+ struct file *filp, poll_table *poll_table, int full);
void ring_buffer_wake_waiters(struct trace_buffer *buffer, int cpu);
#define RING_BUFFER_ALL_CPUS -1
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 9712083832f4..089b1ec9cb3b 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -907,6 +907,21 @@ size_t ring_buffer_nr_dirty_pages(struct trace_buffer *buffer, int cpu)
return cnt - read;
}
+static __always_inline bool full_hit(struct trace_buffer *buffer, int cpu, int full)
+{
+ struct ring_buffer_per_cpu *cpu_buffer = buffer->buffers[cpu];
+ size_t nr_pages;
+ size_t dirty;
+
+ nr_pages = cpu_buffer->nr_pages;
+ if (!nr_pages || !full)
+ return true;
+
+ dirty = ring_buffer_nr_dirty_pages(buffer, cpu);
+
+ return (dirty * 100) > (full * nr_pages);
+}
+
/*
* rb_wake_up_waiters - wake up tasks waiting for ring buffer input
*
@@ -1046,22 +1061,20 @@ int ring_buffer_wait(struct trace_buffer *buffer, int cpu, int full)
!ring_buffer_empty_cpu(buffer, cpu)) {
unsigned long flags;
bool pagebusy;
- size_t nr_pages;
- size_t dirty;
+ bool done;
if (!full)
break;
raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
pagebusy = cpu_buffer->reader_page == cpu_buffer->commit_page;
- nr_pages = cpu_buffer->nr_pages;
- dirty = ring_buffer_nr_dirty_pages(buffer, cpu);
+ done = !pagebusy && full_hit(buffer, cpu, full);
+
if (!cpu_buffer->shortest_full ||
cpu_buffer->shortest_full > full)
cpu_buffer->shortest_full = full;
raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
- if (!pagebusy &&
- (!nr_pages || (dirty * 100) > full * nr_pages))
+ if (done)
break;
}
@@ -1087,6 +1100,7 @@ int ring_buffer_wait(struct trace_buffer *buffer, int cpu, int full)
* @cpu: the cpu buffer to wait on
* @filp: the file descriptor
* @poll_table: The poll descriptor
+ * @full: wait until the percentage of pages are available, if @cpu != RING_BUFFER_ALL_CPUS
*
* If @cpu == RING_BUFFER_ALL_CPUS then the task will wake up as soon
* as data is added to any of the @buffer's cpu buffers. Otherwise
@@ -1096,14 +1110,15 @@ int ring_buffer_wait(struct trace_buffer *buffer, int cpu, int full)
* zero otherwise.
*/
__poll_t ring_buffer_poll_wait(struct trace_buffer *buffer, int cpu,
- struct file *filp, poll_table *poll_table)
+ struct file *filp, poll_table *poll_table, int full)
{
struct ring_buffer_per_cpu *cpu_buffer;
struct rb_irq_work *work;
- if (cpu == RING_BUFFER_ALL_CPUS)
+ if (cpu == RING_BUFFER_ALL_CPUS) {
work = &buffer->irq_work;
- else {
+ full = 0;
+ } else {
if (!cpumask_test_cpu(cpu, buffer->cpumask))
return -EINVAL;
@@ -1111,8 +1126,14 @@ __poll_t ring_buffer_poll_wait(struct trace_buffer *buffer, int cpu,
work = &cpu_buffer->irq_work;
}
- poll_wait(filp, &work->waiters, poll_table);
- work->waiters_pending = true;
+ if (full) {
+ poll_wait(filp, &work->full_waiters, poll_table);
+ work->full_waiters_pending = true;
+ } else {
+ poll_wait(filp, &work->waiters, poll_table);
+ work->waiters_pending = true;
+ }
+
/*
* There's a tight race between setting the waiters_pending and
* checking if the ring buffer is empty. Once the waiters_pending bit
@@ -1128,6 +1149,9 @@ __poll_t ring_buffer_poll_wait(struct trace_buffer *buffer, int cpu,
*/
smp_mb();
+ if (full)
+ return full_hit(buffer, cpu, full) ? EPOLLIN | EPOLLRDNORM : 0;
+
if ((cpu == RING_BUFFER_ALL_CPUS && !ring_buffer_empty(buffer)) ||
(cpu != RING_BUFFER_ALL_CPUS && !ring_buffer_empty_cpu(buffer, cpu)))
return EPOLLIN | EPOLLRDNORM;
@@ -3155,10 +3179,6 @@ static void rb_commit(struct ring_buffer_per_cpu *cpu_buffer,
static __always_inline void
rb_wakeups(struct trace_buffer *buffer, struct ring_buffer_per_cpu *cpu_buffer)
{
- size_t nr_pages;
- size_t dirty;
- size_t full;
-
if (buffer->irq_work.waiters_pending) {
buffer->irq_work.waiters_pending = false;
/* irq_work_queue() supplies it's own memory barriers */
@@ -3182,10 +3202,7 @@ rb_wakeups(struct trace_buffer *buffer, struct ring_buffer_per_cpu *cpu_buffer)
cpu_buffer->last_pages_touch = local_read(&cpu_buffer->pages_touched);
- full = cpu_buffer->shortest_full;
- nr_pages = cpu_buffer->nr_pages;
- dirty = ring_buffer_nr_dirty_pages(buffer, cpu_buffer->cpu);
- if (full && nr_pages && (dirty * 100) <= full * nr_pages)
+ if (!full_hit(buffer, cpu_buffer->cpu, cpu_buffer->shortest_full))
return;
cpu_buffer->irq_work.wakeup_full = true;
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 47a44b055a1d..c6c7a0af3ed2 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -6681,7 +6681,7 @@ trace_poll(struct trace_iterator *iter, struct file *filp, poll_table *poll_tabl
return EPOLLIN | EPOLLRDNORM;
else
return ring_buffer_poll_wait(iter->array_buffer->buffer, iter->cpu_file,
- filp, poll_table);
+ filp, poll_table, iter->tr->buffer_percent);
}
static __poll_t
The patch below does not apply to the 4.19-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
Possible dependencies:
42fb0a1e84ff ("tracing/ring-buffer: Have polling block on watermark")
7e9fbbb1b776 ("ring-buffer: Add ring_buffer_wake_waiters()")
13292494379f ("tracing: Make struct ring_buffer less ambiguous")
1c5eb4481e01 ("tracing: Rename trace_buffer to array_buffer")
2d6425af6116 ("tracing: Declare newly exported APIs in include/linux/trace.h")
a47b53e95acc ("tracing: Rename tracing_reset() to tracing_reset_cpu()")
46cc0b44428d ("tracing/snapshot: Resize spare buffer if size changed")
d2d8b146043a ("Merge tag 'trace-v5.2' of git://git.kernel.org/pub/scm/linux/kernel/git/rostedt/linux-trace")
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 42fb0a1e84ff525ebe560e2baf9451ab69127e2b Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (Google)" <rostedt(a)goodmis.org>
Date: Thu, 20 Oct 2022 23:14:27 -0400
Subject: [PATCH] tracing/ring-buffer: Have polling block on watermark
Currently the way polling works on the ring buffer is broken. It will
return immediately if there's any data in the ring buffer whereas a read
will block until the watermark (defined by the tracefs buffer_percent file)
is hit.
That is, a select() or poll() will return as if there's data available,
but then the following read will block. This is broken for the way
select()s and poll()s are supposed to work.
Have the polling on the ring buffer also block the same way reads and
splice does on the ring buffer.
Link: https://lkml.kernel.org/r/20221020231427.41be3f26@gandalf.local.home
Cc: Linux Trace Kernel <linux-trace-kernel(a)vger.kernel.org>
Cc: Masami Hiramatsu <mhiramat(a)kernel.org>
Cc: Mathieu Desnoyers <mathieu.desnoyers(a)efficios.com>
Cc: Primiano Tucci <primiano(a)google.com>
Cc: stable(a)vger.kernel.org
Fixes: 1e0d6714aceb7 ("ring-buffer: Do not wake up a splice waiter when page is not full")
Signed-off-by: Steven Rostedt (Google) <rostedt(a)goodmis.org>
diff --git a/include/linux/ring_buffer.h b/include/linux/ring_buffer.h
index 2504df9a0453..3c7d295746f6 100644
--- a/include/linux/ring_buffer.h
+++ b/include/linux/ring_buffer.h
@@ -100,7 +100,7 @@ __ring_buffer_alloc(unsigned long size, unsigned flags, struct lock_class_key *k
int ring_buffer_wait(struct trace_buffer *buffer, int cpu, int full);
__poll_t ring_buffer_poll_wait(struct trace_buffer *buffer, int cpu,
- struct file *filp, poll_table *poll_table);
+ struct file *filp, poll_table *poll_table, int full);
void ring_buffer_wake_waiters(struct trace_buffer *buffer, int cpu);
#define RING_BUFFER_ALL_CPUS -1
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 9712083832f4..089b1ec9cb3b 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -907,6 +907,21 @@ size_t ring_buffer_nr_dirty_pages(struct trace_buffer *buffer, int cpu)
return cnt - read;
}
+static __always_inline bool full_hit(struct trace_buffer *buffer, int cpu, int full)
+{
+ struct ring_buffer_per_cpu *cpu_buffer = buffer->buffers[cpu];
+ size_t nr_pages;
+ size_t dirty;
+
+ nr_pages = cpu_buffer->nr_pages;
+ if (!nr_pages || !full)
+ return true;
+
+ dirty = ring_buffer_nr_dirty_pages(buffer, cpu);
+
+ return (dirty * 100) > (full * nr_pages);
+}
+
/*
* rb_wake_up_waiters - wake up tasks waiting for ring buffer input
*
@@ -1046,22 +1061,20 @@ int ring_buffer_wait(struct trace_buffer *buffer, int cpu, int full)
!ring_buffer_empty_cpu(buffer, cpu)) {
unsigned long flags;
bool pagebusy;
- size_t nr_pages;
- size_t dirty;
+ bool done;
if (!full)
break;
raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
pagebusy = cpu_buffer->reader_page == cpu_buffer->commit_page;
- nr_pages = cpu_buffer->nr_pages;
- dirty = ring_buffer_nr_dirty_pages(buffer, cpu);
+ done = !pagebusy && full_hit(buffer, cpu, full);
+
if (!cpu_buffer->shortest_full ||
cpu_buffer->shortest_full > full)
cpu_buffer->shortest_full = full;
raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
- if (!pagebusy &&
- (!nr_pages || (dirty * 100) > full * nr_pages))
+ if (done)
break;
}
@@ -1087,6 +1100,7 @@ int ring_buffer_wait(struct trace_buffer *buffer, int cpu, int full)
* @cpu: the cpu buffer to wait on
* @filp: the file descriptor
* @poll_table: The poll descriptor
+ * @full: wait until the percentage of pages are available, if @cpu != RING_BUFFER_ALL_CPUS
*
* If @cpu == RING_BUFFER_ALL_CPUS then the task will wake up as soon
* as data is added to any of the @buffer's cpu buffers. Otherwise
@@ -1096,14 +1110,15 @@ int ring_buffer_wait(struct trace_buffer *buffer, int cpu, int full)
* zero otherwise.
*/
__poll_t ring_buffer_poll_wait(struct trace_buffer *buffer, int cpu,
- struct file *filp, poll_table *poll_table)
+ struct file *filp, poll_table *poll_table, int full)
{
struct ring_buffer_per_cpu *cpu_buffer;
struct rb_irq_work *work;
- if (cpu == RING_BUFFER_ALL_CPUS)
+ if (cpu == RING_BUFFER_ALL_CPUS) {
work = &buffer->irq_work;
- else {
+ full = 0;
+ } else {
if (!cpumask_test_cpu(cpu, buffer->cpumask))
return -EINVAL;
@@ -1111,8 +1126,14 @@ __poll_t ring_buffer_poll_wait(struct trace_buffer *buffer, int cpu,
work = &cpu_buffer->irq_work;
}
- poll_wait(filp, &work->waiters, poll_table);
- work->waiters_pending = true;
+ if (full) {
+ poll_wait(filp, &work->full_waiters, poll_table);
+ work->full_waiters_pending = true;
+ } else {
+ poll_wait(filp, &work->waiters, poll_table);
+ work->waiters_pending = true;
+ }
+
/*
* There's a tight race between setting the waiters_pending and
* checking if the ring buffer is empty. Once the waiters_pending bit
@@ -1128,6 +1149,9 @@ __poll_t ring_buffer_poll_wait(struct trace_buffer *buffer, int cpu,
*/
smp_mb();
+ if (full)
+ return full_hit(buffer, cpu, full) ? EPOLLIN | EPOLLRDNORM : 0;
+
if ((cpu == RING_BUFFER_ALL_CPUS && !ring_buffer_empty(buffer)) ||
(cpu != RING_BUFFER_ALL_CPUS && !ring_buffer_empty_cpu(buffer, cpu)))
return EPOLLIN | EPOLLRDNORM;
@@ -3155,10 +3179,6 @@ static void rb_commit(struct ring_buffer_per_cpu *cpu_buffer,
static __always_inline void
rb_wakeups(struct trace_buffer *buffer, struct ring_buffer_per_cpu *cpu_buffer)
{
- size_t nr_pages;
- size_t dirty;
- size_t full;
-
if (buffer->irq_work.waiters_pending) {
buffer->irq_work.waiters_pending = false;
/* irq_work_queue() supplies it's own memory barriers */
@@ -3182,10 +3202,7 @@ rb_wakeups(struct trace_buffer *buffer, struct ring_buffer_per_cpu *cpu_buffer)
cpu_buffer->last_pages_touch = local_read(&cpu_buffer->pages_touched);
- full = cpu_buffer->shortest_full;
- nr_pages = cpu_buffer->nr_pages;
- dirty = ring_buffer_nr_dirty_pages(buffer, cpu_buffer->cpu);
- if (full && nr_pages && (dirty * 100) <= full * nr_pages)
+ if (!full_hit(buffer, cpu_buffer->cpu, cpu_buffer->shortest_full))
return;
cpu_buffer->irq_work.wakeup_full = true;
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 47a44b055a1d..c6c7a0af3ed2 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -6681,7 +6681,7 @@ trace_poll(struct trace_iterator *iter, struct file *filp, poll_table *poll_tabl
return EPOLLIN | EPOLLRDNORM;
else
return ring_buffer_poll_wait(iter->array_buffer->buffer, iter->cpu_file,
- filp, poll_table);
+ filp, poll_table, iter->tr->buffer_percent);
}
static __poll_t
This bug is marked as fixed by commit:
net: core: netlink: add helper refcount dec and lock function
net: sched: add helper function to take reference to Qdisc
net: sched: extend Qdisc with rcu
net: sched: rename qdisc_destroy() to qdisc_put()
net: sched: use Qdisc rcu API instead of relying on rtnl lock
But I can't find it in any tested tree for more than 90 days.
Is it a correct commit? Please update it by replying:
#syz fix: exact-commit-title
Until then the bug is still considered open and
new crashes with the same signature are ignored.
When extending segments, nilfs_sufile_alloc() is called to get an
unassigned segment, then mark it as dirty to avoid accidentally
allocating the same segment in the future.
But for some special cases such as a corrupted image it can be
unreliable.
If such corruption of the dirty state of the segment occurs, nilfs2 may
reallocate a segment that is in use and pick the same segment for
writing twice at the same time.
This will cause the problem reported by syzkaller:
https://syzkaller.appspot.com/bug?id=c7c4748e11ffcc367cef04f76e02e931833cbd…
This case started with segbuf1.segnum = 3, nextnum = 4 when constructed.
It supposed segment 4 has already been allocated and marked as dirty.
However the dirty state was corrupted and segment 4 usage was not dirty.
For the first time nilfs_segctor_extend_segments() segment 4 was
allocated again, which made segbuf2 and next segbuf3 had same segment 4.
sb_getblk() will get same bh for segbuf2 and segbuf3, and this bh is
added to both buffer lists of two segbuf. It makes the lists broken
which causes NULL pointer dereference.
Fix the problem by setting usage as dirty every time in
nilfs_sufile_mark_dirty(), which is called during constructing current
segment to be written out and before allocating next segment.
Fixes: 9ff05123e3bf ("nilfs2: segment constructor")
Cc: stable(a)vger.kernel.org
Reported-by: syzbot+77e4f005cb899d4268d1(a)syzkaller.appspotmail.com
Reported-by: Liu Shixin <liushixin2(a)huawei.com>
Signed-off-by: Chen Zhongjin <chenzhongjin(a)huawei.com>
Acked-by: Ryusuke Konishi <konishi.ryusuke(a)gmail.com>
Tested-by: Ryusuke Konishi <konishi.ryusuke(a)gmail.com>
---
v1 -> v2:
1) Add lock protection as Ryusuke suggested and slightly fix commit
message.
2) Fix and add tags.
v2 -> v3:
Fix commit message to make it clear.
---
fs/nilfs2/sufile.c | 8 ++++++++
1 file changed, 8 insertions(+)
diff --git a/fs/nilfs2/sufile.c b/fs/nilfs2/sufile.c
index 77ff8e95421f..dc359b56fdfa 100644
--- a/fs/nilfs2/sufile.c
+++ b/fs/nilfs2/sufile.c
@@ -495,14 +495,22 @@ void nilfs_sufile_do_free(struct inode *sufile, __u64 segnum,
int nilfs_sufile_mark_dirty(struct inode *sufile, __u64 segnum)
{
struct buffer_head *bh;
+ void *kaddr;
+ struct nilfs_segment_usage *su;
int ret;
+ down_write(&NILFS_MDT(sufile)->mi_sem);
ret = nilfs_sufile_get_segment_usage_block(sufile, segnum, 0, &bh);
if (!ret) {
mark_buffer_dirty(bh);
nilfs_mdt_mark_dirty(sufile);
+ kaddr = kmap_atomic(bh->b_page);
+ su = nilfs_sufile_block_get_segment_usage(sufile, segnum, bh, kaddr);
+ nilfs_segment_usage_set_dirty(su);
+ kunmap_atomic(kaddr);
brelse(bh);
}
+ up_write(&NILFS_MDT(sufile)->mi_sem);
return ret;
}
--
2.17.1