When sysfs_create_files() fails in wacom_initialize_remotes() the error
is returned and the cleanup action will not have been registered yet.
As a result the kobject���s refcount is never dropped, so the
kobject can never be freed leading to a reference leak.
Fix this by calling kobject_put() before returning.
Fixes: 83e6b40e2de6 ("HID: wacom: EKR: have the wacom resources dynamically allocated")
Acked-by: Ping Cheng <ping.cheng(a)wacom.com>
Cc: stable(a)vger.kernel.org
Signed-off-by: Qasim Ijaz <qasdev00(a)gmail.com>
---
drivers/hid/wacom_sys.c | 1 +
1 file changed, 1 insertion(+)
diff --git a/drivers/hid/wacom_sys.c b/drivers/hid/wacom_sys.c
index 58cbd43a37e9..1257131b1e34 100644
--- a/drivers/hid/wacom_sys.c
+++ b/drivers/hid/wacom_sys.c
@@ -2059,6 +2059,7 @@ static int wacom_initialize_remotes(struct wacom *wacom)
hid_err(wacom->hdev,
"cannot create sysfs group err: %d\n", error);
kfifo_free(&remote->remote_fifo);
+ kobject_put(remote->remote_dir);
return error;
}
--
2.39.5
During wacom_initialize_remotes() a fifo buffer is allocated
with kfifo_alloc() and later a cleanup action is registered
during devm_add_action_or_reset() to clean it up.
However if the code fails to create a kobject and register it
with sysfs the code simply returns -ENOMEM before the cleanup
action is registered leading to a memory leak.
Fix this by ensuring the fifo is freed when the kobject creation
and registration process fails.
Fixes: 83e6b40e2de6 ("HID: wacom: EKR: have the wacom resources dynamically allocated")
Reviewed-by: Ping Cheng <ping.cheng(a)wacom.com>
Cc: stable(a)vger.kernel.org
Signed-off-by: Qasim Ijaz <qasdev00(a)gmail.com>
---
drivers/hid/wacom_sys.c | 4 +++-
1 file changed, 3 insertions(+), 1 deletion(-)
diff --git a/drivers/hid/wacom_sys.c b/drivers/hid/wacom_sys.c
index eaf099b2efdb..ec5282bc69d6 100644
--- a/drivers/hid/wacom_sys.c
+++ b/drivers/hid/wacom_sys.c
@@ -2048,8 +2048,10 @@ static int wacom_initialize_remotes(struct wacom *wacom)
remote->remote_dir = kobject_create_and_add("wacom_remote",
&wacom->hdev->dev.kobj);
- if (!remote->remote_dir)
+ if (!remote->remote_dir) {
+ kfifo_free(&remote->remote_fifo);
return -ENOMEM;
+ }
error = sysfs_create_files(remote->remote_dir, remote_unpair_attrs);
--
2.39.5
From: Nícolas F. R. A. Prado <nfraprado(a)collabora.com>
commit 1c9977b263475373b31bbf86af94a5c9ae2be42c upstream.
Commit 3ef9f710efcb ("pinctrl: mediatek: Add EINT support for multiple
addresses") introduced an access to the 'soc' field of struct
mtk_pinctrl in mtk_eint_do_init() and for that an include of
pinctrl-mtk-common-v2.h.
However, pinctrl drivers relying on the v1 common driver include
pinctrl-mtk-common.h instead, which provides another definition of
struct mtk_pinctrl that does not contain an 'soc' field.
Since mtk_eint_do_init() can be called both by v1 and v2 drivers, it
will now try to dereference an invalid pointer when called on v1
platforms. This has been observed on Genio 350 EVK (MT8365), which
crashes very early in boot (the kernel trace can only be seen with
earlycon).
In order to fix this, since 'struct mtk_pinctrl' was only needed to get
a 'struct mtk_eint_pin', make 'struct mtk_eint_pin' a parameter
of mtk_eint_do_init() so that callers need to supply it, removing
mtk_eint_do_init()'s dependency on any particular 'struct mtk_pinctrl'.
Fixes: 3ef9f710efcb ("pinctrl: mediatek: Add EINT support for multiple addresses")
Suggested-by: AngeloGioacchino Del Regno <angelogioacchino.delregno(a)collabora.com>
Signed-off-by: Nícolas F. R. A. Prado <nfraprado(a)collabora.com>
Link: https://lore.kernel.org/20250520-genio-350-eint-null-ptr-deref-fix-v2-1-6a3…
Signed-off-by: Linus Walleij <linus.walleij(a)linaro.org>
[ukleinek: backport to 6.15.y]
Signed-off-by: Uwe Kleine-König <u.kleine-koenig(a)baylibre.com>
---
Hello,
would be great to have this in 6.15. Further backporting isn't needed as
3ef9f710efcb == v6.15-rc1~106^2 isn't in 6.14.
This patch fixes booting on mt8365-evk (and probably a few more machines
based on mediatek SoCs.
There was an easy conflict with
86dee87f4b2e6ac119b03810e58723d0b27787a4 in
drivers/pinctrl/mediatek/mtk-eint.c.
Thanks
Uwe
drivers/pinctrl/mediatek/mtk-eint.c | 26 ++++++++-----------
drivers/pinctrl/mediatek/mtk-eint.h | 5 ++--
.../pinctrl/mediatek/pinctrl-mtk-common-v2.c | 2 +-
drivers/pinctrl/mediatek/pinctrl-mtk-common.c | 2 +-
4 files changed, 16 insertions(+), 19 deletions(-)
diff --git a/drivers/pinctrl/mediatek/mtk-eint.c b/drivers/pinctrl/mediatek/mtk-eint.c
index b4eb2beab691..c516c34aaaf6 100644
--- a/drivers/pinctrl/mediatek/mtk-eint.c
+++ b/drivers/pinctrl/mediatek/mtk-eint.c
@@ -22,7 +22,6 @@
#include <linux/platform_device.h>
#include "mtk-eint.h"
-#include "pinctrl-mtk-common-v2.h"
#define MTK_EINT_EDGE_SENSITIVE 0
#define MTK_EINT_LEVEL_SENSITIVE 1
@@ -505,10 +504,9 @@ int mtk_eint_find_irq(struct mtk_eint *eint, unsigned long eint_n)
}
EXPORT_SYMBOL_GPL(mtk_eint_find_irq);
-int mtk_eint_do_init(struct mtk_eint *eint)
+int mtk_eint_do_init(struct mtk_eint *eint, struct mtk_eint_pin *eint_pin)
{
unsigned int size, i, port, inst = 0;
- struct mtk_pinctrl *hw = (struct mtk_pinctrl *)eint->pctl;
/* If clients don't assign a specific regs, let's use generic one */
if (!eint->regs)
@@ -519,7 +517,15 @@ int mtk_eint_do_init(struct mtk_eint *eint)
if (!eint->base_pin_num)
return -ENOMEM;
- if (eint->nbase == 1) {
+ if (eint_pin) {
+ eint->pins = eint_pin;
+ for (i = 0; i < eint->hw->ap_num; i++) {
+ inst = eint->pins[i].instance;
+ if (inst >= eint->nbase)
+ continue;
+ eint->base_pin_num[inst]++;
+ }
+ } else {
size = eint->hw->ap_num * sizeof(struct mtk_eint_pin);
eint->pins = devm_kmalloc(eint->dev, size, GFP_KERNEL);
if (!eint->pins)
@@ -533,16 +539,6 @@ int mtk_eint_do_init(struct mtk_eint *eint)
}
}
- if (hw && hw->soc && hw->soc->eint_pin) {
- eint->pins = hw->soc->eint_pin;
- for (i = 0; i < eint->hw->ap_num; i++) {
- inst = eint->pins[i].instance;
- if (inst >= eint->nbase)
- continue;
- eint->base_pin_num[inst]++;
- }
- }
-
eint->pin_list = devm_kmalloc(eint->dev, eint->nbase * sizeof(u16 *), GFP_KERNEL);
if (!eint->pin_list)
goto err_pin_list;
@@ -610,7 +606,7 @@ int mtk_eint_do_init(struct mtk_eint *eint)
err_wake_mask:
devm_kfree(eint->dev, eint->pin_list);
err_pin_list:
- if (eint->nbase == 1)
+ if (!eint_pin)
devm_kfree(eint->dev, eint->pins);
err_pins:
devm_kfree(eint->dev, eint->base_pin_num);
diff --git a/drivers/pinctrl/mediatek/mtk-eint.h b/drivers/pinctrl/mediatek/mtk-eint.h
index f7f58cca0d5e..23801d4b636f 100644
--- a/drivers/pinctrl/mediatek/mtk-eint.h
+++ b/drivers/pinctrl/mediatek/mtk-eint.h
@@ -88,7 +88,7 @@ struct mtk_eint {
};
#if IS_ENABLED(CONFIG_EINT_MTK)
-int mtk_eint_do_init(struct mtk_eint *eint);
+int mtk_eint_do_init(struct mtk_eint *eint, struct mtk_eint_pin *eint_pin);
int mtk_eint_do_suspend(struct mtk_eint *eint);
int mtk_eint_do_resume(struct mtk_eint *eint);
int mtk_eint_set_debounce(struct mtk_eint *eint, unsigned long eint_n,
@@ -96,7 +96,8 @@ int mtk_eint_set_debounce(struct mtk_eint *eint, unsigned long eint_n,
int mtk_eint_find_irq(struct mtk_eint *eint, unsigned long eint_n);
#else
-static inline int mtk_eint_do_init(struct mtk_eint *eint)
+static inline int mtk_eint_do_init(struct mtk_eint *eint,
+ struct mtk_eint_pin *eint_pin)
{
return -EOPNOTSUPP;
}
diff --git a/drivers/pinctrl/mediatek/pinctrl-mtk-common-v2.c b/drivers/pinctrl/mediatek/pinctrl-mtk-common-v2.c
index d1556b75d9ef..ba13558bfcd7 100644
--- a/drivers/pinctrl/mediatek/pinctrl-mtk-common-v2.c
+++ b/drivers/pinctrl/mediatek/pinctrl-mtk-common-v2.c
@@ -416,7 +416,7 @@ int mtk_build_eint(struct mtk_pinctrl *hw, struct platform_device *pdev)
hw->eint->pctl = hw;
hw->eint->gpio_xlate = &mtk_eint_xt;
- ret = mtk_eint_do_init(hw->eint);
+ ret = mtk_eint_do_init(hw->eint, hw->soc->eint_pin);
if (ret)
goto err_free_eint;
diff --git a/drivers/pinctrl/mediatek/pinctrl-mtk-common.c b/drivers/pinctrl/mediatek/pinctrl-mtk-common.c
index 8596f3541265..7289648eaa02 100644
--- a/drivers/pinctrl/mediatek/pinctrl-mtk-common.c
+++ b/drivers/pinctrl/mediatek/pinctrl-mtk-common.c
@@ -1039,7 +1039,7 @@ static int mtk_eint_init(struct mtk_pinctrl *pctl, struct platform_device *pdev)
pctl->eint->pctl = pctl;
pctl->eint->gpio_xlate = &mtk_eint_xt;
- return mtk_eint_do_init(pctl->eint);
+ return mtk_eint_do_init(pctl->eint, NULL);
}
/* This is used as a common probe function */
--
2.47.2
From: Steven Rostedt <rostedt(a)goodmis.org>
When faultable trace events were added, a trace event may no longer use
normal RCU to synchronize but instead used synchronize_rcu_tasks_trace().
This synchronization takes a much longer time to synchronize.
The filter logic would free the filters by calling
tracepoint_synchronize_unregister() after it unhooked the filter strings
and before freeing them. With this function now calling
synchronize_rcu_tasks_trace() this increased the time to free a filter
tremendously. On a PREEMPT_RT system, it was even more noticeable.
# time trace-cmd record -p function sleep 1
[..]
real 2m29.052s
user 0m0.244s
sys 0m20.136s
As trace-cmd would clear out all the filters before recording, it could
take up to 2 minutes to do a recording of "sleep 1".
To find out where the issues was:
~# trace-cmd sqlhist -e -n sched_stack select start.prev_state as state, end.next_comm as comm, TIMESTAMP_DELTA_USECS as delta, start.STACKTRACE as stack from sched_switch as start join sched_switch as end on start.prev_pid = end.next_pid
Which will produce the following commands (and -e will also execute them):
echo 's:sched_stack s64 state; char comm[16]; u64 delta; unsigned long stack[];' >> /sys/kernel/tracing/dynamic_events
echo 'hist:keys=prev_pid:__arg_18057_2=prev_state,__arg_18057_4=common_timestamp.usecs,__arg_18057_7=common_stacktrace' >> /sys/kernel/tracing/events/sched/sched_switch/trigger
echo 'hist:keys=next_pid:__state_18057_1=$__arg_18057_2,__comm_18057_3=next_comm,__delta_18057_5=common_timestamp.usecs-$__arg_18057_4,__stack_18057_6=$__arg_18057_7:onmatch(sched.sched_switch).trace(sched_stack,$__state_18057_1,$__comm_18057_3,$__delta_18057_5,$__stack_18057_6)' >> /sys/kernel/tracing/events/sched/sched_switch/trigger
The above creates a synthetic event that creates a stack trace when a task
schedules out and records it with the time it scheduled back in. Basically
the time a task is off the CPU. It also records the state of the task when
it left the CPU (running, blocked, sleeping, etc). It also saves the comm
of the task as "comm" (needed for the next command).
~# echo 'hist:keys=state,stack.stacktrace:vals=delta:sort=state,delta if comm == "trace-cmd" && state & 3' > /sys/kernel/tracing/events/synthetic/sched_stack/trigger
The above creates a histogram with buckets per state, per stack, and the
value of the total time it was off the CPU for that stack trace. It filters
on tasks with "comm == trace-cmd" and only the sleeping and blocked states
(1 - sleeping, 2 - blocked).
~# trace-cmd record -p function sleep 1
~# cat /sys/kernel/tracing/events/synthetic/sched_stack/hist | tail -18
{ state: 2, stack.stacktrace __schedule+0x1545/0x3700
schedule+0xe2/0x390
schedule_timeout+0x175/0x200
wait_for_completion_state+0x294/0x440
__wait_rcu_gp+0x247/0x4f0
synchronize_rcu_tasks_generic+0x151/0x230
apply_subsystem_event_filter+0xa2b/0x1300
subsystem_filter_write+0x67/0xc0
vfs_write+0x1e2/0xeb0
ksys_write+0xff/0x1d0
do_syscall_64+0x7b/0x420
entry_SYSCALL_64_after_hwframe+0x76/0x7e
} hitcount: 237 delta: 99756288 <<--------------- Delta is 99 seconds!
Totals:
Hits: 525
Entries: 21
Dropped: 0
This shows that this particular trace waited for 99 seconds on
synchronize_rcu_tasks() in apply_subsystem_event_filter().
In fact, there's a lot of places in the filter code that spends a lot of
time waiting for synchronize_rcu_tasks_trace() in order to free the
filters.
Add helper functions that will use call_rcu*() variants to asynchronously
free the filters. This brings the timings back to normal:
# time trace-cmd record -p function sleep 1
[..]
real 0m14.681s
user 0m0.335s
sys 0m28.616s
And the histogram also shows this:
~# cat /sys/kernel/tracing/events/synthetic/sched_stack/hist | tail -21
{ state: 2, stack.stacktrace __schedule+0x1545/0x3700
schedule+0xe2/0x390
schedule_timeout+0x175/0x200
wait_for_completion_state+0x294/0x440
__wait_rcu_gp+0x247/0x4f0
synchronize_rcu_normal+0x3db/0x5c0
tracing_reset_online_cpus+0x8f/0x1e0
tracing_open+0x335/0x440
do_dentry_open+0x4c6/0x17a0
vfs_open+0x82/0x360
path_openat+0x1a36/0x2990
do_filp_open+0x1c5/0x420
do_sys_openat2+0xed/0x180
__x64_sys_openat+0x108/0x1d0
do_syscall_64+0x7b/0x420
} hitcount: 2 delta: 77044
Totals:
Hits: 55
Entries: 28
Dropped: 0
Where the total waiting time of synchronize_rcu_tasks_trace() is 77
milliseconds.
Cc: stable(a)vger.kernel.org
Cc: Masami Hiramatsu <mhiramat(a)kernel.org>
Cc: Mathieu Desnoyers <mathieu.desnoyers(a)efficios.com>
Cc: "Paul E. McKenney" <paulmck(a)kernel.org>
Cc: "Kiszka, Jan" <jan.kiszka(a)siemens.com>
Cc: "Ziegler, Andreas" <ziegler.andreas(a)siemens.com>
Cc: "MOESSBAUER, Felix" <felix.moessbauer(a)siemens.com>
Link: https://lore.kernel.org/20250605161701.35f7989a@gandalf.local.home
Reported-by: "Flot, Julien" <julien.flot(a)siemens.com>
Tested-by: Julien Flot <julien.flot(a)siemens.com>
Fixes: a363d27cdbc2 ("tracing: Allow system call tracepoints to handle page faults")
Closes: https://lore.kernel.org/all/240017f656631c7dd4017aa93d91f41f653788ea.camel@…
Signed-off-by: Steven Rostedt (Google) <rostedt(a)goodmis.org>
---
kernel/trace/trace_events_filter.c | 164 ++++++++++++++++++++++-------
1 file changed, 127 insertions(+), 37 deletions(-)
diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c
index 2048560264bb..3ff782d6b522 100644
--- a/kernel/trace/trace_events_filter.c
+++ b/kernel/trace/trace_events_filter.c
@@ -1335,6 +1335,74 @@ static void filter_free_subsystem_preds(struct trace_subsystem_dir *dir,
}
}
+struct filter_list {
+ struct list_head list;
+ struct event_filter *filter;
+};
+
+struct filter_head {
+ struct list_head list;
+ struct rcu_head rcu;
+};
+
+
+static void free_filter_list(struct rcu_head *rhp)
+{
+ struct filter_head *filter_list = container_of(rhp, struct filter_head, rcu);
+ struct filter_list *filter_item, *tmp;
+
+ list_for_each_entry_safe(filter_item, tmp, &filter_list->list, list) {
+ __free_filter(filter_item->filter);
+ list_del(&filter_item->list);
+ kfree(filter_item);
+ }
+ kfree(filter_list);
+}
+
+static void free_filter_list_tasks(struct rcu_head *rhp)
+{
+ call_rcu(rhp, free_filter_list);
+}
+
+/*
+ * The tracepoint_synchronize_unregister() is a double rcu call.
+ * It calls synchronize_rcu_tasks_trace() followed by synchronize_rcu().
+ * Instead of waiting for it, simply call these via the call_rcu*()
+ * variants.
+ */
+static void delay_free_filter(struct filter_head *head)
+{
+ call_rcu_tasks_trace(&head->rcu, free_filter_list_tasks);
+}
+
+static void try_delay_free_filter(struct event_filter *filter)
+{
+ struct filter_head *head;
+ struct filter_list *item;
+
+ head = kmalloc(sizeof(*head), GFP_KERNEL);
+ if (!head)
+ goto free_now;
+
+ INIT_LIST_HEAD(&head->list);
+
+ item = kmalloc(sizeof(*item), GFP_KERNEL);
+ if (!item) {
+ kfree(head);
+ goto free_now;
+ }
+
+ item->filter = filter;
+ list_add_tail(&item->list, &head->list);
+ delay_free_filter(head);
+ return;
+
+ free_now:
+ /* Make sure the filter is not being used */
+ tracepoint_synchronize_unregister();
+ __free_filter(filter);
+}
+
static inline void __free_subsystem_filter(struct trace_event_file *file)
{
__free_filter(file->filter);
@@ -1342,15 +1410,53 @@ static inline void __free_subsystem_filter(struct trace_event_file *file)
}
static void filter_free_subsystem_filters(struct trace_subsystem_dir *dir,
- struct trace_array *tr)
+ struct trace_array *tr,
+ struct event_filter *filter)
{
struct trace_event_file *file;
+ struct filter_head *head;
+ struct filter_list *item;
+
+ head = kmalloc(sizeof(*head), GFP_KERNEL);
+ if (!head)
+ goto free_now;
+
+ INIT_LIST_HEAD(&head->list);
+
+ item = kmalloc(sizeof(*item), GFP_KERNEL);
+ if (!item) {
+ kfree(head);
+ goto free_now;
+ }
+
+ item->filter = filter;
+ list_add_tail(&item->list, &head->list);
list_for_each_entry(file, &tr->events, list) {
if (file->system != dir)
continue;
- __free_subsystem_filter(file);
+ item = kmalloc(sizeof(*item), GFP_KERNEL);
+ if (!item)
+ goto free_now;
+ item->filter = file->filter;
+ list_add_tail(&item->list, &head->list);
+ file->filter = NULL;
+ }
+
+ delay_free_filter(head);
+ return;
+ free_now:
+ tracepoint_synchronize_unregister();
+
+ if (head)
+ free_filter_list(&head->rcu);
+
+ list_for_each_entry(file, &tr->events, list) {
+ if (file->system != dir || !file->filter)
+ continue;
+ __free_filter(file->filter);
}
+ __free_filter(filter);
}
int filter_assign_type(const char *type)
@@ -2131,11 +2237,6 @@ static inline void event_clear_filter(struct trace_event_file *file)
RCU_INIT_POINTER(file->filter, NULL);
}
-struct filter_list {
- struct list_head list;
- struct event_filter *filter;
-};
-
static int process_system_preds(struct trace_subsystem_dir *dir,
struct trace_array *tr,
struct filter_parse_error *pe,
@@ -2144,11 +2245,16 @@ static int process_system_preds(struct trace_subsystem_dir *dir,
struct trace_event_file *file;
struct filter_list *filter_item;
struct event_filter *filter = NULL;
- struct filter_list *tmp;
- LIST_HEAD(filter_list);
+ struct filter_head *filter_list;
bool fail = true;
int err;
+ filter_list = kmalloc(sizeof(*filter_list), GFP_KERNEL);
+ if (!filter_list)
+ return -ENOMEM;
+
+ INIT_LIST_HEAD(&filter_list->list);
+
list_for_each_entry(file, &tr->events, list) {
if (file->system != dir)
@@ -2175,7 +2281,7 @@ static int process_system_preds(struct trace_subsystem_dir *dir,
if (!filter_item)
goto fail_mem;
- list_add_tail(&filter_item->list, &filter_list);
+ list_add_tail(&filter_item->list, &filter_list->list);
/*
* Regardless of if this returned an error, we still
* replace the filter for the call.
@@ -2195,31 +2301,22 @@ static int process_system_preds(struct trace_subsystem_dir *dir,
* Do a synchronize_rcu() and to ensure all calls are
* done with them before we free them.
*/
- tracepoint_synchronize_unregister();
- list_for_each_entry_safe(filter_item, tmp, &filter_list, list) {
- __free_filter(filter_item->filter);
- list_del(&filter_item->list);
- kfree(filter_item);
- }
+ delay_free_filter(filter_list);
return 0;
fail:
/* No call succeeded */
- list_for_each_entry_safe(filter_item, tmp, &filter_list, list) {
- list_del(&filter_item->list);
- kfree(filter_item);
- }
+ free_filter_list(&filter_list->rcu);
parse_error(pe, FILT_ERR_BAD_SUBSYS_FILTER, 0);
return -EINVAL;
fail_mem:
__free_filter(filter);
+
/* If any call succeeded, we still need to sync */
if (!fail)
- tracepoint_synchronize_unregister();
- list_for_each_entry_safe(filter_item, tmp, &filter_list, list) {
- __free_filter(filter_item->filter);
- list_del(&filter_item->list);
- kfree(filter_item);
- }
+ delay_free_filter(filter_list);
+ else
+ free_filter_list(&filter_list->rcu);
+
return -ENOMEM;
}
@@ -2361,9 +2458,7 @@ int apply_event_filter(struct trace_event_file *file, char *filter_string)
event_clear_filter(file);
- /* Make sure the filter is not being used */
- tracepoint_synchronize_unregister();
- __free_filter(filter);
+ try_delay_free_filter(filter);
return 0;
}
@@ -2387,11 +2482,8 @@ int apply_event_filter(struct trace_event_file *file, char *filter_string)
event_set_filter(file, filter);
- if (tmp) {
- /* Make sure the call is done with the filter */
- tracepoint_synchronize_unregister();
- __free_filter(tmp);
- }
+ if (tmp)
+ try_delay_free_filter(tmp);
}
return err;
@@ -2417,9 +2509,7 @@ int apply_subsystem_event_filter(struct trace_subsystem_dir *dir,
filter = system->filter;
system->filter = NULL;
/* Ensure all filters are no longer used */
- tracepoint_synchronize_unregister();
- filter_free_subsystem_filters(dir, tr);
- __free_filter(filter);
+ filter_free_subsystem_filters(dir, tr, filter);
return 0;
}
--
2.47.2
The first patch is a fix with an explanation of the issue, you should
read that first.
The second patch adds a comment to document the rules because figuring
this out from scratch causes brain pain.
Accidentally hitting this issue and getting negative consequences from
it would require several stars to line up just right; but if someone out
there is using a malloc() implementation that uses lockless data
structures across threads or such, this could actually be a problem.
In case someone wants a testcase, here's a very artificial one:
```
#include <pthread.h>
#include <err.h>
#include <stdio.h>
#include <unistd.h>
#include <sys/syscall.h>
#include <sys/uio.h>
#include <sys/mman.h>
#include <sys/wait.h>
#include <linux/io_uring.h>
#define SYSCHK(x) ({ \
typeof(x) __res = (x); \
if (__res == (typeof(x))-1) \
err(1, "SYSCHK(" #x ")"); \
__res; \
})
#define NUM_SQ_PAGES 4
static int uring_init(struct io_uring_sqe **sqesp, void **cqesp) {
struct io_uring_sqe *sqes = SYSCHK(mmap(NULL, NUM_SQ_PAGES*0x1000, PROT_READ|PROT_WRITE, MAP_SHARED|MAP_ANONYMOUS, -1, 0));
void *cqes = SYSCHK(mmap(NULL, NUM_SQ_PAGES*0x1000, PROT_READ|PROT_WRITE, MAP_SHARED|MAP_ANONYMOUS, -1, 0));
*(volatile unsigned int *)(cqes+4) = 64 * NUM_SQ_PAGES;
struct io_uring_params params = {
.flags = IORING_SETUP_NO_MMAP|IORING_SETUP_NO_SQARRAY,
.sq_off = { .user_addr = (unsigned long)sqes },
.cq_off = { .user_addr = (unsigned long)cqes }
};
int uring_fd = SYSCHK(syscall(__NR_io_uring_setup, /*entries=*/10, ¶ms));
if (sqesp)
*sqesp = sqes;
if (cqesp)
*cqesp = cqes;
return uring_fd;
}
static char *bufmem[0x3000] __attribute__((aligned(0x1000)));
static void *thread_fn(void *dummy) {
unsigned long i = 0;
while (1) {
*(volatile unsigned long *)(bufmem + 0x0000) = i;
*(volatile unsigned long *)(bufmem + 0x0f00) = i;
*(volatile unsigned long *)(bufmem + 0x1000) = i;
*(volatile unsigned long *)(bufmem + 0x1f00) = i;
*(volatile unsigned long *)(bufmem + 0x2000) = i;
*(volatile unsigned long *)(bufmem + 0x2f00) = i;
i++;
}
}
int main(void) {
#if 1
int uring_fd = uring_init(NULL, NULL);
struct iovec reg_iov = { .iov_base = bufmem, .iov_len = 0x2000 };
SYSCHK(syscall(__NR_io_uring_register, uring_fd, IORING_REGISTER_BUFFERS, ®_iov, 1));
#endif
pthread_t thread;
if (pthread_create(&thread, NULL, thread_fn, NULL))
errx(1, "pthread_create");
sleep(1);
int child = SYSCHK(fork());
if (child == 0) {
printf("bufmem values:\n");
printf(" 0x0000: 0x%lx\n", *(volatile unsigned long *)(bufmem + 0x0000));
printf(" 0x0f00: 0x%lx\n", *(volatile unsigned long *)(bufmem + 0x0f00));
printf(" 0x1000: 0x%lx\n", *(volatile unsigned long *)(bufmem + 0x1000));
printf(" 0x1f00: 0x%lx\n", *(volatile unsigned long *)(bufmem + 0x1f00));
printf(" 0x2000: 0x%lx\n", *(volatile unsigned long *)(bufmem + 0x2000));
printf(" 0x2f00: 0x%lx\n", *(volatile unsigned long *)(bufmem + 0x2f00));
return 0;
}
int wstatus;
SYSCHK(wait(&wstatus));
return 0;
}
```
Without this series, the child will usually print results that are
apart by more than 1, which is not a state that ever occurred in
the parent; in my opinion, that counts as a bug.
If you change the "#if 1" to "#if 0", the bug won't manifest.
Signed-off-by: Jann Horn <jannh(a)google.com>
---
Jann Horn (2):
mm/memory: ensure fork child sees coherent memory snapshot
mm/memory: Document how we make a coherent memory snapshot
kernel/fork.c | 34 ++++++++++++++++++++++++++++++++++
mm/memory.c | 18 ++++++++++++++++++
2 files changed, 52 insertions(+)
---
base-commit: 8477ab143069c6b05d6da4a8184ded8b969240f5
change-id: 20250530-fork-tearing-71da211a50cf
--
Jann Horn <jannh(a)google.com>