The patch below does not apply to the 6.8-stable tree. If someone wants it applied there, or to any other stable or longterm tree, then please email the backport, including the original git commit id to stable@vger.kernel.org.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-6.8.y git checkout FETCH_HEAD git cherry-pick -x 1a80dbcb2dbaf6e4c216e62e30fa7d3daa8001ce # <resolve conflicts, build, test, etc.> git commit -s git send-email --to 'stable@vger.kernel.org' --in-reply-to '2024040527-propeller-immovably-a6d8@gregkh' --subject-prefix 'PATCH 6.8.y' HEAD^..
Possible dependencies:
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 1a80dbcb2dbaf6e4c216e62e30fa7d3daa8001ce Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko andrii@kernel.org Date: Wed, 27 Mar 2024 22:24:26 -0700 Subject: [PATCH] bpf: support deferring bpf_link dealloc to after RCU grace period
BPF link for some program types is passed as a "context" which can be used by those BPF programs to look up additional information. E.g., for multi-kprobes and multi-uprobes, link is used to fetch BPF cookie values.
Because of this runtime dependency, when bpf_link refcnt drops to zero there could still be active BPF programs running accessing link data.
This patch adds generic support to defer bpf_link dealloc callback to after RCU GP, if requested. This is done by exposing two different deallocation callbacks, one synchronous and one deferred. If deferred one is provided, bpf_link_free() will schedule dealloc_deferred() callback to happen after RCU GP.
BPF is using two flavors of RCU: "classic" non-sleepable one and RCU tasks trace one. The latter is used when sleepable BPF programs are used. bpf_link_free() accommodates that by checking underlying BPF program's sleepable flag, and goes either through normal RCU GP only for non-sleepable, or through RCU tasks trace GP *and* then normal RCU GP (taking into account rcu_trace_implies_rcu_gp() optimization), if BPF program is sleepable.
We use this for multi-kprobe and multi-uprobe links, which dereference link during program run. We also preventively switch raw_tp link to use deferred dealloc callback, as upcoming changes in bpf-next tree expose raw_tp link data (specifically, cookie value) to BPF program at runtime as well.
Fixes: 0dcac2725406 ("bpf: Add multi kprobe link") Fixes: 89ae89f53d20 ("bpf: Add multi uprobe link") Reported-by: syzbot+981935d9485a560bfbcb@syzkaller.appspotmail.com Reported-by: syzbot+2cb5a6c573e98db598cc@syzkaller.appspotmail.com Reported-by: syzbot+62d8b26793e8a2bd0516@syzkaller.appspotmail.com Signed-off-by: Andrii Nakryiko andrii@kernel.org Acked-by: Jiri Olsa jolsa@kernel.org Link: https://lore.kernel.org/r/20240328052426.3042617-2-andrii@kernel.org Signed-off-by: Alexei Starovoitov ast@kernel.org
diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 4f20f62f9d63..890e152d553e 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1574,12 +1574,26 @@ struct bpf_link { enum bpf_link_type type; const struct bpf_link_ops *ops; struct bpf_prog *prog; - struct work_struct work; + /* rcu is used before freeing, work can be used to schedule that + * RCU-based freeing before that, so they never overlap + */ + union { + struct rcu_head rcu; + struct work_struct work; + }; };
struct bpf_link_ops { void (*release)(struct bpf_link *link); + /* deallocate link resources callback, called without RCU grace period + * waiting + */ void (*dealloc)(struct bpf_link *link); + /* deallocate link resources callback, called after RCU grace period; + * if underlying BPF program is sleepable we go through tasks trace + * RCU GP and then "classic" RCU GP + */ + void (*dealloc_deferred)(struct bpf_link *link); int (*detach)(struct bpf_link *link); int (*update_prog)(struct bpf_link *link, struct bpf_prog *new_prog, struct bpf_prog *old_prog); diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index ae2ff73bde7e..c287925471f6 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -3024,17 +3024,46 @@ void bpf_link_inc(struct bpf_link *link) atomic64_inc(&link->refcnt); }
+static void bpf_link_defer_dealloc_rcu_gp(struct rcu_head *rcu) +{ + struct bpf_link *link = container_of(rcu, struct bpf_link, rcu); + + /* free bpf_link and its containing memory */ + link->ops->dealloc_deferred(link); +} + +static void bpf_link_defer_dealloc_mult_rcu_gp(struct rcu_head *rcu) +{ + if (rcu_trace_implies_rcu_gp()) + bpf_link_defer_dealloc_rcu_gp(rcu); + else + call_rcu(rcu, bpf_link_defer_dealloc_rcu_gp); +} + /* bpf_link_free is guaranteed to be called from process context */ static void bpf_link_free(struct bpf_link *link) { + bool sleepable = false; + bpf_link_free_id(link->id); if (link->prog) { + sleepable = link->prog->sleepable; /* detach BPF program, clean up used resources */ link->ops->release(link); bpf_prog_put(link->prog); } - /* free bpf_link and its containing memory */ - link->ops->dealloc(link); + if (link->ops->dealloc_deferred) { + /* schedule BPF link deallocation; if underlying BPF program + * is sleepable, we need to first wait for RCU tasks trace + * sync, then go through "classic" RCU grace period + */ + if (sleepable) + call_rcu_tasks_trace(&link->rcu, bpf_link_defer_dealloc_mult_rcu_gp); + else + call_rcu(&link->rcu, bpf_link_defer_dealloc_rcu_gp); + } + if (link->ops->dealloc) + link->ops->dealloc(link); }
static void bpf_link_put_deferred(struct work_struct *work) @@ -3544,7 +3573,7 @@ static int bpf_raw_tp_link_fill_link_info(const struct bpf_link *link,
static const struct bpf_link_ops bpf_raw_tp_link_lops = { .release = bpf_raw_tp_link_release, - .dealloc = bpf_raw_tp_link_dealloc, + .dealloc_deferred = bpf_raw_tp_link_dealloc, .show_fdinfo = bpf_raw_tp_link_show_fdinfo, .fill_link_info = bpf_raw_tp_link_fill_link_info, }; diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 0b73fe5f7206..9dc605f08a23 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -2728,7 +2728,7 @@ static int bpf_kprobe_multi_link_fill_link_info(const struct bpf_link *link,
static const struct bpf_link_ops bpf_kprobe_multi_link_lops = { .release = bpf_kprobe_multi_link_release, - .dealloc = bpf_kprobe_multi_link_dealloc, + .dealloc_deferred = bpf_kprobe_multi_link_dealloc, .fill_link_info = bpf_kprobe_multi_link_fill_link_info, };
@@ -3242,7 +3242,7 @@ static int bpf_uprobe_multi_link_fill_link_info(const struct bpf_link *link,
static const struct bpf_link_ops bpf_uprobe_multi_link_lops = { .release = bpf_uprobe_multi_link_release, - .dealloc = bpf_uprobe_multi_link_dealloc, + .dealloc_deferred = bpf_uprobe_multi_link_dealloc, .fill_link_info = bpf_uprobe_multi_link_fill_link_info, };
On Fri, Apr 5, 2024 at 2:50 AM gregkh@linuxfoundation.org wrote:
The patch below does not apply to the 6.8-stable tree. If someone wants it applied there, or to any other stable or longterm tree, then please email the backport, including the original git commit id to stable@vger.kernel.org.
This is strange, I get a clean auto-merged cherry-pick:
$ git co linux-6.8.y Updating files: 100% (10694/10694), done. branch 'linux-6.8.y' set up to track 'stable/linux-6.8.y'. Switched to a new branch 'linux-6.8.y' $ git cp -x 1a80dbcb2dbaf6e4c216e62e30fa7d3daa8001ce Auto-merging include/linux/bpf.h Auto-merging kernel/bpf/syscall.c Auto-merging kernel/trace/bpf_trace.c [linux-6.8.y fd74c60792f5] bpf: support deferring bpf_link dealloc to after RCU grace period Date: Wed Mar 27 22:24:26 2024 -0700 3 files changed, 49 insertions(+), 6 deletions(-)
Note that e9c856cabefb ("bpf: put uprobe link's path and task in release callback") has to be backported at the same time. I'll cherry-pick both and will send it just in case.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-6.8.y git checkout FETCH_HEAD git cherry-pick -x 1a80dbcb2dbaf6e4c216e62e30fa7d3daa8001ce # <resolve conflicts, build, test, etc.> git commit -s git send-email --to 'stable@vger.kernel.org' --in-reply-to '2024040527-propeller-immovably-a6d8@gregkh' --subject-prefix 'PATCH 6.8.y' HEAD^..
Possible dependencies:
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 1a80dbcb2dbaf6e4c216e62e30fa7d3daa8001ce Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko andrii@kernel.org Date: Wed, 27 Mar 2024 22:24:26 -0700 Subject: [PATCH] bpf: support deferring bpf_link dealloc to after RCU grace period
BPF link for some program types is passed as a "context" which can be used by those BPF programs to look up additional information. E.g., for multi-kprobes and multi-uprobes, link is used to fetch BPF cookie values.
Because of this runtime dependency, when bpf_link refcnt drops to zero there could still be active BPF programs running accessing link data.
This patch adds generic support to defer bpf_link dealloc callback to after RCU GP, if requested. This is done by exposing two different deallocation callbacks, one synchronous and one deferred. If deferred one is provided, bpf_link_free() will schedule dealloc_deferred() callback to happen after RCU GP.
BPF is using two flavors of RCU: "classic" non-sleepable one and RCU tasks trace one. The latter is used when sleepable BPF programs are used. bpf_link_free() accommodates that by checking underlying BPF program's sleepable flag, and goes either through normal RCU GP only for non-sleepable, or through RCU tasks trace GP *and* then normal RCU GP (taking into account rcu_trace_implies_rcu_gp() optimization), if BPF program is sleepable.
We use this for multi-kprobe and multi-uprobe links, which dereference link during program run. We also preventively switch raw_tp link to use deferred dealloc callback, as upcoming changes in bpf-next tree expose raw_tp link data (specifically, cookie value) to BPF program at runtime as well.
Fixes: 0dcac2725406 ("bpf: Add multi kprobe link") Fixes: 89ae89f53d20 ("bpf: Add multi uprobe link") Reported-by: syzbot+981935d9485a560bfbcb@syzkaller.appspotmail.com Reported-by: syzbot+2cb5a6c573e98db598cc@syzkaller.appspotmail.com Reported-by: syzbot+62d8b26793e8a2bd0516@syzkaller.appspotmail.com Signed-off-by: Andrii Nakryiko andrii@kernel.org Acked-by: Jiri Olsa jolsa@kernel.org Link: https://lore.kernel.org/r/20240328052426.3042617-2-andrii@kernel.org Signed-off-by: Alexei Starovoitov ast@kernel.org
diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 4f20f62f9d63..890e152d553e 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1574,12 +1574,26 @@ struct bpf_link { enum bpf_link_type type; const struct bpf_link_ops *ops; struct bpf_prog *prog;
struct work_struct work;
/* rcu is used before freeing, work can be used to schedule that
* RCU-based freeing before that, so they never overlap
*/
union {
struct rcu_head rcu;
struct work_struct work;
};
};
struct bpf_link_ops { void (*release)(struct bpf_link *link);
/* deallocate link resources callback, called without RCU grace period
* waiting
*/ void (*dealloc)(struct bpf_link *link);
/* deallocate link resources callback, called after RCU grace period;
* if underlying BPF program is sleepable we go through tasks trace
* RCU GP and then "classic" RCU GP
*/
void (*dealloc_deferred)(struct bpf_link *link); int (*detach)(struct bpf_link *link); int (*update_prog)(struct bpf_link *link, struct bpf_prog *new_prog, struct bpf_prog *old_prog);
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index ae2ff73bde7e..c287925471f6 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -3024,17 +3024,46 @@ void bpf_link_inc(struct bpf_link *link) atomic64_inc(&link->refcnt); }
+static void bpf_link_defer_dealloc_rcu_gp(struct rcu_head *rcu) +{
struct bpf_link *link = container_of(rcu, struct bpf_link, rcu);
/* free bpf_link and its containing memory */
link->ops->dealloc_deferred(link);
+}
+static void bpf_link_defer_dealloc_mult_rcu_gp(struct rcu_head *rcu) +{
if (rcu_trace_implies_rcu_gp())
bpf_link_defer_dealloc_rcu_gp(rcu);
else
call_rcu(rcu, bpf_link_defer_dealloc_rcu_gp);
+}
/* bpf_link_free is guaranteed to be called from process context */ static void bpf_link_free(struct bpf_link *link) {
bool sleepable = false;
bpf_link_free_id(link->id); if (link->prog) {
sleepable = link->prog->sleepable; /* detach BPF program, clean up used resources */ link->ops->release(link); bpf_prog_put(link->prog); }
/* free bpf_link and its containing memory */
link->ops->dealloc(link);
if (link->ops->dealloc_deferred) {
/* schedule BPF link deallocation; if underlying BPF program
* is sleepable, we need to first wait for RCU tasks trace
* sync, then go through "classic" RCU grace period
*/
if (sleepable)
call_rcu_tasks_trace(&link->rcu, bpf_link_defer_dealloc_mult_rcu_gp);
else
call_rcu(&link->rcu, bpf_link_defer_dealloc_rcu_gp);
}
if (link->ops->dealloc)
link->ops->dealloc(link);
}
static void bpf_link_put_deferred(struct work_struct *work) @@ -3544,7 +3573,7 @@ static int bpf_raw_tp_link_fill_link_info(const struct bpf_link *link,
static const struct bpf_link_ops bpf_raw_tp_link_lops = { .release = bpf_raw_tp_link_release,
.dealloc = bpf_raw_tp_link_dealloc,
.dealloc_deferred = bpf_raw_tp_link_dealloc, .show_fdinfo = bpf_raw_tp_link_show_fdinfo, .fill_link_info = bpf_raw_tp_link_fill_link_info,
}; diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 0b73fe5f7206..9dc605f08a23 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -2728,7 +2728,7 @@ static int bpf_kprobe_multi_link_fill_link_info(const struct bpf_link *link,
static const struct bpf_link_ops bpf_kprobe_multi_link_lops = { .release = bpf_kprobe_multi_link_release,
.dealloc = bpf_kprobe_multi_link_dealloc,
.dealloc_deferred = bpf_kprobe_multi_link_dealloc, .fill_link_info = bpf_kprobe_multi_link_fill_link_info,
};
@@ -3242,7 +3242,7 @@ static int bpf_uprobe_multi_link_fill_link_info(const struct bpf_link *link,
static const struct bpf_link_ops bpf_uprobe_multi_link_lops = { .release = bpf_uprobe_multi_link_release,
.dealloc = bpf_uprobe_multi_link_dealloc,
.dealloc_deferred = bpf_uprobe_multi_link_dealloc, .fill_link_info = bpf_uprobe_multi_link_fill_link_info,
};
On Fri, Apr 5, 2024 at 9:15 AM Andrii Nakryiko andrii.nakryiko@gmail.com wrote:
On Fri, Apr 5, 2024 at 2:50 AM gregkh@linuxfoundation.org wrote:
The patch below does not apply to the 6.8-stable tree. If someone wants it applied there, or to any other stable or longterm tree, then please email the backport, including the original git commit id to stable@vger.kernel.org.
This is strange, I get a clean auto-merged cherry-pick:
$ git co linux-6.8.y Updating files: 100% (10694/10694), done. branch 'linux-6.8.y' set up to track 'stable/linux-6.8.y'. Switched to a new branch 'linux-6.8.y' $ git cp -x 1a80dbcb2dbaf6e4c216e62e30fa7d3daa8001ce Auto-merging include/linux/bpf.h Auto-merging kernel/bpf/syscall.c Auto-merging kernel/trace/bpf_trace.c [linux-6.8.y fd74c60792f5] bpf: support deferring bpf_link dealloc to after RCU grace period Date: Wed Mar 27 22:24:26 2024 -0700 3 files changed, 49 insertions(+), 6 deletions(-)
Note that e9c856cabefb ("bpf: put uprobe link's path and task in release callback") has to be backported at the same time. I'll cherry-pick both and will send it just in case.
Ah, so it doesn't build (trivial link->prog->sleepable -> link->prog->aux->sleepable change, will fix up). Not sure if possible, but it would be nice to distinguish between patch not applying vs it causing build (or test) failures, but no big deal.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-6.8.y git checkout FETCH_HEAD git cherry-pick -x 1a80dbcb2dbaf6e4c216e62e30fa7d3daa8001ce # <resolve conflicts, build, test, etc.> git commit -s git send-email --to 'stable@vger.kernel.org' --in-reply-to '2024040527-propeller-immovably-a6d8@gregkh' --subject-prefix 'PATCH 6.8.y' HEAD^..
Possible dependencies:
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 1a80dbcb2dbaf6e4c216e62e30fa7d3daa8001ce Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko andrii@kernel.org Date: Wed, 27 Mar 2024 22:24:26 -0700 Subject: [PATCH] bpf: support deferring bpf_link dealloc to after RCU grace period
BPF link for some program types is passed as a "context" which can be used by those BPF programs to look up additional information. E.g., for multi-kprobes and multi-uprobes, link is used to fetch BPF cookie values.
Because of this runtime dependency, when bpf_link refcnt drops to zero there could still be active BPF programs running accessing link data.
This patch adds generic support to defer bpf_link dealloc callback to after RCU GP, if requested. This is done by exposing two different deallocation callbacks, one synchronous and one deferred. If deferred one is provided, bpf_link_free() will schedule dealloc_deferred() callback to happen after RCU GP.
BPF is using two flavors of RCU: "classic" non-sleepable one and RCU tasks trace one. The latter is used when sleepable BPF programs are used. bpf_link_free() accommodates that by checking underlying BPF program's sleepable flag, and goes either through normal RCU GP only for non-sleepable, or through RCU tasks trace GP *and* then normal RCU GP (taking into account rcu_trace_implies_rcu_gp() optimization), if BPF program is sleepable.
We use this for multi-kprobe and multi-uprobe links, which dereference link during program run. We also preventively switch raw_tp link to use deferred dealloc callback, as upcoming changes in bpf-next tree expose raw_tp link data (specifically, cookie value) to BPF program at runtime as well.
Fixes: 0dcac2725406 ("bpf: Add multi kprobe link") Fixes: 89ae89f53d20 ("bpf: Add multi uprobe link") Reported-by: syzbot+981935d9485a560bfbcb@syzkaller.appspotmail.com Reported-by: syzbot+2cb5a6c573e98db598cc@syzkaller.appspotmail.com Reported-by: syzbot+62d8b26793e8a2bd0516@syzkaller.appspotmail.com Signed-off-by: Andrii Nakryiko andrii@kernel.org Acked-by: Jiri Olsa jolsa@kernel.org Link: https://lore.kernel.org/r/20240328052426.3042617-2-andrii@kernel.org Signed-off-by: Alexei Starovoitov ast@kernel.org
diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 4f20f62f9d63..890e152d553e 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1574,12 +1574,26 @@ struct bpf_link { enum bpf_link_type type; const struct bpf_link_ops *ops; struct bpf_prog *prog;
struct work_struct work;
/* rcu is used before freeing, work can be used to schedule that
* RCU-based freeing before that, so they never overlap
*/
union {
struct rcu_head rcu;
struct work_struct work;
};
};
struct bpf_link_ops { void (*release)(struct bpf_link *link);
/* deallocate link resources callback, called without RCU grace period
* waiting
*/ void (*dealloc)(struct bpf_link *link);
/* deallocate link resources callback, called after RCU grace period;
* if underlying BPF program is sleepable we go through tasks trace
* RCU GP and then "classic" RCU GP
*/
void (*dealloc_deferred)(struct bpf_link *link); int (*detach)(struct bpf_link *link); int (*update_prog)(struct bpf_link *link, struct bpf_prog *new_prog, struct bpf_prog *old_prog);
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index ae2ff73bde7e..c287925471f6 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -3024,17 +3024,46 @@ void bpf_link_inc(struct bpf_link *link) atomic64_inc(&link->refcnt); }
+static void bpf_link_defer_dealloc_rcu_gp(struct rcu_head *rcu) +{
struct bpf_link *link = container_of(rcu, struct bpf_link, rcu);
/* free bpf_link and its containing memory */
link->ops->dealloc_deferred(link);
+}
+static void bpf_link_defer_dealloc_mult_rcu_gp(struct rcu_head *rcu) +{
if (rcu_trace_implies_rcu_gp())
bpf_link_defer_dealloc_rcu_gp(rcu);
else
call_rcu(rcu, bpf_link_defer_dealloc_rcu_gp);
+}
/* bpf_link_free is guaranteed to be called from process context */ static void bpf_link_free(struct bpf_link *link) {
bool sleepable = false;
bpf_link_free_id(link->id); if (link->prog) {
sleepable = link->prog->sleepable; /* detach BPF program, clean up used resources */ link->ops->release(link); bpf_prog_put(link->prog); }
/* free bpf_link and its containing memory */
link->ops->dealloc(link);
if (link->ops->dealloc_deferred) {
/* schedule BPF link deallocation; if underlying BPF program
* is sleepable, we need to first wait for RCU tasks trace
* sync, then go through "classic" RCU grace period
*/
if (sleepable)
call_rcu_tasks_trace(&link->rcu, bpf_link_defer_dealloc_mult_rcu_gp);
else
call_rcu(&link->rcu, bpf_link_defer_dealloc_rcu_gp);
}
if (link->ops->dealloc)
link->ops->dealloc(link);
}
static void bpf_link_put_deferred(struct work_struct *work) @@ -3544,7 +3573,7 @@ static int bpf_raw_tp_link_fill_link_info(const struct bpf_link *link,
static const struct bpf_link_ops bpf_raw_tp_link_lops = { .release = bpf_raw_tp_link_release,
.dealloc = bpf_raw_tp_link_dealloc,
.dealloc_deferred = bpf_raw_tp_link_dealloc, .show_fdinfo = bpf_raw_tp_link_show_fdinfo, .fill_link_info = bpf_raw_tp_link_fill_link_info,
}; diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 0b73fe5f7206..9dc605f08a23 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -2728,7 +2728,7 @@ static int bpf_kprobe_multi_link_fill_link_info(const struct bpf_link *link,
static const struct bpf_link_ops bpf_kprobe_multi_link_lops = { .release = bpf_kprobe_multi_link_release,
.dealloc = bpf_kprobe_multi_link_dealloc,
.dealloc_deferred = bpf_kprobe_multi_link_dealloc, .fill_link_info = bpf_kprobe_multi_link_fill_link_info,
};
@@ -3242,7 +3242,7 @@ static int bpf_uprobe_multi_link_fill_link_info(const struct bpf_link *link,
static const struct bpf_link_ops bpf_uprobe_multi_link_lops = { .release = bpf_uprobe_multi_link_release,
.dealloc = bpf_uprobe_multi_link_dealloc,
.dealloc_deferred = bpf_uprobe_multi_link_dealloc, .fill_link_info = bpf_uprobe_multi_link_fill_link_info,
};
On Fri, Apr 05, 2024 at 09:21:15AM -0700, Andrii Nakryiko wrote:
On Fri, Apr 5, 2024 at 9:15 AM Andrii Nakryiko andrii.nakryiko@gmail.com wrote:
On Fri, Apr 5, 2024 at 2:50 AM gregkh@linuxfoundation.org wrote:
The patch below does not apply to the 6.8-stable tree. If someone wants it applied there, or to any other stable or longterm tree, then please email the backport, including the original git commit id to stable@vger.kernel.org.
This is strange, I get a clean auto-merged cherry-pick:
$ git co linux-6.8.y Updating files: 100% (10694/10694), done. branch 'linux-6.8.y' set up to track 'stable/linux-6.8.y'. Switched to a new branch 'linux-6.8.y' $ git cp -x 1a80dbcb2dbaf6e4c216e62e30fa7d3daa8001ce Auto-merging include/linux/bpf.h Auto-merging kernel/bpf/syscall.c Auto-merging kernel/trace/bpf_trace.c [linux-6.8.y fd74c60792f5] bpf: support deferring bpf_link dealloc to after RCU grace period Date: Wed Mar 27 22:24:26 2024 -0700 3 files changed, 49 insertions(+), 6 deletions(-)
Note that e9c856cabefb ("bpf: put uprobe link's path and task in release callback") has to be backported at the same time. I'll cherry-pick both and will send it just in case.
Ah, so it doesn't build (trivial link->prog->sleepable -> link->prog->aux->sleepable change, will fix up). Not sure if possible, but it would be nice to distinguish between patch not applying vs it causing build (or test) failures, but no big deal.
I only have 1 script, sorry :)
thanks,
greg k-h
There is no need to delay putting either path or task to deallocation step. It can be done right after bpf_uprobe_unregister. Between release and dealloc, there could be still some running BPF programs, but they don't access either task or path, only data in link->uprobes, so it is safe to do.
On the other hand, doing path_put() in dealloc callback makes this dealloc sleepable because path_put() itself might sleep. Which is problematic due to the need to call uprobe's dealloc through call_rcu(), which is what is done in the next bug fix patch. So solve the problem by releasing these resources early.
Signed-off-by: Andrii Nakryiko andrii@kernel.org Link: https://lore.kernel.org/r/20240328052426.3042617-1-andrii@kernel.org Signed-off-by: Alexei Starovoitov ast@kernel.org (cherry picked from commit e9c856cabefb71d47b2eeb197f72c9c88e9b45b0) --- kernel/trace/bpf_trace.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-)
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 7ac6c52b25eb..45de8a4923e2 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -3142,6 +3142,9 @@ static void bpf_uprobe_multi_link_release(struct bpf_link *link)
umulti_link = container_of(link, struct bpf_uprobe_multi_link, link); bpf_uprobe_unregister(&umulti_link->path, umulti_link->uprobes, umulti_link->cnt); + if (umulti_link->task) + put_task_struct(umulti_link->task); + path_put(&umulti_link->path); }
static void bpf_uprobe_multi_link_dealloc(struct bpf_link *link) @@ -3149,9 +3152,6 @@ static void bpf_uprobe_multi_link_dealloc(struct bpf_link *link) struct bpf_uprobe_multi_link *umulti_link;
umulti_link = container_of(link, struct bpf_uprobe_multi_link, link); - if (umulti_link->task) - put_task_struct(umulti_link->task); - path_put(&umulti_link->path); kvfree(umulti_link->uprobes); kfree(umulti_link); }
BPF link for some program types is passed as a "context" which can be used by those BPF programs to look up additional information. E.g., for multi-kprobes and multi-uprobes, link is used to fetch BPF cookie values.
Because of this runtime dependency, when bpf_link refcnt drops to zero there could still be active BPF programs running accessing link data.
This patch adds generic support to defer bpf_link dealloc callback to after RCU GP, if requested. This is done by exposing two different deallocation callbacks, one synchronous and one deferred. If deferred one is provided, bpf_link_free() will schedule dealloc_deferred() callback to happen after RCU GP.
BPF is using two flavors of RCU: "classic" non-sleepable one and RCU tasks trace one. The latter is used when sleepable BPF programs are used. bpf_link_free() accommodates that by checking underlying BPF program's sleepable flag, and goes either through normal RCU GP only for non-sleepable, or through RCU tasks trace GP *and* then normal RCU GP (taking into account rcu_trace_implies_rcu_gp() optimization), if BPF program is sleepable.
We use this for multi-kprobe and multi-uprobe links, which dereference link during program run. We also preventively switch raw_tp link to use deferred dealloc callback, as upcoming changes in bpf-next tree expose raw_tp link data (specifically, cookie value) to BPF program at runtime as well.
Fixes: 0dcac2725406 ("bpf: Add multi kprobe link") Fixes: 89ae89f53d20 ("bpf: Add multi uprobe link") Reported-by: syzbot+981935d9485a560bfbcb@syzkaller.appspotmail.com Reported-by: syzbot+2cb5a6c573e98db598cc@syzkaller.appspotmail.com Reported-by: syzbot+62d8b26793e8a2bd0516@syzkaller.appspotmail.com Signed-off-by: Andrii Nakryiko andrii@kernel.org Acked-by: Jiri Olsa jolsa@kernel.org Link: https://lore.kernel.org/r/20240328052426.3042617-2-andrii@kernel.org Signed-off-by: Alexei Starovoitov ast@kernel.org (cherry picked from commit 1a80dbcb2dbaf6e4c216e62e30fa7d3daa8001ce) --- include/linux/bpf.h | 16 +++++++++++++++- kernel/bpf/syscall.c | 35 ++++++++++++++++++++++++++++++++--- kernel/trace/bpf_trace.c | 4 ++-- 3 files changed, 49 insertions(+), 6 deletions(-)
diff --git a/include/linux/bpf.h b/include/linux/bpf.h index e30100597d0a..7d719976b564 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1568,12 +1568,26 @@ struct bpf_link { enum bpf_link_type type; const struct bpf_link_ops *ops; struct bpf_prog *prog; - struct work_struct work; + /* rcu is used before freeing, work can be used to schedule that + * RCU-based freeing before that, so they never overlap + */ + union { + struct rcu_head rcu; + struct work_struct work; + }; };
struct bpf_link_ops { void (*release)(struct bpf_link *link); + /* deallocate link resources callback, called without RCU grace period + * waiting + */ void (*dealloc)(struct bpf_link *link); + /* deallocate link resources callback, called after RCU grace period; + * if underlying BPF program is sleepable we go through tasks trace + * RCU GP and then "classic" RCU GP + */ + void (*dealloc_deferred)(struct bpf_link *link); int (*detach)(struct bpf_link *link); int (*update_prog)(struct bpf_link *link, struct bpf_prog *new_prog, struct bpf_prog *old_prog); diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index a1f18681721c..0f90b6b27430 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -2895,17 +2895,46 @@ void bpf_link_inc(struct bpf_link *link) atomic64_inc(&link->refcnt); }
+static void bpf_link_defer_dealloc_rcu_gp(struct rcu_head *rcu) +{ + struct bpf_link *link = container_of(rcu, struct bpf_link, rcu); + + /* free bpf_link and its containing memory */ + link->ops->dealloc_deferred(link); +} + +static void bpf_link_defer_dealloc_mult_rcu_gp(struct rcu_head *rcu) +{ + if (rcu_trace_implies_rcu_gp()) + bpf_link_defer_dealloc_rcu_gp(rcu); + else + call_rcu(rcu, bpf_link_defer_dealloc_rcu_gp); +} + /* bpf_link_free is guaranteed to be called from process context */ static void bpf_link_free(struct bpf_link *link) { + bool sleepable = false; + bpf_link_free_id(link->id); if (link->prog) { + sleepable = link->prog->aux->sleepable; /* detach BPF program, clean up used resources */ link->ops->release(link); bpf_prog_put(link->prog); } - /* free bpf_link and its containing memory */ - link->ops->dealloc(link); + if (link->ops->dealloc_deferred) { + /* schedule BPF link deallocation; if underlying BPF program + * is sleepable, we need to first wait for RCU tasks trace + * sync, then go through "classic" RCU grace period + */ + if (sleepable) + call_rcu_tasks_trace(&link->rcu, bpf_link_defer_dealloc_mult_rcu_gp); + else + call_rcu(&link->rcu, bpf_link_defer_dealloc_rcu_gp); + } + if (link->ops->dealloc) + link->ops->dealloc(link); }
static void bpf_link_put_deferred(struct work_struct *work) @@ -3415,7 +3444,7 @@ static int bpf_raw_tp_link_fill_link_info(const struct bpf_link *link,
static const struct bpf_link_ops bpf_raw_tp_link_lops = { .release = bpf_raw_tp_link_release, - .dealloc = bpf_raw_tp_link_dealloc, + .dealloc_deferred = bpf_raw_tp_link_dealloc, .show_fdinfo = bpf_raw_tp_link_show_fdinfo, .fill_link_info = bpf_raw_tp_link_fill_link_info, }; diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 45de8a4923e2..c8d1ebc43838 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -2713,7 +2713,7 @@ static int bpf_kprobe_multi_link_fill_link_info(const struct bpf_link *link,
static const struct bpf_link_ops bpf_kprobe_multi_link_lops = { .release = bpf_kprobe_multi_link_release, - .dealloc = bpf_kprobe_multi_link_dealloc, + .dealloc_deferred = bpf_kprobe_multi_link_dealloc, .fill_link_info = bpf_kprobe_multi_link_fill_link_info, };
@@ -3227,7 +3227,7 @@ static int bpf_uprobe_multi_link_fill_link_info(const struct bpf_link *link,
static const struct bpf_link_ops bpf_uprobe_multi_link_lops = { .release = bpf_uprobe_multi_link_release, - .dealloc = bpf_uprobe_multi_link_dealloc, + .dealloc_deferred = bpf_uprobe_multi_link_dealloc, .fill_link_info = bpf_uprobe_multi_link_fill_link_info, };
linux-stable-mirror@lists.linaro.org