Recently we committed a fix to allow processes to receive notifications for non-zero exits via the process connector module. Commit is a4c9a56e6a2c.
However, for threads, when it does a pthread_exit(&exit_status) call, the kernel is not aware of the exit status with which pthread_exit is called. It is sent by child thread to the parent process, if it is waiting in pthread_join(). Hence, for a thread exiting abnormally, kernel cannot send notifications to any listening processes.
The exception to this is if the thread is sent a signal which it has not handled, and dies along with it's process as a result; for eg. SIGSEGV or SIGKILL. In this case, kernel is aware of the non-zero exit and sends a notification for it.
For our use case, we cannot have parent wait in pthread_join, one of the main reasons for this being that we do not want to track normal pthread_exit(), which could be a very large number. We only want to be notified of any abnormal exits. Hence, threads are created with pthread_attr_t set to PTHREAD_CREATE_DETACHED.
To fix this problem, we add a new type PROC_CN_MCAST_NOTIFY to proc connector API, which allows a thread to send it's exit status to kernel either when it needs to call pthread_exit() with non-zero value to indicate some error or from signal handler before pthread_exit().
Anjali Kulkarni (2): connector/cn_proc: Handle threads for proc connector connector/cn_proc: Selftest for threads case
drivers/connector/cn_proc.c | 11 ++- include/linux/cn_proc.h | 5 +- include/uapi/linux/cn_proc.h | 4 +- kernel/exit.c | 5 +- tools/testing/selftests/connector/Makefile | 23 ++++- .../testing/selftests/connector/proc_filter.c | 5 + tools/testing/selftests/connector/thread.c | 87 +++++++++++++++++ .../selftests/connector/thread_filter.c | 93 +++++++++++++++++++ 8 files changed, 226 insertions(+), 7 deletions(-) create mode 100644 tools/testing/selftests/connector/thread.c create mode 100644 tools/testing/selftests/connector/thread_filter.c
Add a new type PROC_CN_MCAST_NOTIFY to proc connector API, which allows a thread to notify the kernel that it has exited abnormally. Thread can also send the exit status code it wants returned in the notification with it. Exiting thread can call this either when it wants to call pthread_exit() with non-zero value or from signal handler.
Once kernel receives this, it saves this exit status in the thread's exit_code field of task_struct. This field is then checked when the thread actually exits, and if non-zero, it is copied to the notification to be sent.
Signed-off-by: Anjali Kulkarni anjali.k.kulkarni@oracle.com --- drivers/connector/cn_proc.c | 11 +++++++++-- include/linux/cn_proc.h | 5 +++-- include/uapi/linux/cn_proc.h | 4 +++- kernel/exit.c | 5 ++++- 4 files changed, 19 insertions(+), 6 deletions(-)
diff --git a/drivers/connector/cn_proc.c b/drivers/connector/cn_proc.c index 44b19e696176..4c38b9bf4f2f 100644 --- a/drivers/connector/cn_proc.c +++ b/drivers/connector/cn_proc.c @@ -320,7 +320,7 @@ void proc_coredump_connector(struct task_struct *task) send_msg(msg); }
-void proc_exit_connector(struct task_struct *task) +void proc_exit_connector(struct task_struct *task, __u32 uexit_code) { struct cn_msg *msg; struct proc_event *ev; @@ -337,7 +337,10 @@ void proc_exit_connector(struct task_struct *task) ev->what = PROC_EVENT_EXIT; ev->event_data.exit.process_pid = task->pid; ev->event_data.exit.process_tgid = task->tgid; - ev->event_data.exit.exit_code = task->exit_code; + if (task->exit_code == 0) + ev->event_data.exit.exit_code = uexit_code; + else + ev->event_data.exit.exit_code = task->exit_code; ev->event_data.exit.exit_signal = task->exit_signal;
rcu_read_lock(); @@ -413,6 +416,10 @@ static void cn_proc_mcast_ctl(struct cn_msg *msg, if (msg->len == sizeof(*pinput)) { pinput = (struct proc_input *)msg->data; mc_op = pinput->mcast_op; + if (mc_op == PROC_CN_MCAST_NOTIFY) { + current->exit_code = pinput->uexit_code; + return; + } ev_type = pinput->event_type; } else if (msg->len == sizeof(mc_op)) { mc_op = *((enum proc_cn_mcast_op *)msg->data); diff --git a/include/linux/cn_proc.h b/include/linux/cn_proc.h index 1d5b02a96c46..fc1d75897cc7 100644 --- a/include/linux/cn_proc.h +++ b/include/linux/cn_proc.h @@ -27,7 +27,7 @@ void proc_sid_connector(struct task_struct *task); void proc_ptrace_connector(struct task_struct *task, int which_id); void proc_comm_connector(struct task_struct *task); void proc_coredump_connector(struct task_struct *task); -void proc_exit_connector(struct task_struct *task); +void proc_exit_connector(struct task_struct *task, __u32 uexit_code); #else static inline void proc_fork_connector(struct task_struct *task) {} @@ -52,7 +52,8 @@ static inline void proc_ptrace_connector(struct task_struct *task, static inline void proc_coredump_connector(struct task_struct *task) {}
-static inline void proc_exit_connector(struct task_struct *task) +static inline void proc_exit_connector(struct task_struct *task, + __u32 uexit_code) {} #endif /* CONFIG_PROC_EVENTS */ #endif /* CN_PROC_H */ diff --git a/include/uapi/linux/cn_proc.h b/include/uapi/linux/cn_proc.h index 18e3745b86cd..2b12a24e4651 100644 --- a/include/uapi/linux/cn_proc.h +++ b/include/uapi/linux/cn_proc.h @@ -27,7 +27,8 @@ */ enum proc_cn_mcast_op { PROC_CN_MCAST_LISTEN = 1, - PROC_CN_MCAST_IGNORE = 2 + PROC_CN_MCAST_IGNORE = 2, + PROC_CN_MCAST_NOTIFY = 3 };
#define PROC_EVENT_ALL (PROC_EVENT_FORK | PROC_EVENT_EXEC | PROC_EVENT_UID | \ @@ -65,6 +66,7 @@ enum proc_cn_event { struct proc_input { enum proc_cn_mcast_op mcast_op; enum proc_cn_event event_type; + __u32 uexit_code; };
static inline enum proc_cn_event valid_event(enum proc_cn_event ev_type) diff --git a/kernel/exit.c b/kernel/exit.c index 7430852a8571..e2698ebe59ea 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -821,6 +821,7 @@ void __noreturn do_exit(long code) { struct task_struct *tsk = current; int group_dead; + __u32 uexit_code;
WARN_ON(irqs_disabled());
@@ -863,6 +864,8 @@ void __noreturn do_exit(long code) tty_audit_exit(); audit_free(tsk);
+ uexit_code = tsk->exit_code; + tsk->exit_code = code; taskstats_exit(tsk, group_dead);
@@ -900,7 +903,7 @@ void __noreturn do_exit(long code)
exit_tasks_rcu_start(); exit_notify(tsk, group_dead); - proc_exit_connector(tsk); + proc_exit_connector(tsk, uexit_code); mpol_put_task_policy(tsk); #ifdef CONFIG_FUTEX if (unlikely(current->pi_state_cache))
On 09/19, Anjali Kulkarni wrote:
@@ -413,6 +416,10 @@ static void cn_proc_mcast_ctl(struct cn_msg *msg, if (msg->len == sizeof(*pinput)) { pinput = (struct proc_input *)msg->data; mc_op = pinput->mcast_op;
if (mc_op == PROC_CN_MCAST_NOTIFY) {
current->exit_code = pinput->uexit_code;
return;
...
--- a/kernel/exit.c +++ b/kernel/exit.c @@ -821,6 +821,7 @@ void __noreturn do_exit(long code) { struct task_struct *tsk = current; int group_dead;
- __u32 uexit_code;
WARN_ON(irqs_disabled()); @@ -863,6 +864,8 @@ void __noreturn do_exit(long code) tty_audit_exit(); audit_free(tsk);
- uexit_code = tsk->exit_code;
I don't think you can use task_struct->exit_code. If this task is ptraced, it can be changed/cleared in, say, ptrace_stop() after PROC_CN_MCAST_NOTIFY.
Oleg.
On Sep 20, 2024, at 4:00 AM, Oleg Nesterov oleg@redhat.com wrote:
On 09/19, Anjali Kulkarni wrote:
@@ -413,6 +416,10 @@ static void cn_proc_mcast_ctl(struct cn_msg *msg, if (msg->len == sizeof(*pinput)) { pinput = (struct proc_input *)msg->data; mc_op = pinput->mcast_op;
if (mc_op == PROC_CN_MCAST_NOTIFY) {
current->exit_code = pinput->uexit_code;
return;
...
--- a/kernel/exit.c +++ b/kernel/exit.c @@ -821,6 +821,7 @@ void __noreturn do_exit(long code) { struct task_struct *tsk = current; int group_dead;
__u32 uexit_code;
WARN_ON(irqs_disabled());
@@ -863,6 +864,8 @@ void __noreturn do_exit(long code) tty_audit_exit(); audit_free(tsk);
- uexit_code = tsk->exit_code;
I don't think you can use task_struct->exit_code. If this task is ptraced, it can be changed/cleared in, say, ptrace_stop() after PROC_CN_MCAST_NOTIFY.
Thank you, that’s a good point! However, the use case of ptrace, which I assume is for mostly debug and tracing, is exclusive of the use case I am using it for - for production and mostly scaling scenarios. That is, I assume ptrace calls can be done only to your own processes (except superuser), so the tracing process should understand and only do one(ptrace) or the other (request for a exit notification by using a system call) and not both? I could add a comment or something which describes this somewhere. Another point is - if an exit_code is modified, it will anyways be overwritten in the do_exit() call - so it’s not clear to me what the purpose of writing that field would be for ptrace_stop() or any other function…? Is there any other reason for ptrace_stop() to modify task_struct->exit_code?
Anjali
Oleg.
On 09/20, Anjali Kulkarni wrote:
On Sep 20, 2024, at 4:00 AM, Oleg Nesterov oleg@redhat.com wrote:
I don't think you can use task_struct->exit_code. If this task is ptraced, it can be changed/cleared in, say, ptrace_stop() after PROC_CN_MCAST_NOTIFY.
Thank you, that’s a good point! However, the use case of ptrace, which I assume is for mostly debug and tracing, is exclusive of the use case I am using it for
Well. I don't understand your use-case. Or any other use-case for drivers/connector/ that I know nothing about. But this is irrelevant.
The new PROC_CN_MCAST_NOTIFY functionality you propose should work regardless of whether this task is ptraced or not. But it doesn't because the usage of ->exit_code in your patch conflicts with the current usage of this field.
So, NACK, sorry.
Oleg.
On Sep 20, 2024, at 11:44 AM, Oleg Nesterov oleg@redhat.com wrote:
On 09/20, Anjali Kulkarni wrote:
On Sep 20, 2024, at 4:00 AM, Oleg Nesterov oleg@redhat.com wrote:
I don't think you can use task_struct->exit_code. If this task is ptraced, it can be changed/cleared in, say, ptrace_stop() after PROC_CN_MCAST_NOTIFY.
Thank you, that’s a good point! However, the use case of ptrace, which I assume is for mostly debug and tracing, is exclusive of the use case I am using it for
Well. I don't understand your use-case. Or any other use-case for drivers/connector/ that I know nothing about. But this is irrelevant.
The new PROC_CN_MCAST_NOTIFY functionality you propose should work regardless of
Yes, agreed.
whether this task is ptraced or not. But it doesn't because the usage of ->exit_code in your patch conflicts with the current usage of this field.
Ok, I see that ptrace_stop() seems to be using it in much the same way I want to use it - as a temporary place to store some values. Since in do_exit(), exit_code is overwritten, I didn’t think anyone was using it. Could I add a new field in the task_struct to store my value? (I don’t think there is any other unused/field I can use temporarily)
Anjali
So, NACK, sorry.
Oleg.
Test to check if setting PROC_CN_MCAST_NOTIFY in proc connector API, allows a thread's non-zero exit status to be returned to proc_filter.
The threads.c program creates 2 child threads. 1st thread handles signal SIGSEGV, and 2nd thread needs to indicate some error condition (value 1) to the kernel, instead of using pthread_exit() with 1.
In both cases, child sends notify_netlink_thread_exit(exit_code) to kernel, to let kernel know it has exited abnormally with exit_code.
Compile: make thread make proc_filter To see non-zero exit notifications, run: ./proc_filter -f Run threads code in another window: ./threads Note the 2 child thread IDs reported above Send SIGSEGV signal to the child handling SIGSEGV: kill -11 <child1-tid> Watch the child 1 tid being notified with exit code 11 to proc_filter Watch child 2 tid being notified with exit code 1 (value defined in code) to proc_filter
Signed-off-by: Anjali Kulkarni anjali.k.kulkarni@oracle.com --- tools/testing/selftests/connector/Makefile | 23 ++++- .../testing/selftests/connector/proc_filter.c | 5 + tools/testing/selftests/connector/thread.c | 87 +++++++++++++++++ .../selftests/connector/thread_filter.c | 93 +++++++++++++++++++ 4 files changed, 207 insertions(+), 1 deletion(-) create mode 100644 tools/testing/selftests/connector/thread.c create mode 100644 tools/testing/selftests/connector/thread_filter.c
diff --git a/tools/testing/selftests/connector/Makefile b/tools/testing/selftests/connector/Makefile index 92188b9bac5c..453f5fbcdfb1 100644 --- a/tools/testing/selftests/connector/Makefile +++ b/tools/testing/selftests/connector/Makefile @@ -1,5 +1,26 @@ # SPDX-License-Identifier: GPL-2.0 -CFLAGS += -Wall $(KHDR_INCLUDES) +KERNEL="../../../.." + +CFLAGS += -Wall $(KHDR_INCLUDES) -I $(KERNEL)/include/uapi -I $(KERNEL)/include + +proc_filter: proc_filter.o + cc proc_filter.o -o proc_filter + +proc_filter.o: proc_filter.c + cc -c proc_filter.c -o proc_filter.o $(CFLAGS) + +thread: thread.o thread_filter.o + cc thread.o thread_filter.o -o thread + +thread.o: thread.c $(DEPS) + cc -c thread.c -o thread.o $(CFLAGS) + +thread_filter.o: thread_filter.c + cc -c thread_filter.c -o thread_filter.o $(CFLAGS) + +define EXTRA_CLEAN + rm *.o +endef
TEST_GEN_PROGS = proc_filter
diff --git a/tools/testing/selftests/connector/proc_filter.c b/tools/testing/selftests/connector/proc_filter.c index 4a825b997666..6fb4842894f8 100644 --- a/tools/testing/selftests/connector/proc_filter.c +++ b/tools/testing/selftests/connector/proc_filter.c @@ -1,4 +1,9 @@ // SPDX-License-Identifier: GPL-2.0-only +/* + * Author: Anjali Kulkarni anjali.k.kulkarni@oracle.com + * + * Copyright (c) 2024 Oracle and/or its affiliates. + */
#include <sys/types.h> #include <sys/epoll.h> diff --git a/tools/testing/selftests/connector/thread.c b/tools/testing/selftests/connector/thread.c new file mode 100644 index 000000000000..e20f209c980c --- /dev/null +++ b/tools/testing/selftests/connector/thread.c @@ -0,0 +1,87 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Author: Anjali Kulkarni anjali.k.kulkarni@oracle.com + * + * Copyright (c) 2024 Oracle and/or its affiliates. + */ + +#include <pthread.h> +#include <stdio.h> +#include <unistd.h> +#include <stdlib.h> +#include <signal.h> + +/* + * This code tests a thread exit notification when thread exits abnormally. + * Normally, when a thread exits abnormally, the kernel is not aware of the + * exit code. This is usually only conveyed from child to parent via the + * pthread_exit() and pthread_join() calls. Sometimes, however, a parent + * process cannot monitor all child processes via pthread_join(), particularly + * when there is a huge amount of child processes. In this case, the parent + * has created the child with PTHREAD_CREATE_DETACHED attribute. + * To fix this problem, either when child wants to convey non-zero exit via + * pthread_exit() or in a signal handler, the child can notify the kernel's + * connector module it's exit status via a netlink call with new type + * PROC_CN_MCAST_NOTIFY. (Implemented in the thread_filter.c file). + * This will send the exit code from the child to the kernel, which the kernel + * can later return to proc_filter program when the child actually exits. + * To test this usecase: + * Compile: + * make thread + * make proc_filter + * To see non-zero exit notifications, run: + * ./proc_filter -f + * Start the threads code, creating 2 threads, in another window: + * ./threads + * Note the 2 child thread IDs reported above + * Send SIGSEGV signal to the child handling SIGSEGV: + * kill -11 <child1-tid> + * Watch the event being notified with exit code 11 to proc_filter + * Watch child 2 tid being notified with exit code 1 (value defined in code) + * to proc_filter + */ + +extern int notify_netlink_thread_exit(unsigned int exit_code); + +static void sigsegvh(int sig) +{ + unsigned int exit_code = (unsigned int) sig; + /* + * Send any non-zero value to get a notification. Here we are + * sending the signal number for SIGSEGV which is 11 + */ + notify_netlink_thread_exit(exit_code); +} + +void *threadc1(void *ptr) +{ + signal(SIGSEGV, sigsegvh); + printf("Child 1 thread id %d, handling SIGSEGV\n", gettid()); + sleep(50); + pthread_exit(NULL); +} + +void *threadc2(void *ptr) +{ + printf("Child 2 thread id %d\n", gettid()); + sleep(2); + notify_netlink_thread_exit(1); + pthread_exit(NULL); +} + +int main(int argc, char **argv) +{ + pthread_t thread1, thread2; + pthread_attr_t attr1, attr2; + + pthread_attr_init(&attr1); + pthread_attr_setdetachstate(&attr1, PTHREAD_CREATE_DETACHED); + pthread_create(&thread1, &attr1, *threadc1, NULL); + + pthread_attr_init(&attr2); + pthread_attr_setdetachstate(&attr2, PTHREAD_CREATE_DETACHED); + pthread_create(&thread2, &attr2, *threadc2, NULL); + + sleep(50); + exit(0); +} diff --git a/tools/testing/selftests/connector/thread_filter.c b/tools/testing/selftests/connector/thread_filter.c new file mode 100644 index 000000000000..4b666004313b --- /dev/null +++ b/tools/testing/selftests/connector/thread_filter.c @@ -0,0 +1,93 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Author: Anjali Kulkarni anjali.k.kulkarni@oracle.com + * + * Copyright (c) 2024 Oracle and/or its affiliates. + */ + +#include <sys/types.h> +#include <sys/epoll.h> +#include <sys/socket.h> +#include <linux/netlink.h> +#include <linux/connector.h> +#include <linux/cn_proc.h> + +#include <stddef.h> +#include <stdio.h> +#include <stdlib.h> +#include <unistd.h> +#include <strings.h> +#include <errno.h> +#include <signal.h> +#include <string.h> + +#define NL_MESSAGE_SIZE (sizeof(struct nlmsghdr) + sizeof(struct cn_msg) + \ + sizeof(struct proc_input)) + +/* + * Send PROC_CN_MCAST_NOTIFY type notification to the connector code in kernel. + * This will send the exit_code specified by user to the connector layer, so + * it can send a notification for that event to any listening process + */ +int send_message(int nl_sock, unsigned int exit_code) +{ + char buff[NL_MESSAGE_SIZE]; + struct nlmsghdr *hdr; + struct cn_msg *msg; + + hdr = (struct nlmsghdr *)buff; + hdr->nlmsg_len = NL_MESSAGE_SIZE; + hdr->nlmsg_type = NLMSG_DONE; + hdr->nlmsg_flags = 0; + hdr->nlmsg_seq = 0; + hdr->nlmsg_pid = getpid(); + + msg = (struct cn_msg *)NLMSG_DATA(hdr); + msg->id.idx = CN_IDX_PROC; + msg->id.val = CN_VAL_PROC; + msg->seq = 0; + msg->ack = 0; + msg->flags = 0; + + msg->len = sizeof(struct proc_input); + ((struct proc_input *)msg->data)->mcast_op = + PROC_CN_MCAST_NOTIFY; + ((struct proc_input *)msg->data)->uexit_code = exit_code; + + if (send(nl_sock, hdr, hdr->nlmsg_len, 0) == -1) { + perror("send failed"); + return -errno; + } + return 0; +} + +int notify_netlink_thread_exit(unsigned int exit_code) +{ + struct sockaddr_nl sa_nl; + int err = 0; + int nl_sock; + + nl_sock = socket(PF_NETLINK, SOCK_DGRAM, NETLINK_CONNECTOR); + + if (nl_sock == -1) { + perror("socket failed"); + return -errno; + } + + bzero(&sa_nl, sizeof(sa_nl)); + sa_nl.nl_family = AF_NETLINK; + sa_nl.nl_groups = CN_IDX_PROC; + sa_nl.nl_pid = gettid(); + + if (bind(nl_sock, (struct sockaddr *)&sa_nl, sizeof(sa_nl)) == -1) { + perror("bind failed"); + return -errno; + } + + err = send_message(nl_sock, exit_code); + + if (err < 0) + return err; + + return 0; +}
Hi Anjali,
kernel test robot noticed the following build warnings:
[auto build test WARNING on net-next/main]
url: https://github.com/intel-lab-lkp/linux/commits/Anjali-Kulkarni/connector-cn_... base: net-next/main patch link: https://lore.kernel.org/r/20240920000933.185090-3-anjali.k.kulkarni%40oracle... patch subject: [PATCH net-next 2/2] connector/cn_proc: Selftest for threads case :::::: branch date: 2 days ago :::::: commit date: 2 days ago compiler: gcc-12 (Debian 12.2.0-14) 12.2.0 reproduce (this is a W=1 build): (https://download.01.org/0day-ci/archive/20240921/202409212201.l94GHFkW-lkp@i...)
If you fix the issue in a separate patch/commit (i.e. not just a new version of the same patch/commit), kindly add following tags | Reported-by: kernel test robot lkp@intel.com | Closes: https://lore.kernel.org/r/202409212201.l94GHFkW-lkp@intel.com/
All warnings (new ones prefixed by >>):
In file included from ../../../../include/uapi/linux/netlink.h:7, from proc_filter.c:11:
../../../../include/uapi/linux/types.h:10:2: warning: #warning "Attempt to use kernel headers from user space, see https://kernelnewbies.org/KernelHeaders" [-Wcpp]
10 | #warning "Attempt to use kernel headers from user space, see https://kernelnewbies.org/KernelHeaders" | ^~~~~~~
linux-kselftest-mirror@lists.linaro.org