Skip to content

Commit 07c3ef5

Browse files
committed
Merge tag 'vfs-7.1-rc1.pidfs' of git://git.kernel.org/pub/scm/linux/kernel/git/vfs/vfs
Pull clone and pidfs updates from Christian Brauner: "Add three new clone3() flags for pidfd-based process lifecycle management. CLONE_AUTOREAP: CLONE_AUTOREAP makes a child process auto-reap on exit without ever becoming a zombie. This is a per-process property in contrast to the existing auto-reap mechanism via SA_NOCLDWAIT or SIG_IGN for SIGCHLD which applies to all children of a given parent. Currently the only way to automatically reap children is to set SA_NOCLDWAIT or SIG_IGN on SIGCHLD. This is a parent-scoped property affecting all children which makes it unsuitable for libraries or applications that need selective auto-reaping of specific children while still being able to wait() on others. CLONE_AUTOREAP stores an autoreap flag in the child's signal_struct. When the child exits do_notify_parent() checks this flag and causes exit_notify() to transition the task directly to EXIT_DEAD. Since the flag lives on the child it survives reparenting: if the original parent exits and the child is reparented to a subreaper or init the child still auto-reaps when it eventually exits. This is cleaner than forcing the subreaper to get SIGCHLD and then reaping it. If the parent doesn't care the subreaper won't care. If there's a subreaper that would care it would be easy enough to add a prctl() that either just turns back on SIGCHLD and turns off auto-reaping or a prctl() that just notifies the subreaper whenever a child is reparented to it. CLONE_AUTOREAP can be combined with CLONE_PIDFD to allow the parent to monitor the child's exit via poll() and retrieve exit status via PIDFD_GET_INFO. Without CLONE_PIDFD it provides a fire-and-forget pattern. No exit signal is delivered so exit_signal must be zero. CLONE_THREAD and CLONE_PARENT are rejected: CLONE_THREAD because autoreap is a process-level property, and CLONE_PARENT because an autoreap child reparented via CLONE_PARENT could become an invisible zombie under a parent that never calls wait(). The flag is not inherited by the autoreap process's own children. Each child that should be autoreaped must be explicitly created with CLONE_AUTOREAP. CLONE_NNP: CLONE_NNP sets no_new_privs on the child at clone time. Unlike prctl(PR_SET_NO_NEW_PRIVS) which a process sets on itself, CLONE_NNP allows the parent to impose no_new_privs on the child at creation without affecting the parent's own privileges. CLONE_THREAD is rejected because threads share credentials. CLONE_NNP is useful on its own for any spawn-and-sandbox pattern but was specifically introduced to enable unprivileged usage of CLONE_PIDFD_AUTOKILL. CLONE_PIDFD_AUTOKILL: This flag ties a child's lifetime to the pidfd returned from clone3(). When the last reference to the struct file created by clone3() is closed the kernel sends SIGKILL to the child. A pidfd obtained via pidfd_open() for the same process does not keep the child alive and does not trigger autokill - only the specific struct file from clone3() has this property. This is useful for container runtimes, service managers, and sandboxed subprocess execution - any scenario where the child must die if the parent crashes or abandons the pidfd or just wants a throwaway helper process. CLONE_PIDFD_AUTOKILL requires both CLONE_PIDFD and CLONE_AUTOREAP. It requires CLONE_PIDFD because the whole point is tying the child's lifetime to the pidfd. It requires CLONE_AUTOREAP because a killed child with no one to reap it would become a zombie - the primary use case is the parent crashing or abandoning the pidfd so no one is around to call waitpid(). CLONE_THREAD is rejected because autokill targets a process not a thread. If CLONE_NNP is specified together with CLONE_PIDFD_AUTOKILL an unprivileged user may spawn a process that is autokilled. The child cannot escalate privileges via setuid/setgid exec after being spawned. If CLONE_PIDFD_AUTOKILL is specified without CLONE_NNP the caller must have have CAP_SYS_ADMIN in its user namespace" * tag 'vfs-7.1-rc1.pidfs' of git://git.kernel.org/pub/scm/linux/kernel/git/vfs/vfs: selftests: check pidfd_info->coredump_code correctness pidfds: add coredump_code field to pidfd_info kselftest/coredump: reintroduce null pointer dereference selftests/pidfd: add CLONE_PIDFD_AUTOKILL tests selftests/pidfd: add CLONE_NNP tests selftests/pidfd: add CLONE_AUTOREAP tests pidfd: add CLONE_PIDFD_AUTOKILL clone: add CLONE_NNP clone: add CLONE_AUTOREAP
2 parents dc0dfa7 + d29eb5f commit 07c3ef5

15 files changed

Lines changed: 1075 additions & 20 deletions

File tree

fs/pidfs.c

Lines changed: 40 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,8 @@
88
#include <linux/mount.h>
99
#include <linux/pid.h>
1010
#include <linux/pidfs.h>
11+
#include <linux/sched/signal.h>
12+
#include <linux/signal.h>
1113
#include <linux/pid_namespace.h>
1214
#include <linux/poll.h>
1315
#include <linux/proc_fs.h>
@@ -54,6 +56,7 @@ struct pidfs_anon_attr {
5456
};
5557
__u32 coredump_mask;
5658
__u32 coredump_signal;
59+
__u32 coredump_code;
5760
};
5861

5962
static struct rhashtable pidfs_ino_ht;
@@ -358,7 +361,8 @@ static __u32 pidfs_coredump_mask(unsigned long mm_flags)
358361
PIDFD_INFO_EXIT | \
359362
PIDFD_INFO_COREDUMP | \
360363
PIDFD_INFO_SUPPORTED_MASK | \
361-
PIDFD_INFO_COREDUMP_SIGNAL)
364+
PIDFD_INFO_COREDUMP_SIGNAL | \
365+
PIDFD_INFO_COREDUMP_CODE)
362366

363367
static long pidfd_info(struct file *file, unsigned int cmd, unsigned long arg)
364368
{
@@ -372,7 +376,7 @@ static long pidfd_info(struct file *file, unsigned int cmd, unsigned long arg)
372376
const struct cred *c;
373377
__u64 mask;
374378

375-
BUILD_BUG_ON(sizeof(struct pidfd_info) != PIDFD_INFO_SIZE_VER2);
379+
BUILD_BUG_ON(sizeof(struct pidfd_info) != PIDFD_INFO_SIZE_VER3);
376380

377381
if (!uinfo)
378382
return -EINVAL;
@@ -405,9 +409,10 @@ static long pidfd_info(struct file *file, unsigned int cmd, unsigned long arg)
405409
if (mask & PIDFD_INFO_COREDUMP) {
406410
if (test_bit(PIDFS_ATTR_BIT_COREDUMP, &attr->attr_mask)) {
407411
smp_rmb();
408-
kinfo.mask |= PIDFD_INFO_COREDUMP | PIDFD_INFO_COREDUMP_SIGNAL;
412+
kinfo.mask |= PIDFD_INFO_COREDUMP | PIDFD_INFO_COREDUMP_SIGNAL | PIDFD_INFO_COREDUMP_CODE;
409413
kinfo.coredump_mask = attr->coredump_mask;
410414
kinfo.coredump_signal = attr->coredump_signal;
415+
kinfo.coredump_code = attr->coredump_code;
411416
}
412417
}
413418

@@ -662,7 +667,28 @@ static long pidfd_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
662667
return open_namespace(ns_common);
663668
}
664669

670+
static int pidfs_file_release(struct inode *inode, struct file *file)
671+
{
672+
struct pid *pid = inode->i_private;
673+
struct task_struct *task;
674+
675+
if (!(file->f_flags & PIDFD_AUTOKILL))
676+
return 0;
677+
678+
guard(rcu)();
679+
task = pid_task(pid, PIDTYPE_TGID);
680+
if (!task)
681+
return 0;
682+
683+
/* Not available for kthreads or user workers for now. */
684+
if (WARN_ON_ONCE(task->flags & (PF_KTHREAD | PF_USER_WORKER)))
685+
return 0;
686+
do_send_sig_info(SIGKILL, SEND_SIG_PRIV, task, PIDTYPE_TGID);
687+
return 0;
688+
}
689+
665690
static const struct file_operations pidfs_file_operations = {
691+
.release = pidfs_file_release,
666692
.poll = pidfd_poll,
667693
#ifdef CONFIG_PROC_FS
668694
.show_fdinfo = pidfd_show_fdinfo,
@@ -757,8 +783,9 @@ void pidfs_coredump(const struct coredump_params *cprm)
757783
PIDFD_COREDUMPED;
758784
/* If coredumping is set to skip we should never end up here. */
759785
VFS_WARN_ON_ONCE(attr->coredump_mask & PIDFD_COREDUMP_SKIP);
760-
/* Expose the signal number that caused the coredump. */
786+
/* Expose the signal number and code that caused the coredump. */
761787
attr->coredump_signal = cprm->siginfo->si_signo;
788+
attr->coredump_code = cprm->siginfo->si_code;
762789
smp_wmb();
763790
set_bit(PIDFS_ATTR_BIT_COREDUMP, &attr->attr_mask);
764791
}
@@ -1112,11 +1139,11 @@ struct file *pidfs_alloc_file(struct pid *pid, unsigned int flags)
11121139
int ret;
11131140

11141141
/*
1115-
* Ensure that PIDFD_STALE can be passed as a flag without
1116-
* overloading other uapi pidfd flags.
1142+
* Ensure that internal pidfd flags don't overlap with each
1143+
* other or with uapi pidfd flags.
11171144
*/
1118-
BUILD_BUG_ON(PIDFD_STALE == PIDFD_THREAD);
1119-
BUILD_BUG_ON(PIDFD_STALE == PIDFD_NONBLOCK);
1145+
BUILD_BUG_ON(hweight32(PIDFD_THREAD | PIDFD_NONBLOCK |
1146+
PIDFD_STALE | PIDFD_AUTOKILL) != 4);
11201147

11211148
ret = path_from_stashed(&pid->stashed, pidfs_mnt, get_pid(pid), &path);
11221149
if (ret < 0)
@@ -1127,9 +1154,12 @@ struct file *pidfs_alloc_file(struct pid *pid, unsigned int flags)
11271154
flags &= ~PIDFD_STALE;
11281155
flags |= O_RDWR;
11291156
pidfd_file = dentry_open(&path, flags, current_cred());
1130-
/* Raise PIDFD_THREAD explicitly as do_dentry_open() strips it. */
1157+
/*
1158+
* Raise PIDFD_THREAD and PIDFD_AUTOKILL explicitly as
1159+
* do_dentry_open() strips O_EXCL and O_TRUNC.
1160+
*/
11311161
if (!IS_ERR(pidfd_file))
1132-
pidfd_file->f_flags |= (flags & PIDFD_THREAD);
1162+
pidfd_file->f_flags |= (flags & (PIDFD_THREAD | PIDFD_AUTOKILL));
11331163

11341164
return pidfd_file;
11351165
}

include/linux/sched/signal.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -132,6 +132,7 @@ struct signal_struct {
132132
*/
133133
unsigned int is_child_subreaper:1;
134134
unsigned int has_child_subreaper:1;
135+
unsigned int autoreap:1;
135136

136137
#ifdef CONFIG_POSIX_TIMERS
137138

include/uapi/linux/pidfd.h

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@
1313
#ifdef __KERNEL__
1414
#include <linux/sched.h>
1515
#define PIDFD_STALE CLONE_PIDFD
16+
#define PIDFD_AUTOKILL O_TRUNC
1617
#endif
1718

1819
/* Flags for pidfd_send_signal(). */
@@ -28,10 +29,12 @@
2829
#define PIDFD_INFO_COREDUMP (1UL << 4) /* Only returned if requested. */
2930
#define PIDFD_INFO_SUPPORTED_MASK (1UL << 5) /* Want/got supported mask flags */
3031
#define PIDFD_INFO_COREDUMP_SIGNAL (1UL << 6) /* Always returned if PIDFD_INFO_COREDUMP is requested. */
32+
#define PIDFD_INFO_COREDUMP_CODE (1UL << 7) /* Always returned if PIDFD_INFO_COREDUMP is requested. */
3133

3234
#define PIDFD_INFO_SIZE_VER0 64 /* sizeof first published struct */
3335
#define PIDFD_INFO_SIZE_VER1 72 /* sizeof second published struct */
3436
#define PIDFD_INFO_SIZE_VER2 80 /* sizeof third published struct */
37+
#define PIDFD_INFO_SIZE_VER3 88 /* sizeof fourth published struct */
3538

3639
/*
3740
* Values for @coredump_mask in pidfd_info.
@@ -98,6 +101,8 @@ struct pidfd_info {
98101
struct /* coredump info */ {
99102
__u32 coredump_mask;
100103
__u32 coredump_signal;
104+
__u32 coredump_code;
105+
__u32 coredump_pad; /* align supported_mask to 8 bytes */
101106
};
102107
__u64 supported_mask; /* Mask flags that this kernel supports */
103108
};

include/uapi/linux/sched.h

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -34,8 +34,11 @@
3434
#define CLONE_IO 0x80000000 /* Clone io context */
3535

3636
/* Flags for the clone3() syscall. */
37-
#define CLONE_CLEAR_SIGHAND 0x100000000ULL /* Clear any signal handler and reset to SIG_DFL. */
38-
#define CLONE_INTO_CGROUP 0x200000000ULL /* Clone into a specific cgroup given the right permissions. */
37+
#define CLONE_CLEAR_SIGHAND (1ULL << 32) /* Clear any signal handler and reset to SIG_DFL. */
38+
#define CLONE_INTO_CGROUP (1ULL << 33) /* Clone into a specific cgroup given the right permissions. */
39+
#define CLONE_AUTOREAP (1ULL << 34) /* Auto-reap child on exit. */
40+
#define CLONE_NNP (1ULL << 35) /* Set no_new_privs on child. */
41+
#define CLONE_PIDFD_AUTOKILL (1ULL << 36) /* Kill child when clone pidfd closes. */
3942

4043
/*
4144
* cloning flags intersect with CSIGNAL so can be used with unshare and clone3

kernel/fork.c

Lines changed: 49 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -2029,6 +2029,41 @@ __latent_entropy struct task_struct *copy_process(
20292029
return ERR_PTR(-EINVAL);
20302030
}
20312031

2032+
if (clone_flags & CLONE_AUTOREAP) {
2033+
if (clone_flags & CLONE_THREAD)
2034+
return ERR_PTR(-EINVAL);
2035+
if (clone_flags & CLONE_PARENT)
2036+
return ERR_PTR(-EINVAL);
2037+
if (args->exit_signal)
2038+
return ERR_PTR(-EINVAL);
2039+
}
2040+
2041+
if ((clone_flags & CLONE_PARENT) && current->signal->autoreap)
2042+
return ERR_PTR(-EINVAL);
2043+
2044+
if (clone_flags & CLONE_NNP) {
2045+
if (clone_flags & CLONE_THREAD)
2046+
return ERR_PTR(-EINVAL);
2047+
}
2048+
2049+
if (clone_flags & CLONE_PIDFD_AUTOKILL) {
2050+
if (!(clone_flags & CLONE_PIDFD))
2051+
return ERR_PTR(-EINVAL);
2052+
if (!(clone_flags & CLONE_AUTOREAP))
2053+
return ERR_PTR(-EINVAL);
2054+
if (clone_flags & CLONE_THREAD)
2055+
return ERR_PTR(-EINVAL);
2056+
/*
2057+
* Without CLONE_NNP the child could escalate privileges
2058+
* after being spawned, so require CAP_SYS_ADMIN.
2059+
* With CLONE_NNP the child can't gain new privileges,
2060+
* so allow unprivileged usage.
2061+
*/
2062+
if (!(clone_flags & CLONE_NNP) &&
2063+
!ns_capable(current_user_ns(), CAP_SYS_ADMIN))
2064+
return ERR_PTR(-EPERM);
2065+
}
2066+
20322067
/*
20332068
* Force any signals received before this point to be delivered
20342069
* before the fork happens. Collect up signals sent to multiple
@@ -2251,13 +2286,18 @@ __latent_entropy struct task_struct *copy_process(
22512286
* if the fd table isn't shared).
22522287
*/
22532288
if (clone_flags & CLONE_PIDFD) {
2254-
int flags = (clone_flags & CLONE_THREAD) ? PIDFD_THREAD : 0;
2289+
unsigned flags = PIDFD_STALE;
2290+
2291+
if (clone_flags & CLONE_THREAD)
2292+
flags |= PIDFD_THREAD;
2293+
if (clone_flags & CLONE_PIDFD_AUTOKILL)
2294+
flags |= PIDFD_AUTOKILL;
22552295

22562296
/*
22572297
* Note that no task has been attached to @pid yet indicate
22582298
* that via CLONE_PIDFD.
22592299
*/
2260-
retval = pidfd_prepare(pid, flags | PIDFD_STALE, &pidfile);
2300+
retval = pidfd_prepare(pid, flags, &pidfile);
22612301
if (retval < 0)
22622302
goto bad_fork_free_pid;
22632303
pidfd = retval;
@@ -2413,6 +2453,9 @@ __latent_entropy struct task_struct *copy_process(
24132453
*/
24142454
copy_seccomp(p);
24152455

2456+
if (clone_flags & CLONE_NNP)
2457+
task_set_no_new_privs(p);
2458+
24162459
init_task_pid_links(p);
24172460
if (likely(p->pid)) {
24182461
ptrace_init_task(p, (clone_flags & CLONE_PTRACE) || trace);
@@ -2436,6 +2479,8 @@ __latent_entropy struct task_struct *copy_process(
24362479
*/
24372480
p->signal->has_child_subreaper = p->real_parent->signal->has_child_subreaper ||
24382481
p->real_parent->signal->is_child_subreaper;
2482+
if (clone_flags & CLONE_AUTOREAP)
2483+
p->signal->autoreap = 1;
24392484
list_add_tail(&p->sibling, &p->real_parent->children);
24402485
list_add_tail_rcu(&p->tasks, &init_task.tasks);
24412486
attach_pid(p, PIDTYPE_TGID);
@@ -2897,7 +2942,8 @@ static bool clone3_args_valid(struct kernel_clone_args *kargs)
28972942
{
28982943
/* Verify that no unknown flags are passed along. */
28992944
if (kargs->flags &
2900-
~(CLONE_LEGACY_FLAGS | CLONE_CLEAR_SIGHAND | CLONE_INTO_CGROUP))
2945+
~(CLONE_LEGACY_FLAGS | CLONE_CLEAR_SIGHAND | CLONE_INTO_CGROUP |
2946+
CLONE_AUTOREAP | CLONE_NNP | CLONE_PIDFD_AUTOKILL))
29012947
return false;
29022948

29032949
/*

kernel/ptrace.c

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -549,7 +549,8 @@ static bool __ptrace_detach(struct task_struct *tracer, struct task_struct *p)
549549
if (!dead && thread_group_empty(p)) {
550550
if (!same_thread_group(p->real_parent, tracer))
551551
dead = do_notify_parent(p, p->exit_signal);
552-
else if (ignoring_children(tracer->sighand)) {
552+
else if (ignoring_children(tracer->sighand) ||
553+
p->signal->autoreap) {
553554
__wake_up_parent(p, tracer);
554555
dead = true;
555556
}

kernel/signal.c

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2251,6 +2251,10 @@ bool do_notify_parent(struct task_struct *tsk, int sig)
22512251
if (psig->action[SIGCHLD-1].sa.sa_handler == SIG_IGN)
22522252
sig = 0;
22532253
}
2254+
if (!tsk->ptrace && tsk->signal->autoreap) {
2255+
autoreap = true;
2256+
sig = 0;
2257+
}
22542258
/*
22552259
* Send with __send_signal as si_pid and si_uid are in the
22562260
* parent's namespaces.

tools/testing/selftests/coredump/coredump_socket_protocol_test.c

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1004,6 +1004,8 @@ TEST_F(coredump, socket_request_invalid_size_large)
10041004
*
10051005
* Verify that when using socket-based coredump protocol,
10061006
* the coredump_signal field is correctly exposed as SIGSEGV.
1007+
* Also check that the coredump_code field is correctly exposed
1008+
* as SEGV_MAPERR.
10071009
*/
10081010
TEST_F(coredump, socket_coredump_signal_sigsegv)
10091011
{
@@ -1079,6 +1081,18 @@ TEST_F(coredump, socket_coredump_signal_sigsegv)
10791081
goto out;
10801082
}
10811083

1084+
/* Verify coredump_code is available and correct */
1085+
if (!(info.mask & PIDFD_INFO_COREDUMP_CODE)) {
1086+
fprintf(stderr, "socket_coredump_signal_sigsegv: PIDFD_INFO_COREDUMP_CODE not set in mask\n");
1087+
goto out;
1088+
}
1089+
1090+
if (info.coredump_code != SEGV_MAPERR) {
1091+
fprintf(stderr, "socket_coredump_signal_sigsegv: coredump_code=%d, expected SEGV_MAPERR=%d\n",
1092+
info.coredump_code, SEGV_MAPERR);
1093+
goto out;
1094+
}
1095+
10821096
if (!read_coredump_req(fd_coredump, &req)) {
10831097
fprintf(stderr, "socket_coredump_signal_sigsegv: read_coredump_req failed\n");
10841098
goto out;
@@ -1128,6 +1142,8 @@ TEST_F(coredump, socket_coredump_signal_sigsegv)
11281142
ASSERT_TRUE(!!(info.mask & PIDFD_INFO_COREDUMP));
11291143
ASSERT_TRUE(!!(info.mask & PIDFD_INFO_COREDUMP_SIGNAL));
11301144
ASSERT_EQ(info.coredump_signal, SIGSEGV);
1145+
ASSERT_TRUE(!!(info.mask & PIDFD_INFO_COREDUMP_CODE));
1146+
ASSERT_EQ(info.coredump_code, SEGV_MAPERR);
11311147

11321148
wait_and_check_coredump_server(pid_coredump_server, _metadata, self);
11331149
}
@@ -1137,6 +1153,8 @@ TEST_F(coredump, socket_coredump_signal_sigsegv)
11371153
*
11381154
* Verify that when using socket-based coredump protocol,
11391155
* the coredump_signal field is correctly exposed as SIGABRT.
1156+
* Also check that the coredump_code field is correctly exposed
1157+
* as SI_TKILL.
11401158
*/
11411159
TEST_F(coredump, socket_coredump_signal_sigabrt)
11421160
{
@@ -1212,6 +1230,12 @@ TEST_F(coredump, socket_coredump_signal_sigabrt)
12121230
goto out;
12131231
}
12141232

1233+
if (info.coredump_code != SI_TKILL) {
1234+
fprintf(stderr, "socket_coredump_signal_sigabrt: coredump_code=%d, expected SI_TKILL=%d\n",
1235+
info.coredump_code, SI_TKILL);
1236+
goto out;
1237+
}
1238+
12151239
if (!read_coredump_req(fd_coredump, &req)) {
12161240
fprintf(stderr, "socket_coredump_signal_sigabrt: read_coredump_req failed\n");
12171241
goto out;
@@ -1261,6 +1285,8 @@ TEST_F(coredump, socket_coredump_signal_sigabrt)
12611285
ASSERT_TRUE(!!(info.mask & PIDFD_INFO_COREDUMP));
12621286
ASSERT_TRUE(!!(info.mask & PIDFD_INFO_COREDUMP_SIGNAL));
12631287
ASSERT_EQ(info.coredump_signal, SIGABRT);
1288+
ASSERT_TRUE(!!(info.mask & PIDFD_INFO_COREDUMP_CODE));
1289+
ASSERT_EQ(info.coredump_code, SI_TKILL);
12641290

12651291
wait_and_check_coredump_server(pid_coredump_server, _metadata, self);
12661292
}

0 commit comments

Comments
 (0)