Skip to content

Commit c8134b5

Browse files
committed
pidfd: add CLONE_PIDFD_AUTOKILL
Add a new clone3() flag CLONE_PIDFD_AUTOKILL that ties a child's lifetime to the pidfd returned from clone3(). When the last reference to the struct file created by clone3() is closed the kernel sends SIGKILL to the child. A pidfd obtained via pidfd_open() for the same process does not keep the child alive and does not trigger autokill - only the specific struct file from clone3() has this property. This is useful for container runtimes, service managers, and sandboxed subprocess execution - any scenario where the child must die if the parent crashes or abandons the pidfd. CLONE_PIDFD_AUTOKILL requires both CLONE_PIDFD (the whole point is tying lifetime to the pidfd file) and CLONE_AUTOREAP (a killed child with no one to reap it would become a zombie). CLONE_THREAD is rejected because autokill targets a process not a thread. The clone3 pidfd is identified by the PIDFD_AUTOKILL file flag set on the struct file at clone3() time. The pidfs .release handler checks this flag and sends SIGKILL via do_send_sig_info(SIGKILL, SEND_SIG_PRIV, ...) only when it is set. Files from pidfd_open() or open_by_handle_at() are distinct struct files that do not carry this flag. dup()/fork() share the same struct file so they extend the child's lifetime until the last reference drops. CLONE_PIDFD_AUTOKILL uses a privilege model based on CLONE_NNP: without CLONE_NNP the child could escalate privileges via setuid/setgid exec after being spawned, so the caller must have CAP_SYS_ADMIN in its user namespace. With CLONE_NNP the child can never gain new privileges so unprivileged usage is allowed. This is a deliberate departure from the pdeath_signal model which is reset during secureexec and commit_creds() rendering it useless for container runtimes that need to deprivilege themselves. Link: https://patch.msgid.link/20260226-work-pidfs-autoreap-v5-3-d148b984a989@kernel.org Reviewed-by: Oleg Nesterov <oleg@redhat.com> Signed-off-by: Christian Brauner <brauner@kernel.org>
1 parent 24baca5 commit c8134b5

4 files changed

Lines changed: 60 additions & 9 deletions

File tree

fs/pidfs.c

Lines changed: 32 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,8 @@
88
#include <linux/mount.h>
99
#include <linux/pid.h>
1010
#include <linux/pidfs.h>
11+
#include <linux/sched/signal.h>
12+
#include <linux/signal.h>
1113
#include <linux/pid_namespace.h>
1214
#include <linux/poll.h>
1315
#include <linux/proc_fs.h>
@@ -637,7 +639,28 @@ static long pidfd_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
637639
return open_namespace(ns_common);
638640
}
639641

642+
static int pidfs_file_release(struct inode *inode, struct file *file)
643+
{
644+
struct pid *pid = inode->i_private;
645+
struct task_struct *task;
646+
647+
if (!(file->f_flags & PIDFD_AUTOKILL))
648+
return 0;
649+
650+
guard(rcu)();
651+
task = pid_task(pid, PIDTYPE_TGID);
652+
if (!task)
653+
return 0;
654+
655+
/* Not available for kthreads or user workers for now. */
656+
if (WARN_ON_ONCE(task->flags & (PF_KTHREAD | PF_USER_WORKER)))
657+
return 0;
658+
do_send_sig_info(SIGKILL, SEND_SIG_PRIV, task, PIDTYPE_TGID);
659+
return 0;
660+
}
661+
640662
static const struct file_operations pidfs_file_operations = {
663+
.release = pidfs_file_release,
641664
.poll = pidfd_poll,
642665
#ifdef CONFIG_PROC_FS
643666
.show_fdinfo = pidfd_show_fdinfo,
@@ -1093,11 +1116,11 @@ struct file *pidfs_alloc_file(struct pid *pid, unsigned int flags)
10931116
int ret;
10941117

10951118
/*
1096-
* Ensure that PIDFD_STALE can be passed as a flag without
1097-
* overloading other uapi pidfd flags.
1119+
* Ensure that internal pidfd flags don't overlap with each
1120+
* other or with uapi pidfd flags.
10981121
*/
1099-
BUILD_BUG_ON(PIDFD_STALE == PIDFD_THREAD);
1100-
BUILD_BUG_ON(PIDFD_STALE == PIDFD_NONBLOCK);
1122+
BUILD_BUG_ON(hweight32(PIDFD_THREAD | PIDFD_NONBLOCK |
1123+
PIDFD_STALE | PIDFD_AUTOKILL) != 4);
11011124

11021125
ret = path_from_stashed(&pid->stashed, pidfs_mnt, get_pid(pid), &path);
11031126
if (ret < 0)
@@ -1108,9 +1131,12 @@ struct file *pidfs_alloc_file(struct pid *pid, unsigned int flags)
11081131
flags &= ~PIDFD_STALE;
11091132
flags |= O_RDWR;
11101133
pidfd_file = dentry_open(&path, flags, current_cred());
1111-
/* Raise PIDFD_THREAD explicitly as do_dentry_open() strips it. */
1134+
/*
1135+
* Raise PIDFD_THREAD and PIDFD_AUTOKILL explicitly as
1136+
* do_dentry_open() strips O_EXCL and O_TRUNC.
1137+
*/
11121138
if (!IS_ERR(pidfd_file))
1113-
pidfd_file->f_flags |= (flags & PIDFD_THREAD);
1139+
pidfd_file->f_flags |= (flags & (PIDFD_THREAD | PIDFD_AUTOKILL));
11141140

11151141
return pidfd_file;
11161142
}

include/uapi/linux/pidfd.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@
1313
#ifdef __KERNEL__
1414
#include <linux/sched.h>
1515
#define PIDFD_STALE CLONE_PIDFD
16+
#define PIDFD_AUTOKILL O_TRUNC
1617
#endif
1718

1819
/* Flags for pidfd_send_signal(). */

include/uapi/linux/sched.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,7 @@
3838
#define CLONE_INTO_CGROUP (1ULL << 33) /* Clone into a specific cgroup given the right permissions. */
3939
#define CLONE_AUTOREAP (1ULL << 34) /* Auto-reap child on exit. */
4040
#define CLONE_NNP (1ULL << 35) /* Set no_new_privs on child. */
41+
#define CLONE_PIDFD_AUTOKILL (1ULL << 36) /* Kill child when clone pidfd closes. */
4142

4243
/*
4344
* cloning flags intersect with CSIGNAL so can be used with unshare and clone3

kernel/fork.c

Lines changed: 26 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -2045,6 +2045,24 @@ __latent_entropy struct task_struct *copy_process(
20452045
return ERR_PTR(-EINVAL);
20462046
}
20472047

2048+
if (clone_flags & CLONE_PIDFD_AUTOKILL) {
2049+
if (!(clone_flags & CLONE_PIDFD))
2050+
return ERR_PTR(-EINVAL);
2051+
if (!(clone_flags & CLONE_AUTOREAP))
2052+
return ERR_PTR(-EINVAL);
2053+
if (clone_flags & CLONE_THREAD)
2054+
return ERR_PTR(-EINVAL);
2055+
/*
2056+
* Without CLONE_NNP the child could escalate privileges
2057+
* after being spawned, so require CAP_SYS_ADMIN.
2058+
* With CLONE_NNP the child can't gain new privileges,
2059+
* so allow unprivileged usage.
2060+
*/
2061+
if (!(clone_flags & CLONE_NNP) &&
2062+
!ns_capable(current_user_ns(), CAP_SYS_ADMIN))
2063+
return ERR_PTR(-EPERM);
2064+
}
2065+
20482066
/*
20492067
* Force any signals received before this point to be delivered
20502068
* before the fork happens. Collect up signals sent to multiple
@@ -2267,13 +2285,18 @@ __latent_entropy struct task_struct *copy_process(
22672285
* if the fd table isn't shared).
22682286
*/
22692287
if (clone_flags & CLONE_PIDFD) {
2270-
int flags = (clone_flags & CLONE_THREAD) ? PIDFD_THREAD : 0;
2288+
unsigned flags = PIDFD_STALE;
2289+
2290+
if (clone_flags & CLONE_THREAD)
2291+
flags |= PIDFD_THREAD;
2292+
if (clone_flags & CLONE_PIDFD_AUTOKILL)
2293+
flags |= PIDFD_AUTOKILL;
22712294

22722295
/*
22732296
* Note that no task has been attached to @pid yet indicate
22742297
* that via CLONE_PIDFD.
22752298
*/
2276-
retval = pidfd_prepare(pid, flags | PIDFD_STALE, &pidfile);
2299+
retval = pidfd_prepare(pid, flags, &pidfile);
22772300
if (retval < 0)
22782301
goto bad_fork_free_pid;
22792302
pidfd = retval;
@@ -2920,7 +2943,7 @@ static bool clone3_args_valid(struct kernel_clone_args *kargs)
29202943
/* Verify that no unknown flags are passed along. */
29212944
if (kargs->flags &
29222945
~(CLONE_LEGACY_FLAGS | CLONE_CLEAR_SIGHAND | CLONE_INTO_CGROUP |
2923-
CLONE_AUTOREAP | CLONE_NNP))
2946+
CLONE_AUTOREAP | CLONE_NNP | CLONE_PIDFD_AUTOKILL))
29242947
return false;
29252948

29262949
/*

0 commit comments

Comments
 (0)