Skip to content

Commit 9877918

Browse files
committed
Merge patch series "xattr: rework simple xattrs and support user.* xattrs on sockets"
Christian Brauner <brauner@kernel.org> says: This reworks the simple_xattr infrastructure and adds support for user.* extended attributes on sockets. The simple_xattr subsystem currently uses an rbtree protected by a reader-writer spinlock. This series replaces the rbtree with an rhashtable giving O(1) average-case lookup with RCU-based lockless reads. This sped up concurrent access patterns on tmpfs quite a bit and it's an overall easy enough conversion to do and gets rid or rwlock_t. The conversion is done incrementally: a new rhashtable path is added alongside the existing rbtree, consumers are migrated one at a time (shmem, kernfs, pidfs), and then the rbtree code is removed. All three consumers switch from embedded structs to pointer-based lazy allocation so the rhashtable overhead is only paid for inodes that actually use xattrs. With this infrastructure in place the series adds support for user.* xattrs on sockets. Path-based AF_UNIX sockets inherit xattr support from the underlying filesystem (e.g. tmpfs) but sockets in sockfs - that is everything created via socket() including abstract namespace AF_UNIX sockets - had no xattr support at all. The xattr_permission() checks are reworked to allow user.* xattrs on S_IFSOCK inodes. Sockfs sockets get per-inode limits of 128 xattrs and 128KB total value size matching the limits already in use for kernfs. The practical motivation comes from several directions. systemd and GNOME are expanding their use of Varlink as an IPC mechanism. For D-Bus there are tools like dbus-monitor that can observe IPC traffic across the system but this only works because D-Bus has a central broker. For Varlink there is no broker and there is currently no way to identify which sockets speak Varlink. With user.* xattrs on sockets a service can label its socket with the IPC protocol it speaks (e.g., user.varlink=1) and an eBPF program can then selectively capture traffic on those sockets. Enumerating bound sockets via netlink combined with these xattr labels gives a way to discover all Varlink IPC entrypoints for debugging and introspection. Similarly, systemd-journald wants to use xattrs on the /dev/log socket for protocol negotiation to indicate whether RFC 5424 structured syslog is supported or whether only the legacy RFC 3164 format should be used. In containers these labels are particularly useful as high-privilege or more complicated solutions for socket identification aren't available. The series comes with comprehensive selftests covering path-based AF_UNIX sockets, sockfs socket operations, per-inode limit enforcement, and xattr operations across multiple address families (AF_INET, AF_INET6, AF_NETLINK, AF_PACKET). * patches from https://patch.msgid.link/20260216-work-xattr-socket-v1-0-c2efa4f74cb7@kernel.org: selftests/xattr: test xattrs on various socket families selftests/xattr: sockfs socket xattr tests selftests/xattr: path-based AF_UNIX socket xattr tests xattr: support extended attributes on sockets xattr,net: support limited amount of extended attributes on sockfs sockets xattr: move user limits for xattrs to generic infra xattr: switch xattr_permission() to switch statement xattr: add xattr_permission_error() xattr: remove rbtree-based simple_xattr infrastructure pidfs: adapt to rhashtable-based simple_xattrs kernfs: adapt to rhashtable-based simple_xattrs with lazy allocation shmem: adapt to rhashtable-based simple_xattrs with lazy allocation xattr: add rhashtable-based simple_xattr infrastructure xattr: add rcu_head and rhash_head to struct simple_xattr Link: https://patch.msgid.link/20260216-work-xattr-socket-v1-0-c2efa4f74cb7@kernel.org Signed-off-by: Christian Brauner <brauner@kernel.org>
2 parents 6de23f8 + 0f1f4e4 commit 9877918

15 files changed

Lines changed: 1546 additions & 295 deletions

File tree

fs/kernfs/dir.c

Lines changed: 11 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -547,10 +547,8 @@ static void kernfs_free_rcu(struct rcu_head *rcu)
547547
/* If the whole node goes away, then name can't be used outside */
548548
kfree_const(rcu_access_pointer(kn->name));
549549

550-
if (kn->iattr) {
551-
simple_xattrs_free(&kn->iattr->xattrs, NULL);
550+
if (kn->iattr)
552551
kmem_cache_free(kernfs_iattrs_cache, kn->iattr);
553-
}
554552

555553
kmem_cache_free(kernfs_node_cache, kn);
556554
}
@@ -584,6 +582,12 @@ void kernfs_put(struct kernfs_node *kn)
584582
if (kernfs_type(kn) == KERNFS_LINK)
585583
kernfs_put(kn->symlink.target_kn);
586584

585+
if (kn->iattr && kn->iattr->xattrs) {
586+
simple_xattrs_free(kn->iattr->xattrs, NULL);
587+
kfree(kn->iattr->xattrs);
588+
kn->iattr->xattrs = NULL;
589+
}
590+
587591
spin_lock(&root->kernfs_idr_lock);
588592
idr_remove(&root->ino_idr, (u32)kernfs_ino(kn));
589593
spin_unlock(&root->kernfs_idr_lock);
@@ -682,7 +686,10 @@ static struct kernfs_node *__kernfs_new_node(struct kernfs_root *root,
682686

683687
err_out4:
684688
if (kn->iattr) {
685-
simple_xattrs_free(&kn->iattr->xattrs, NULL);
689+
if (kn->iattr->xattrs) {
690+
simple_xattrs_free(kn->iattr->xattrs, NULL);
691+
kfree(kn->iattr->xattrs);
692+
}
686693
kmem_cache_free(kernfs_iattrs_cache, kn->iattr);
687694
}
688695
err_out3:

fs/kernfs/inode.c

Lines changed: 23 additions & 76 deletions
Original file line numberDiff line numberDiff line change
@@ -45,9 +45,7 @@ static struct kernfs_iattrs *__kernfs_iattrs(struct kernfs_node *kn, bool alloc)
4545
ret->ia_mtime = ret->ia_atime;
4646
ret->ia_ctime = ret->ia_atime;
4747

48-
simple_xattrs_init(&ret->xattrs);
49-
atomic_set(&ret->nr_user_xattrs, 0);
50-
atomic_set(&ret->user_xattr_size, 0);
48+
simple_xattr_limits_init(&ret->xattr_limits);
5149

5250
/* If someone raced us, recognize it. */
5351
if (!try_cmpxchg(&kn->iattr, &attr, ret))
@@ -146,7 +144,8 @@ ssize_t kernfs_iop_listxattr(struct dentry *dentry, char *buf, size_t size)
146144
if (!attrs)
147145
return -ENOMEM;
148146

149-
return simple_xattr_list(d_inode(dentry), &attrs->xattrs, buf, size);
147+
return simple_xattr_list(d_inode(dentry), READ_ONCE(attrs->xattrs),
148+
buf, size);
150149
}
151150

152151
static inline void set_default_inode_attr(struct inode *inode, umode_t mode)
@@ -298,27 +297,38 @@ int kernfs_xattr_get(struct kernfs_node *kn, const char *name,
298297
void *value, size_t size)
299298
{
300299
struct kernfs_iattrs *attrs = kernfs_iattrs_noalloc(kn);
300+
struct simple_xattrs *xattrs;
301+
301302
if (!attrs)
302303
return -ENODATA;
303304

304-
return simple_xattr_get(&attrs->xattrs, name, value, size);
305+
xattrs = READ_ONCE(attrs->xattrs);
306+
if (!xattrs)
307+
return -ENODATA;
308+
309+
return simple_xattr_get(xattrs, name, value, size);
305310
}
306311

307312
int kernfs_xattr_set(struct kernfs_node *kn, const char *name,
308313
const void *value, size_t size, int flags)
309314
{
310315
struct simple_xattr *old_xattr;
316+
struct simple_xattrs *xattrs;
311317
struct kernfs_iattrs *attrs;
312318

313319
attrs = kernfs_iattrs(kn);
314320
if (!attrs)
315321
return -ENOMEM;
316322

317-
old_xattr = simple_xattr_set(&attrs->xattrs, name, value, size, flags);
323+
xattrs = simple_xattrs_lazy_alloc(&attrs->xattrs, value, flags);
324+
if (IS_ERR_OR_NULL(xattrs))
325+
return PTR_ERR(xattrs);
326+
327+
old_xattr = simple_xattr_set(xattrs, name, value, size, flags);
318328
if (IS_ERR(old_xattr))
319329
return PTR_ERR(old_xattr);
320330

321-
simple_xattr_free(old_xattr);
331+
simple_xattr_free_rcu(old_xattr);
322332
return 0;
323333
}
324334

@@ -344,69 +354,6 @@ static int kernfs_vfs_xattr_set(const struct xattr_handler *handler,
344354
return kernfs_xattr_set(kn, name, value, size, flags);
345355
}
346356

347-
static int kernfs_vfs_user_xattr_add(struct kernfs_node *kn,
348-
const char *full_name,
349-
struct simple_xattrs *xattrs,
350-
const void *value, size_t size, int flags)
351-
{
352-
struct kernfs_iattrs *attr = kernfs_iattrs_noalloc(kn);
353-
atomic_t *sz = &attr->user_xattr_size;
354-
atomic_t *nr = &attr->nr_user_xattrs;
355-
struct simple_xattr *old_xattr;
356-
int ret;
357-
358-
if (atomic_inc_return(nr) > KERNFS_MAX_USER_XATTRS) {
359-
ret = -ENOSPC;
360-
goto dec_count_out;
361-
}
362-
363-
if (atomic_add_return(size, sz) > KERNFS_USER_XATTR_SIZE_LIMIT) {
364-
ret = -ENOSPC;
365-
goto dec_size_out;
366-
}
367-
368-
old_xattr = simple_xattr_set(xattrs, full_name, value, size, flags);
369-
if (!old_xattr)
370-
return 0;
371-
372-
if (IS_ERR(old_xattr)) {
373-
ret = PTR_ERR(old_xattr);
374-
goto dec_size_out;
375-
}
376-
377-
ret = 0;
378-
size = old_xattr->size;
379-
simple_xattr_free(old_xattr);
380-
dec_size_out:
381-
atomic_sub(size, sz);
382-
dec_count_out:
383-
atomic_dec(nr);
384-
return ret;
385-
}
386-
387-
static int kernfs_vfs_user_xattr_rm(struct kernfs_node *kn,
388-
const char *full_name,
389-
struct simple_xattrs *xattrs,
390-
const void *value, size_t size, int flags)
391-
{
392-
struct kernfs_iattrs *attr = kernfs_iattrs_noalloc(kn);
393-
atomic_t *sz = &attr->user_xattr_size;
394-
atomic_t *nr = &attr->nr_user_xattrs;
395-
struct simple_xattr *old_xattr;
396-
397-
old_xattr = simple_xattr_set(xattrs, full_name, value, size, flags);
398-
if (!old_xattr)
399-
return 0;
400-
401-
if (IS_ERR(old_xattr))
402-
return PTR_ERR(old_xattr);
403-
404-
atomic_sub(old_xattr->size, sz);
405-
atomic_dec(nr);
406-
simple_xattr_free(old_xattr);
407-
return 0;
408-
}
409-
410357
static int kernfs_vfs_user_xattr_set(const struct xattr_handler *handler,
411358
struct mnt_idmap *idmap,
412359
struct dentry *unused, struct inode *inode,
@@ -415,6 +362,7 @@ static int kernfs_vfs_user_xattr_set(const struct xattr_handler *handler,
415362
{
416363
const char *full_name = xattr_full_name(handler, suffix);
417364
struct kernfs_node *kn = inode->i_private;
365+
struct simple_xattrs *xattrs;
418366
struct kernfs_iattrs *attrs;
419367

420368
if (!(kernfs_root(kn)->flags & KERNFS_ROOT_SUPPORT_USER_XATTR))
@@ -424,13 +372,12 @@ static int kernfs_vfs_user_xattr_set(const struct xattr_handler *handler,
424372
if (!attrs)
425373
return -ENOMEM;
426374

427-
if (value)
428-
return kernfs_vfs_user_xattr_add(kn, full_name, &attrs->xattrs,
429-
value, size, flags);
430-
else
431-
return kernfs_vfs_user_xattr_rm(kn, full_name, &attrs->xattrs,
432-
value, size, flags);
375+
xattrs = simple_xattrs_lazy_alloc(&attrs->xattrs, value, flags);
376+
if (IS_ERR_OR_NULL(xattrs))
377+
return PTR_ERR(xattrs);
433378

379+
return simple_xattr_set_limited(xattrs, &attrs->xattr_limits,
380+
full_name, value, size, flags);
434381
}
435382

436383
static const struct xattr_handler kernfs_trusted_xattr_handler = {

fs/kernfs/kernfs-internal.h

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -26,9 +26,8 @@ struct kernfs_iattrs {
2626
struct timespec64 ia_mtime;
2727
struct timespec64 ia_ctime;
2828

29-
struct simple_xattrs xattrs;
30-
atomic_t nr_user_xattrs;
31-
atomic_t user_xattr_size;
29+
struct simple_xattrs *xattrs;
30+
struct simple_xattr_limits xattr_limits;
3231
};
3332

3433
struct kernfs_root {

fs/pidfs.c

Lines changed: 40 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@
2222
#include <net/net_namespace.h>
2323
#include <linux/coredump.h>
2424
#include <linux/rhashtable.h>
25+
#include <linux/llist.h>
2526
#include <linux/xattr.h>
2627
#include <linux/cookie.h>
2728

@@ -31,7 +32,6 @@
3132
#define PIDFS_PID_DEAD ERR_PTR(-ESRCH)
3233

3334
static struct kmem_cache *pidfs_attr_cachep __ro_after_init;
34-
static struct kmem_cache *pidfs_xattr_cachep __ro_after_init;
3535

3636
static struct path pidfs_root_path = {};
3737

@@ -46,9 +46,8 @@ enum pidfs_attr_mask_bits {
4646
PIDFS_ATTR_BIT_COREDUMP = 1,
4747
};
4848

49-
struct pidfs_attr {
49+
struct pidfs_anon_attr {
5050
unsigned long attr_mask;
51-
struct simple_xattrs *xattrs;
5251
struct /* exit info */ {
5352
__u64 cgroupid;
5453
__s32 exit_code;
@@ -93,6 +92,13 @@ static const struct rhashtable_params pidfs_ino_ht_params = {
9392
* inode number and the inode generation number to compare or
9493
* use file handles.
9594
*/
95+
struct pidfs_attr {
96+
struct simple_xattrs *xattrs;
97+
union {
98+
struct pidfs_anon_attr;
99+
struct llist_node pidfs_llist;
100+
};
101+
};
96102

97103
#if BITS_PER_LONG == 32
98104

@@ -178,10 +184,30 @@ void pidfs_remove_pid(struct pid *pid)
178184
pidfs_ino_ht_params);
179185
}
180186

187+
static LLIST_HEAD(pidfs_free_list);
188+
189+
static void pidfs_free_attr_work(struct work_struct *work)
190+
{
191+
struct pidfs_attr *attr, *next;
192+
struct llist_node *head;
193+
194+
head = llist_del_all(&pidfs_free_list);
195+
llist_for_each_entry_safe(attr, next, head, pidfs_llist) {
196+
struct simple_xattrs *xattrs = attr->xattrs;
197+
198+
if (xattrs) {
199+
simple_xattrs_free(xattrs, NULL);
200+
kfree(xattrs);
201+
}
202+
kfree(attr);
203+
}
204+
}
205+
206+
static DECLARE_WORK(pidfs_free_work, pidfs_free_attr_work);
207+
181208
void pidfs_free_pid(struct pid *pid)
182209
{
183-
struct pidfs_attr *attr __free(kfree) = no_free_ptr(pid->attr);
184-
struct simple_xattrs *xattrs __free(kfree) = NULL;
210+
struct pidfs_attr *attr = pid->attr;
185211

186212
/*
187213
* Any dentry must've been wiped from the pid by now.
@@ -200,9 +226,10 @@ void pidfs_free_pid(struct pid *pid)
200226
if (IS_ERR(attr))
201227
return;
202228

203-
xattrs = no_free_ptr(attr->xattrs);
204-
if (xattrs)
205-
simple_xattrs_free(xattrs, NULL);
229+
if (likely(!attr->xattrs))
230+
kfree(attr);
231+
else if (llist_add(&attr->pidfs_llist, &pidfs_free_list))
232+
schedule_work(&pidfs_free_work);
206233
}
207234

208235
#ifdef CONFIG_PROC_FS
@@ -1011,7 +1038,7 @@ static int pidfs_xattr_get(const struct xattr_handler *handler,
10111038

10121039
xattrs = READ_ONCE(attr->xattrs);
10131040
if (!xattrs)
1014-
return 0;
1041+
return -ENODATA;
10151042

10161043
name = xattr_full_name(handler, suffix);
10171044
return simple_xattr_get(xattrs, name, value, size);
@@ -1031,22 +1058,16 @@ static int pidfs_xattr_set(const struct xattr_handler *handler,
10311058
/* Ensure we're the only one to set @attr->xattrs. */
10321059
WARN_ON_ONCE(!inode_is_locked(inode));
10331060

1034-
xattrs = READ_ONCE(attr->xattrs);
1035-
if (!xattrs) {
1036-
xattrs = kmem_cache_zalloc(pidfs_xattr_cachep, GFP_KERNEL);
1037-
if (!xattrs)
1038-
return -ENOMEM;
1039-
1040-
simple_xattrs_init(xattrs);
1041-
smp_store_release(&pid->attr->xattrs, xattrs);
1042-
}
1061+
xattrs = simple_xattrs_lazy_alloc(&attr->xattrs, value, flags);
1062+
if (IS_ERR_OR_NULL(xattrs))
1063+
return PTR_ERR(xattrs);
10431064

10441065
name = xattr_full_name(handler, suffix);
10451066
old_xattr = simple_xattr_set(xattrs, name, value, size, flags);
10461067
if (IS_ERR(old_xattr))
10471068
return PTR_ERR(old_xattr);
10481069

1049-
simple_xattr_free(old_xattr);
1070+
simple_xattr_free_rcu(old_xattr);
10501071
return 0;
10511072
}
10521073

@@ -1124,11 +1145,6 @@ void __init pidfs_init(void)
11241145
(SLAB_HWCACHE_ALIGN | SLAB_RECLAIM_ACCOUNT |
11251146
SLAB_ACCOUNT | SLAB_PANIC), NULL);
11261147

1127-
pidfs_xattr_cachep = kmem_cache_create("pidfs_xattr_cache",
1128-
sizeof(struct simple_xattrs), 0,
1129-
(SLAB_HWCACHE_ALIGN | SLAB_RECLAIM_ACCOUNT |
1130-
SLAB_ACCOUNT | SLAB_PANIC), NULL);
1131-
11321148
pidfs_mnt = kern_mount(&pidfs_type);
11331149
if (IS_ERR(pidfs_mnt))
11341150
panic("Failed to mount pidfs pseudo filesystem");

0 commit comments

Comments
 (0)