Skip to content

Commit c8db081

Browse files
committed
Merge tag 'vfs-7.1-rc1.xattr' of git://git.kernel.org/pub/scm/linux/kernel/git/vfs/vfs
Pull vfs xattr updates from Christian Brauner: "This reworks the simple_xattr infrastructure and adds support for user.* extended attributes on sockets. The simple_xattr subsystem currently uses an rbtree protected by a reader-writer spinlock. This series replaces the rbtree with an rhashtable giving O(1) average-case lookup with RCU-based lockless reads. This sped up concurrent access patterns on tmpfs quite a bit and it's an overall easy enough conversion to do and gets rid or rwlock_t. The conversion is done incrementally: a new rhashtable path is added alongside the existing rbtree, consumers are migrated one at a time (shmem, kernfs, pidfs), and then the rbtree code is removed. All three consumers switch from embedded structs to pointer-based lazy allocation so the rhashtable overhead is only paid for inodes that actually use xattrs. With this infrastructure in place the series adds support for user.* xattrs on sockets. Path-based AF_UNIX sockets inherit xattr support from the underlying filesystem (e.g. tmpfs) but sockets in sockfs - that is everything created via socket() including abstract namespace AF_UNIX sockets - had no xattr support at all. The xattr_permission() checks are reworked to allow user.* xattrs on S_IFSOCK inodes. Sockfs sockets get per-inode limits of 128 xattrs and 128KB total value size matching the limits already in use for kernfs. The practical motivation comes from several directions. systemd and GNOME are expanding their use of Varlink as an IPC mechanism. For D-Bus there are tools like dbus-monitor that can observe IPC traffic across the system but this only works because D-Bus has a central broker. For Varlink there is no broker and there is currently no way to identify which sockets speak Varlink. With user.* xattrs on sockets a service can label its socket with the IPC protocol it speaks (e.g., user.varlink=1) and an eBPF program can then selectively capture traffic on those sockets. Enumerating bound sockets via netlink combined with these xattr labels gives a way to discover all Varlink IPC entrypoints for debugging and introspection. Similarly, systemd-journald wants to use xattrs on the /dev/log socket for protocol negotiation to indicate whether RFC 5424 structured syslog is supported or whether only the legacy RFC 3164 format should be used. In containers these labels are particularly useful as high-privilege or more complicated solutions for socket identification aren't available. The series comes with comprehensive selftests covering path-based AF_UNIX sockets, sockfs socket operations, per-inode limit enforcement, and xattr operations across multiple address families (AF_INET, AF_INET6, AF_NETLINK, AF_PACKET)" * tag 'vfs-7.1-rc1.xattr' of git://git.kernel.org/pub/scm/linux/kernel/git/vfs/vfs: selftests/xattr: test xattrs on various socket families selftests/xattr: sockfs socket xattr tests selftests/xattr: path-based AF_UNIX socket xattr tests xattr: support extended attributes on sockets xattr,net: support limited amount of extended attributes on sockfs sockets xattr: move user limits for xattrs to generic infra xattr: switch xattr_permission() to switch statement xattr: add xattr_permission_error() xattr: remove rbtree-based simple_xattr infrastructure pidfs: adapt to rhashtable-based simple_xattrs kernfs: adapt to rhashtable-based simple_xattrs with lazy allocation shmem: adapt to rhashtable-based simple_xattrs with lazy allocation xattr: add rhashtable-based simple_xattr infrastructure xattr: add rcu_head and rhash_head to struct simple_xattr
2 parents 0e58e3f + 9877918 commit c8db081

15 files changed

Lines changed: 1546 additions & 295 deletions

File tree

fs/kernfs/dir.c

Lines changed: 11 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -564,10 +564,8 @@ static void kernfs_free_rcu(struct rcu_head *rcu)
564564
/* If the whole node goes away, then name can't be used outside */
565565
kfree_const(rcu_access_pointer(kn->name));
566566

567-
if (kn->iattr) {
568-
simple_xattrs_free(&kn->iattr->xattrs, NULL);
567+
if (kn->iattr)
569568
kmem_cache_free(kernfs_iattrs_cache, kn->iattr);
570-
}
571569

572570
kmem_cache_free(kernfs_node_cache, kn);
573571
}
@@ -601,6 +599,12 @@ void kernfs_put(struct kernfs_node *kn)
601599
if (kernfs_type(kn) == KERNFS_LINK)
602600
kernfs_put(kn->symlink.target_kn);
603601

602+
if (kn->iattr && kn->iattr->xattrs) {
603+
simple_xattrs_free(kn->iattr->xattrs, NULL);
604+
kfree(kn->iattr->xattrs);
605+
kn->iattr->xattrs = NULL;
606+
}
607+
604608
spin_lock(&root->kernfs_idr_lock);
605609
idr_remove(&root->ino_idr, (u32)kernfs_ino(kn));
606610
spin_unlock(&root->kernfs_idr_lock);
@@ -699,7 +703,10 @@ static struct kernfs_node *__kernfs_new_node(struct kernfs_root *root,
699703

700704
err_out4:
701705
if (kn->iattr) {
702-
simple_xattrs_free(&kn->iattr->xattrs, NULL);
706+
if (kn->iattr->xattrs) {
707+
simple_xattrs_free(kn->iattr->xattrs, NULL);
708+
kfree(kn->iattr->xattrs);
709+
}
703710
kmem_cache_free(kernfs_iattrs_cache, kn->iattr);
704711
}
705712
err_out3:

fs/kernfs/inode.c

Lines changed: 23 additions & 76 deletions
Original file line numberDiff line numberDiff line change
@@ -45,9 +45,7 @@ static struct kernfs_iattrs *__kernfs_iattrs(struct kernfs_node *kn, bool alloc)
4545
ret->ia_mtime = ret->ia_atime;
4646
ret->ia_ctime = ret->ia_atime;
4747

48-
simple_xattrs_init(&ret->xattrs);
49-
atomic_set(&ret->nr_user_xattrs, 0);
50-
atomic_set(&ret->user_xattr_size, 0);
48+
simple_xattr_limits_init(&ret->xattr_limits);
5149

5250
/* If someone raced us, recognize it. */
5351
if (!try_cmpxchg(&kn->iattr, &attr, ret))
@@ -146,7 +144,8 @@ ssize_t kernfs_iop_listxattr(struct dentry *dentry, char *buf, size_t size)
146144
if (!attrs)
147145
return -ENOMEM;
148146

149-
return simple_xattr_list(d_inode(dentry), &attrs->xattrs, buf, size);
147+
return simple_xattr_list(d_inode(dentry), READ_ONCE(attrs->xattrs),
148+
buf, size);
150149
}
151150

152151
static inline void set_default_inode_attr(struct inode *inode, umode_t mode)
@@ -298,27 +297,38 @@ int kernfs_xattr_get(struct kernfs_node *kn, const char *name,
298297
void *value, size_t size)
299298
{
300299
struct kernfs_iattrs *attrs = kernfs_iattrs_noalloc(kn);
300+
struct simple_xattrs *xattrs;
301+
301302
if (!attrs)
302303
return -ENODATA;
303304

304-
return simple_xattr_get(&attrs->xattrs, name, value, size);
305+
xattrs = READ_ONCE(attrs->xattrs);
306+
if (!xattrs)
307+
return -ENODATA;
308+
309+
return simple_xattr_get(xattrs, name, value, size);
305310
}
306311

307312
int kernfs_xattr_set(struct kernfs_node *kn, const char *name,
308313
const void *value, size_t size, int flags)
309314
{
310315
struct simple_xattr *old_xattr;
316+
struct simple_xattrs *xattrs;
311317
struct kernfs_iattrs *attrs;
312318

313319
attrs = kernfs_iattrs(kn);
314320
if (!attrs)
315321
return -ENOMEM;
316322

317-
old_xattr = simple_xattr_set(&attrs->xattrs, name, value, size, flags);
323+
xattrs = simple_xattrs_lazy_alloc(&attrs->xattrs, value, flags);
324+
if (IS_ERR_OR_NULL(xattrs))
325+
return PTR_ERR(xattrs);
326+
327+
old_xattr = simple_xattr_set(xattrs, name, value, size, flags);
318328
if (IS_ERR(old_xattr))
319329
return PTR_ERR(old_xattr);
320330

321-
simple_xattr_free(old_xattr);
331+
simple_xattr_free_rcu(old_xattr);
322332
return 0;
323333
}
324334

@@ -344,69 +354,6 @@ static int kernfs_vfs_xattr_set(const struct xattr_handler *handler,
344354
return kernfs_xattr_set(kn, name, value, size, flags);
345355
}
346356

347-
static int kernfs_vfs_user_xattr_add(struct kernfs_node *kn,
348-
const char *full_name,
349-
struct simple_xattrs *xattrs,
350-
const void *value, size_t size, int flags)
351-
{
352-
struct kernfs_iattrs *attr = kernfs_iattrs_noalloc(kn);
353-
atomic_t *sz = &attr->user_xattr_size;
354-
atomic_t *nr = &attr->nr_user_xattrs;
355-
struct simple_xattr *old_xattr;
356-
int ret;
357-
358-
if (atomic_inc_return(nr) > KERNFS_MAX_USER_XATTRS) {
359-
ret = -ENOSPC;
360-
goto dec_count_out;
361-
}
362-
363-
if (atomic_add_return(size, sz) > KERNFS_USER_XATTR_SIZE_LIMIT) {
364-
ret = -ENOSPC;
365-
goto dec_size_out;
366-
}
367-
368-
old_xattr = simple_xattr_set(xattrs, full_name, value, size, flags);
369-
if (!old_xattr)
370-
return 0;
371-
372-
if (IS_ERR(old_xattr)) {
373-
ret = PTR_ERR(old_xattr);
374-
goto dec_size_out;
375-
}
376-
377-
ret = 0;
378-
size = old_xattr->size;
379-
simple_xattr_free(old_xattr);
380-
dec_size_out:
381-
atomic_sub(size, sz);
382-
dec_count_out:
383-
atomic_dec(nr);
384-
return ret;
385-
}
386-
387-
static int kernfs_vfs_user_xattr_rm(struct kernfs_node *kn,
388-
const char *full_name,
389-
struct simple_xattrs *xattrs,
390-
const void *value, size_t size, int flags)
391-
{
392-
struct kernfs_iattrs *attr = kernfs_iattrs_noalloc(kn);
393-
atomic_t *sz = &attr->user_xattr_size;
394-
atomic_t *nr = &attr->nr_user_xattrs;
395-
struct simple_xattr *old_xattr;
396-
397-
old_xattr = simple_xattr_set(xattrs, full_name, value, size, flags);
398-
if (!old_xattr)
399-
return 0;
400-
401-
if (IS_ERR(old_xattr))
402-
return PTR_ERR(old_xattr);
403-
404-
atomic_sub(old_xattr->size, sz);
405-
atomic_dec(nr);
406-
simple_xattr_free(old_xattr);
407-
return 0;
408-
}
409-
410357
static int kernfs_vfs_user_xattr_set(const struct xattr_handler *handler,
411358
struct mnt_idmap *idmap,
412359
struct dentry *unused, struct inode *inode,
@@ -415,6 +362,7 @@ static int kernfs_vfs_user_xattr_set(const struct xattr_handler *handler,
415362
{
416363
const char *full_name = xattr_full_name(handler, suffix);
417364
struct kernfs_node *kn = inode->i_private;
365+
struct simple_xattrs *xattrs;
418366
struct kernfs_iattrs *attrs;
419367

420368
if (!(kernfs_root(kn)->flags & KERNFS_ROOT_SUPPORT_USER_XATTR))
@@ -424,13 +372,12 @@ static int kernfs_vfs_user_xattr_set(const struct xattr_handler *handler,
424372
if (!attrs)
425373
return -ENOMEM;
426374

427-
if (value)
428-
return kernfs_vfs_user_xattr_add(kn, full_name, &attrs->xattrs,
429-
value, size, flags);
430-
else
431-
return kernfs_vfs_user_xattr_rm(kn, full_name, &attrs->xattrs,
432-
value, size, flags);
375+
xattrs = simple_xattrs_lazy_alloc(&attrs->xattrs, value, flags);
376+
if (IS_ERR_OR_NULL(xattrs))
377+
return PTR_ERR(xattrs);
433378

379+
return simple_xattr_set_limited(xattrs, &attrs->xattr_limits,
380+
full_name, value, size, flags);
434381
}
435382

436383
static const struct xattr_handler kernfs_trusted_xattr_handler = {

fs/kernfs/kernfs-internal.h

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -26,9 +26,8 @@ struct kernfs_iattrs {
2626
struct timespec64 ia_mtime;
2727
struct timespec64 ia_ctime;
2828

29-
struct simple_xattrs xattrs;
30-
atomic_t nr_user_xattrs;
31-
atomic_t user_xattr_size;
29+
struct simple_xattrs *xattrs;
30+
struct simple_xattr_limits xattr_limits;
3231
};
3332

3433
struct kernfs_root {

fs/pidfs.c

Lines changed: 40 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@
2222
#include <net/net_namespace.h>
2323
#include <linux/coredump.h>
2424
#include <linux/rhashtable.h>
25+
#include <linux/llist.h>
2526
#include <linux/xattr.h>
2627
#include <linux/cookie.h>
2728

@@ -31,7 +32,6 @@
3132
#define PIDFS_PID_DEAD ERR_PTR(-ESRCH)
3233

3334
static struct kmem_cache *pidfs_attr_cachep __ro_after_init;
34-
static struct kmem_cache *pidfs_xattr_cachep __ro_after_init;
3535

3636
static struct path pidfs_root_path = {};
3737

@@ -46,9 +46,8 @@ enum pidfs_attr_mask_bits {
4646
PIDFS_ATTR_BIT_COREDUMP = 1,
4747
};
4848

49-
struct pidfs_attr {
49+
struct pidfs_anon_attr {
5050
unsigned long attr_mask;
51-
struct simple_xattrs *xattrs;
5251
struct /* exit info */ {
5352
__u64 cgroupid;
5453
__s32 exit_code;
@@ -93,6 +92,13 @@ static const struct rhashtable_params pidfs_ino_ht_params = {
9392
* inode number and the inode generation number to compare or
9493
* use file handles.
9594
*/
95+
struct pidfs_attr {
96+
struct simple_xattrs *xattrs;
97+
union {
98+
struct pidfs_anon_attr;
99+
struct llist_node pidfs_llist;
100+
};
101+
};
96102

97103
#if BITS_PER_LONG == 32
98104

@@ -178,10 +184,30 @@ void pidfs_remove_pid(struct pid *pid)
178184
pidfs_ino_ht_params);
179185
}
180186

187+
static LLIST_HEAD(pidfs_free_list);
188+
189+
static void pidfs_free_attr_work(struct work_struct *work)
190+
{
191+
struct pidfs_attr *attr, *next;
192+
struct llist_node *head;
193+
194+
head = llist_del_all(&pidfs_free_list);
195+
llist_for_each_entry_safe(attr, next, head, pidfs_llist) {
196+
struct simple_xattrs *xattrs = attr->xattrs;
197+
198+
if (xattrs) {
199+
simple_xattrs_free(xattrs, NULL);
200+
kfree(xattrs);
201+
}
202+
kfree(attr);
203+
}
204+
}
205+
206+
static DECLARE_WORK(pidfs_free_work, pidfs_free_attr_work);
207+
181208
void pidfs_free_pid(struct pid *pid)
182209
{
183-
struct pidfs_attr *attr __free(kfree) = no_free_ptr(pid->attr);
184-
struct simple_xattrs *xattrs __free(kfree) = NULL;
210+
struct pidfs_attr *attr = pid->attr;
185211

186212
/*
187213
* Any dentry must've been wiped from the pid by now.
@@ -200,9 +226,10 @@ void pidfs_free_pid(struct pid *pid)
200226
if (IS_ERR(attr))
201227
return;
202228

203-
xattrs = no_free_ptr(attr->xattrs);
204-
if (xattrs)
205-
simple_xattrs_free(xattrs, NULL);
229+
if (likely(!attr->xattrs))
230+
kfree(attr);
231+
else if (llist_add(&attr->pidfs_llist, &pidfs_free_list))
232+
schedule_work(&pidfs_free_work);
206233
}
207234

208235
#ifdef CONFIG_PROC_FS
@@ -1009,7 +1036,7 @@ static int pidfs_xattr_get(const struct xattr_handler *handler,
10091036

10101037
xattrs = READ_ONCE(attr->xattrs);
10111038
if (!xattrs)
1012-
return 0;
1039+
return -ENODATA;
10131040

10141041
name = xattr_full_name(handler, suffix);
10151042
return simple_xattr_get(xattrs, name, value, size);
@@ -1029,22 +1056,16 @@ static int pidfs_xattr_set(const struct xattr_handler *handler,
10291056
/* Ensure we're the only one to set @attr->xattrs. */
10301057
WARN_ON_ONCE(!inode_is_locked(inode));
10311058

1032-
xattrs = READ_ONCE(attr->xattrs);
1033-
if (!xattrs) {
1034-
xattrs = kmem_cache_zalloc(pidfs_xattr_cachep, GFP_KERNEL);
1035-
if (!xattrs)
1036-
return -ENOMEM;
1037-
1038-
simple_xattrs_init(xattrs);
1039-
smp_store_release(&pid->attr->xattrs, xattrs);
1040-
}
1059+
xattrs = simple_xattrs_lazy_alloc(&attr->xattrs, value, flags);
1060+
if (IS_ERR_OR_NULL(xattrs))
1061+
return PTR_ERR(xattrs);
10411062

10421063
name = xattr_full_name(handler, suffix);
10431064
old_xattr = simple_xattr_set(xattrs, name, value, size, flags);
10441065
if (IS_ERR(old_xattr))
10451066
return PTR_ERR(old_xattr);
10461067

1047-
simple_xattr_free(old_xattr);
1068+
simple_xattr_free_rcu(old_xattr);
10481069
return 0;
10491070
}
10501071

@@ -1122,11 +1143,6 @@ void __init pidfs_init(void)
11221143
(SLAB_HWCACHE_ALIGN | SLAB_RECLAIM_ACCOUNT |
11231144
SLAB_ACCOUNT | SLAB_PANIC), NULL);
11241145

1125-
pidfs_xattr_cachep = kmem_cache_create("pidfs_xattr_cache",
1126-
sizeof(struct simple_xattrs), 0,
1127-
(SLAB_HWCACHE_ALIGN | SLAB_RECLAIM_ACCOUNT |
1128-
SLAB_ACCOUNT | SLAB_PANIC), NULL);
1129-
11301146
pidfs_mnt = kern_mount(&pidfs_type);
11311147
if (IS_ERR(pidfs_mnt))
11321148
panic("Failed to mount pidfs pseudo filesystem");

0 commit comments

Comments
 (0)