Skip to content

Commit 192d852

Browse files
Thomas GleixnerPeter Zijlstra
authored andcommitted
sched/mmcid: Avoid full tasklist walks
Chasing vfork()'ed tasks on a CID ownership mode switch requires a full task list walk, which is obviously expensive on large systems. Avoid that by keeping a list of tasks using a mm MMCID entity in mm::mm_cid and walk this list instead. This removes the proven to be flaky counting logic and avoids a full task list walk in the case of vfork()'ed tasks. Fixes: fbd0e71 ("sched/mmcid: Provide CID ownership mode fixup functions") Signed-off-by: Thomas Gleixner <tglx@kernel.org> Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org> Tested-by: Matthieu Baerts (NGI0) <matttbe@kernel.org> Link: https://patch.msgid.link/20260310202526.183824481@kernel.org
1 parent 7574ac6 commit 192d852

3 files changed

Lines changed: 18 additions & 43 deletions

File tree

include/linux/rseq_types.h

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -133,10 +133,12 @@ struct rseq_data { };
133133
* @active: MM CID is active for the task
134134
* @cid: The CID associated to the task either permanently or
135135
* borrowed from the CPU
136+
* @node: Queued in the per MM MMCID list
136137
*/
137138
struct sched_mm_cid {
138139
unsigned int active;
139140
unsigned int cid;
141+
struct hlist_node node;
140142
};
141143

142144
/**
@@ -157,6 +159,7 @@ struct mm_cid_pcpu {
157159
* @work: Regular work to handle the affinity mode change case
158160
* @lock: Spinlock to protect against affinity setting which can't take @mutex
159161
* @mutex: Mutex to serialize forks and exits related to this mm
162+
* @user_list: List of the MM CID users of a MM
160163
* @nr_cpus_allowed: The number of CPUs in the per MM allowed CPUs map. The map
161164
* is growth only.
162165
* @users: The number of tasks sharing this MM. Separate from mm::mm_users
@@ -177,13 +180,14 @@ struct mm_mm_cid {
177180

178181
raw_spinlock_t lock;
179182
struct mutex mutex;
183+
struct hlist_head user_list;
180184

181185
/* Low frequency modified */
182186
unsigned int nr_cpus_allowed;
183187
unsigned int users;
184188
unsigned int pcpu_thrs;
185189
unsigned int update_deferred;
186-
}____cacheline_aligned_in_smp;
190+
} ____cacheline_aligned;
187191
#else /* CONFIG_SCHED_MM_CID */
188192
struct mm_mm_cid { };
189193
struct sched_mm_cid { };

kernel/fork.c

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1000,6 +1000,7 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node)
10001000
#ifdef CONFIG_SCHED_MM_CID
10011001
tsk->mm_cid.cid = MM_CID_UNSET;
10021002
tsk->mm_cid.active = 0;
1003+
INIT_HLIST_NODE(&tsk->mm_cid.node);
10031004
#endif
10041005
return tsk;
10051006

kernel/sched/core.c

Lines changed: 12 additions & 42 deletions
Original file line numberDiff line numberDiff line change
@@ -10620,65 +10620,32 @@ static inline void mm_cid_transit_to_cpu(struct task_struct *t, struct mm_cid_pc
1062010620
}
1062110621
}
1062210622

10623-
static bool mm_cid_fixup_task_to_cpu(struct task_struct *t, struct mm_struct *mm)
10623+
static void mm_cid_fixup_task_to_cpu(struct task_struct *t, struct mm_struct *mm)
1062410624
{
1062510625
/* Remote access to mm::mm_cid::pcpu requires rq_lock */
1062610626
guard(task_rq_lock)(t);
10627-
/* If the task is not active it is not in the users count */
10628-
if (!t->mm_cid.active)
10629-
return false;
1063010627
if (cid_on_task(t->mm_cid.cid)) {
1063110628
/* If running on the CPU, put the CID in transit mode, otherwise drop it */
1063210629
if (task_rq(t)->curr == t)
1063310630
mm_cid_transit_to_cpu(t, per_cpu_ptr(mm->mm_cid.pcpu, task_cpu(t)));
1063410631
else
1063510632
mm_unset_cid_on_task(t);
1063610633
}
10637-
return true;
1063810634
}
1063910635

10640-
static void mm_cid_do_fixup_tasks_to_cpus(struct mm_struct *mm)
10636+
static void mm_cid_fixup_tasks_to_cpus(void)
1064110637
{
10642-
struct task_struct *p, *t;
10643-
unsigned int users;
10644-
10645-
/*
10646-
* This can obviously race with a concurrent affinity change, which
10647-
* increases the number of allowed CPUs for this mm, but that does
10648-
* not affect the mode and only changes the CID constraints. A
10649-
* possible switch back to per task mode happens either in the
10650-
* deferred handler function or in the next fork()/exit().
10651-
*
10652-
* The caller has already transferred so remove it from the users
10653-
* count. The incoming task is already visible and has mm_cid.active,
10654-
* but has task::mm_cid::cid == UNSET. Still it needs to be accounted
10655-
* for. Concurrent fork()s might add more threads, but all of them have
10656-
* task::mm_cid::active = 0, so they don't affect the accounting here.
10657-
*/
10658-
users = mm->mm_cid.users - 1;
10659-
10660-
guard(rcu)();
10661-
for_other_threads(current, t) {
10662-
if (mm_cid_fixup_task_to_cpu(t, mm))
10663-
users--;
10664-
}
10638+
struct mm_struct *mm = current->mm;
10639+
struct task_struct *t;
1066510640

10666-
if (!users)
10667-
return;
10641+
lockdep_assert_held(&mm->mm_cid.mutex);
1066810642

10669-
/* Happens only for VM_CLONE processes. */
10670-
for_each_process_thread(p, t) {
10671-
if (t == current || t->mm != mm)
10672-
continue;
10673-
mm_cid_fixup_task_to_cpu(t, mm);
10643+
hlist_for_each_entry(t, &mm->mm_cid.user_list, mm_cid.node) {
10644+
/* Current has already transferred before invoking the fixup. */
10645+
if (t != current)
10646+
mm_cid_fixup_task_to_cpu(t, mm);
1067410647
}
10675-
}
10676-
10677-
static void mm_cid_fixup_tasks_to_cpus(void)
10678-
{
10679-
struct mm_struct *mm = current->mm;
1068010648

10681-
mm_cid_do_fixup_tasks_to_cpus(mm);
1068210649
mm_cid_complete_transit(mm, MM_CID_ONCPU);
1068310650
}
1068410651

@@ -10687,6 +10654,7 @@ static bool sched_mm_cid_add_user(struct task_struct *t, struct mm_struct *mm)
1068710654
lockdep_assert_held(&mm->mm_cid.lock);
1068810655

1068910656
t->mm_cid.active = 1;
10657+
hlist_add_head(&t->mm_cid.node, &mm->mm_cid.user_list);
1069010658
mm->mm_cid.users++;
1069110659
return mm_update_max_cids(mm);
1069210660
}
@@ -10744,6 +10712,7 @@ static bool sched_mm_cid_remove_user(struct task_struct *t)
1074410712
/* Clear the transition bit */
1074510713
t->mm_cid.cid = cid_from_transit_cid(t->mm_cid.cid);
1074610714
mm_unset_cid_on_task(t);
10715+
hlist_del_init(&t->mm_cid.node);
1074710716
t->mm->mm_cid.users--;
1074810717
return mm_update_max_cids(t->mm);
1074910718
}
@@ -10886,6 +10855,7 @@ void mm_init_cid(struct mm_struct *mm, struct task_struct *p)
1088610855
mutex_init(&mm->mm_cid.mutex);
1088710856
mm->mm_cid.irq_work = IRQ_WORK_INIT_HARD(mm_cid_irq_work);
1088810857
INIT_WORK(&mm->mm_cid.work, mm_cid_work_fn);
10858+
INIT_HLIST_HEAD(&mm->mm_cid.user_list);
1088910859
cpumask_copy(mm_cpus_allowed(mm), &p->cpus_mask);
1089010860
bitmap_zero(mm_cidmask(mm), num_possible_cpus());
1089110861
}

0 commit comments

Comments
 (0)