Skip to content

Commit 63724e9

Browse files
committed
Merge tag 'sched-urgent-2026-03-15' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip
Pull scheduler fixes from Ingo Molnar: "More MM-CID fixes, mostly fixing hangs/races: - Fix CID hangs due to a race between concurrent forks - Fix vfork()/CLONE_VM MMCID bug causing hangs - Remove pointless preemption guard - Fix CID task list walk performance regression on large systems by removing the known-flaky and slow counting logic using for_each_process_thread() in mm_cid_*fixup_tasks_to_cpus(), and implementing a simple sched_mm_cid::node list instead" * tag 'sched-urgent-2026-03-15' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: sched/mmcid: Avoid full tasklist walks sched/mmcid: Remove pointless preempt guard sched/mmcid: Handle vfork()/CLONE_VM correctly sched/mmcid: Prevent CID stalls due to concurrent forks
2 parents 9745031 + 192d852 commit 63724e9

4 files changed

Lines changed: 34 additions & 56 deletions

File tree

include/linux/rseq_types.h

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -133,10 +133,12 @@ struct rseq_data { };
133133
* @active: MM CID is active for the task
134134
* @cid: The CID associated to the task either permanently or
135135
* borrowed from the CPU
136+
* @node: Queued in the per MM MMCID list
136137
*/
137138
struct sched_mm_cid {
138139
unsigned int active;
139140
unsigned int cid;
141+
struct hlist_node node;
140142
};
141143

142144
/**
@@ -157,6 +159,7 @@ struct mm_cid_pcpu {
157159
* @work: Regular work to handle the affinity mode change case
158160
* @lock: Spinlock to protect against affinity setting which can't take @mutex
159161
* @mutex: Mutex to serialize forks and exits related to this mm
162+
* @user_list: List of the MM CID users of a MM
160163
* @nr_cpus_allowed: The number of CPUs in the per MM allowed CPUs map. The map
161164
* is growth only.
162165
* @users: The number of tasks sharing this MM. Separate from mm::mm_users
@@ -177,13 +180,14 @@ struct mm_mm_cid {
177180

178181
raw_spinlock_t lock;
179182
struct mutex mutex;
183+
struct hlist_head user_list;
180184

181185
/* Low frequency modified */
182186
unsigned int nr_cpus_allowed;
183187
unsigned int users;
184188
unsigned int pcpu_thrs;
185189
unsigned int update_deferred;
186-
}____cacheline_aligned_in_smp;
190+
} ____cacheline_aligned;
187191
#else /* CONFIG_SCHED_MM_CID */
188192
struct mm_mm_cid { };
189193
struct sched_mm_cid { };

include/linux/sched.h

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2354,7 +2354,6 @@ static __always_inline void alloc_tag_restore(struct alloc_tag *tag, struct allo
23542354
#ifdef CONFIG_SCHED_MM_CID
23552355
void sched_mm_cid_before_execve(struct task_struct *t);
23562356
void sched_mm_cid_after_execve(struct task_struct *t);
2357-
void sched_mm_cid_fork(struct task_struct *t);
23582357
void sched_mm_cid_exit(struct task_struct *t);
23592358
static __always_inline int task_mm_cid(struct task_struct *t)
23602359
{
@@ -2363,7 +2362,6 @@ static __always_inline int task_mm_cid(struct task_struct *t)
23632362
#else
23642363
static inline void sched_mm_cid_before_execve(struct task_struct *t) { }
23652364
static inline void sched_mm_cid_after_execve(struct task_struct *t) { }
2366-
static inline void sched_mm_cid_fork(struct task_struct *t) { }
23672365
static inline void sched_mm_cid_exit(struct task_struct *t) { }
23682366
static __always_inline int task_mm_cid(struct task_struct *t)
23692367
{

kernel/fork.c

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1000,6 +1000,7 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node)
10001000
#ifdef CONFIG_SCHED_MM_CID
10011001
tsk->mm_cid.cid = MM_CID_UNSET;
10021002
tsk->mm_cid.active = 0;
1003+
INIT_HLIST_NODE(&tsk->mm_cid.node);
10031004
#endif
10041005
return tsk;
10051006

@@ -1586,7 +1587,6 @@ static int copy_mm(u64 clone_flags, struct task_struct *tsk)
15861587

15871588
tsk->mm = mm;
15881589
tsk->active_mm = mm;
1589-
sched_mm_cid_fork(tsk);
15901590
return 0;
15911591
}
15921592

@@ -2498,7 +2498,6 @@ __latent_entropy struct task_struct *copy_process(
24982498
exit_nsproxy_namespaces(p);
24992499
bad_fork_cleanup_mm:
25002500
if (p->mm) {
2501-
sched_mm_cid_exit(p);
25022501
mm_clear_owner(p->mm, p);
25032502
mmput(p->mm);
25042503
}

kernel/sched/core.c

Lines changed: 28 additions & 51 deletions
Original file line numberDiff line numberDiff line change
@@ -4729,8 +4729,11 @@ void sched_cancel_fork(struct task_struct *p)
47294729
scx_cancel_fork(p);
47304730
}
47314731

4732+
static void sched_mm_cid_fork(struct task_struct *t);
4733+
47324734
void sched_post_fork(struct task_struct *p)
47334735
{
4736+
sched_mm_cid_fork(p);
47344737
uclamp_post_fork(p);
47354738
scx_post_fork(p);
47364739
}
@@ -10617,83 +10620,54 @@ static inline void mm_cid_transit_to_cpu(struct task_struct *t, struct mm_cid_pc
1061710620
}
1061810621
}
1061910622

10620-
static bool mm_cid_fixup_task_to_cpu(struct task_struct *t, struct mm_struct *mm)
10623+
static void mm_cid_fixup_task_to_cpu(struct task_struct *t, struct mm_struct *mm)
1062110624
{
1062210625
/* Remote access to mm::mm_cid::pcpu requires rq_lock */
1062310626
guard(task_rq_lock)(t);
10624-
/* If the task is not active it is not in the users count */
10625-
if (!t->mm_cid.active)
10626-
return false;
1062710627
if (cid_on_task(t->mm_cid.cid)) {
1062810628
/* If running on the CPU, put the CID in transit mode, otherwise drop it */
1062910629
if (task_rq(t)->curr == t)
1063010630
mm_cid_transit_to_cpu(t, per_cpu_ptr(mm->mm_cid.pcpu, task_cpu(t)));
1063110631
else
1063210632
mm_unset_cid_on_task(t);
1063310633
}
10634-
return true;
1063510634
}
1063610635

10637-
static void mm_cid_do_fixup_tasks_to_cpus(struct mm_struct *mm)
10636+
static void mm_cid_fixup_tasks_to_cpus(void)
1063810637
{
10639-
struct task_struct *p, *t;
10640-
unsigned int users;
10638+
struct mm_struct *mm = current->mm;
10639+
struct task_struct *t;
1064110640

10642-
/*
10643-
* This can obviously race with a concurrent affinity change, which
10644-
* increases the number of allowed CPUs for this mm, but that does
10645-
* not affect the mode and only changes the CID constraints. A
10646-
* possible switch back to per task mode happens either in the
10647-
* deferred handler function or in the next fork()/exit().
10648-
*
10649-
* The caller has already transferred. The newly incoming task is
10650-
* already accounted for, but not yet visible.
10651-
*/
10652-
users = mm->mm_cid.users - 2;
10653-
if (!users)
10654-
return;
10641+
lockdep_assert_held(&mm->mm_cid.mutex);
1065510642

10656-
guard(rcu)();
10657-
for_other_threads(current, t) {
10658-
if (mm_cid_fixup_task_to_cpu(t, mm))
10659-
users--;
10643+
hlist_for_each_entry(t, &mm->mm_cid.user_list, mm_cid.node) {
10644+
/* Current has already transferred before invoking the fixup. */
10645+
if (t != current)
10646+
mm_cid_fixup_task_to_cpu(t, mm);
1066010647
}
1066110648

10662-
if (!users)
10663-
return;
10664-
10665-
/* Happens only for VM_CLONE processes. */
10666-
for_each_process_thread(p, t) {
10667-
if (t == current || t->mm != mm)
10668-
continue;
10669-
if (mm_cid_fixup_task_to_cpu(t, mm)) {
10670-
if (--users == 0)
10671-
return;
10672-
}
10673-
}
10674-
}
10675-
10676-
static void mm_cid_fixup_tasks_to_cpus(void)
10677-
{
10678-
struct mm_struct *mm = current->mm;
10679-
10680-
mm_cid_do_fixup_tasks_to_cpus(mm);
1068110649
mm_cid_complete_transit(mm, MM_CID_ONCPU);
1068210650
}
1068310651

1068410652
static bool sched_mm_cid_add_user(struct task_struct *t, struct mm_struct *mm)
1068510653
{
10654+
lockdep_assert_held(&mm->mm_cid.lock);
10655+
1068610656
t->mm_cid.active = 1;
10657+
hlist_add_head(&t->mm_cid.node, &mm->mm_cid.user_list);
1068710658
mm->mm_cid.users++;
1068810659
return mm_update_max_cids(mm);
1068910660
}
1069010661

10691-
void sched_mm_cid_fork(struct task_struct *t)
10662+
static void sched_mm_cid_fork(struct task_struct *t)
1069210663
{
1069310664
struct mm_struct *mm = t->mm;
1069410665
bool percpu;
1069510666

10696-
WARN_ON_ONCE(!mm || t->mm_cid.cid != MM_CID_UNSET);
10667+
if (!mm)
10668+
return;
10669+
10670+
WARN_ON_ONCE(t->mm_cid.cid != MM_CID_UNSET);
1069710671

1069810672
guard(mutex)(&mm->mm_cid.mutex);
1069910673
scoped_guard(raw_spinlock_irq, &mm->mm_cid.lock) {
@@ -10732,12 +10706,13 @@ void sched_mm_cid_fork(struct task_struct *t)
1073210706

1073310707
static bool sched_mm_cid_remove_user(struct task_struct *t)
1073410708
{
10709+
lockdep_assert_held(&t->mm->mm_cid.lock);
10710+
1073510711
t->mm_cid.active = 0;
10736-
scoped_guard(preempt) {
10737-
/* Clear the transition bit */
10738-
t->mm_cid.cid = cid_from_transit_cid(t->mm_cid.cid);
10739-
mm_unset_cid_on_task(t);
10740-
}
10712+
/* Clear the transition bit */
10713+
t->mm_cid.cid = cid_from_transit_cid(t->mm_cid.cid);
10714+
mm_unset_cid_on_task(t);
10715+
hlist_del_init(&t->mm_cid.node);
1074110716
t->mm->mm_cid.users--;
1074210717
return mm_update_max_cids(t->mm);
1074310718
}
@@ -10880,11 +10855,13 @@ void mm_init_cid(struct mm_struct *mm, struct task_struct *p)
1088010855
mutex_init(&mm->mm_cid.mutex);
1088110856
mm->mm_cid.irq_work = IRQ_WORK_INIT_HARD(mm_cid_irq_work);
1088210857
INIT_WORK(&mm->mm_cid.work, mm_cid_work_fn);
10858+
INIT_HLIST_HEAD(&mm->mm_cid.user_list);
1088310859
cpumask_copy(mm_cpus_allowed(mm), &p->cpus_mask);
1088410860
bitmap_zero(mm_cidmask(mm), num_possible_cpus());
1088510861
}
1088610862
#else /* CONFIG_SCHED_MM_CID */
1088710863
static inline void mm_update_cpus_allowed(struct mm_struct *mm, const struct cpumask *affmsk) { }
10864+
static inline void sched_mm_cid_fork(struct task_struct *t) { }
1088810865
#endif /* !CONFIG_SCHED_MM_CID */
1088910866

1089010867
static DEFINE_PER_CPU(struct sched_change_ctx, sched_change_ctx);

0 commit comments

Comments
 (0)