Skip to content

Commit cc4448d

Browse files
etsalhtejun
authored andcommitted
tools/sched_ext: add scx_userland scheduler
Add in the scx_userland scheduler that does vruntime-based scheduling in userspace code and communicates scheduling decisions to BPF by accessing and modifying globals through the skeleton. Cc: Tejun Heo <tj@kernel.org> Cc: David Vernet <dvernet@meta.com> Signed-off-by: Emil Tsalapatis <emil@etsalapatis.com> Signed-off-by: Tejun Heo <tj@kernel.org>
1 parent 2f8d489 commit cc4448d

4 files changed

Lines changed: 799 additions & 1 deletion

File tree

tools/sched_ext/Makefile

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -189,7 +189,7 @@ $(INCLUDE_DIR)/%.bpf.skel.h: $(SCXOBJ_DIR)/%.bpf.o $(INCLUDE_DIR)/vmlinux.h $(BP
189189

190190
SCX_COMMON_DEPS := include/scx/common.h include/scx/user_exit_info.h | $(BINDIR)
191191

192-
c-sched-targets = scx_simple scx_cpu0 scx_qmap scx_central scx_flatcg
192+
c-sched-targets = scx_simple scx_cpu0 scx_qmap scx_central scx_flatcg scx_userland
193193

194194
$(addprefix $(BINDIR)/,$(c-sched-targets)): \
195195
$(BINDIR)/%: \

tools/sched_ext/scx_userland.bpf.c

Lines changed: 344 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,344 @@
1+
/* SPDX-License-Identifier: GPL-2.0 */
2+
/*
3+
* A minimal userland scheduler.
4+
*
5+
* In terms of scheduling, this provides two different types of behaviors:
6+
* 1. A global FIFO scheduling order for _any_ tasks that have CPU affinity.
7+
* All such tasks are direct-dispatched from the kernel, and are never
8+
* enqueued in user space.
9+
* 2. A primitive vruntime scheduler that is implemented in user space, for all
10+
* other tasks.
11+
*
12+
* Some parts of this example user space scheduler could be implemented more
13+
* efficiently using more complex and sophisticated data structures. For
14+
* example, rather than using BPF_MAP_TYPE_QUEUE's,
15+
* BPF_MAP_TYPE_{USER_}RINGBUF's could be used for exchanging messages between
16+
* user space and kernel space. Similarly, we use a simple vruntime-sorted list
17+
* in user space, but an rbtree could be used instead.
18+
*
19+
* Copyright (c) 2022 Meta Platforms, Inc. and affiliates.
20+
* Copyright (c) 2022 Tejun Heo <tj@kernel.org>
21+
* Copyright (c) 2022 David Vernet <dvernet@meta.com>
22+
*/
23+
#include <scx/common.bpf.h>
24+
#include "scx_userland.h"
25+
26+
/*
27+
* Maximum amount of tasks enqueued/dispatched between kernel and user-space.
28+
*/
29+
#define MAX_ENQUEUED_TASKS 4096
30+
31+
char _license[] SEC("license") = "GPL";
32+
33+
const volatile s32 usersched_pid;
34+
35+
/* !0 for veristat, set during init */
36+
const volatile u32 num_possible_cpus = 64;
37+
38+
/* Stats that are printed by user space. */
39+
u64 nr_failed_enqueues, nr_kernel_enqueues, nr_user_enqueues;
40+
41+
/*
42+
* Number of tasks that are queued for scheduling.
43+
*
44+
* This number is incremented by the BPF component when a task is queued to the
45+
* user-space scheduler and it must be decremented by the user-space scheduler
46+
* when a task is consumed.
47+
*/
48+
volatile u64 nr_queued;
49+
50+
/*
51+
* Number of tasks that are waiting for scheduling.
52+
*
53+
* This number must be updated by the user-space scheduler to keep track if
54+
* there is still some scheduling work to do.
55+
*/
56+
volatile u64 nr_scheduled;
57+
58+
UEI_DEFINE(uei);
59+
60+
/*
61+
* The map containing tasks that are enqueued in user space from the kernel.
62+
*
63+
* This map is drained by the user space scheduler.
64+
*/
65+
struct {
66+
__uint(type, BPF_MAP_TYPE_QUEUE);
67+
__uint(max_entries, MAX_ENQUEUED_TASKS);
68+
__type(value, struct scx_userland_enqueued_task);
69+
} enqueued SEC(".maps");
70+
71+
/*
72+
* The map containing tasks that are dispatched to the kernel from user space.
73+
*
74+
* Drained by the kernel in userland_dispatch().
75+
*/
76+
struct {
77+
__uint(type, BPF_MAP_TYPE_QUEUE);
78+
__uint(max_entries, MAX_ENQUEUED_TASKS);
79+
__type(value, s32);
80+
} dispatched SEC(".maps");
81+
82+
/* Per-task scheduling context */
83+
struct task_ctx {
84+
bool force_local; /* Dispatch directly to local DSQ */
85+
};
86+
87+
/* Map that contains task-local storage. */
88+
struct {
89+
__uint(type, BPF_MAP_TYPE_TASK_STORAGE);
90+
__uint(map_flags, BPF_F_NO_PREALLOC);
91+
__type(key, int);
92+
__type(value, struct task_ctx);
93+
} task_ctx_stor SEC(".maps");
94+
95+
/*
96+
* Flag used to wake-up the user-space scheduler.
97+
*/
98+
static volatile u32 usersched_needed;
99+
100+
/*
101+
* Set user-space scheduler wake-up flag (equivalent to an atomic release
102+
* operation).
103+
*/
104+
static void set_usersched_needed(void)
105+
{
106+
__sync_fetch_and_or(&usersched_needed, 1);
107+
}
108+
109+
/*
110+
* Check and clear user-space scheduler wake-up flag (equivalent to an atomic
111+
* acquire operation).
112+
*/
113+
static bool test_and_clear_usersched_needed(void)
114+
{
115+
return __sync_fetch_and_and(&usersched_needed, 0) == 1;
116+
}
117+
118+
static bool is_usersched_task(const struct task_struct *p)
119+
{
120+
return p->pid == usersched_pid;
121+
}
122+
123+
static bool keep_in_kernel(const struct task_struct *p)
124+
{
125+
return p->nr_cpus_allowed < num_possible_cpus;
126+
}
127+
128+
static struct task_struct *usersched_task(void)
129+
{
130+
struct task_struct *p;
131+
132+
p = bpf_task_from_pid(usersched_pid);
133+
/*
134+
* Should never happen -- the usersched task should always be managed
135+
* by sched_ext.
136+
*/
137+
if (!p)
138+
scx_bpf_error("Failed to find usersched task %d", usersched_pid);
139+
140+
return p;
141+
}
142+
143+
s32 BPF_STRUCT_OPS(userland_select_cpu, struct task_struct *p,
144+
s32 prev_cpu, u64 wake_flags)
145+
{
146+
if (keep_in_kernel(p)) {
147+
s32 cpu;
148+
struct task_ctx *tctx;
149+
150+
tctx = bpf_task_storage_get(&task_ctx_stor, p, 0, 0);
151+
if (!tctx) {
152+
scx_bpf_error("Failed to look up task-local storage for %s", p->comm);
153+
return -ESRCH;
154+
}
155+
156+
if (p->nr_cpus_allowed == 1 ||
157+
scx_bpf_test_and_clear_cpu_idle(prev_cpu)) {
158+
tctx->force_local = true;
159+
return prev_cpu;
160+
}
161+
162+
cpu = scx_bpf_pick_idle_cpu(p->cpus_ptr, 0);
163+
if (cpu >= 0) {
164+
tctx->force_local = true;
165+
return cpu;
166+
}
167+
}
168+
169+
return prev_cpu;
170+
}
171+
172+
static void dispatch_user_scheduler(void)
173+
{
174+
struct task_struct *p;
175+
176+
p = usersched_task();
177+
if (p) {
178+
scx_bpf_dsq_insert(p, SCX_DSQ_GLOBAL, SCX_SLICE_DFL, 0);
179+
bpf_task_release(p);
180+
}
181+
}
182+
183+
static void enqueue_task_in_user_space(struct task_struct *p, u64 enq_flags)
184+
{
185+
struct scx_userland_enqueued_task task = {};
186+
187+
task.pid = p->pid;
188+
task.sum_exec_runtime = p->se.sum_exec_runtime;
189+
task.weight = p->scx.weight;
190+
191+
if (bpf_map_push_elem(&enqueued, &task, 0)) {
192+
/*
193+
* If we fail to enqueue the task in user space, put it
194+
* directly on the global DSQ.
195+
*/
196+
__sync_fetch_and_add(&nr_failed_enqueues, 1);
197+
scx_bpf_dsq_insert(p, SCX_DSQ_GLOBAL, SCX_SLICE_DFL, enq_flags);
198+
} else {
199+
__sync_fetch_and_add(&nr_user_enqueues, 1);
200+
set_usersched_needed();
201+
}
202+
}
203+
204+
void BPF_STRUCT_OPS(userland_enqueue, struct task_struct *p, u64 enq_flags)
205+
{
206+
if (keep_in_kernel(p)) {
207+
u64 dsq_id = SCX_DSQ_GLOBAL;
208+
struct task_ctx *tctx;
209+
210+
tctx = bpf_task_storage_get(&task_ctx_stor, p, 0, 0);
211+
if (!tctx) {
212+
scx_bpf_error("Failed to lookup task ctx for %s", p->comm);
213+
return;
214+
}
215+
216+
if (tctx->force_local)
217+
dsq_id = SCX_DSQ_LOCAL;
218+
tctx->force_local = false;
219+
scx_bpf_dsq_insert(p, dsq_id, SCX_SLICE_DFL, enq_flags);
220+
__sync_fetch_and_add(&nr_kernel_enqueues, 1);
221+
return;
222+
} else if (!is_usersched_task(p)) {
223+
enqueue_task_in_user_space(p, enq_flags);
224+
}
225+
}
226+
227+
void BPF_STRUCT_OPS(userland_dispatch, s32 cpu, struct task_struct *prev)
228+
{
229+
if (test_and_clear_usersched_needed())
230+
dispatch_user_scheduler();
231+
232+
bpf_repeat(MAX_ENQUEUED_TASKS) {
233+
s32 pid;
234+
struct task_struct *p;
235+
236+
if (bpf_map_pop_elem(&dispatched, &pid))
237+
break;
238+
239+
/*
240+
* The task could have exited by the time we get around to
241+
* dispatching it. Treat this as a normal occurrence, and simply
242+
* move onto the next iteration.
243+
*/
244+
p = bpf_task_from_pid(pid);
245+
if (!p)
246+
continue;
247+
248+
scx_bpf_dsq_insert(p, SCX_DSQ_GLOBAL, SCX_SLICE_DFL, 0);
249+
bpf_task_release(p);
250+
}
251+
}
252+
253+
/*
254+
* A CPU is about to change its idle state. If the CPU is going idle, ensure
255+
* that the user-space scheduler has a chance to run if there is any remaining
256+
* work to do.
257+
*/
258+
void BPF_STRUCT_OPS(userland_update_idle, s32 cpu, bool idle)
259+
{
260+
/*
261+
* Don't do anything if we exit from and idle state, a CPU owner will
262+
* be assigned in .running().
263+
*/
264+
if (!idle)
265+
return;
266+
/*
267+
* A CPU is now available, notify the user-space scheduler that tasks
268+
* can be dispatched, if there is at least one task waiting to be
269+
* scheduled, either queued (accounted in nr_queued) or scheduled
270+
* (accounted in nr_scheduled).
271+
*
272+
* NOTE: nr_queued is incremented by the BPF component, more exactly in
273+
* enqueue(), when a task is sent to the user-space scheduler, then
274+
* the scheduler drains the queued tasks (updating nr_queued) and adds
275+
* them to its internal data structures / state; at this point tasks
276+
* become "scheduled" and the user-space scheduler will take care of
277+
* updating nr_scheduled accordingly; lastly tasks will be dispatched
278+
* and the user-space scheduler will update nr_scheduled again.
279+
*
280+
* Checking both counters allows to determine if there is still some
281+
* pending work to do for the scheduler: new tasks have been queued
282+
* since last check, or there are still tasks "queued" or "scheduled"
283+
* since the previous user-space scheduler run. If the counters are
284+
* both zero it is pointless to wake-up the scheduler (even if a CPU
285+
* becomes idle), because there is nothing to do.
286+
*
287+
* Keep in mind that update_idle() doesn't run concurrently with the
288+
* user-space scheduler (that is single-threaded): this function is
289+
* naturally serialized with the user-space scheduler code, therefore
290+
* this check here is also safe from a concurrency perspective.
291+
*/
292+
if (nr_queued || nr_scheduled) {
293+
/*
294+
* Kick the CPU to make it immediately ready to accept
295+
* dispatched tasks.
296+
*/
297+
set_usersched_needed();
298+
scx_bpf_kick_cpu(cpu, 0);
299+
}
300+
}
301+
302+
s32 BPF_STRUCT_OPS(userland_init_task, struct task_struct *p,
303+
struct scx_init_task_args *args)
304+
{
305+
if (bpf_task_storage_get(&task_ctx_stor, p, 0,
306+
BPF_LOCAL_STORAGE_GET_F_CREATE))
307+
return 0;
308+
else
309+
return -ENOMEM;
310+
}
311+
312+
s32 BPF_STRUCT_OPS(userland_init)
313+
{
314+
if (num_possible_cpus == 0) {
315+
scx_bpf_error("User scheduler # CPUs uninitialized (%d)",
316+
num_possible_cpus);
317+
return -EINVAL;
318+
}
319+
320+
if (usersched_pid <= 0) {
321+
scx_bpf_error("User scheduler pid uninitialized (%d)",
322+
usersched_pid);
323+
return -EINVAL;
324+
}
325+
326+
return 0;
327+
}
328+
329+
void BPF_STRUCT_OPS(userland_exit, struct scx_exit_info *ei)
330+
{
331+
UEI_RECORD(uei, ei);
332+
}
333+
334+
SCX_OPS_DEFINE(userland_ops,
335+
.select_cpu = (void *)userland_select_cpu,
336+
.enqueue = (void *)userland_enqueue,
337+
.dispatch = (void *)userland_dispatch,
338+
.update_idle = (void *)userland_update_idle,
339+
.init_task = (void *)userland_init_task,
340+
.init = (void *)userland_init,
341+
.exit = (void *)userland_exit,
342+
.flags = SCX_OPS_ENQ_LAST |
343+
SCX_OPS_KEEP_BUILTIN_IDLE,
344+
.name = "userland");

0 commit comments

Comments
 (0)