-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathscx_me.bpf.c
1261 lines (1097 loc) · 41.5 KB
/
scx_me.bpf.c
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
/* SPDX-License-Identifier: GPL-2.0 */
/*
* A simple five-level FIFO queue scheduler.
*
* There are five FIFOs implemented using BPF_MAP_TYPE_QUEUE. A task gets
* assigned to one depending on its compound weight. Each CPU round robins
* through the FIFOs and dispatches more from FIFOs with higher indices - 1 from
* queue0, 2 from queue1, 4 from queue2 and so on.
*
* This scheduler demonstrates:
*
* - BPF-side queueing using PIDs.
* - Sleepable per-task storage allocation using ops.prep_enable().
* - Using ops.cpu_release() to handle a higher priority scheduling class taking
* the CPU away.
* - Core-sched support.
*
* This scheduler is primarily for demonstration and testing of sched_ext
* features and unlikely to be useful for actual workloads.
*
* Copyright (c) 2022 Meta Platforms, Inc. and affiliates.
* Copyright (c) 2022 Tejun Heo <[email protected]>
* Copyright (c) 2022 David Vernet <[email protected]>
*/
#include <scx/common.bpf.h>
#include <string.h>
enum consts {
ONE_SEC_IN_NS = 1000000000,//一秒钟的纳秒数,用于时间计算
SHARED_DSQ = 0,//共享的调度队列(DSQ,Dispatch Queue)的标识符
};
char _license[] SEC("license") = "GPL";
const volatile u64 slice_ns = SCX_SLICE_DFL;// 时间片长度
const volatile u32 stall_user_nth;
const volatile u32 stall_kernel_nth;
const volatile u32 dsp_inf_loop_after;
const volatile u32 dsp_batch;
const volatile bool print_shared_dsq;// 控制是否打印共享调度队列的信息
const volatile char exp_prefix[17];
const volatile s32 disallow_tgid;
const volatile bool suppress_dump;// 控制是否抑制调度器的转储输出(如调度队列状态、错误日志等)
const volatile bool cpu_strat;
const u64 min_wait_time = 1000000; // 5毫秒
u32 test_error_cnt;// 用于测试的错误计数器
UEI_DEFINE(uei);// 这个宏通常用于定义用户事件接口(UEI),用于处理和记录特定的事件或状态
// 每个 qmap_me 结构定义了一个 eBPF 队列,类型为 BPF_MAP_TYPE_QUEUE。这种队列是先进先出(FIFO)类型,用于任务排队
// 容纳 4096 个条目,条目任务的 PID,定义了 5 个队列:queue0 到 queue4。这些队列用于存储不同优先级的任务
struct qmap_me {
__uint(type, BPF_MAP_TYPE_QUEUE);
__uint(max_entries, 4096);
__type(value, u32);
} queue0 SEC(".maps"),
queue1 SEC(".maps"),
queue2 SEC(".maps"),
queue3 SEC(".maps"),
queue4 SEC(".maps");
struct {
__uint(type, BPF_MAP_TYPE_ARRAY_OF_MAPS);
__uint(max_entries, 5);
__type(key, int);// 键是队列的索引
__array(values, struct qmap_me);
} queue_arr SEC(".maps") = {
.values = {
[0] = &queue0,
[1] = &queue1,
[2] = &queue2,
[3] = &queue3,
[4] = &queue4,
},
};
/*
* If enabled, CPU performance target is set according to the queue index
* according to the following table.
*/
// CPU 性能目标
// 性能目标基于 SCX_CPUPERF_ONE 的比例计算,每个队列的目标按优先级递增
// 这种设计确保了任务越高优先级,分配的 CPU 资源越多
static const u32 qidx_to_cpuperf_target[] = {
[0] = SCX_CPUPERF_ONE * 0 / 4,// 最低性能
[1] = SCX_CPUPERF_ONE * 1 / 4,
[2] = SCX_CPUPERF_ONE * 2 / 4,
[3] = SCX_CPUPERF_ONE * 3 / 4,
[4] = SCX_CPUPERF_ONE * 4 / 4,// 最高性能
};
/*
* Per-queue sequence numbers to implement core-sched ordering.
*
* Tail seq is assigned to each queued task and incremented. Head seq tracks the
* sequence number of the latest dispatched task. The distance between the a
* task's seq and the associated queue's head seq is called the queue distance
* and used when comparing two tasks for ordering. See qmap_core_sched_before().
*/
// core_sched_head_seqs 和 core_sched_tail_seqs 用于实现核心调度(core-sched)的任务顺序控制
// head指向队列的头部,即下一个要调度的任务,tail指向队列的尾部,即最后一个入队的任务
// 每个队列(共 5 个队列)都有一个头序列号(head seq)和尾序列号(tail seq)
// tail seq:分配给每个排队的任务,并在每次有新任务入队时递增
// head seq:跟踪已被调度任务的最新序列号
static u64 core_sched_head_seqs[5];
static u64 core_sched_tail_seqs[5];
// 这个尾序列号反映了任务在加入队列时的顺序
// 例如,如果一个任务加入了 queue0,它会得到 core_sched_tail_seqs[0] 的当前值作为它的序列号,然后 core_sched_tail_seqs[0] 增加 1
// 根据任务的序列号和尾序列号可以算出队列距离,用于在不同队列之间进行调度决策时衡量每个队列的负载情况
/* Per-task scheduling context */
struct task_ctx {
bool force_local;// 指示是否强制将任务直接调度到本地调度队列(local_dsq),绕过常规调度 /* Dispatch directly to local_dsq */
u64 core_sched_seq;// 记录任务在核心调度中的序列号,用于保持任务在调度过程中的顺序一致性
};
// 任务上下文存储
struct {
__uint(type, BPF_MAP_TYPE_TASK_STORAGE);
__uint(map_flags, BPF_F_NO_PREALLOC);//
__type(key, int);// 任务的标识符(如任务 ID)
__type(value, struct task_ctx);
} task_ctx_stor SEC(".maps");
struct cpu_ctx {
u64 dsp_idx;// 当前的调度索引,用于指示 CPU 当前从哪个队列调度任务 /* dispatch index */
u64 dsp_cnt;// 剩余的调度计数,用于控制当前队列的调度任务数量 /* remaining count */
u32 avg_weight;// 当前调度的平均权重,用于动态调整调度策略
u32 cpuperf_target;// 标 CPU 性能,用于调整 CPU 频率或能耗目标,以匹配当前的调度需求
};
struct task_csw{
u32 willing_csw;
u32 unwilling_csw;
u32 static_weight;
u32 weight;
u64 last_time;
};
/*记录上一次的自愿和非自愿上下文切换次数,如果上一次是自愿上下文切换,就把它放到较低优先级的队列,反之根据上一次被调度运行的时间,被隔的越久的,优先级越高*/
// CPU 上下文存储
struct {
__uint(type, BPF_MAP_TYPE_PERCPU_ARRAY);
__uint(max_entries, 1);
__type(key, u32);
__type(value, struct cpu_ctx);
} cpu_ctx_stor SEC(".maps");
struct {
__uint(type, BPF_MAP_TYPE_TASK_STORAGE);
__uint(map_flags, BPF_F_NO_PREALLOC);
__type(key, int);
__type(value, struct task_csw);
} task_csw_map SEC(".maps");
// 进程的内存信息
struct task_memory_info {
u64 rss; // 常驻内存(RSS),单位:页
u64 total_vm; // 虚拟内存总量,单位:页
u64 anon_rss; // 匿名内存页,单位:页
u64 file_rss; // 文件映射内存页,单位:页
u64 swap_usage; // 交换分区使用量,单位:页
u64 pgfault; // 次要页面故障数
u64 pgmajfault; // 主要页面故障数
};
struct {
__uint(type, BPF_MAP_TYPE_TASK_STORAGE);
__uint(map_flags, BPF_F_NO_PREALLOC);
__type(key, int);
__type(value, struct task_memory_info);
} task_mem_map SEC(".maps");
/*
struct {
__uint(type, BPF_MAP_TYPE_ARRAY);
__uint(max_entries, 40);
__type(key, u32);
__type(value, u32);
} nice_to_weight_map SEC(".maps");*/
static const u32 nice_to_weight_map[40] SEC(".rodata") = {
88761, 71755, 56483, 46273, 36291,
29154, 23254, 18705, 14949, 11916,
9548, 7620, 6100, 4904, 3906,
3121, 2501, 1991, 1586, 1277,
1024, 820, 655, 526, 423,
335, 272, 215, 172, 137,
110, 87, 70, 56, 45,
36, 29, 23, 18, 15
};
/* Statistics */
u64 nr_enqueued, nr_dispatched, nr_reenqueued, nr_dequeued;
u64 nr_core_sched_execed, nr_expedited;
u32 cpuperf_min, cpuperf_avg, cpuperf_max;
u32 cpuperf_target_min, cpuperf_target_avg, cpuperf_target_max;
s32 BPF_STRUCT_OPS(qmap_select_cpu, struct task_struct *p,
s32 prev_cpu, u64 wake_flags)
{
struct task_ctx *tctx;
s32 cpu;
// // 获取任务的调度上下文
// static void *(* const bpf_task_storage_get)(void *map, struct task_struct *task, void *value, __u64 flags)
// 成功时,返回指向与指定任务关联的存储空间的指针,这个空间存储了与任务相关的上下文数据
// 最后一个参数为1时候,如果任务未在map中,就将其初始化之后加入map,后面有这种情况的代码
tctx = bpf_task_storage_get(&task_ctx_stor, p, 0, 0);
if (!tctx) {
scx_bpf_error("task_ctx lookup failed");
return -ESRCH;
}
// // 如果任务只能运行在一个 CPU 或前一个 CPU 现在是空闲的,选择前一个 CPU
if (p->nr_cpus_allowed == 1 ||
scx_bpf_test_and_clear_cpu_idle(prev_cpu)) {
tctx->force_local = true;
return prev_cpu;
}
// // 尝试选择一个空闲的 CPU
cpu = scx_bpf_pick_idle_cpu(p->cpus_ptr, 0);
if (cpu >= 0)
return cpu; // // 找到空闲 CPU,返回其编号
return prev_cpu;// 否则继续使用前一个 CPU
}
// 根据weight值选择不同的FIFO队列
static int weight_to_idx(u32 weight)
{
/* Coarsely map the compound weight to a FIFO. */
if (weight <= 25)
return 0;
else if (weight <= 50)
return 1;
else if (weight < 200)
return 2;
else if (weight < 400)
return 3;
else
return 4;
}
static int weight_to_idx_task(u32 weight){
// 不同nice值对应的weight值
// 在 CFS 中,默认 nice = 0 的任务对应的权重为 1024,大多数任务的 weight 会集中在 1024(默认值)上下
/* -20 88761, 71755, 56483, 46273, 36291,
-15 29154, 23254, 18705, 14949, 11916,
/* -10 9548, 7620, 6100, 4904, 3906,
/* -5 3121, 2501, 1991, 1586, 1277,
/* 0 1024, 820, 655, 526, 423,
/* 5 335, 272, 215, 172, 137,
/* 10 110, 87, 70, 56, 45,
/* 15 36, 29, 23, 18, 15,*/
if (weight <= 800)
return 0; // 低优先级后台任务
else if (weight <= 2000)
return 1; // 普通优先级任务
else if (weight <= 4000)
return 2;
else if (weight <= 8000)
return 3;
else
return 4;
}
static void* update_task_weight(struct task_struct *p, u64 enq_flags){
if (!p)
return NULL; // 参数无效
struct task_csw *t_csw = bpf_task_storage_get(&task_csw_map, p, 0, 0);
if (!t_csw){
scx_bpf_error("task_csw lookup failed");
return NULL;
}
//u32 weight = p->scx.weight;
u32 willing_csw = p->nvcsw;
u32 unwilling_csw = p->nivcsw;
u64 cur_time = bpf_ktime_get_ns();
t_csw->willing_csw = willing_csw;
if(unwilling_csw == 0 || t_csw->weight == 0 )
{
t_csw->unwilling_csw = unwilling_csw;
int prio = (int)p->static_prio - 100;
//bpf_printk("%llu task prio is %llu",p->pid,prio);
if(prio > 39)
prio = 39;
if(prio < 0)
prio = 0;
/*u32 *weight = bpf_map_lookup_elem(&nice_to_weight_map, &prio);
if (weight) {
t_csw->weight = *weight;
t_csw->static_weight = *weight;
} else {
// 处理查找失败的情况,可以给一个默认值
t_csw->weight = 1024; // 这是一个默认的权重
t_csw->static_weight = 1024;
}*/
//bpf_printk("%llu",nice_to_weight_map[20]);
t_csw->static_weight = nice_to_weight_map[prio];
t_csw->weight = nice_to_weight_map[prio];
t_csw->last_time = cur_time;
//bpf_printk("%llu task weight is %llu, and wait %llu",p->pid,t_csw->weight,wait_time);
//bpf_printk("%llu task weight is %llu,map value is %llu",p->pid,t_csw->weight,nice_to_weight_map[prio]);
return t_csw;
}
else{
u64 wait_time = cur_time - t_csw->last_time;
//bpf_printk("%llu task weight is %llu, and wait %llu",p->pid,t_csw->weight,wait_time);
t_csw->last_time = cur_time;
if(t_csw->unwilling_csw != unwilling_csw){ // 上一次是非自愿上下文切换
t_csw->unwilling_csw = unwilling_csw;
u32 normalized_wait_time = wait_time / min_wait_time;
if(p->pid >= 3800 && p->pid <= 4000)
bpf_printk("%llu task normalized_wait is %llu",p->pid,normalized_wait_time);
bpf_printk("%llu task normalized_wait is %llu",p->pid,normalized_wait_time);
//t_csw->weight = t_csw->weight * 11/10;
/*if(enq_flags & SCX_ENQ_WAKEUP){
if(t_csw->weight * 11 / 10 >= 8000)
{
t_csw->weight = 7900; // 保证任务不会抢占一些最高优先级的任务
}
else{
t_csw->weight = t_csw->weight * 11 / 10;
}
}
else{*/
/*if(t_csw->weight * 103 / 100 >= 8000)
{
t_csw->weight = 7900; // 保证任务不会抢占一些最高优先级的任务
}
else{
t_csw->weight = t_csw->weight * 103 / 10;
}*/
if(normalized_wait_time >= 1 && normalized_wait_time <=5){
if((t_csw->weight * 103) / 100 >= 8000 && t_csw->static_weight < 8000)
{
t_csw->weight = 7900; // 保证任务不会抢占一些最高优先级的任务
}
else{
t_csw->weight = t_csw->weight * 103 / 100;
}
}
else if(normalized_wait_time < 10){
if((t_csw->weight * 105) / 100 >= 8000 && t_csw->static_weight < 8000)
{
t_csw->weight = 7900; // 保证任务不会抢占一些最高优先级的任务
}
else{
t_csw->weight = t_csw->weight * 105 / 100;
}
}
else{
if((t_csw->weight * 108) / 100 >= 8000 && t_csw->static_weight < 8000)
{
t_csw->weight = 7900; // 保证任务不会抢占一些最高优先级的任务
}
else{
t_csw->weight = t_csw->weight * 108 / 100;
}
}
// }
}
else
{
t_csw->unwilling_csw = unwilling_csw;
/*if(enq_flags & SCX_ENQ_WAKEUP){
if((t_csw->weight * 108) / 100 >= 8000 && t_csw->static_weight < 8000)
{
t_csw->weight = 7900; // 保证任务不会抢占一些最高优先级的任务
}
else{
t_csw->weight = t_csw->weight * 108 / 10;
}
}
else{*/
if((t_csw->weight * 97) / 100 > t_csw->static_weight){
t_csw->weight = t_csw->weight * 97 / 100;
bpf_printk("%llu task weight down, now is %llu",p->pid,t_csw->weight);
}else{
t_csw->weight = t_csw->static_weight;
}
//}
}
return t_csw;
}
}
static s32 update_task_memory_info(struct task_struct *task)
{
if (!task)
return -EINVAL; // 参数无效
// 从 task_struct 中读取 PID
/* u32 pid = BPF_CORE_READ(task, pid);
if (!pid)
return -ESRCH; // 未找到进程*/
// 获取任务的 mm_struct
struct mm_struct *mm = BPF_CORE_READ(task, mm);
if (!mm) {
bpf_printk("Task %u has no associated mm_struct", task->pid);
return -ESRCH; // 任务没有关联的内存描述符
}
// 在 Map 中查找对应的 task_memory_info
struct task_memory_info *mem_info = bpf_task_storage_get(&task_mem_map, task, 0, 0);
if (!mem_info){
bpf_printk("mem_info is NULL for pid: %u", task->pid);
return -ENOENT; // 在 Map 中未找到对应的内存信息
}
// 从 mm_struct 中读取内存统计信息
/* u64 file_rss = BPF_CORE_READ(mm, rss_stat.count[MM_FILEPAGES]);
u64 anon_rss = BPF_CORE_READ(mm, rss_stat.count[MM_ANONPAGES]);
u64 swap_usage = BPF_CORE_READ(mm, rss_stat.count[MM_SWAPENTS]);*/
// 获取内存页数(通过 rss_stat 访问匿名页、文件页、交换分区)
u64 total_vm = BPF_CORE_READ(mm, total_vm);
u64 file_rss = 0, anon_rss = 0, swap_usage = 0;
struct percpu_counter *rss_stat = BPF_CORE_READ(mm, rss_stat);
if (!rss_stat) {
bpf_printk("rss_stat is NULL for pid: %u", task->pid);
return -ESRCH;
}
if (rss_stat) {
// 通过 rss_stat 读取匿名页、文件页、交换页等内存统计信息
bpf_probe_read_kernel(&file_rss, sizeof(u64), &rss_stat[MM_FILEPAGES].count);
bpf_probe_read_kernel(&anon_rss, sizeof(u64), &rss_stat[MM_ANONPAGES].count);
bpf_probe_read_kernel(&swap_usage, sizeof(u64), &rss_stat[MM_SWAPENTS].count);
}
// 更新 mem_info 结构体
mem_info->file_rss = file_rss;
mem_info->anon_rss = anon_rss;
mem_info->rss = file_rss + anon_rss;
mem_info->swap_usage = swap_usage;
mem_info->total_vm = total_vm;
// 从 task_struct 中读取页面故障计数
mem_info->pgfault = BPF_CORE_READ(task, min_flt);
mem_info->pgmajfault = BPF_CORE_READ(task, maj_flt);
// 输出存储的信息
bpf_printk("Updated memory info for pid %u:", task->pid);
//bpf_printk(" total_vm=%llu", mem_info->total_vm);
bpf_printk(" RSS=%llu total_vm=%llu", mem_info->rss, mem_info->total_vm);
bpf_printk(" anon_rss=%llu file_rss=%llu", mem_info->anon_rss, mem_info->file_rss);
bpf_printk(" swap_usage=%llu pgfault=%llu pgmajfault=%llu",
mem_info->swap_usage, mem_info->pgfault, mem_info->pgmajfault);
return 0; // 成功
}
/*
static void print_queue_head_mm(void)
{
int i;
bpf_printk("Begin printing queue head memory info__________________\n");
// Unroll the loop to satisfy the eBPF verifier
//#pragma unroll
for (i = 0; i < 5; i++) {
int key = i;
void *queue_map;
u32 pid;
int ret;
// Get the queue map from the array of maps
queue_map = bpf_map_lookup_elem(&queue_arr, &key);
if (!queue_map) {
bpf_printk("Queue %d not found\n", i);
continue;
}
// Peek at the first element in the queue
ret = bpf_map_peek_elem(queue_map, &pid);
if (ret != 0) {
bpf_printk("Queue %d is empty\n", i);
continue;
}
// Look up the task_struct using pid
struct task_struct *task = bpf_task_from_pid(pid);
if (!task) {
bpf_printk("Failed to get task_struct for pid %u\n", pid);
continue;
}
// Get task memory info using task_struct
struct task_memory_info *mem_info = bpf_task_storage_get(&task_mem_map, task, 0, 0);
if (!mem_info) {
bpf_printk("No memory info for pid %u\n", pid);
bpf_task_release(task); // Make sure to release task_struct reference
continue;
}
// Output the memory information
bpf_printk("Queue %d head pid %u:", i, pid);
bpf_printk(" RSS=%llu total_vm=%llu", mem_info->rss, mem_info->total_vm);
bpf_printk(" anon_rss=%llu file_rss=%llu", mem_info->anon_rss, mem_info->file_rss);
bpf_printk(" swap_usage=%llu pgfault=%llu pgmajfault=%llu",
mem_info->swap_usage, mem_info->pgfault, mem_info->pgmajfault);
// Ensure the task_struct reference is released after usage
bpf_task_release(task);
}
}*/
// 将任务入队到适当的调度队列中
void BPF_STRUCT_OPS(qmap_enqueue, struct task_struct *p, u64 enq_flags)
{
static u32 user_cnt, kernel_cnt;
struct task_ctx *tctx;
u32 pid = p->pid;
//int idx = weight_to_idx(p->scx.weight);// // 根据任务权重决定队列索引
int idx;
if(cpu_strat){
struct task_csw *t_csw = update_task_weight(p,enq_flags);
if(!t_csw){
idx = weight_to_idx(p->scx.weight);
}
else{
idx = weight_to_idx_task(t_csw->weight);
// bpf_printk("%llu task weight is %llu",pid,t_csw->weight);
}
}else{
idx = weight_to_idx(p->scx.weight);
}
void *ring;
// // 如果任务是内核线程,根据stall_kernel_nth的值决定是否阻塞
if (p->flags & PF_KTHREAD) {
if (stall_kernel_nth && !(++kernel_cnt % stall_kernel_nth))
return; // // 按照配置跳过部分内核线程的入队
} else {
if (stall_user_nth && !(++user_cnt % stall_user_nth))
return; // 按照配置跳过部分用户线程的入队
}
if (test_error_cnt && !--test_error_cnt)
scx_bpf_error("test triggering error");
// // 获取任务上下文
tctx = bpf_task_storage_get(&task_ctx_stor, p, 0, 0);
if (!tctx) {
scx_bpf_error("task_ctx lookup failed");
return;
}
//bpf_printk("Enqueueing task %d to queue %d______________\n", pid, idx);
//s32 get_mm = update_task_memory_info(p);
/*if(get_mm != 0){
// scx_bpf_error("task memory read error");
bpf_printk("task %d memory read error\n", pid);
// return;
}*/
/*
* All enqueued tasks must have their core_sched_seq updated for correct
* core-sched ordering, which is why %SCX_OPS_ENQ_LAST is specified in
* qmap_ops.flags.
*/
// // 更新任务的核心调度序列号
// 保持core_sched_tail_seqs指向的是队列的尾部
tctx->core_sched_seq = core_sched_tail_seqs[idx]++;
/*
* If qmap_select_cpu() is telling us to or this is the last runnable
* task on the CPU, enqueue locally.
*/
// // 如果强制本地调度或这是 CPU 上最后一个可运行任务,调度到本地队列
if (tctx->force_local || (enq_flags & SCX_ENQ_LAST)) {
tctx->force_local = false;
scx_bpf_dispatch(p, SCX_DSQ_LOCAL, slice_ns, enq_flags);
return;
}
/*
* If the task was re-enqueued due to the CPU being preempted by a
* higher priority scheduling class, just re-enqueue the task directly
* on the global DSQ. As we want another CPU to pick it up, find and
* kick an idle CPU.
*/
// 如果任务由于被高优先级抢占而重新入队,直接加入全局调度队列
/*if (enq_flags & SCX_ENQ_REENQ) {
s32 cpu;
scx_bpf_dispatch(p, SHARED_DSQ, 0, enq_flags);// // 将任务加入全局调度队列
cpu = scx_bpf_pick_idle_cpu(p->cpus_ptr, 0);// // 选择一个空闲的 CPU
if (cpu >= 0)
scx_bpf_kick_cpu(cpu, SCX_KICK_IDLE);
return;
}*/
if (enq_flags & SCX_ENQ_PREEMPT) {
s32 cpu;
scx_bpf_dispatch(p, SHARED_DSQ, 0, enq_flags);// // 将任务加入全局调度队列
cpu = scx_bpf_pick_idle_cpu(p->cpus_ptr, 0);// // 选择一个空闲的 CPU
if (cpu >= 0)
scx_bpf_kick_cpu(cpu, SCX_KICK_IDLE);
return;
}
// 查找对应索引的队列
ring = bpf_map_lookup_elem(&queue_arr, &idx);
if (!ring) {
scx_bpf_error("failed to find ring %d", idx);
return;
}
/* Queue on the selected FIFO. If the FIFO overflows, punt to global. */
// // 尝试将任务加入选定的 FIFO 队列,如果失败则放入全局队列
if (bpf_map_push_elem(ring, &pid, 0)) {
scx_bpf_dispatch(p, SHARED_DSQ, slice_ns, enq_flags);
return;
}
__sync_fetch_and_add(&nr_enqueued, 1);// 增加入队计数
}
/*
* The BPF queue map doesn't support removal and sched_ext can handle spurious
* dispatches. qmap_dequeue() is only used to collect statistics.
*/
// 统计任务出队的次数
void BPF_STRUCT_OPS(qmap_dequeue, struct task_struct *p, u64 deq_flags)
{
__sync_fetch_and_add(&nr_dequeued, 1);// 增加出队任务的计数 nr_dequeued
if (deq_flags & SCX_DEQ_CORE_SCHED_EXEC)// 如果出队标志中包含 SCX_DEQ_CORE_SCHED_EXEC,则同时增加 nr_core_sched_execed 计数
__sync_fetch_and_add(&nr_core_sched_execed, 1);
}
static void update_core_sched_head_seq(struct task_struct *p)
{
// 在map里找是否有任务p的上下文存储
struct task_ctx *tctx = bpf_task_storage_get(&task_ctx_stor, p, 0, 0);
//int idx = weight_to_idx(p->scx.weight);// 获得任务所对应的队列索引
int idx;
if(cpu_strat){
struct task_csw *t_csw = bpf_task_storage_get(&task_csw_map, p, 0, 0);
if(!t_csw){
idx = weight_to_idx(p->scx.weight);
}
else{
idx = weight_to_idx_task(t_csw->weight);
}
}
else{
idx = weight_to_idx(p->scx.weight);
}
// 如果任务压根不在map中,就不需要进一步更新了
if (tctx)
core_sched_head_seqs[idx] = tctx->core_sched_seq;// 感觉这里更新逻辑有点问题,head指向的一直都是队列最前头的任务,而当前这个任务可能并不是队列中最前面的
else
scx_bpf_error("task_ctx lookup failed");
}
static bool consume_shared_dsq(void)// 从共享调度队列中消费任务
{
struct task_struct *p;
bool consumed;
// exp_prefix是一个前缀,如果有的话,就根据这个前缀,在队列中找这个任务名字的任务来调度
// scx_bpf_consume用于从指定的非本地调度队列(DSQ)中提取任务,并将其转移到当前 CPU 的本地 DSQ 中执行
if (exp_prefix[0] == '\0')// 如果没有前缀,直接消费共享调度队列
return scx_bpf_consume(SHARED_DSQ);
/*
* To demonstrate the use of scx_bpf_consume_task(), implement silly
* selective priority boosting mechanism by scanning SHARED_DSQ looking
* for matching comms and consume them first. This makes difference only
* when dsp_batch is larger than 1.
*/
consumed = false;
__COMPAT_DSQ_FOR_EACH(p, SHARED_DSQ, 0) {// 遍历共享调度队列
char comm[sizeof(exp_prefix)];
memcpy(comm, p->comm, sizeof(exp_prefix) - 1);
// // 比较任务名称与 exp_prefix,匹配则优先消费
if (!bpf_strncmp(comm, sizeof(exp_prefix),
(const char *)exp_prefix) &&
__COMPAT_scx_bpf_consume_task(BPF_FOR_EACH_ITER, p)) {
consumed = true;// 标记成功消费了匹配任务
__sync_fetch_and_add(&nr_expedited, 1);// 计数消费的优先任务
}
}
return consumed || scx_bpf_consume(SHARED_DSQ);
}
// 任务分派,这里也只是将队列中的任务分派到共享的队列中
void BPF_STRUCT_OPS(qmap_dispatch, s32 cpu, struct task_struct *prev)
{
struct task_struct *p;
struct cpu_ctx *cpuc;
u32 zero = 0, batch = dsp_batch ?: 1;
void *fifo;
s32 i, pid;
if (consume_shared_dsq())// 首先尝试从共享调度队列消费任务
return;
// 如果进入了无限调度循环条件(dsp_inf_loop_after),不断地调度 PID 为 2 的任务来保持调度循环
if (dsp_inf_loop_after && nr_dispatched > dsp_inf_loop_after) {
/*
* PID 2 should be kthreadd which should mostly be idle and off
* the scheduler. Let's keep dispatching it to force the kernel
* to call this function over and over again.
*/
p = bpf_task_from_pid(2);
if (p) {
scx_bpf_dispatch(p, SCX_DSQ_LOCAL, slice_ns, 0);
bpf_task_release(p);
return;
}
}
// 查找当前 CPU 上下文 cpuc,并按顺序从 5 个调度队列中选择任务进行调度
if (!(cpuc = bpf_map_lookup_elem(&cpu_ctx_stor, &zero))) {
scx_bpf_error("failed to look up cpu_ctx");
return;
}
for (i = 0; i < 5; i++) {
/* Advance the dispatch cursor and pick the fifo. */
// 如果当前队列的调度计数为 0,就切换到下一个队列
if (!cpuc->dsp_cnt) {
cpuc->dsp_idx = (cpuc->dsp_idx + 1) % 5;
cpuc->dsp_cnt = 1 << cpuc->dsp_idx;// 根据队列索引调整调度计数
// 如果优先级高的队列,执行一次任务分派时候会分派更多任务,这个计数就是控制这个的
}
// // 从 queue_arr 中获取当前索引的队列
fifo = bpf_map_lookup_elem(&queue_arr, &cpuc->dsp_idx);
if (!fifo) {
scx_bpf_error("failed to find ring %llu", cpuc->dsp_idx);
return;
}
/* Dispatch or advance. */
// // 从当前队列中弹出任务并调度
// 根据dsp_cnt计数调度,优先级高的这一下可以调度更多任务
// 也可能队列没这么多任务来提供调度,任务没了就直接跳出循环,同时dsp_cnt置0
bpf_repeat(BPF_MAX_LOOPS) {
if (bpf_map_pop_elem(fifo, &pid))
break;// 失败跳出循环
p = bpf_task_from_pid(pid);
if (!p)
continue;// 如果任务不存在,跳过,因为没有实际调度,所以dsp_cnt不减
update_core_sched_head_seq(p);// 更新核心调度头部序列号,确保调度状态一致
__sync_fetch_and_add(&nr_dispatched, 1);
scx_bpf_dispatch(p, SHARED_DSQ, slice_ns, 0);// 将任务调度到共享调度队列
bpf_task_release(p);// 释放任务引用
batch--;// 批处理计数减一
cpuc->dsp_cnt--;
if (!batch || !scx_bpf_dispatch_nr_slots()) {
consume_shared_dsq();
return;
}
if (!cpuc->dsp_cnt)
break;
}
// 最后将dsp_cnt置0是为了让下一次调度时候有机会给下一个队列
// 次数dsp_cnt本身是可能不为0的,这样置0自动让位给下一个队列,避免饥饿的方式
cpuc->dsp_cnt = 0;
}
}
// 用于在任务时钟滴答(tick)事件发生时更新 CPU 的性能目标(cpuperf_target)
void BPF_STRUCT_OPS(qmap_tick, struct task_struct *p)
{
struct cpu_ctx *cpuc;
u32 zero = 0;
int idx;
if (!(cpuc = bpf_map_lookup_elem(&cpu_ctx_stor, &zero))) {
scx_bpf_error("failed to look up cpu_ctx");
return;
}
/*
* Use the running avg of weights to select the target cpuperf level.
* This is a demonstration of the cpuperf feature rather than a
* practical strategy to regulate CPU frequency.
*/
// // 更新 CPU 的平均权重,采用 3/4 的当前平均值和 1/4 的当前任务权重
//cpuc->avg_weight = cpuc->avg_weight * 3 / 4 + p->scx.weight / 4;
if(cpu_strat){
struct task_csw *t_csw = bpf_task_storage_get(&task_csw_map, p, 0, 0);
if(!t_csw){
cpuc->avg_weight = cpuc->avg_weight * 3 / 4 + p->scx.weight / 4;
idx = weight_to_idx(cpuc->avg_weight);
}
else{
cpuc->avg_weight = cpuc->avg_weight * 3 / 4 + t_csw->weight / 4;
idx = weight_to_idx_task(cpuc->avg_weight);
}
}
else{
cpuc->avg_weight = cpuc->avg_weight * 3 / 4 + p->scx.weight / 4;
idx = weight_to_idx(cpuc->avg_weight);
}
// idx = weight_to_idx(cpuc->avg_weight);// 根据计算得到的平均权重选择对应的队列索引
cpuc->cpuperf_target = qidx_to_cpuperf_target[idx];// 根据队列索引设置目标 CPU 性能级别
// 设置 CPU 的性能目标
scx_bpf_cpuperf_set(scx_bpf_task_cpu(p), cpuc->cpuperf_target);
}
/*
* The distance from the head of the queue scaled by the weight of the queue.
* The lower the number, the older the task and the higher the priority.
*/
// 衡量任务在调度中的相对位置,相对体现在同样的实际长度,优先级低的值根据其优先级来翻倍
static s64 task_qdist(struct task_struct *p)
{
//int idx = weight_to_idx(p->scx.weight);
int idx;
if(cpu_strat){
struct task_csw *t_csw = bpf_task_storage_get(&task_csw_map, p, 0, 0);
if(!t_csw){
idx = weight_to_idx(p->scx.weight);
}
else{
idx = weight_to_idx_task(t_csw->weight);
}
}
else{
idx = weight_to_idx(p->scx.weight);
}
struct task_ctx *tctx;
s64 qdist;
tctx = bpf_task_storage_get(&task_ctx_stor, p, 0, 0);
if (!tctx) {
scx_bpf_error("task_ctx lookup failed");
return 0;
}
qdist = tctx->core_sched_seq - core_sched_head_seqs[idx];
/*
* As queue index increments, the priority doubles. The queue w/ index 3
* is dispatched twice more frequently than 2. Reflect the difference by
* scaling qdists accordingly. Note that the shift amount needs to be
* flipped depending on the sign to avoid flipping priority direction.
*/
if (qdist >= 0)
return qdist << (4 - idx);
else
return qdist << idx;
}
/*
* This is called to determine the task ordering when core-sched is picking
* tasks to execute on SMT siblings and should encode about the same ordering as
* the regular scheduling path. Use the priority-scaled distances from the head
* of the queues to compare the two tasks which should be consistent with the
* dispatch path behavior.
*/
bool BPF_STRUCT_OPS(qmap_core_sched_before,
struct task_struct *a, struct task_struct *b)
{
return task_qdist(a) > task_qdist(b);
}
void BPF_STRUCT_OPS(qmap_cpu_release, s32 cpu, struct scx_cpu_release_args *args)
{
u32 cnt;
/*
* Called when @cpu is taken by a higher priority scheduling class. This
* makes @cpu no longer available for executing sched_ext tasks. As we
* don't want the tasks in @cpu's local dsq to sit there until @cpu
* becomes available again, re-enqueue them into the global dsq. See
* %SCX_ENQ_REENQ handling in qmap_enqueue().
*/
/*
* 当 @cpu 被更高优先级的调度类占用时调用。此时 @cpu 不再可用于执行
* sched_ext 任务。为了避免 @cpu 本地调度队列中的任务一直挂起,直到
* @cpu 重新可用,将这些任务重新入队到全局调度队列中。
* 参见 qmap_enqueue() 中对 %SCX_ENQ_REENQ 的处理。
*/
cnt = scx_bpf_reenqueue_local();// 将本地队列任务重新入队到全局队列
if (cnt)
__sync_fetch_and_add(&nr_reenqueued, cnt);
}
s32 BPF_STRUCT_OPS(qmap_init_task, struct task_struct *p,
struct scx_init_task_args *args)
{
// // 如果新任务的 TGID 等于不允许的 TGID,将其标记为不允许调度
if (p->tgid == disallow_tgid)
p->scx.disallow = true;
// 初始化 task_mem_map
/*struct task_memory_info *mem_info = bpf_task_storage_get(&task_mem_map, p, 0,
BPF_LOCAL_STORAGE_GET_F_CREATE);
if (!mem_info) {
//bpf_printk("Error: Failed to get or create mem_info\n");
return -ENOMEM;
} else {
//bpf_printk("Success: mem_info allocated at %p\n", mem_info);
}*/
if(!bpf_task_storage_get(&task_csw_map,p,0,BPF_LOCAL_STORAGE_GET_F_CREATE))
return -ENOMEM;
/*
* @p is new. Let's ensure that its task_ctx is available. We can sleep
* in this function and the following will automatically use GFP_KERNEL.
*/
/*
* @p 是新任务。确保其任务上下文可用。我们可以在这个函数中睡眠,
* 后续操作将自动使用 GFP_KERNEL 分配内存。
*/
if (bpf_task_storage_get(&task_ctx_stor, p, 0,
BPF_LOCAL_STORAGE_GET_F_CREATE)) //&& bpf_task_storage_get(&task_mem_map, p, 0, BPF_LOCAL_STORAGE_GET_F_CREATE))
return 0;
else
return -ENOMEM;
}
void BPF_STRUCT_OPS(qmap_dump, struct scx_dump_ctx *dctx)// 感觉是特殊时候才被调用
{
s32 i, pid;
if (suppress_dump)
return;
bpf_for(i, 0, 5) {
void *fifo;
if (!(fifo = bpf_map_lookup_elem(&queue_arr, &i)))
return;
scx_bpf_dump("QMAP FIFO[%d]:", i);
bpf_repeat(4096) {
if (bpf_map_pop_elem(fifo, &pid))// 就为了个输出队列的信息,就把队列中的任务弹出来,也没恢复,不合适吧
break;
scx_bpf_dump(" %d", pid);
}
scx_bpf_dump("\n");
}
}
void BPF_STRUCT_OPS(qmap_dump_cpu, struct scx_dump_ctx *dctx, s32 cpu, bool idle)
{
u32 zero = 0;
struct cpu_ctx *cpuc;
if (suppress_dump || idle)
return;
if (!(cpuc = bpf_map_lookup_percpu_elem(&cpu_ctx_stor, &zero, cpu)))// // 获取指定 CPU 的上下文信息
return;
// // 输出 CPU 的调度上下文,包括调度索引、调度计数、平均权重和 CPU 性能目标
scx_bpf_dump("QMAP: dsp_idx=%llu dsp_cnt=%llu avg_weight=%u cpuperf_target=%u",
cpuc->dsp_idx, cpuc->dsp_cnt, cpuc->avg_weight,
cpuc->cpuperf_target);
}
void BPF_STRUCT_OPS(qmap_dump_task, struct scx_dump_ctx *dctx, struct task_struct *p)
{
struct task_ctx *taskc;
u32 pid;
struct task_memory_info *mem_info;
if (suppress_dump)
return;
if (!(taskc = bpf_task_storage_get(&task_ctx_stor, p, 0, 0)))
return;
// 获取任务的 PID