-
Notifications
You must be signed in to change notification settings - Fork 4
/
linux-bug-report-for-5.2.patch
1026 lines (974 loc) · 43.3 KB
/
linux-bug-report-for-5.2.patch
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
From 3df0fcb84eec4aaac449a13d26a8e6ae393ac3da Mon Sep 17 00:00:00 2001
From: Dongli Zhang <[email protected]>
Date: Thu, 11 Jul 2019 10:03:15 +0800
Subject: [PATCH 1/1] linux bug report for 5.2
- hung_task, soft-lockup, hard-lockup
- rcu stall
- lockdep
- kmemleak, kasan
Signed-off-by: Dongli Zhang <[email protected]>
---
arch/x86/include/asm/preempt.h | 21 +++++++
include/linux/bottom_half.h | 14 +++++
include/linux/hardirq.h | 8 +++
include/linux/preempt.h | 86 ++++++++++++++++++++++++++++
kernel/hung_task.c | 99 ++++++++++++++++++++++++++++++++
kernel/locking/lockdep.c | 68 ++++++++++++++++++++++
kernel/sched/core.c | 83 +++++++++++++++++++++++++++
kernel/softirq.c | 34 +++++++++++
kernel/watchdog.c | 126 +++++++++++++++++++++++++++++++++++++++++
kernel/watchdog_hld.c | 34 +++++++++++
mm/kmemleak.c | 23 ++++++++
11 files changed, 596 insertions(+)
diff --git a/arch/x86/include/asm/preempt.h b/arch/x86/include/asm/preempt.h
index 99a7fa9..9110e8d 100644
--- a/arch/x86/include/asm/preempt.h
+++ b/arch/x86/include/asm/preempt.h
@@ -6,6 +6,15 @@
#include <asm/percpu.h>
#include <linux/thread_info.h>
+/*
+ * DECLARE_PER_CPU(int, __preempt_count);
+ * percpu的__preempt_count:
+ * - 第0-7位表示preempt (PREEMPT_OFFSET)
+ * - 第8-15位表示softirq (SOFTIRQ_OFFSET)
+ * - 第16-19位表示hardirq (HARDIRQ_OFFSET)
+ * - 第20位表示nmi (NMI_OFFSET)
+ * - 第31位用做PREEMPT_NEED_RESCHED
+ */
DECLARE_PER_CPU(int, __preempt_count);
/* We use the MSB mostly because its available */
@@ -21,6 +30,16 @@ DECLARE_PER_CPU(int, __preempt_count);
* We mask the PREEMPT_NEED_RESCHED bit so as not to confuse all current users
* that think a non-zero value indicates we cannot preempt.
*/
+/*
+ * 返回percpu的__preempt_count, 第PREEMPT_NEED_RESCHED(31)位被清空为0
+ * DECLARE_PER_CPU(int, __preempt_count);
+ * percpu的__preempt_count:
+ * - 第0-7位表示preempt (PREEMPT_OFFSET)
+ * - 第8-15位表示softirq (SOFTIRQ_OFFSET)
+ * - 第16-19位表示hardirq (HARDIRQ_OFFSET)
+ * - 第20位表示nmi (NMI_OFFSET)
+ * - 第31位用做PREEMPT_NEED_RESCHED
+ */
static __always_inline int preempt_count(void)
{
return raw_cpu_read_4(__preempt_count) & ~PREEMPT_NEED_RESCHED;
@@ -74,11 +93,13 @@ static __always_inline bool test_preempt_need_resched(void)
* The various preempt_count add/sub methods
*/
+/* 为percpu的__preempt_count增加val */
static __always_inline void __preempt_count_add(int val)
{
raw_cpu_add_4(__preempt_count, val);
}
+/* 为percpu的__preempt_count减少val */
static __always_inline void __preempt_count_sub(int val)
{
raw_cpu_add_4(__preempt_count, -val);
diff --git a/include/linux/bottom_half.h b/include/linux/bottom_half.h
index a19519f..b949a5e 100644
--- a/include/linux/bottom_half.h
+++ b/include/linux/bottom_half.h
@@ -7,6 +7,20 @@
#ifdef CONFIG_TRACE_IRQFLAGS
extern void __local_bh_disable_ip(unsigned long ip, unsigned int cnt);
#else
+/*
+ * called by:
+ * - include/linux/bottom_half.h|19| <<local_bh_disable>> __local_bh_disable_ip(_THIS_IP_, SOFTIRQ_DISABLE_OFFSET);
+ * - include/linux/rwlock_api_smp.h|175| <<__raw_read_lock_bh>> __local_bh_disable_ip(_RET_IP_, SOFTIRQ_LOCK_OFFSET);
+ * - include/linux/rwlock_api_smp.h|202| <<__raw_write_lock_bh>> __local_bh_disable_ip(_RET_IP_, SOFTIRQ_LOCK_OFFSET);
+ * - include/linux/spinlock_api_smp.h|134| <<__raw_spin_lock_bh>> __local_bh_disable_ip(_RET_IP_, SOFTIRQ_LOCK_OFFSET);
+ * - include/linux/spinlock_api_smp.h|181| <<__raw_spin_trylock_bh>> __local_bh_disable_ip(_RET_IP_, SOFTIRQ_LOCK_OFFSET);
+ * - include/linux/spinlock_api_up.h|34| <<__LOCK_BH>> do { __local_bh_disable_ip(_THIS_IP_, SOFTIRQ_LOCK_OFFSET); ___LOCK(lock); } while (0)
+ * - kernel/softirq.c|269| <<__do_softirq>> __local_bh_disable_ip(_RET_IP_, SOFTIRQ_OFFSET);
+ *
+ * 在oracle linux和ubuntu都用这里
+ *
+ * 为percpu的__preempt_count增加cnt
+ */
static __always_inline void __local_bh_disable_ip(unsigned long ip, unsigned int cnt)
{
preempt_count_add(cnt);
diff --git a/include/linux/hardirq.h b/include/linux/hardirq.h
index da0af63..ddf8668 100644
--- a/include/linux/hardirq.h
+++ b/include/linux/hardirq.h
@@ -33,6 +33,10 @@ extern void rcu_nmi_exit(void);
* always balanced, so the interrupted value of ->hardirq_context
* will always be restored.
*/
+/*
+ * 为percpu的__preempt_count增加HARDIRQ_OFFSET
+ * HARDIRQ_OFFSET: 1往左移动16位, 16-19位表示hardirq
+ */
#define __irq_enter() \
do { \
account_irq_enter_time(current); \
@@ -48,6 +52,10 @@ extern void irq_enter(void);
/*
* Exit irq context without processing softirqs:
*/
+/*
+ * 为percpu的__preempt_count减少HARDIRQ_OFFSET
+ * HARDIRQ_OFFSET: 1往左移动16位, 16-19位表示hardirq
+ */
#define __irq_exit() \
do { \
trace_hardirq_exit(); \
diff --git a/include/linux/preempt.h b/include/linux/preempt.h
index dd92b1a..888e19e 100644
--- a/include/linux/preempt.h
+++ b/include/linux/preempt.h
@@ -11,6 +11,16 @@
#include <linux/list.h>
/*
+ * DECLARE_PER_CPU(int, __preempt_count);
+ * percpu的__preempt_count:
+ * - 第0-7位表示preempt (PREEMPT_OFFSET)
+ * - 第8-15位表示softirq (SOFTIRQ_OFFSET)
+ * - 第16-19位表示hardirq (HARDIRQ_OFFSET)
+ * - 第20位表示nmi (NMI_OFFSET)
+ * - 第31位用做PREEMPT_NEED_RESCHED
+ */
+
+/*
* We put the hardirq and softirq counter into the preemption
* counter. The bitmask has the following meaning:
*
@@ -35,22 +45,71 @@
#define NMI_BITS 1
#define PREEMPT_SHIFT 0
+/* 0 + 8 = 8 */
#define SOFTIRQ_SHIFT (PREEMPT_SHIFT + PREEMPT_BITS)
+/* 8 + 8 = 16 */
#define HARDIRQ_SHIFT (SOFTIRQ_SHIFT + SOFTIRQ_BITS)
+/* 16 + 4 = 20 */
#define NMI_SHIFT (HARDIRQ_SHIFT + HARDIRQ_BITS)
+/* 1"先"往左移x位再减1 */
#define __IRQ_MASK(x) ((1UL << (x))-1)
+/* percpu的__preempt_count中表示preempt的0-7位都是1 */
#define PREEMPT_MASK (__IRQ_MASK(PREEMPT_BITS) << PREEMPT_SHIFT)
+/* percpu的__preempt_count中表示softirq的8-15位都是1 */
#define SOFTIRQ_MASK (__IRQ_MASK(SOFTIRQ_BITS) << SOFTIRQ_SHIFT)
+/* percpu的__preempt_count中表示hardirq的16-19位都是1 */
#define HARDIRQ_MASK (__IRQ_MASK(HARDIRQ_BITS) << HARDIRQ_SHIFT)
+/* percpu的__preempt_count中表示nmi的第20位是1 */
#define NMI_MASK (__IRQ_MASK(NMI_BITS) << NMI_SHIFT)
+/*
+ * used by:
+ * - include/linux/preempt.h|71| <<INIT_PREEMPT_COUNT>> #define INIT_PREEMPT_COUNT PREEMPT_OFFSET
+ * - include/linux/preempt.h|117| <<PREEMPT_DISABLE_OFFSET>> #define PREEMPT_DISABLE_OFFSET PREEMPT_OFFSET
+ *
+ * 1往左移动0位: 0-7位表示preempt
+ */
#define PREEMPT_OFFSET (1UL << PREEMPT_SHIFT)
+/*
+ * used by:
+ * - include/linux/preempt.h|61| <<SOFTIRQ_DISABLE_OFFSET>> #define SOFTIRQ_DISABLE_OFFSET (2 * SOFTIRQ_OFFSET)
+ * - include/linux/preempt.h|108| <<in_serving_softirq>> #define in_serving_softirq() (softirq_count() & SOFTIRQ_OFFSET)
+ * - include/linux/preempt.h|111| <<in_task>> (NMI_MASK | HARDIRQ_MASK | SOFTIRQ_OFFSET)))
+ * - kernel/softirq.c|269| <<__do_softirq>> __local_bh_disable_ip(_RET_IP_, SOFTIRQ_OFFSET);
+ * - kernel/softirq.c|319| <<__do_softirq>> __local_bh_enable(SOFTIRQ_OFFSET);
+ * - kernel/trace/ring_buffer.c|2706| <<trace_recursive_lock>> if (!(pc & (NMI_MASK | HARDIRQ_MASK | SOFTIRQ_OFFSET)))
+ * - kernel/trace/trace.c|2330| <<tracing_generic_entry_update>> ((pc & SOFTIRQ_OFFSET) ? TRACE_FLAG_SOFTIRQ : 0) |
+ *
+ * 1往左移动8位: 8-15位表示softirq
+ */
#define SOFTIRQ_OFFSET (1UL << SOFTIRQ_SHIFT)
+/*
+ * used by:
+ * - include/linux/hardirq.h|43| <<__irq_enter>> preempt_count_add(HARDIRQ_OFFSET); \
+ * - include/linux/hardirq.h|63| <<__irq_exit>> preempt_count_sub(HARDIRQ_OFFSET); \
+ * - include/linux/hardirq.h|83| <<nmi_enter>> preempt_count_add(NMI_OFFSET + HARDIRQ_OFFSET); \
+ * - include/linux/hardirq.h|93| <<nmi_exit>> preempt_count_sub(NMI_OFFSET + HARDIRQ_OFFSET); \
+ * - kernel/sched/cputime.c|498| <<account_process_tick>> else if ((p != rq->idle) || (irq_count() != HARDIRQ_OFFSET))
+ * - kernel/sched/cputime.c|499| <<account_process_tick>> account_system_time(p, HARDIRQ_OFFSET, cputime);
+ * - kernel/softirq.c|415| <<irq_exit>> preempt_count_sub(HARDIRQ_OFFSET);
+ *
+ * 1往左移动16位: 16-19位表示hardirq
+ */
#define HARDIRQ_OFFSET (1UL << HARDIRQ_SHIFT)
+/* 1往左移动20位: 20位表示nmi */
#define NMI_OFFSET (1UL << NMI_SHIFT)
+/*
+ * used by:
+ * - include/linux/bottom_half.h|19| <<local_bh_disable>> __local_bh_disable_ip(_THIS_IP_, SOFTIRQ_DISABLE_OFFSET);
+ * - include/linux/bottom_half.h|27| <<local_bh_enable_ip>> __local_bh_enable_ip(ip, SOFTIRQ_DISABLE_OFFSET);
+ * - include/linux/bottom_half.h|32| <<local_bh_enable>> __local_bh_enable_ip(_THIS_IP_, SOFTIRQ_DISABLE_OFFSET);
+ * - include/linux/preempt.h|168| <<SOFTIRQ_LOCK_OFFSET>> #define SOFTIRQ_LOCK_OFFSET (SOFTIRQ_DISABLE_OFFSET + PREEMPT_LOCK_OFFSET)
+ * - kernel/softirq.c|162| <<_local_bh_enable>> __local_bh_enable(SOFTIRQ_DISABLE_OFFSET);
+ * - kernel/softirq.c|176| <<__local_bh_enable_ip>> if (softirq_count() == SOFTIRQ_DISABLE_OFFSET)
+ */
#define SOFTIRQ_DISABLE_OFFSET (2 * SOFTIRQ_OFFSET)
#define PREEMPT_DISABLED (PREEMPT_DISABLE_OFFSET + PREEMPT_ENABLED)
@@ -77,6 +136,17 @@
/* preempt_count() and related functions, depends on PREEMPT_NEED_RESCHED */
#include <asm/preempt.h>
+/*
+ * 返回percpu的__preempt_count, 第PREEMPT_NEED_RESCHED(31)位被清空为0
+ * DECLARE_PER_CPU(int, __preempt_count);
+ * percpu的__preempt_count:
+ * - 第0-7位表示preempt (PREEMPT_OFFSET)
+ * - 第8-15位表示softirq (SOFTIRQ_OFFSET)
+ * - 第16-19位表示hardirq (HARDIRQ_OFFSET)
+ * - 第20位表示nmi (NMI_OFFSET)
+ * - 第31位用做PREEMPT_NEED_RESCHED
+ */
+
#define hardirq_count() (preempt_count() & HARDIRQ_MASK)
#define softirq_count() (preempt_count() & SOFTIRQ_MASK)
#define irq_count() (preempt_count() & (HARDIRQ_MASK | SOFTIRQ_MASK \
@@ -97,6 +167,7 @@
*/
#define in_irq() (hardirq_count())
#define in_softirq() (softirq_count())
+/* percpu的__preempt_count中表示softirq (第8-15位), hardirq (第16-19位) 和nmi (第20位) 的部分 */
#define in_interrupt() (irq_count())
#define in_serving_softirq() (softirq_count() & SOFTIRQ_OFFSET)
#define in_nmi() (preempt_count() & NMI_MASK)
@@ -145,6 +216,21 @@
* Check whether we were atomic before we did preempt_disable():
* (used by the scheduler)
*/
+/*
+ * PREEMPT_DISABLE_OFFSET在!CONFIG_PREEMPT_COUNT的情况下是0
+ * 也就是说in_atomic_preempt_off()返回true的前提是preempt_count()不为0
+ *
+ * 返回percpu的__preempt_count, 第PREEMPT_NEED_RESCHED(31)位被清空为0
+ * DECLARE_PER_CPU(int, __preempt_count);
+ * percpu的__preempt_count:
+ * - 第0-7位表示preempt (PREEMPT_OFFSET)
+ * - 第8-15位表示softirq (SOFTIRQ_OFFSET)
+ * - 第16-19位表示hardirq (HARDIRQ_OFFSET)
+ * - 第20位表示nmi (NMI_OFFSET)
+ * - 第31位用做PREEMPT_NEED_RESCHED
+ *
+ * 也就是说!CONFIG_PREEMPT_COUNT情况下只有不在irq, softirq和nmi才返回true!
+ */
#define in_atomic_preempt_off() (preempt_count() != PREEMPT_DISABLE_OFFSET)
#if defined(CONFIG_DEBUG_PREEMPT) || defined(CONFIG_TRACE_PREEMPT_TOGGLE)
diff --git a/kernel/hung_task.c b/kernel/hung_task.c
index 14a625c..79da315 100644
--- a/kernel/hung_task.c
+++ b/kernel/hung_task.c
@@ -25,6 +25,61 @@
#include <trace/events/sched.h>
/*
+ * 必须激活CONFIG_DETECT_HUNG_TASK
+ *
+ * hung_task : kernel/hung_task.c 默认120秒!!!
+ * 检测函数是check_hung_task()
+ *
+ * 进程长时间(系统默认配置120秒)处于TASK_UNINTERRUPTIBLE睡眠状态,这种状态下进程不响应异步信号.如:进
+ * 程与外设硬件的交互(如read),通常使用这种状态来保证进程与设备的交互过程不被打断,否则设备可能处于不
+ * 可控的状态.
+ *
+ * Linux的进程存在多种状态,如TASK_RUNNING的运行态,EXIT_DEAD的停止态和TASK_INTERRUPTIBLE的接收信号的
+ * 等待状态等等(可在include/linux/sched.h中查看).其中有一种状态等待为TASK_UNINTERRUPTIBLE,称为D状态,
+ * 该种状态下进程不接收信号,只能通过wake_up唤醒.处于这种状态的情况有很多.例如mutex锁就可能会设置进
+ * 程于该状态,有时候进程在等待某种IO资源就绪时(wait_event机制)会设置进程进入该状态.一般情况下,进程处
+ * 于该状态的时间不会太久,但若IO设备出现故障或者出现进程死锁等情况,进程就可能长期处于该状态而无法再
+ * 返回到TASK_RUNNING态.因此,内核为了便于发现这类情况设计出了hung task机制专门用于检测长期处于D状态
+ * 的进程并发出告警.
+ *
+ * 核心思想为创建一个内核监测进程循环监测处于D状态的每一个进程(任务),统计它们在两次检测之间的调度次数,
+ * 如果发现有任务在两次监测之间没有发生任何的调度则可判断该进程一直处于D状态,很有可能已经死锁,因此触
+ * 发报警日志打印,输出进程的基本信息,栈回溯以及寄存器保存信息以供内核开发人员定位.
+ *
+ * 核心函数是kernel/hung_task.c的watchdog(), 会创建一个内核线程khungtaskd.
+ *
+ *
+ * 下面是一个hung_task的例子.
+ *
+ * static int task_hang_forever(void *data)
+ * {
+ * __set_current_state(TASK_UNINTERRUPTIBLE);
+ * schedule();
+ *
+ * return 0;
+ * }
+ *
+ * kthread_run(task_hang_forever, NULL, "task_hang_forever");
+ *
+ * 会创建一个task_hang_forever内核线程, 把自己永远设置在D状态
+ *
+ * 在运行120+秒后...
+ *
+ * [ 246.950946] INFO: task task_hang_forev:86 blocked for more than 122 seconds.
+ * [ 246.953136] Not tainted 5.2.0+ #9
+ * [ 246.954380] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
+ * [ 246.956795] task_hang_forev D15456 86 2 0x80004000
+ * [ 246.956802] Call Trace:
+ * [ 246.956855] ? __schedule+0x34a/0x530
+ * [ 246.956874] ? blk_mq_tag_to_rq+0x20/0x20
+ * [ 246.956877] schedule+0x2e/0x90
+ * [ 246.956879] task_hang_forever+0x22/0x40
+ * [ 246.956899] kthread+0xf3/0x130
+ * [ 246.956908] ? kthread_destroy_worker+0x40/0x40
+ * [ 246.956913] ret_from_fork+0x35/0x40
+ */
+
+/*
* The number of tasks checked:
*/
int __read_mostly sysctl_hung_task_check_count = PID_MAX_LIMIT;
@@ -41,6 +96,9 @@ int __read_mostly sysctl_hung_task_check_count = PID_MAX_LIMIT;
/*
* Zero means infinite timeout - no checking done:
*/
+/*
+ * 很多操作系统都是120秒后报错
+ */
unsigned long __read_mostly sysctl_hung_task_timeout_secs = CONFIG_DEFAULT_HUNG_TASK_TIMEOUT;
/*
@@ -48,6 +106,9 @@ unsigned long __read_mostly sysctl_hung_task_timeout_secs = CONFIG_DEFAULT_HUNG_
*/
unsigned long __read_mostly sysctl_hung_task_check_interval_secs;
+/*
+ * 默认最多报告10次
+ */
int __read_mostly sysctl_hung_task_warnings = 10;
static int __read_mostly did_panic;
@@ -85,6 +146,14 @@ static struct notifier_block panic_block = {
.notifier_call = hung_task_panic,
};
+/*
+ * called by:
+ * - kernel/hung_task.c|198| <<check_hung_uninterruptible_tasks>> check_hung_task(t, timeout);
+ *
+ * watchdog()
+ * -> check_hung_uninterruptible_tasks()
+ * -> check_hung_task()
+ */
static void check_hung_task(struct task_struct *t, unsigned long timeout)
{
unsigned long switch_count = t->nvcsw + t->nivcsw;
@@ -149,6 +218,10 @@ static void check_hung_task(struct task_struct *t, unsigned long timeout)
* For preemptible RCU it is sufficient to call rcu_read_unlock in order
* to exit the grace period. For classic RCU, a reschedule is required.
*/
+/*
+ * called by:
+ * - kernel/hung_task.c|210| <<check_hung_uninterruptible_tasks>> if (!rcu_lock_break(g, t))
+ */
static bool rcu_lock_break(struct task_struct *g, struct task_struct *t)
{
bool can_cont;
@@ -170,6 +243,10 @@ static bool rcu_lock_break(struct task_struct *g, struct task_struct *t)
* a really long time (120 seconds). If that happens, print out
* a warning.
*/
+/*
+ * called by:
+ * - kernel/hung_task.c|309| <<watchdog>> check_hung_uninterruptible_tasks(timeout);
+ */
static void check_hung_uninterruptible_tasks(unsigned long timeout)
{
int max_count = sysctl_hung_task_check_count;
@@ -189,6 +266,13 @@ static void check_hung_uninterruptible_tasks(unsigned long timeout)
if (!max_count--)
goto unlock;
if (time_after(jiffies, last_break + HUNG_TASK_LOCK_BREAK)) {
+ /*
+ * To avoid extending the RCU grace period for an unbounded amount of time,
+ * periodically exit the critical section and enter a new one.
+ *
+ * For preemptible RCU it is sufficient to call rcu_read_unlock in order
+ * to exit the grace period. For classic RCU, a reschedule is required.:
+ */
if (!rcu_lock_break(g, t))
goto unlock;
last_break = jiffies;
@@ -235,6 +319,11 @@ int proc_dohung_task_timeout_secs(struct ctl_table *table, int write,
return ret;
}
+/*
+ * used by:
+ * - kernel/hung_task.c|260| <<reset_hung_task_detector>> atomic_set(&reset_hung_task, 1);
+ * - kernel/hung_task.c|315| <<watchdog>> if (!atomic_xchg(&reset_hung_task, 0) &&
+ */
static atomic_t reset_hung_task = ATOMIC_INIT(0);
void reset_hung_task_detector(void)
@@ -243,8 +332,18 @@ void reset_hung_task_detector(void)
}
EXPORT_SYMBOL_GPL(reset_hung_task_detector);
+/*
+ * 在以下使用hung_detector_suspended:
+ * - kernel/hung_task.c|255| <<hungtask_pm_notify>> hung_detector_suspended = true;
+ * - kernel/hung_task.c|260| <<hungtask_pm_notify>> hung_detector_suspended = false;
+ * - kernel/hung_task.c|288| <<watchdog>> !hung_detector_suspended)
+ */
static bool hung_detector_suspended;
+/*
+ * used by:
+ * - kernel/hung_task.c|304| <<hung_task_init>> pm_notifier(hungtask_pm_notify, 0);
+ */
static int hungtask_pm_notify(struct notifier_block *self,
unsigned long action, void *hcpu)
{
diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c
index c47788f..62456ee 100644
--- a/kernel/locking/lockdep.c
+++ b/kernel/locking/lockdep.c
@@ -62,6 +62,74 @@
#define CREATE_TRACE_POINTS
#include <trace/events/lock.h>
+/*
+ * 调用相邻两次spin_lock的例子, 复杂一点的spin_lock在不同地方调两次没检测出来
+ *
+ * [ 53.270336] ============================================
+ * [ 53.270336] WARNING: possible recursive locking detected
+ * [ 53.270336] 5.2.0+ #14 Not tainted
+ * [ 53.270336] --------------------------------------------
+ * [ 53.270336] systemd-journal/157 is trying to acquire lock:
+ * [ 53.270336] 0000000095ca99d5 (&(&test_lock)->rlock){+.-.}, at: net_rx_action+0x3a7/0x470
+ * [ 53.270336]
+ * [ 53.270336] but task is already holding lock:
+ * [ 53.270336] 0000000095ca99d5 (&(&test_lock)->rlock){+.-.}, at: net_rx_action+0x39b/0x470
+ * [ 53.270336]
+ * [ 53.270336] other info that might help us debug this:
+ * [ 53.270336] Possible unsafe locking scenario:
+ * [ 53.270336]
+ * [ 53.270336] CPU0
+ * [ 53.270336] ----
+ * [ 53.270336] lock(&(&test_lock)->rlock);
+ * [ 53.270336] lock(&(&test_lock)->rlock);
+ * [ 53.270336]
+ * [ 53.270336] *** DEADLOCK ***
+ * [ 53.270336]
+ * [ 53.270336] May be due to missing lock nesting notation
+ * [ 53.270336]
+ * [ 53.270336] 1 lock held by systemd-journal/157:
+ * [ 53.270336] #0: 0000000095ca99d5 (&(&test_lock)->rlock){+.-.}, at: net_rx_action+0x39b/0x470
+ * [ 53.270336]
+ * [ 53.270336] stack backtrace:
+ * [ 53.270336] CPU: 2 PID: 157 Comm: systemd-journal Not tainted 5.2.0+ #14
+ * [ 53.270336] Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS rel-1.12.1-0-ga5cab58e9a3f-prebuilt.qemu.org 04/01/2014
+ * [ 53.270336] Call Trace:
+ * [ 53.270336] <IRQ>
+ * [ 53.270336] dump_stack+0x5e/0x8b
+ * [ 53.270336] __lock_acquire+0x362/0x1eb0
+ * [ 53.270336] ? net_rx_action+0x39b/0x470
+ * [ 53.270336] ? sched_clock_local+0x12/0x80
+ * [ 53.270336] ? lock_acquire+0xb4/0x1b0
+ * [ 53.270336] lock_acquire+0xb4/0x1b0
+ * [ 53.270336] ? net_rx_action+0x3a7/0x470
+ * [ 53.270336] _raw_spin_lock+0x2b/0x60
+ * [ 53.270336] ? net_rx_action+0x3a7/0x470
+ * [ 53.270336] net_rx_action+0x3a7/0x470
+ * [ 53.270336] ? lock_acquire+0xb4/0x1b0
+ * [ 53.270336] __do_softirq+0xcb/0x437
+ * [ 53.270336] irq_exit+0xb6/0xc0
+ * [ 53.270336] do_IRQ+0x5b/0x110
+ * [ 53.270336] common_interrupt+0xf/0xf
+ * [ 53.270336] </IRQ>
+ * [ 53.270336] RIP: 0010:___bpf_prog_run+0x10/0x1390
+ * [ 53.270336] RSP: 0018:ffff99c200553d58 EFLAGS: 00000246 ORIG_RAX: ffffffffffffffdb
+ * [ 53.270336] RAX: 00000000000000ac RBX: ffff99c200391040 RCX: ffff99c200553d98
+ * [ 53.270336] RDX: 000000007fff0000 RSI: 00000000000000ac RDI: 0000000000000000
+ * [ 53.270336] RBP: ffff99c200553d98 R08: 0000000000000000 R09: 0000000000000000
+ * [ 53.270336] R10: 0000000000000000 R11: 0000000000000000 R12: 0000000000000000
+ * [ 53.270336] R13: 000000007fff0000 R14: 0000000000000000 R15: ffff8e3a37b30300
+ * [ 53.270336] ? ___bpf_prog_run+0x35a/0x1390
+ * [ 53.270336] ? __bpf_prog_run32+0x34/0x60
+ * [ 53.270336] ? _raw_spin_unlock+0x1f/0x30
+ * [ 53.270336] ? __seccomp_filter+0x8e/0x6b0
+ * [ 53.270336] ? __handle_mm_fault+0x601/0xae0
+ * [ 53.270336] ? sched_clock_local+0x12/0x80
+ * [ 53.270336] ? __do_page_fault+0x2c6/0x500
+ * [ 53.270336] ? syscall_trace_enter+0xc0/0x350
+ * [ 53.270336] ? do_syscall_64+0x132/0x1b0
+ * [ 53.270336] ? entry_SYSCALL_64_after_hwframe+0x49/0xbe
+ */
+
#ifdef CONFIG_PROVE_LOCKING
int prove_locking = 1;
module_param(prove_locking, int, 0644);
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 874c427..a1350bf 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -3163,6 +3163,7 @@ static inline void preempt_latency_start(int val)
}
}
+/* 为percpu的__preempt_count增加val */
void preempt_count_add(int val)
{
#ifdef CONFIG_DEBUG_PREEMPT
@@ -3172,6 +3173,7 @@ void preempt_count_add(int val)
if (DEBUG_LOCKS_WARN_ON((preempt_count() < 0)))
return;
#endif
+ /* 为percpu的__preempt_count增加val */
__preempt_count_add(val);
#ifdef CONFIG_DEBUG_PREEMPT
/*
@@ -3195,6 +3197,7 @@ static inline void preempt_latency_stop(int val)
trace_preempt_on(CALLER_ADDR0, get_lock_parent_ip());
}
+/* 为percpu的__preempt_count减少val */
void preempt_count_sub(int val)
{
#ifdef CONFIG_DEBUG_PREEMPT
@@ -3212,6 +3215,7 @@ void preempt_count_sub(int val)
#endif
preempt_latency_stop(val);
+ /* 为percpu的__preempt_count减少val */
__preempt_count_sub(val);
}
EXPORT_SYMBOL(preempt_count_sub);
@@ -3232,8 +3236,59 @@ static inline unsigned long get_preempt_disable_ip(struct task_struct *p)
}
/*
+ * scheduling while atomic的例子: 通过在net_rx_action()调用schedule():
+ *
+ * [ 36.981929] BUG: scheduling while atomic: sshd/707/0x00000101
+ * [ 36.986404] Modules linked in:
+ * [ 36.986410] CPU: 2 PID: 707 Comm: sshd Not tainted 5.2.0+ #6
+ * [ 36.986411] Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS rel-1.12.1-0-ga5cab58e9a3f-prebuilt.qemu.org 04/01/2014
+ * [ 36.986413] Call Trace:
+ * [ 36.986429] <IRQ>
+ * [ 36.986455] dump_stack+0x46/0x5b
+ * [ 36.986460] __schedule_bug+0x47/0x60
+ * [ 36.986464] __schedule+0x474/0x530
+ * [ 36.986466] schedule+0x2e/0x90
+ * [ 36.986470] net_rx_action+0x37b/0x3d0
+ * [ 36.986474] __do_softirq+0xf2/0x2c7
+ * [ 36.986477] do_softirq_own_stack+0x2a/0x40
+ * [ 36.986478] </IRQ>
+ * [ 36.986482] do_softirq.part.19+0x26/0x30
+ * [ 36.986485] __local_bh_enable_ip+0x5b/0x60
+ * [ 36.986488] ip_finish_output2+0x1a3/0x530
+ * [ 36.986506] ? ip_output+0x69/0xe0
+ * [ 36.986509] ip_output+0x69/0xe0
+ * [ 36.986511] ? ip_finish_output2+0x530/0x530
+ * [ 36.986513] __ip_queue_xmit+0x14b/0x380
+ * [ 36.986516] __tcp_transmit_skb+0x574/0xaa0
+ * [ 36.986519] tcp_write_xmit+0x27e/0x11d0
+ * [ 36.986522] __tcp_push_pending_frames+0x29/0xb0
+ * [ 36.986525] tcp_sendmsg_locked+0x2c5/0xd90
+ * [ 36.986528] tcp_sendmsg+0x22/0x40
+ * [ 36.986531] sock_sendmsg+0x39/0x50
+ * [ 36.986533] sock_write_iter+0x82/0xf0
+ * [ 36.986537] new_sync_write+0x107/0x1a0
+ * [ 36.986540] vfs_write+0xae/0x1a0
+ * [ 36.986542] ksys_write+0x57/0xd0
+ * [ 36.986545] do_syscall_64+0x43/0x110
+ * [ 36.986549] entry_SYSCALL_64_after_hwframe+0x44/0xa9
+ * [ 36.986552] RIP: 0033:0x7fd167ed8154
+ * ... ...
+ * [ 36.986556] RSP: 002b:00007ffe12a19578 EFLAGS: 00000246 ORIG_RAX: 0000000000000001
+ * [ 36.986558] RAX: ffffffffffffffda RBX: 0000000000000054 RCX: 00007fd167ed8154
+ * [ 36.986559] RDX: 0000000000000054 RSI: 0000564ffeb94770 RDI: 0000000000000003
+ * [ 36.986561] RBP: 0000564ffeb79350 R08: 0000000000000000 R09: 0000000000000600
+ * [ 36.986562] R10: 0000000000000008 R11: 0000000000000246 R12: 0000000000000000
+ * [ 36.986563] R13: 0000564ffe8d3ad0 R14: 0000000000000003 R15: 00007ffe12a195ff
+ * [ 36.986585] softirq: huh, entered softirq 3 NET_RX 00000000687a3c6d with preempt_count 00000101, exited with 00000000?
+ */
+
+/*
* Print scheduling while atomic bug:
*/
+/*
+ * called by only:
+ * - kernel/sched/core.c|3280| <<schedule_debug>> __schedule_bug(prev);
+ */
static noinline void __schedule_bug(struct task_struct *prev)
{
/* Save this before calling printk(), since that will clobber it */
@@ -3265,6 +3320,10 @@ static noinline void __schedule_bug(struct task_struct *prev)
/*
* Various schedule()-time debugging checks and statistics:
*/
+/*
+ * called by only:
+ * - kernel/sched/core.c|3385| <<__schedule>> schedule_debug(prev);
+ */
static inline void schedule_debug(struct task_struct *prev)
{
#ifdef CONFIG_SCHED_STACK_END_CHECK
@@ -3272,6 +3331,21 @@ static inline void schedule_debug(struct task_struct *prev)
panic("corrupted stack end detected inside scheduler\n");
#endif
+ /*
+ * PREEMPT_DISABLE_OFFSET在!CONFIG_PREEMPT_COUNT的情况下是0
+ * 也就是说in_atomic_preempt_off()返回true的前提是preempt_count()不为0
+ *
+ * 返回percpu的__preempt_count, 第PREEMPT_NEED_RESCHED(31)位被清空为0
+ * DECLARE_PER_CPU(int, __preempt_count);
+ * percpu的__preempt_count:
+ * - 第0-7位表示preempt (PREEMPT_OFFSET)
+ * - 第8-15位表示softirq (SOFTIRQ_OFFSET)
+ * - 第16-19位表示hardirq (HARDIRQ_OFFSET)
+ * - 第20位表示nmi (NMI_OFFSET)
+ * - 第31位用做PREEMPT_NEED_RESCHED
+ *
+ * 也就是说!CONFIG_PREEMPT_COUNT情况下只有不在irq, softirq和nmi才返回true!
+ */
if (unlikely(in_atomic_preempt_off())) {
__schedule_bug(prev);
preempt_count_set(PREEMPT_DISABLED);
@@ -3366,6 +3440,15 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
*
* WARNING: must be called with preemption disabled!
*/
+/*
+ * called by:
+ * - kernel/sched/core.c|3466| <<do_task_dead>> __schedule(false);
+ * - kernel/sched/core.c|3513| <<schedule>> __schedule(false);
+ * - kernel/sched/core.c|3541| <<schedule_idle>> __schedule(false);
+ * - kernel/sched/core.c|3594| <<preempt_schedule_common>> __schedule(true);
+ * - kernel/sched/core.c|3668| <<preempt_schedule_notrace>> __schedule(true);
+ * - kernel/sched/core.c|3697| <<preempt_schedule_irq>> __schedule(true);
+ */
static void __sched notrace __schedule(bool preempt)
{
struct task_struct *prev, *next;
diff --git a/kernel/softirq.c b/kernel/softirq.c
index a6b81c6..4e656d7 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -107,6 +107,18 @@ static bool ksoftirqd_running(unsigned long pending)
* where hardirqs are disabled legitimately:
*/
#ifdef CONFIG_TRACE_IRQFLAGS
+/*
+ * called by:
+ * - include/linux/bottom_half.h|19| <<local_bh_disable>> __local_bh_disable_ip(_THIS_IP_, SOFTIRQ_DISABLE_OFFSET);
+ * - include/linux/rwlock_api_smp.h|175| <<__raw_read_lock_bh>> __local_bh_disable_ip(_RET_IP_, SOFTIRQ_LOCK_OFFSET);
+ * - include/linux/rwlock_api_smp.h|202| <<__raw_write_lock_bh>> __local_bh_disable_ip(_RET_IP_, SOFTIRQ_LOCK_OFFSET);
+ * - include/linux/spinlock_api_smp.h|134| <<__raw_spin_lock_bh>> __local_bh_disable_ip(_RET_IP_, SOFTIRQ_LOCK_OFFSET);
+ * - include/linux/spinlock_api_smp.h|181| <<__raw_spin_trylock_bh>> __local_bh_disable_ip(_RET_IP_, SOFTIRQ_LOCK_OFFSET);
+ * - include/linux/spinlock_api_up.h|34| <<__LOCK_BH>> do { __local_bh_disable_ip(_THIS_IP_, SOFTIRQ_LOCK_OFFSET); ___LOCK(lock); } while (0)
+ * - kernel/softirq.c|269| <<__do_softirq>> __local_bh_disable_ip(_RET_IP_, SOFTIRQ_OFFSET);
+ *
+ * 在oracle linux和ubuntu都不用这里
+ */
void __local_bh_disable_ip(unsigned long ip, unsigned int cnt)
{
unsigned long flags;
@@ -139,6 +151,11 @@ void __local_bh_disable_ip(unsigned long ip, unsigned int cnt)
EXPORT_SYMBOL(__local_bh_disable_ip);
#endif /* CONFIG_TRACE_IRQFLAGS */
+/*
+ * called by:
+ * - kernel/softirq.c|162| <<_local_bh_enable>> __local_bh_enable(SOFTIRQ_DISABLE_OFFSET);
+ * - kernel/softirq.c|319| <<__do_softirq>> __local_bh_enable(SOFTIRQ_OFFSET);
+ */
static void __local_bh_enable(unsigned int cnt)
{
lockdep_assert_irqs_disabled();
@@ -149,6 +166,7 @@ static void __local_bh_enable(unsigned int cnt)
if (softirq_count() == (cnt & SOFTIRQ_MASK))
trace_softirqs_on(_RET_IP_);
+ /* 为percpu的__preempt_count减少cnt */
__preempt_count_sub(cnt);
}
@@ -266,6 +284,10 @@ asmlinkage __visible void __softirq_entry __do_softirq(void)
pending = local_softirq_pending();
account_irq_enter_time(current);
+ /*
+ * 为percpu的__preempt_count增加SOFTIRQ_OFFSET,
+ * 也就是表示softirq的部分
+ */
__local_bh_disable_ip(_RET_IP_, SOFTIRQ_OFFSET);
in_hardirq = lockdep_softirq_start();
@@ -355,6 +377,10 @@ void irq_enter(void)
_local_bh_enable();
}
+ /*
+ * 为percpu的__preempt_count增加HARDIRQ_OFFSET
+ * HARDIRQ_OFFSET: 1往左移动16位, 16-19位表示hardirq
+ */
__irq_enter();
}
@@ -409,6 +435,14 @@ void irq_exit(void)
#endif
account_irq_exit_time(current);
preempt_count_sub(HARDIRQ_OFFSET);
+ /*
+ * in_interrupt():
+ * percpu的__preempt_count中表示softirq (第8-15位), hardirq (第16-19位) 和nmi (第20位) 的部分
+ *
+ * 在__do_softirq()的时候会调用__local_bh_disable_ip(_RET_IP_, SOFTIRQ_OFFSET)
+ * 为percpu的__preempt_count增加SOFTIRQ_OFFSET,
+ * 也就是表示softirq的部分
+ */
if (!in_interrupt() && local_softirq_pending())
invoke_softirq();
diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index 7f9e7b9..d1c0596 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -27,6 +27,81 @@
#include <asm/irq_regs.h>
#include <linux/kvm_para.h>
+/*
+ * 在desktop/server上, soft lockup之前会发生rcu stall
+ *
+ * [ 37.521697] rcu: INFO: rcu_sched self-detected stall on CPU
+ * [ 53.857680] rcu: INFO: rcu_sched detected expedited stalls on CPUs/tasks: { 2-... } 21397 jiffies s: 41 root: 0x4/.
+ * [ 100.524703] rcu: INFO: rcu_sched self-detected stall on CPU
+ * [ 119.905726] rcu: INFO: rcu_sched detected expedited stalls on CPUs/tasks: { 2-... } 87445 jiffies s: 41 root: 0x4/.
+ * [ 163.527727] rcu: INFO: rcu_sched self-detected stall on CPU
+ * [ 185.441728] rcu: INFO: rcu_sched detected expedited stalls on CPUs/tasks: { 2-... } 152981 jiffies s: 41 root: 0x4/.
+ * [ 226.531736] rcu: INFO: rcu_sched self-detected stall on CPU
+ * [ 250.977743] rcu: INFO: rcu_sched detected expedited stalls on CPUs/tasks: { 2-... } 218517 jiffies s: 41 root: 0x4/.
+ * [ 289.535760] rcu: INFO: rcu_sched self-detected stall on CPU
+ * [ 316.513721] rcu: INFO: rcu_sched detected expedited stalls on CPUs/tasks: { 2-... } 284053 jiffies s: 41 root: 0x4/.
+ * [ 352.539749] rcu: INFO: rcu_sched self-detected stall on CPU
+ * [ 382.049760] rcu: INFO: rcu_sched detected expedited stalls on CPUs/tasks: { 2-... } 349589 jiffies s: 41 root: 0x4/.
+ *
+ * [ 64.268700] watchdog: BUG: soft lockup - CPU#2 stuck for 22s! [sshd:770]
+ * [ 64.269391] Modules linked in:
+ * [ 64.269393] CPU: 2 PID: 770 Comm: sshd Not tainted 5.2.0+ #4
+ * [ 64.269393] Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS rel-1.12.1-0-ga5cab58e9a3f-prebuilt.qemu.org 04/01/2014
+ * [ 64.269396] RIP: 0010:net_rx_action+0xb2/0x3c0
+ * [ 64.269398] RSP: 0018:ffffa8a0c070cf10 EFLAGS: 00000202 ORIG_RAX: ffffffffffffff13
+ * [ 64.269399] RAX: 0000000000000001 RBX: ffffa8a0c070cf38 RCX: ffff98ebb9e08b50
+ * [ 64.269399] RDX: ffff98ebb9e08b50 RSI: 0000000000000008 RDI: 00000000000007d0
+ * [ 64.269400] RBP: ffffffff83405118 R08: 0000000000000000 R09: ffff98ebb9e08840
+ * [ 64.269400] R10: 0000000000000000 R11: ffffa8a0c0cebab8 R12: ffffa8a0c070cf48
+ * [ 64.269400] R13: 0000000000000003 R14: 0000000000000008 R15: ffff98ebbbb29a40
+ * [ 64.269401] FS: 00007f0826c498c0(0000) GS:ffff98ebbbb00000(0000) knlGS:0000000000000000
+ * [ 64.269403] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
+ * [ 64.269404] CR2: 0000000001862438 CR3: 0000000174a1c000 CR4: 00000000000006e0
+ * [ 64.269404] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
+ * [ 64.269405] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
+ * [ 64.269405] Call Trace:
+ * [ 64.269406] <IRQ>
+ * [ 64.269408] ? e1000_intr_msix_rx+0x58/0x70
+ * [ 64.269410] __do_softirq+0xf2/0x2c7
+ * [ 64.269411] do_softirq_own_stack+0x2a/0x40
+ * [ 64.269412] </IRQ>
+ * [ 64.269414] do_softirq.part.19+0x26/0x30
+ * [ 64.269415] __local_bh_enable_ip+0x5b/0x60
+ * [ 64.269417] ip_finish_output2+0x1a3/0x530
+ * [ 64.269418] ? ip_output+0x69/0xe0
+ * [ 64.269419] ip_output+0x69/0xe0
+ * [ 64.269420] ? ip_finish_output2+0x530/0x530
+ * [ 64.269421] __ip_queue_xmit+0x14b/0x380
+ * [ 64.269423] __tcp_transmit_skb+0x574/0xaa0
+ * [ 64.269424] tcp_write_xmit+0x27e/0x11d0
+ * [ 64.269425] __tcp_push_pending_frames+0x29/0xb0
+ * [ 64.269426] tcp_sendmsg_locked+0x2c5/0xd90
+ * [ 64.269427] tcp_sendmsg+0x22/0x40
+ * [ 64.269429] sock_sendmsg+0x39/0x50
+ * [ 64.269430] sock_write_iter+0x82/0xf0
+ * [ 64.269431] new_sync_write+0x107/0x1a0
+ * [ 64.269433] vfs_write+0xae/0x1a0
+ * [ 64.269434] ksys_write+0x57/0xd0
+ * [ 64.269435] do_syscall_64+0x43/0x110
+ * [ 64.269436] entry_SYSCALL_64_after_hwframe+0x44/0xa9
+ * [ 64.269437] RIP: 0033:0x7f0824dde2c0
+ * [ 64.269438] RSP: 002b:00007ffcd58a94b8 EFLAGS: 00000246 ORIG_RAX: 0000000000000001
+ * [ 64.269439] RAX: ffffffffffffffda RBX: 0000000000000054 RCX: 00007f0824dde2c0
+ * [ 64.269439] RDX: 0000000000000054 RSI: 0000557092066c48 RDI: 0000000000000003
+ * [ 64.269440] RBP: 000055709204fe70 R08: 0000000000000000 R09: 0000000000002100
+ * [ 64.269440] R10: 0000000000000008 R11: 0000000000000246 R12: 00007ffcd58a9570
+ * [ 64.269441] R13: 00007ffcd58a9574 R14: 000000000000e40c R15: 00007f0824dde2b0
+ *
+ *
+ * 关于hard lockup, 在kvm的VM里disable irq只是获得以下 (因为qemu没有"-cpu host"):
+ *
+ * [ 156.050437] rcu: INFO: rcu_sched detected stalls on CPUs/tasks:
+ * [ 219.055418] rcu: INFO: rcu_sched detected stalls on CPUs/tasks:
+ * [ 282.060416] rcu: INFO: rcu_sched detected stalls on CPUs/tasks:
+ * [ 345.065454] rcu: INFO: rcu_sched detected stalls on CPUs/tasks:
+ * [ 408.070453] rcu: INFO: rcu_sched detected stalls on CPUs/tasks:
+ */
+
static DEFINE_MUTEX(watchdog_mutex);
#if defined(CONFIG_HARDLOCKUP_DETECTOR) || defined(CONFIG_HAVE_NMI_WATCHDOG)
@@ -168,6 +243,14 @@ unsigned int __read_mostly softlockup_panic =
static bool softlockup_initialized __read_mostly;
static u64 __read_mostly sample_period;
+/*
+ * used by:
+ * - kernel/watchdog.c|267| <<__touch_watchdog>> __this_cpu_write(watchdog_touch_ts, get_timestamp());
+ * - kernel/watchdog.c|284| <<touch_softlockup_watchdog_sched>> raw_cpu_write(watchdog_touch_ts, 0);
+ * - kernel/watchdog.c|308| <<touch_all_softlockup_watchdogs>> per_cpu(watchdog_touch_ts, cpu) = 0;
+ * - kernel/watchdog.c|315| <<touch_softlockup_watchdog_sync>> __this_cpu_write(watchdog_touch_ts, 0);
+ * - kernel/watchdog.c|375| <<watchdog_timer_fn>> unsigned long touch_ts = __this_cpu_read(watchdog_touch_ts);
+ */
static DEFINE_PER_CPU(unsigned long, watchdog_touch_ts);
static DEFINE_PER_CPU(struct hrtimer, watchdog_hrtimer);
static DEFINE_PER_CPU(bool, softlockup_touch_sync);
@@ -255,6 +338,13 @@ static void set_sample_period(void)
}
/* Commands for resetting the watchdog */
+/*
+ * called by:
+ * - kernel/watchdog.c|355| <<softlockup_fn>> __touch_watchdog();
+ * - kernel/watchdog.c|398| <<watchdog_timer_fn>> __touch_watchdog();
+ * - kernel/watchdog.c|431| <<watchdog_timer_fn>> __touch_watchdog();
+ * - kernel/watchdog.c|499| <<watchdog_enable>> __touch_watchdog();
+ */
static void __touch_watchdog(void)
{
__this_cpu_write(watchdog_touch_ts, get_timestamp());
@@ -268,6 +358,13 @@ static void __touch_watchdog(void)
* entering idle state. This should only be used for scheduler events.
* Use touch_softlockup_watchdog() for everything else.
*/
+/*
+ * called by:
+ * - kernel/time/tick-sched.c|156| <<tick_sched_handle>> touch_softlockup_watchdog_sched();
+ * - kernel/time/tick-sched.c|508| <<tick_nohz_update_jiffies>> touch_softlockup_watchdog_sched();
+ * - kernel/time/tick-sched.c|841| <<tick_nohz_restart_sched_tick>> touch_softlockup_watchdog_sched();
+ * - kernel/watchdog.c|289| <<touch_softlockup_watchdog>> touch_softlockup_watchdog_sched();
+ */
notrace void touch_softlockup_watchdog_sched(void)
{
/*
@@ -308,6 +405,10 @@ void touch_softlockup_watchdog_sync(void)
__this_cpu_write(watchdog_touch_ts, 0);
}
+/*
+ * called by:
+ * - kernel/watchdog.c|422| <<watchdog_timer_fn>> duration = is_softlockup(touch_ts);
+ */
static int is_softlockup(unsigned long touch_ts)
{
unsigned long now = get_timestamp();
@@ -321,6 +422,10 @@ static int is_softlockup(unsigned long touch_ts)
}
/* watchdog detector functions */
+/*
+ * called by:
+ * - kernel/watchdog_hld.c|131| <<watchdog_overflow_callback>> if (is_hardlockup()) {
+ */
bool is_hardlockup(void)
{
unsigned long hrint = __this_cpu_read(hrtimer_interrupts);
@@ -338,6 +443,10 @@ static void watchdog_interrupt_count(void)
}
static DEFINE_PER_CPU(struct completion, softlockup_completion);
+/*
+ * used by:
+ * - kernel/watchdog.c|394| <<watchdog_timer_fn>> this_cpu_ptr(&softlockup_stop_work));
+ */
static DEFINE_PER_CPU(struct cpu_stop_work, softlockup_stop_work);
/*
@@ -359,6 +468,10 @@ static int softlockup_fn(void *data)
}
/* watchdog kicker functions */
+/*
+ * used by:
+ * - kernel/watchdog.c|494| <<watchdog_enable>> hrtimer->function = watchdog_timer_fn;
+ */
static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer)
{
unsigned long touch_ts = __this_cpu_read(watchdog_touch_ts);
@@ -375,6 +488,9 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer)
/* kick the softlockup detector */
if (completion_done(this_cpu_ptr(&softlockup_completion))) {
reinit_completion(this_cpu_ptr(&softlockup_completion));
+ /*
+ * stop a cpu but don't wait for completion
+ */
stop_one_cpu_nowait(smp_processor_id(),
softlockup_fn, NULL,
this_cpu_ptr(&softlockup_stop_work));
@@ -787,3 +903,13 @@ void __init lockup_detector_init(void)
nmi_watchdog_available = true;
lockup_detector_setup();
}
+
+/*
+ * 在下面的patch做了改动
+ *
+ * watchdog/softlockup: Replace "watchdog/%u" threads with cpu_stop_work
+ *
+ * Oleg suggested to replace the "watchdog/%u" threads with
+ * cpu_stop_work. That removes one thread per CPU while at the same time
+ * fixes softlockup vs SCHED_DEADLINE.
+ */
diff --git a/kernel/watchdog_hld.c b/kernel/watchdog_hld.c
index 247bf0b..d00b583 100644
--- a/kernel/watchdog_hld.c
+++ b/kernel/watchdog_hld.c
@@ -20,6 +20,36 @@
#include <asm/irq_regs.h>
#include <linux/perf_event.h>
+/*
+ * 通过在net_rx_action()中禁止irq, 可以重现hard lockup
+ * 和soft lockup以及rcu stall混在一起
+ *
+ * [ 128.919716] NMI watchdog: Watchdog detected hard LOCKUP on cpu 2
+ * [ 128.919716] Modules linked in:
+ * [ 128.919717] CPU: 2 PID: 0 Comm: swapper/2 Not tainted 5.2.0+ #6
+ * [ 128.919717] Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS rel-1.12.1-0-ga5cab58e9a3f-prebuilt.qemu.org 04/01/2014
+ * [ 128.919717] RIP: 0010:net_rx_action+0xb3/0x3c0
+ * [ 128.919718] RSP: 0018:ffffaea2c070ced8 EFLAGS: 00000002
+ * [ 128.919718] RAX: 0000000000000001 RBX: ffffaea2c070cf00 RCX: ffffa3b2b9de0b50
+ * [ 128.919718] RDX: ffffa3b2b9de0b50 RSI: 0000000000000008 RDI: 00000000000007d0
+ * [ 128.919718] RBP: ffffffff97205118 R08: 0000000000000010 R09: 0000000000004000
+ * [ 128.919718] R10: 0000000000007ffe R11: 0000000000000000 R12: ffffaea2c070cf10
+ * [ 128.919719] R13: 0000000000000003 R14: 0000000000000008 R15: ffffa3b2bbb29a40
+ * [ 128.919719] FS: 0000000000000000(0000) GS:ffffa3b2bbb00000(0000) knlGS:0000000000000000
+ * [ 128.919719] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
+ * [ 128.919719] CR2: 0000557d15f31bd8 CR3: 000000017765c005 CR4: 00000000003606e0
+ * [ 128.919719] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
+ * [ 128.919720] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
+ * [ 128.919720] Call Trace:
+ * [ 128.919720] <IRQ>
+ * [ 128.919720] ? recalibrate_cpu_khz+0x10/0x10
+ * [ 128.919720] __do_softirq+0xf2/0x2c7
+ * [ 128.919720] irq_exit+0xa3/0xb0
+ * [ 128.919720] do_IRQ+0x45/0xd0
+ * [ 128.919721] common_interrupt+0xf/0xf
+ * [ 128.919721] </IRQ>
+ */
+
static DEFINE_PER_CPU(bool, hard_watchdog_warn);
static DEFINE_PER_CPU(bool, watchdog_nmi_touch);
static DEFINE_PER_CPU(struct perf_event *, watchdog_ev);
@@ -107,6 +137,10 @@ static struct perf_event_attr wd_hw_attr = {
};
/* Callback function for perf event subsystem */
+/*
+ * used by:
+ * - kernel/watchdog_hld.c|207| <<hardlockup_detector_event_create>> watchdog_overflow_callback, NULL);
+ */
static void watchdog_overflow_callback(struct perf_event *event,
struct perf_sample_data *data,
struct pt_regs *regs)
diff --git a/mm/kmemleak.c b/mm/kmemleak.c
index 9dd581d..bc8f4d8 100644
--- a/mm/kmemleak.c
+++ b/mm/kmemleak.c
@@ -101,6 +101,29 @@
#include <linux/memory_hotplug.h>
/*
+ * kmemleak的原理
+ *
+ * kmemleak通过追踪kmalloc(),vmalloc(),kmem_cache_alloc()等函数,把分配内存