Zebra crash in route_node_delete() as the same route node is accessed in two different threads. #17047
Open
2 tasks done
Labels
triage
Needs further investigation
Description
We are getting the netlink notifications from kernel on two different sockets (2 different threads correspondingly) and accessing the same data structure eventually leading to Zebra crash.
See the below threads:
Thread 5 (Thread 0x7f684e67a7c0 (LWP 32)):
#0 0x00007f684ea8a1dc in read () from /lib/x86_64-linux-gnu/libc.so.6
#1 0x00007f684e96db34 in _Ux86_64_get_elf_image () from /lib/x86_64-linux-gnu/libunwind.so.8
#2 0x00007f684e97605a in ?? () from /lib/x86_64-linux-gnu/libunwind.so.8
#3 0x00007f684e96e8b4 in _ULx86_64_get_proc_name () from /lib/x86_64-linux-gnu/libunwind.so.8
#4 0x00007f684ec4168e in zlog_backtrace_sigsafe (priority=2, program_counter=0x7f684ea1ce3c) at ../lib/log.c:194
#5 0x00007f684ec413e6 in zlog_signal (signo=6, action=0x7f684ecfbf0b "aborting...", siginfo_v=0x7fff36d53e70, program_counter=0x7f684ea1ce3c) at ../lib/log.c:154
#6 0x00007f684ec84fbd in core_handler (signo=6, siginfo=0x7fff36d53e70, context=0x7fff36d53d40) at ../lib/sigevent.c:251
#7
#8 0x00007f684ea1ce3c in ?? () from /lib/x86_64-linux-gnu/libc.so.6
#9 0x00007f684e9cdfb2 in raise () from /lib/x86_64-linux-gnu/libc.so.6
#10 0x00007f684e9b8472 in abort () from /lib/x86_64-linux-gnu/libc.so.6
#11 0x00007f684ecc3d76 in _zlog_assert_failed (xref=0x7f684ed6a220 <_xref.16>, extra=0x0) at ../lib/zlog.c:678
--Type for more, q to quit, c to continue without paging--
#12 0x00007f684ec9616a in route_node_delete (node=0x563ba9ef5bf0) at ../lib/table.c:352
#13 0x00007f684ec95005 in route_unlock_node (node=0x563ba9ef5bf0) at ../lib/table.h:258
#14 0x00007f684ec96383 in route_next (node=0x563ba9ef5bf0) at ../lib/table.c:436
#15 0x0000563ba8bfb378 in zl3vni_from_svi_ns (ns=0x563ba9b02e30, _in_param=0x7fff36d54680, _p_zl3vni=0x7fff36d54658) at ../zebra/zebra_vxlan.c:1863
#16 0x00007f684ec4d41f in ns_walk_func (func=0x563ba8bfb20c <zl3vni_from_svi_ns>, param_in=0x7fff36d54680, param_out=0x7fff36d54658) at ../lib/netns_linux.c:386
#17 0x0000563ba8bfb4f2 in zl3vni_from_svi (ifp=0x563ba9d72900, br_if=0x563ba9e8e450) at ../zebra/zebra_vxlan.c:1930
#18 0x0000563ba8bff5de in zebra_vxlan_handle_kernel_neigh_update (ifp=0x563ba9d72900, link_if=0x563ba9e8e450, ip=0x7fff36d547c0, macaddr=0x7fff36d547e0, state=4, is_ext=false, is_router=false, local_inactive=false, dp_static=false) at ../zebra/zebra_vxlan.c:3764
#19 0x0000563ba8b65024 in netlink_ipneigh_change (h=0x563ba9b0a0d0, len=60, ns_id=0) at ../zebra/rt_netlink.c:4195
#20 0x0000563ba8b65872 in netlink_neigh_change (h=0x563ba9b0a0d0, ns_id=0) at ../zebra/rt_netlink.c:4413
#21 0x0000563ba8b4fb62 in netlink_information_fetch (h=0x563ba9b0a0d0, ns_id=0, startup=0) at ../zebra/kernel_netlink.c:406
#22 0x0000563ba8b515d6 in netlink_parse_info (filter=0x563ba8b4fab0 <netlink_information_fetch>, nl=0x563ba9b036a8, zns=0x7fff36d54a70, count=5, startup=false) at ../zebra/kernel_netlink.c:985
#23 0x0000563ba8b4fd53 in kernel_read (thread=0x7fff36d54b80) at ../zebra/kernel_netlink.c:491
#24 0x00007f684ec9f138 in thread_call (thread=0x7fff36d54b80) at ../lib/thread.c:1990
#25 0x00007f684ec35d50 in frr_run (master=0x563ba98e8050) at ../lib/libfrr.c:1198
#26 0x0000563ba8b54ee5 in main (argc=10, argv=0x7fff36d54e58) at ../zebra/main.c:478
Thread 1 (Thread 0x7f68470a96c0 (LWP 41)):
#0 0x00007f684ea1ce3c in ?? () from /lib/x86_64-linux-gnu/libc.so.6
#1 0x00007f684e9cdfb2 in raise () from /lib/x86_64-linux-gnu/libc.so.6
#2 0x00007f684e9b8472 in abort () from /lib/x86_64-linux-gnu/libc.so.6
#3 0x00007f684ecc3e89 in _zlog_assert_failed (xref=0x7f684ed6a220 <_xref.16>, extra=0x0) at ../lib/zlog.c:700
#4 0x00007f684ec9616a in route_node_delete (node=0x563ba9ef5bf0) at ../lib/table.c:352
#5 0x0000563ba8b3f9b0 in route_unlock_node (node=0x563ba9ef5bf0) at ../lib/table.h:258
#6 0x0000563ba8b40435 in if_lookup_by_index_per_ns (ns=0x563ba9b03680, ifindex=78) at ../zebra/interface.c:301
#7 0x0000563ba8b5b9a1 in parse_multipath_nexthops_unicast (ns_id=0, ng=0x7f6840007a30, rtm=0x7f6847098c60, rtnh=0x7f6847098c90, tb=0x7f6847098a60, prefsrc=0x7f6847098c88, vrf_id=0) at ../zebra/rt_netlink.c:611
#8 0x0000563ba8b5c9ab in netlink_route_change_read_unicast_internal (h=0x7f6847098c50, ns_id=0, startup=0, ctx=0x7f6840055f50) at ../zebra/rt_netlink.c:995
#9 0x00007f684e67159a in fpm_read (t=0x7f68470a8d60) at ../zebra/dplane_fpm_nl.c:820
#10 0x00007f684ec9f138 in thread_call (thread=0x7f68470a8d60) at ../lib/thread.c:1990
#11 0x00007f684ec21059 in fpt_run (arg=0x563ba9c75de0) at ../lib/frr_pthread.c:309
--Type for more, q to quit, c to continue without paging--
#12 0x00007f684ec20a8c in frr_pthread_inner (arg=0x563ba9c75de0) at ../lib/frr_pthread.c:158
#13 0x00007f684ea1b144 in ?? () from /lib/x86_64-linux-gnu/libc.so.6
#14 0x00007f684ea9b7dc in ?? () from /lib/x86_64-linux-gnu/libc.so.6
(gdb) quit
root@leaf0:/# exit
exit
Version
How to reproduce
We attempted "Graceful Restart". This zebra crash happened once the router came back after undergoing GR.
All the threads are given below:
(gdb) thread apply all bt
Thread 9 (Thread 0x7f681ffff6c0 (LWP 81)):
#0 0x00007f684ea8e256 in ppoll () from /lib/x86_64-linux-gnu/libc.so.6
#1 0x00007f684ec9cb24 in fd_poll (m=0x563ba9ec2040, timer_wait=0x0, eintr_p=0x7f681fffeccb) at ../lib/thread.c:930
#2 0x00007f684ec9ed21 in thread_fetch (m=0x563ba9ec2040, fetch=0x7f681fffed60) at ../lib/thread.c:1830
#3 0x00007f684ec21045 in fpt_run (arg=0x563ba9ec1fb0) at ../lib/frr_pthread.c:308
#4 0x00007f684ec20a8c in frr_pthread_inner (arg=0x563ba9ec1fb0) at ../lib/frr_pthread.c:158
#5 0x00007f684ea1b144 in ?? () from /lib/x86_64-linux-gnu/libc.so.6
#6 0x00007f684ea9b7dc in ?? () from /lib/x86_64-linux-gnu/libc.so.6
Thread 8 (Thread 0x7f683e7a76c0 (LWP 42)):
#0 0x00007f684ea18113 in ?? () from /lib/x86_64-linux-gnu/libc.so.6
#1 0x00007f684ea1fbfe in ?? () from /lib/x86_64-linux-gnu/libc.so.6
#2 0x00007f684ec98c8e in _frr_mtx_unlock (mutex=0x7f683e7a6a58) at ../lib/frr_pthread.h:262
#3 0x00007f684ec9d139 in _thread_add_timer_timeval (xref=0x7f684e6778a0 <_xref.13>, m=0x563ba9c75e70, func=0x7f684e672f2d <fpm_process_queue>, arg=0x563ba9c6b4e0, time_relative=0x7f683e7a6af0, t_ptr=0x563ba9c6b618) at ../lib/thread.c:1054
#4 0x00007f684ec9d21e in _thread_add_timer (xref=0x7f684e6778a0 <_xref.13>, m=0x563ba9c75e70, func=0x7f684e672f2d <fpm_process_queue>, arg=0x563ba9c6b4e0, timer=0, t_ptr=0x563ba9c6b618) at ../lib/thread.c:1098
#5 0x00007f684e673982 in fpm_nl_process (prov=0x563ba9c6b6a0) at ../zebra/dplane_fpm_nl.c:1784
#6 0x0000563ba8b8a8ae in dplane_thread_loop (event=0x7f683e7a6d60) at ../zebra/zebra_dplane.c:7412
#7 0x00007f684ec9f138 in thread_call (thread=0x7f683e7a6d60) at ../lib/thread.c:1990
#8 0x00007f684ec21059 in fpt_run (arg=0x563ba9c6f240) at ../lib/frr_pthread.c:309
#9 0x00007f684ec20a8c in frr_pthread_inner (arg=0x563ba9c6f240) at ../lib/frr_pthread.c:158
#10 0x00007f684ea1b144 in ?? () from /lib/x86_64-linux-gnu/libc.so.6
#11 0x00007f684ea9b7dc in ?? () from /lib/x86_64-linux-gnu/libc.so.6
Thread 7 (Thread 0x7f6826ffd6c0 (LWP 61)):
#0 0x00007f684ea8e256 in ppoll () from /lib/x86_64-linux-gnu/libc.so.6
#1 0x00007f684ec9cb24 in fd_poll (m=0x563ba9d226a0, timer_wait=0x0, eintr_p=0x7f6826ffcccb) at ../lib/thread.c:930
#2 0x00007f684ec9ed21 in thread_fetch (m=0x563ba9d226a0, fetch=0x7f6826ffcd60) at ../lib/thread.c:1830
#3 0x00007f684ec21045 in fpt_run (arg=0x563ba9d22610) at ../lib/frr_pthread.c:308
#4 0x00007f684ec20a8c in frr_pthread_inner (arg=0x563ba9d22610) at ../lib/frr_pthread.c:158
#5 0x00007f684ea1b144 in ?? () from /lib/x86_64-linux-gnu/libc.so.6
#6 0x00007f684ea9b7dc in ?? () from /lib/x86_64-linux-gnu/libc.so.6
Thread 6 (Thread 0x7f682dffb6c0 (LWP 60)):
#0 0x00007f684ea8e256 in ppoll () from /lib/x86_64-linux-gnu/libc.so.6
#1 0x00007f684ec9cb24 in fd_poll (m=0x563ba9ceac50, timer_wait=0x0, eintr_p=0x7f682dffaccb) at ../lib/thread.c:930
#2 0x00007f684ec9ed21 in thread_fetch (m=0x563ba9ceac50, fetch=0x7f682dffad60) at ../lib/thread.c:1830
#3 0x00007f684ec21045 in fpt_run (arg=0x563ba9ceabc0) at ../lib/frr_pthread.c:308
#4 0x00007f684ec20a8c in frr_pthread_inner (arg=0x563ba9ceabc0) at ../lib/frr_pthread.c:158
#5 0x00007f684ea1b144 in ?? () from /lib/x86_64-linux-gnu/libc.so.6
#6 0x00007f684ea9b7dc in ?? () from /lib/x86_64-linux-gnu/libc.so.6
Thread 5 (Thread 0x7f684e67a7c0 (LWP 32)):
#0 0x00007f684ea8a1dc in read () from /lib/x86_64-linux-gnu/libc.so.6
#1 0x00007f684e96db34 in _Ux86_64_get_elf_image () from /lib/x86_64-linux-gnu/libunwind.so.8
#2 0x00007f684e97605a in ?? () from /lib/x86_64-linux-gnu/libunwind.so.8
#3 0x00007f684e96e8b4 in _ULx86_64_get_proc_name () from /lib/x86_64-linux-gnu/libunwind.so.8
#4 0x00007f684ec4168e in zlog_backtrace_sigsafe (priority=2, program_counter=0x7f684ea1ce3c) at ../lib/log.c:194
#5 0x00007f684ec413e6 in zlog_signal (signo=6, action=0x7f684ecfbf0b "aborting...", siginfo_v=0x7fff36d53e70, program_counter=0x7f684ea1ce3c) at ../lib/log.c:154
#6 0x00007f684ec84fbd in core_handler (signo=6, siginfo=0x7fff36d53e70, context=0x7fff36d53d40) at ../lib/sigevent.c:251
#7
#8 0x00007f684ea1ce3c in ?? () from /lib/x86_64-linux-gnu/libc.so.6
#9 0x00007f684e9cdfb2 in raise () from /lib/x86_64-linux-gnu/libc.so.6
#10 0x00007f684e9b8472 in abort () from /lib/x86_64-linux-gnu/libc.so.6
#11 0x00007f684ecc3d76 in _zlog_assert_failed (xref=0x7f684ed6a220 <_xref.16>, extra=0x0) at ../lib/zlog.c:678
--Type for more, q to quit, c to continue without paging--
#12 0x00007f684ec9616a in route_node_delete (node=0x563ba9ef5bf0) at ../lib/table.c:352
#13 0x00007f684ec95005 in route_unlock_node (node=0x563ba9ef5bf0) at ../lib/table.h:258
#14 0x00007f684ec96383 in route_next (node=0x563ba9ef5bf0) at ../lib/table.c:436
#15 0x0000563ba8bfb378 in zl3vni_from_svi_ns (ns=0x563ba9b02e30, _in_param=0x7fff36d54680, _p_zl3vni=0x7fff36d54658) at ../zebra/zebra_vxlan.c:1863
#16 0x00007f684ec4d41f in ns_walk_func (func=0x563ba8bfb20c <zl3vni_from_svi_ns>, param_in=0x7fff36d54680, param_out=0x7fff36d54658) at ../lib/netns_linux.c:386
#17 0x0000563ba8bfb4f2 in zl3vni_from_svi (ifp=0x563ba9d72900, br_if=0x563ba9e8e450) at ../zebra/zebra_vxlan.c:1930
#18 0x0000563ba8bff5de in zebra_vxlan_handle_kernel_neigh_update (ifp=0x563ba9d72900, link_if=0x563ba9e8e450, ip=0x7fff36d547c0, macaddr=0x7fff36d547e0, state=4, is_ext=false, is_router=false, local_inactive=false, dp_static=false) at ../zebra/zebra_vxlan.c:3764
#19 0x0000563ba8b65024 in netlink_ipneigh_change (h=0x563ba9b0a0d0, len=60, ns_id=0) at ../zebra/rt_netlink.c:4195
#20 0x0000563ba8b65872 in netlink_neigh_change (h=0x563ba9b0a0d0, ns_id=0) at ../zebra/rt_netlink.c:4413
#21 0x0000563ba8b4fb62 in netlink_information_fetch (h=0x563ba9b0a0d0, ns_id=0, startup=0) at ../zebra/kernel_netlink.c:406
#22 0x0000563ba8b515d6 in netlink_parse_info (filter=0x563ba8b4fab0 <netlink_information_fetch>, nl=0x563ba9b036a8, zns=0x7fff36d54a70, count=5, startup=false) at ../zebra/kernel_netlink.c:985
#23 0x0000563ba8b4fd53 in kernel_read (thread=0x7fff36d54b80) at ../zebra/kernel_netlink.c:491
#24 0x00007f684ec9f138 in thread_call (thread=0x7fff36d54b80) at ../lib/thread.c:1990
#25 0x00007f684ec35d50 in frr_run (master=0x563ba98e8050) at ../lib/libfrr.c:1198
#26 0x0000563ba8b54ee5 in main (argc=10, argv=0x7fff36d54e58) at ../zebra/main.c:478
Thread 4 (Thread 0x7f683ffff6c0 (LWP 43)):
#0 0x00007f684ea8e256 in ppoll () from /lib/x86_64-linux-gnu/libc.so.6
#1 0x00007f684ec9cb24 in fd_poll (m=0x563ba9c78820, timer_wait=0x0, eintr_p=0x7f683fffeccb) at ../lib/thread.c:930
#2 0x00007f684ec9ed21 in thread_fetch (m=0x563ba9c78820, fetch=0x7f683fffed60) at ../lib/thread.c:1830
#3 0x00007f684ec21045 in fpt_run (arg=0x563ba9c78790) at ../lib/frr_pthread.c:308
#4 0x00007f684ec20a8c in frr_pthread_inner (arg=0x563ba9c78790) at ../lib/frr_pthread.c:158
#5 0x00007f684ea1b144 in ?? () from /lib/x86_64-linux-gnu/libc.so.6
#6 0x00007f684ea9b7dc in ?? () from /lib/x86_64-linux-gnu/libc.so.6
Thread 3 (Thread 0x7f68478aa6c0 (LWP 40)):
#0 0x00007f684ea93719 in syscall () from /lib/x86_64-linux-gnu/libc.so.6
#1 0x00007f684ec801b9 in sys_futex (addr1=0x7f684ed8e020 <rcu_seq>, op=0, val1=7, timeout=0x0, addr2=0x0, val3=0) at ../lib/seqlock.c:53
#2 0x00007f684ec802e1 in seqlock_wait (sqlo=0x7f684ed8e020 <rcu_seq>, val=5) at ../lib/seqlock.c:153
#3 0x00007f684ec20027 in rcu_main (arg=0x0) at ../lib/frrcu.c:429
#4 0x00007f684ea1b144 in ?? () from /lib/x86_64-linux-gnu/libc.so.6
#5 0x00007f684ea9b7dc in ?? () from /lib/x86_64-linux-gnu/libc.so.6
Thread 2 (Thread 0x7f683cfa46c0 (LWP 50)):
#0 0x00007f684ea8e256 in ppoll () from /lib/x86_64-linux-gnu/libc.so.6
#1 0x00007f684ec9cb24 in fd_poll (m=0x563ba9cb3010, timer_wait=0x0, eintr_p=0x7f683cfa3ccb) at ../lib/thread.c:930
#2 0x00007f684ec9ed21 in thread_fetch (m=0x563ba9cb3010, fetch=0x7f683cfa3d60) at ../lib/thread.c:1830
#3 0x00007f684ec21045 in fpt_run (arg=0x563ba9cb2f80) at ../lib/frr_pthread.c:308
#4 0x00007f684ec20a8c in frr_pthread_inner (arg=0x563ba9cb2f80) at ../lib/frr_pthread.c:158
#5 0x00007f684ea1b144 in ?? () from /lib/x86_64-linux-gnu/libc.so.6
#6 0x00007f684ea9b7dc in ?? () from /lib/x86_64-linux-gnu/libc.so.6
Thread 1 (Thread 0x7f68470a96c0 (LWP 41)):
#0 0x00007f684ea1ce3c in ?? () from /lib/x86_64-linux-gnu/libc.so.6
#1 0x00007f684e9cdfb2 in raise () from /lib/x86_64-linux-gnu/libc.so.6
#2 0x00007f684e9b8472 in abort () from /lib/x86_64-linux-gnu/libc.so.6
#3 0x00007f684ecc3e89 in _zlog_assert_failed (xref=0x7f684ed6a220 <_xref.16>, extra=0x0) at ../lib/zlog.c:700
#4 0x00007f684ec9616a in route_node_delete (node=0x563ba9ef5bf0) at ../lib/table.c:352
#5 0x0000563ba8b3f9b0 in route_unlock_node (node=0x563ba9ef5bf0) at ../lib/table.h:258
#6 0x0000563ba8b40435 in if_lookup_by_index_per_ns (ns=0x563ba9b03680, ifindex=78) at ../zebra/interface.c:301
#7 0x0000563ba8b5b9a1 in parse_multipath_nexthops_unicast (ns_id=0, ng=0x7f6840007a30, rtm=0x7f6847098c60, rtnh=0x7f6847098c90, tb=0x7f6847098a60, prefsrc=0x7f6847098c88, vrf_id=0) at ../zebra/rt_netlink.c:611
#8 0x0000563ba8b5c9ab in netlink_route_change_read_unicast_internal (h=0x7f6847098c50, ns_id=0, startup=0, ctx=0x7f6840055f50) at ../zebra/rt_netlink.c:995
#9 0x00007f684e67159a in fpm_read (t=0x7f68470a8d60) at ../zebra/dplane_fpm_nl.c:820
#10 0x00007f684ec9f138 in thread_call (thread=0x7f68470a8d60) at ../lib/thread.c:1990
#11 0x00007f684ec21059 in fpt_run (arg=0x563ba9c75de0) at ../lib/frr_pthread.c:309
--Type for more, q to quit, c to continue without paging--
#12 0x00007f684ec20a8c in frr_pthread_inner (arg=0x563ba9c75de0) at ../lib/frr_pthread.c:158
#13 0x00007f684ea1b144 in ?? () from /lib/x86_64-linux-gnu/libc.so.6
#14 0x00007f684ea9b7dc in ?? () from /lib/x86_64-linux-gnu/libc.so.6
(gdb) quit
root@leaf0:/# exit
exit
Expected behavior
No crash should be observed
Actual behavior
Same Data structure getting accessed in two different threads.
Additional context
No response
Checklist
The text was updated successfully, but these errors were encountered: