Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Kernel locks tracing only in process context #5114

Open
Anjali05 opened this issue Oct 2, 2024 · 4 comments
Open

Kernel locks tracing only in process context #5114

Anjali05 opened this issue Oct 2, 2024 · 4 comments

Comments

@Anjali05
Copy link

Anjali05 commented Oct 2, 2024

Hi,

I am using an extended version of klockstat tool to trace spin locks but because of interrupts, the stack traces I am getting are quite messy. Ideally, I would like to trace only in the process context. Is there a way to do this in eBPF currently? I tried using in_interrupt but I do not think its supported. Can anyone suggest any workaround for this? Thanks in advance!

@yonghong-song
Copy link
Collaborator

Currently BPF does not have helpers/kfuncs to identify the context. So I don't have a good way either. Is it possible that you filter some stack trace if their pattern suggests it is from non-process context?

@Anjali05
Copy link
Author

Anjali05 commented Oct 4, 2024

@yonghong-song There are patterns but if I filter lock acquisitions like this, I will be losing data on some locks. For example in stacks like these, I do not have separate stack traces giving me the name of the lock acquired by folio_lruvec_lock_irqsave:

   "locks": [
            "&head->lock"
        ],
        "kernel_stack": [
            "_raw_spin_lock+0x5 [kernel]",
            "__pcpu_freelist_pop+0xb3 [kernel]",
            "pcpu_freelist_pop+0x1a [kernel]",
            "__bpf_get_stackid+0x147 [kernel]",
            "bpf_get_stackid_raw_tp+0x56 [kernel]",
            "bpf_prog_6deef7357e7b4530+0x91a6 [bpf]",
            "ipmi_si_pci_init.cold+0x11741 [ipmi_si]",
            "_raw_spin_lock_nested+0x5 [kernel]",
            "raw_spin_rq_lock_nested+0x23 [kernel]",
            "try_to_wake_up+0x249 [kernel]",
            "hrtimer_wakeup+0x1e [kernel]",
            "__hrtimer_run_queues+0x137 [kernel]",
            "hrtimer_interrupt+0x118 [kernel]",
            "__sysvec_apic_timer_interrupt+0x97 [kernel]",
            "sysvec_apic_timer_interrupt+0x9d [kernel]",
            "asm_sysvec_apic_timer_interrupt+0x16 [kernel]",
            "unwind_next_frame+0x34 [kernel]",
            "__unwind_start+0x15b [kernel]",
            "perf_callchain_kernel+0x9d [kernel]",
            "get_perf_callchain+0xc5 [kernel]",
            "bpf_get_stackid+0x69 [kernel]",
            "bpf_get_stackid_raw_tp+0x56 [kernel]",
            "bpf_prog_6deef7357e7b4530+0x8b7e [bpf]",
            "ipmi_si_pci_init.cold+0xd73e [ipmi_si]",
            "_raw_spin_lock_irqsave+0x5 [kernel]",
            "folio_lruvec_lock_irqsave+0x5e [kernel]",
            "folio_batch_move_lru+0x58 [kernel]",
            "folio_add_lru+0x9c [kernel]",
            "shmem_get_folio_gfp+0x3b4 [kernel]",
            "shmem_fault+0x76 [kernel]",
            "__do_fault+0x31 [kernel]",
            "do_fault+0x21c [kernel]",
            "__handle_mm_fault+0x628 [kernel]",
            "handle_mm_fault+0x107 [kernel]",
            "__get_user_pages+0x231 [kernel]",
            "get_user_pages_unlocked+0xd3 [kernel]",
            "hva_to_pfn+0xfb [kvm]",
            "kvm_faultin_pfn+0x91 [kvm]",
            "direct_page_fault+0x22c [kvm]",
            "kvm_mmu_page_fault+0x10d [kvm]",
            "vmx_handle_exit+0x16b [kvm_intel]",
            "vcpu_enter_guest+0xa40 [kvm]",
            "kvm_arch_vcpu_ioctl_run+0x266 [kvm]",
            "kvm_vcpu_ioctl+0x27d [kvm]",
            "__x64_sys_ioctl+0x88 [kernel]",
            "do_syscall_64+0x38 [kernel]",
            "entry_SYSCALL_64_after_hwframe+0x63 [kernel]",
            ""
        ],
        "addresses": [
            "0xffffffff81cd0c35",
            "0xffffffff812559e3",
            "0xffffffff81255c2a",
            "0xffffffff812718c7",
            "0xffffffff81211ea6",
            "0xffffffffa0018192",
            "0xffffffffa09a0052",
            "0xffffffff81cd0cb5",
            "0xffffffff810e1013",
            "0xffffffff810ea3b9",
            "0xffffffff8116ad9e",
            "0xffffffff8116b057",
            "0xffffffff8116bc38",
            "0xffffffff8106eb67",
            "0xffffffff81cbb13d",
            "0xffffffff81e00e46",
            "0xffffffff81082c74",
            "0xffffffff810833eb",
            "0xffffffff810067dd",
            "0xffffffff81291115",
            "0xffffffff81271bf9",
            "0xffffffff81211ea6",
            "0xffffffffa0017b6a",
            "0xffffffffa099c04f",
            "0xffffffff81cd0e95",
            "0xffffffff8136847e",
            "0xffffffff812af818",
            "0xffffffff812af9cc",
            "0xffffffff812c07a4",
            "0xffffffff812c0b96",
            "0xffffffff812e3aa1",
            "0xffffffff812e4a1c",
            "0xffffffff812e8fe8",
            "0xffffffff812e9ac7",
            "0xffffffff812dbeb1",
            "0xffffffff812dc3d3",
            "0xffffffffa0e6196b",
            "0xffffffffa0eaf961",
            "0xffffffffa0eb631c",
            "0xffffffffa0eb6b1d",
            "0xffffffffa0df976b",
            "0xffffffffa0e843f0",
            "0xffffffffa0e863b6",
            "0xffffffffa0e5c11d",
            "0xffffffff813a0ea8",
            "0xffffffff81cb7148",
            "0xffffffff81e0009b"
        ]
    },

I did manually add tracepoints/kfunc probes for some interrupt handlers like this:

BPF_PERCPU_ARRAY(irq_flags, u64, 1);

// Function to mark the start of interrupt handling
TRACEPOINT_PROBE(irq, irq_handler_entry) {
    int cpu = bpf_get_smp_processor_id();
    u64 val = 1; // Mark as in-interrupt
    irq_flags.update(&cpu, &val);
    return 0;
}

// Function to mark the end of interrupt handling
TRACEPOINT_PROBE(irq, irq_handler_exit) {
    int cpu = bpf_get_smp_processor_id();
    u64 val = 0; // Mark as not in-interrupt
    irq_flags.update(&cpu, &val);
    return 0;
}

// Function to mark the start of interrupt handling
TRACEPOINT_PROBE(irq, softirq_entry) {
    int cpu = bpf_get_smp_processor_id();
    u64 val = 1; // Mark as in-interrupt
    irq_flags.update(&cpu, &val);
    return 0;
}

// Function to mark the end of interrupt handling
TRACEPOINT_PROBE(irq, softirq_exit) {
    int cpu = bpf_get_smp_processor_id();
    u64 val = 0; // Mark as not in-interrupt
    irq_flags.update(&cpu, &val);
    return 0;
}

static void handle_hrtimer_interrupt_enter() {
    int cpu = bpf_get_smp_processor_id();
    u64 val = 1; // Mark as in-interrupt
    irq_flags.update(&cpu, &val); 
}

// The probe for hrtimer interrupt entry
KFUNC_PROBE(hrtimer_interrupt) {
    handle_hrtimer_interrupt_enter(); 
    return 0;
}

static void handle_hrtimer_interrupt_exit() {
    int cpu = bpf_get_smp_processor_id();
    u64 val = 0; // Mark as not in-interrupt
    irq_flags.update(&cpu, &val); 
}

// The probe for hrtimer interrupt exit
KRETFUNC_PROBE(hrtimer_interrupt) {
    handle_hrtimer_interrupt_exit(); 
    return 0;
}

static void handle_sysvec_apic_timer_interrupt_enter() {
    int cpu = bpf_get_smp_processor_id();
    u64 val = 1; // Mark as in-interrupt
    irq_flags.update(&cpu, &val); 

}

KFUNC_PROBE(__sysvec_apic_timer_interrupt) {
    handle_sysvec_apic_timer_interrupt_enter(); 
    return 0;
}

static void handle_sysvec_apic_timer_interrupt_exit() {
    int cpu = bpf_get_smp_processor_id();
    u64 val = 0; // Mark as not in-interrupt
    irq_flags.update(&cpu, &val);
}

KRETFUNC_PROBE(__sysvec_apic_timer_interrupt) {
    handle_sysvec_apic_timer_interrupt_exit(); 
    return 0;
}

static int lock__NAME_(struct pt_regs *ctx, LOCK_TYPE  *lock)
{
    u32 current_pid = bpf_get_current_pid_tgid();

  
    // Check if we're in an interrupt context
    int cpu = bpf_get_smp_processor_id();
    u64 *flag = irq_flags.lookup(&cpu);
    if (flag && *flag == 1) {
        //bpf_trace_printk("interrupt context\\n"); 
        return 0;  // In-interrupt, skip stack trace capture
    }

....
}

But I do not think it's doing anything. Do you have any other suggestions?

@Anjali05
Copy link
Author

Anjali05 commented Oct 7, 2024

@yonghong-song Can you point me to any useful guide to create a helper function to access context?

@4ast
Copy link
Member

4ast commented Oct 28, 2024

with libbpf the following works:

#include <vmlinux.h>
extern const struct pcpu_hot pcpu_hot __ksym __weak;
SEC("syscall")
int pcpu_hot_test(void *ctx)
{
        struct pcpu_hot *p = bpf_this_cpu_ptr(&pcpu_hot);

        bpf_printk("pc %x\n", p->preempt_count);
        return 0;
}

Then add your own on top:

# define irq_count()            (preempt_count() & (NMI_MASK | HARDIRQ_MASK | SOFTIRQ_MASK))
#define in_interrupt()          (irq_count())

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
None yet
Projects
None yet
Development

No branches or pull requests

3 participants