Issue metadata
Sign in to add a comment
|
kgdb: Can't backtrace (bt) past el1_irq in gdb |
||||||||||||||||||||||
Issue description
For some reason, when I hook up gdb to the kernel (AKA use kgdb) I can never bt past el1. I _can_ trace past el1 with the normal kernel stack crawl though.
For instance, I put a bogus drop into the debugger in an IRQ I know probes on my system:
---
diff --git a/drivers/spi/spi-qcom-qspi.c b/drivers/spi/spi-qcom-qspi.c
index b8163b40bb92..419a5abfc70a 100644
--- a/drivers/spi/spi-qcom-qspi.c
+++ b/drivers/spi/spi-qcom-qspi.c
@@ -381,6 +381,9 @@ static irqreturn_t pio_write(struct qcom_qspi *ctrl)
return IRQ_HANDLED;
}
+#include <linux/kgdb.h>
+bool doug;
+
static irqreturn_t qcom_qspi_irq(int irq, void *dev_id)
{
u32 int_status;
@@ -390,6 +393,9 @@ static irqreturn_t qcom_qspi_irq(int irq, void *dev_id)
spin_lock_irqsave(&ctrl->lock, flags);
+ if (!doug)
+ kgdb_breakpoint();
+
int_status = readl(ctrl->base + MSTR_INT_STATUS);
writel(int_status, ctrl->base + MSTR_INT_STATUS);
---
In kdb I can trace all the way back to start_kernel:
[0]kdb> bt
Stack traceback for pid 0
0xffffff8009228e80 0 0 1 0 R 0xffffff800922a150 *swapper/0
Call trace:
dump_backtrace+0x0/0x150
show_stack+0x20/0x28
kdb_show_stack+0x5c/0x7c
kdb_bt1.isra.1+0xa0/0x108
kdb_bt+0x3ac/0x3e0
kdb_parse+0x53c/0x614
kdb_main_loop+0x568/0x6f0
kdb_stub+0x28c/0x3a8
kgdb_cpu_enter+0x1d0/0x62c
kgdb_handle_exception+0x184/0x1d8
kgdb_compiled_brk_fn+0x30/0x3c
brk_handler+0x134/0x178
do_debug_exception+0xfc/0x178
el1_dbg+0x18/0x78
kgdb_breakpoint+0x34/0x58
qcom_qspi_irq+0x40/0x28c
__handle_irq_event_percpu+0x1a4/0x3f4
handle_irq_event_percpu+0x38/0x88
handle_irq_event+0x4c/0x7c
handle_fasteoi_irq+0xb4/0x124
generic_handle_irq+0x30/0x44
__handle_domain_irq+0x90/0xbc
gic_handle_irq+0xf4/0x1a4
el1_irq+0xb4/0x130
arch_cpu_idle+0x100/0x1e4
default_idle_call+0x2c/0x34
do_idle+0x10c/0x264
cpu_startup_entry+0x28/0x2c
rest_init+0x254/0x264
start_kernel+0x434/0x48c
---
When hook up to gdb and try a trace you can see it gets stuck at el1_irq:
(gdb) bt
#0 arch_kgdb_breakpoint () at /mnt/host/source/src/third_party/kernel/v4.19/arch/arm64/include/asm/kgdb.h:32
#1 kgdb_breakpoint () at /mnt/host/source/src/third_party/kernel/v4.19/kernel/debug/debug_core.c:1136
#2 0xffffff800865f36c in qcom_qspi_irq (irq=<optimized out>, dev_id=0xffffffc0f859bf50)
at /mnt/host/source/src/third_party/kernel/v4.19/drivers/spi/spi-qcom-qspi.c:397
#3 0xffffff8008134848 in __handle_irq_event_percpu (desc=0xffffffc0fa04f280, flags=0xffffff8008003ee4)
at /mnt/host/source/src/third_party/kernel/v4.19/kernel/irq/handle.c:149
#4 0xffffff8008134ad0 in handle_irq_event_percpu (desc=0xffffffc0fa04f280) at /mnt/host/source/src/third_party/kernel/v4.19/kernel/irq/handle.c:189
#5 0xffffff8008134b6c in handle_irq_event (desc=0xffffffc0fa04f280) at /mnt/host/source/src/third_party/kernel/v4.19/kernel/irq/handle.c:206
#6 0xffffff800813919c in handle_fasteoi_irq (desc=0xffffffc0fa04f280) at /mnt/host/source/src/third_party/kernel/v4.19/kernel/irq/chip.c:719
#7 0xffffff80081336c8 in generic_handle_irq_desc (desc=<optimized out>) at /mnt/host/source/src/third_party/kernel/v4.19/include/linux/irqdesc.h:154
#8 generic_handle_irq (irq=31) at /mnt/host/source/src/third_party/kernel/v4.19/kernel/irq/irqdesc.c:624
#9 0xffffff8008133e68 in __handle_domain_irq (domain=0xffffffc0fb019880, hwirq=<optimized out>, lookup=true, regs=<optimized out>)
at /mnt/host/source/src/third_party/kernel/v4.19/kernel/irq/irqdesc.c:661
#10 0xffffff8008081138 in handle_domain_irq (regs=<optimized out>, hwirq=<optimized out>, domain=<optimized out>)
at /mnt/host/source/src/third_party/kernel/v4.19/include/linux/irqdesc.h:172
#11 gic_handle_irq (regs=0xffffff8009203d50) at /mnt/host/source/src/third_party/kernel/v4.19/drivers/irqchip/irq-gic-v3.c:362
#12 0xffffff8008082bf4 in el1_irq () at /mnt/host/source/src/third_party/kernel/v4.19/arch/arm64/kernel/entry.S:622
Backtrace stopped: previous frame identical to this frame (corrupt stack?)
...but if I manually try to trace I can go further. AKA I enter these commands:
frame 12
frame ((void**)$x29)[0] ((void**)$x29)[1]
frame (*(void***)$x29)[0] (*(void***)$x29)[1]
frame (**(void****)$x29)[0] (**(void****)$x29)[1]
frame (***(void*****)$x29)[0] (***(void*****)$x29)[1]
frame (****(void******)$x29)[0] (****(void******)$x29)[1]
frame (*****(void*******)$x29)[0] (*****(void*******)$x29)[1]
frame (******(void********)$x29)[0] (******(void********)$x29)[1]
frame (*******(void*********)$x29)[0] (*******(void*********)$x29)[1]
frame (********(void**********)$x29)[0] (********(void**********)$x29)[1]
frame (*********(void***********)$x29)[0] (*********(void***********)$x29)[1]
...and I see:
(gdb) frame ((void**)$x29)[0] ((void**)$x29)[1]
#0 arch_cpu_idle () at /mnt/host/source/src/third_party/kernel/v4.19/arch/arm64/kernel/process.c:88
88 trace_cpu_idle_rcuidle(PWR_EVENT_EXIT, smp_processor_id());
(gdb) frame (*(void***)$x29)[0] (*(void***)$x29)[1]
#0 0xffffff8008134848 in __handle_irq_event_percpu (desc=0xffffffc0fa04f280, flags=0xffffff8008003ee4)
at /mnt/host/source/src/third_party/kernel/v4.19/kernel/irq/handle.c:149
149 res = action->handler(irq, action->dev_id);
(gdb) frame (**(void****)$x29)[0] (**(void****)$x29)[1]
#0 0xffffff8008134ad0 in handle_irq_event_percpu (desc=0x80) at /mnt/host/source/src/third_party/kernel/v4.19/kernel/irq/handle.c:189
189 retval = __handle_irq_event_percpu(desc, &flags);
(gdb) frame (***(void*****)$x29)[0] (***(void*****)$x29)[1]
#0 0xffffff8008134b6c in handle_irq_event (desc=0xffffffc0f859bf98) at /mnt/host/source/src/third_party/kernel/v4.19/kernel/irq/handle.c:206
206 ret = handle_irq_event_percpu(desc);
(gdb) frame (****(void******)$x29)[0] (****(void******)$x29)[1]
#0 cond_unmask_eoi_irq (chip=<optimized out>, desc=<optimized out>) at /mnt/host/source/src/third_party/kernel/v4.19/kernel/irq/chip.c:665
665 if (!(desc->istate & IRQS_ONESHOT)) {
(gdb) frame (*****(void*******)$x29)[0] (*****(void*******)$x29)[1]
#0 generic_handle_irq (irq=4166631320) at /mnt/host/source/src/third_party/kernel/v4.19/kernel/irq/irqdesc.c:625
625 return 0;
(gdb) frame (******(void********)$x29)[0] (******(void********)$x29)[1]
#0 __handle_domain_irq (domain=0xffffffc0f859bf50, hwirq=<optimized out>, lookup=false, regs=<optimized out>)
at /mnt/host/source/src/third_party/kernel/v4.19/kernel/irq/irqdesc.c:644
644 int ret = 0;
(gdb) frame (*******(void*********)$x29)[0] (*******(void*********)$x29)[1]
#0 gic_handle_irq (regs=0xffffff800921c000 <nf_conntrack_locks+55296>) at /mnt/host/source/src/third_party/kernel/v4.19/drivers/irqchip/irq-gic-v3.c:
363 if (err) {
(gdb) frame (********(void**********)$x29)[0] (********(void**********)$x29)[1]
#0 0xffffff8008082bf4 in el1_irq () at /mnt/host/source/src/third_party/kernel/v4.19/arch/arm64/kernel/entry.S:622
622 irq_handler
(gdb) frame (*********(void***********)$x29)[0] (*********(void***********)$x29)[1]
#0 arch_cpu_idle () at /mnt/host/source/src/third_party/kernel/v4.19/arch/arm64/kernel/process.c:88
88 trace_cpu_idle_rcuidle(PWR_EVENT_EXIT, smp_processor_id());
...I could go further but it gets to be a hassle...
===
Presumably gdb somehow can't trace past el1_irq since it's in assembly. Is there any way we can adjust the assembly, add debug info, or make gdb smarter to handle this?
Here's what gdb knows about the frame above el1_irq and el1_irq:
(gdb) frame 11
#11 gic_handle_irq (regs=0xffffff8009203d50) at /mnt/host/source/src/third_party/kernel/v4.19/drivers/irqchip/irq-gic-v3.c:362
362 err = handle_domain_irq(gic_data.domain, irqnr, regs);
(gdb) info frame
Stack level 11, frame at 0xffffff8008004000:
pc = 0xffffff8008081138 in gic_handle_irq (/mnt/host/source/src/third_party/kernel/v4.19/drivers/irqchip/irq-gic-v3.c:362); saved pc = 0xffffff800808
called by frame at 0xffffff8008004000, caller of frame at 0xffffff8008004000
source language c.
Arglist at 0xffffff8008003fb0, args: regs=0xffffff8009203d50
Locals at 0xffffff8008003fb0, Previous frame's sp is 0xffffff8008004000
Saved registers:
x19 at 0xffffff8008003fc0, x20 at 0xffffff8008003fc8, x21 at 0xffffff8008003fd0, x22 at 0xffffff8008003fd8, x23 at 0xffffff8008003fe0, x24 at 0xffff
x25 at 0xffffff8008003ff0, x26 at 0xffffff8008003ff8, x29 at 0xffffff8008003fb0, x30 at 0xffffff8008003fb8
--
(gdb) frame 12
#12 0xffffff8008082bf4 in el1_irq () at /mnt/host/source/src/third_party/kernel/v4.19/arch/arm64/kernel/entry.S:622
622 irq_handler
(gdb) info frame
Stack level 12, frame at 0xffffff8008004000:
pc = 0xffffff8008082bf4 in el1_irq (/mnt/host/source/src/third_party/kernel/v4.19/arch/arm64/kernel/entry.S:622); saved pc = <not saved>
Outermost frame: previous frame identical to this frame (corrupt stack?)
caller of frame at 0xffffff8008004000
source language asm.
Arglist at 0xffffff8008004000, args:
Locals at 0xffffff8008004000, Previous frame's sp is 0xffffff8008004000
Cannot access memory at address 0xffffff8008004000
,
Nov 28
,
Dec 1
I'm not sure what the issue is, or if there is a real bug here or not. It may be that kdb is just brute-forcing it's way through the stack (only stopping if a dereference would literally segfault), while GDB (and kgdb) tends to stop as soon is it thinks the frames are starting to look like junk. In any case, I would need accurate instructions on how to reproduce your issue before I could begin to try to figure out what was really going on.
,
Dec 7
Almost certainly it just needs to be annotated like Luis says. Thanks for the pointer! --- Caroline: I'm happy to provide instructions to reproduce if you want to take a shot at helping w/ the annotations (or confirm that's the problem). It would involve building your own kernel and getting it onto some type of arm64 Chromebook (probably arm64-kevin). In general the set of steps is at <https://chromium.googlesource.com/chromiumos/docs/+/master/kernel_faq.md#Debugging-with-KGDB_KDB> ...I was thinking you'd have to modify a driver to break while in an interrupt like I did above, but then I realized I was being really dumb. If you just break into the debugger with SYSRQ-g over serial you can see this too. In such a case: kdb: [0]kdb> bt Stack traceback for pid 0 0xffffff8009228e40 0 0 1 0 R 0xffffff800922a110 *swapper/0 Call trace: dump_backtrace+0x0/0x130 show_stack+0x20/0x2c kdb_show_stack+0x60/0x84 kdb_bt1+0xbc/0x104 kdb_bt+0x230/0x3c8 kdb_parse+0x3cc/0x5bc kdb_main_loop+0x678/0x6e0 kdb_stub+0x26c/0x374 kgdb_cpu_enter+0x404/0x62c kgdb_handle_exception+0x134/0x1b8 kgdb_compiled_brk_fn+0x34/0x44 brk_handler+0xb8/0xf8 do_debug_exception+0xd4/0x170 el1_dbg+0x18/0x78 kgdb_breakpoint+0x2c/0x50 sysrq_handle_dbg+0x30/0x58 __handle_sysrq+0x170/0x1b0 handle_sysrq+0x38/0x44 qcom_geni_serial_isr+0x2f0/0x314 __handle_irq_event_percpu+0x12c/0x304 handle_irq_event_percpu+0x34/0x8c handle_irq_event+0x48/0x78 handle_fasteoi_irq+0x84/0xf4 generic_handle_irq+0x24/0x3c __handle_domain_irq+0x70/0xb0 gic_handle_irq+0x154/0x1a4 el1_irq+0xb4/0x130 arch_cpu_idle+0xbc/0x17c default_idle_call+0x1c/0x30 do_idle+0xec/0x22c cpu_startup_entry+0x24/0x28 rest_init+0x1f4/0x200 start_kernel+0x3fc/0x484 --- kgdb: (gdb) bt #0 arch_kgdb_breakpoint () at /mnt/host/source/src/third_party/kernel/v4.19/arch/arm64/include/asm/kgdb.h:32 #1 kgdb_breakpoint () at /mnt/host/source/src/third_party/kernel/v4.19/kernel/debug/debug_core.c:1135 #2 0xffffff80081a9088 in sysrq_handle_dbg (key=<optimized out>) at /mnt/host/source/src/third_party/kernel/v4.19/kernel/debug/debug_core.c:889 #3 0xffffff80084b7314 in __handle_sysrq (key=103, check_mask=true) at /mnt/host/source/src/third_party/kernel/v4.19/drivers/tty/sysrq.c:620 #4 0xffffff80084b738c in handle_sysrq (key=103) at /mnt/host/source/src/third_party/kernel/v4.19/drivers/tty/sysrq.c:649 #5 0xffffff80084d272c in uart_unlock_and_check_sysrq (port=<optimized out>, irqflags=<optimized out>) at /mnt/host/source/src/third_party/kernel/v4.19/include/linux/serial_core.h:511 #6 qcom_geni_serial_isr (isr=<optimized out>, dev=0xffffff80092b3198 <qcom_geni_console_port>) at /mnt/host/source/src/third_party/kernel/v4.19/drivers/tty/serial/qcom_geni_serial.c:834 #7 0xffffff800814a9d0 in __handle_irq_event_percpu (desc=0xffffffc0f8cc4280, flags=0xffffff8008003eb4) at /mnt/host/source/src/third_party/kernel/v4.19/kernel/irq/handle.c:149 #8 0xffffff800814abdc in handle_irq_event_percpu (desc=0xffffffc0f8cc4280) at /mnt/host/source/src/third_party/kernel/v4.19/kernel/irq/handle.c:189 #9 0xffffff800814ac7c in handle_irq_event (desc=0xffffffc0f8cc4280) at /mnt/host/source/src/third_party/kernel/v4.19/kernel/irq/handle.c:206 #10 0xffffff800814e89c in handle_fasteoi_irq (desc=0xffffffc0f8cc4280) at /mnt/host/source/src/third_party/kernel/v4.19/kernel/irq/chip.c:719 #11 0xffffff8008149ae8 in generic_handle_irq_desc (desc=0x67) at /mnt/host/source/src/third_party/kernel/v4.19/include/linux/irqdesc.h:154 #12 generic_handle_irq (irq=<optimized out>) at /mnt/host/source/src/third_party/kernel/v4.19/kernel/irq/irqdesc.c:624 #13 0xffffff8008149b70 in __handle_domain_irq (domain=0xffffffc0fb021880, hwirq=<optimized out>, lookup=true, regs=0xffffff8009203d60) at /mnt/host/source/src/third_party/kernel/v4.19/kernel/irq/irqdesc.c:661 #14 0xffffff8008081154 in handle_domain_irq (domain=0x67, hwirq=<optimized out>, regs=<optimized out>) at /mnt/host/source/src/third_party/kernel/v4.19/include/linux/irqdesc.h:172 #15 gic_handle_irq (regs=0xffffff8009203d60) at /mnt/host/source/src/third_party/kernel/v4.19/drivers/irqchip/irq-gic-v3.c:362 #16 0xffffff8008082bf4 in el1_irq () at /mnt/host/source/src/third_party/kernel/v4.19/arch/arm64/kernel/entry.S:622 Backtrace stopped: previous frame identical to this frame (corrupt stack?) --- In case it helps, the assembly code can be found at: https://chromium.googlesource.com/chromiumos/third_party/kernel/+/chromeos-4.19/arch/arm64/kernel/entry.S ...and here's a disassembly while sitting in el1_irq (gdb) disass Dump of assembler code for function el1_irq: 0xffffff8008082b40 <+0>: stp x0, x1, [sp] 0xffffff8008082b44 <+4>: stp x2, x3, [sp, #16] 0xffffff8008082b48 <+8>: stp x4, x5, [sp, #32] 0xffffff8008082b4c <+12>: stp x6, x7, [sp, #48] 0xffffff8008082b50 <+16>: stp x8, x9, [sp, #64] 0xffffff8008082b54 <+20>: stp x10, x11, [sp, #80] 0xffffff8008082b58 <+24>: stp x12, x13, [sp, #96] 0xffffff8008082b5c <+28>: stp x14, x15, [sp, #112] 0xffffff8008082b60 <+32>: stp x16, x17, [sp, #128] 0xffffff8008082b64 <+36>: stp x18, x19, [sp, #144] 0xffffff8008082b68 <+40>: stp x20, x21, [sp, #160] 0xffffff8008082b6c <+44>: stp x22, x23, [sp, #176] 0xffffff8008082b70 <+48>: stp x24, x25, [sp, #192] 0xffffff8008082b74 <+52>: stp x26, x27, [sp, #208] 0xffffff8008082b78 <+56>: stp x28, x29, [sp, #224] 0xffffff8008082b7c <+60>: add x21, sp, #0x140 0xffffff8008082b80 <+64>: mrs x28, sp_el0 0xffffff8008082b84 <+68>: ldr x20, [x28, #8] 0xffffff8008082b88 <+72>: str x20, [sp, #288] 0xffffff8008082b8c <+76>: mov x20, #0x7fffffffff // #549755813887 0xffffff8008082b90 <+80>: str x20, [x28, #8] 0xffffff8008082b94 <+84>: mrs x22, elr_el1 0xffffff8008082b98 <+88>: mrs x23, spsr_el1 0xffffff8008082b9c <+92>: stp x30, x21, [sp, #240] 0xffffff8008082ba0 <+96>: stp x29, x22, [sp, #304] 0xffffff8008082ba4 <+100>: add x29, sp, #0x130 0xffffff8008082ba8 <+104>: stp x22, x23, [sp, #256] 0xffffff8008082bac <+108>: msr daifclr, #0xd 0xffffff8008082bb0 <+112>: bl 0xffffff80081d013c <trace_hardirqs_off> 0xffffff8008082bb4 <+116>: adrp x1, 0xffffff8008d7b000 <resource_string.mem_spec+5> 0xffffff8008082bb8 <+120>: ldr x1, [x1, #432] 0xffffff8008082bbc <+124>: mov x0, sp 0xffffff8008082bc0 <+128>: mov x19, sp 0xffffff8008082bc4 <+132>: ldr x25, [x28, #56] 0xffffff8008082bc8 <+136>: eor x25, x25, x19 0xffffff8008082bcc <+140>: and x25, x25, #0xffffffffffffc000 0xffffff8008082bd0 <+144>: cbnz x25, 0xffffff8008082bf0 <el1_irq+176> 0xffffff8008082bd4 <+148>: adrp x25, 0xffffff8009149000 <bp_hardening_data> 0xffffff8008082bd8 <+152>: add x25, x25, #0x50 0xffffff8008082bdc <+156>: mrs x26, tpidr_el2 0xffffff8008082be0 <+160>: ldr x25, [x25, x26] 0xffffff8008082be4 <+164>: mov x26, #0x4000 // #16384 0xffffff8008082be8 <+168>: add x26, x25, x26 0xffffff8008082bec <+172>: mov sp, x26 0xffffff8008082bf0 <+176>: blr x1 => 0xffffff8008082bf4 <+180>: mov sp, x19 0xffffff8008082bf8 <+184>: ldr w24, [x28, #16] 0xffffff8008082bfc <+188>: cbnz w24, 0xffffff8008082c0c <el1_irq+204> 0xffffff8008082c00 <+192>: ldr x0, [x28] 0xffffff8008082c04 <+196>: tbz w0, #1, 0xffffff8008082c0c <el1_irq+204> 0xffffff8008082c08 <+200>: bl 0xffffff8008082c70 <el1_preempt> 0xffffff8008082c0c <+204>: bl 0xffffff80081d0024 <trace_hardirqs_on> 0xffffff8008082c10 <+208>: msr daifset, #0xf 0xffffff8008082c14 <+212>: ldr x20, [sp, #288] 0xffffff8008082c18 <+216>: str x20, [x28, #8] 0xffffff8008082c1c <+220>: ldp x21, x22, [sp, #256] 0xffffff8008082c20 <+224>: msr elr_el1, x21 0xffffff8008082c24 <+228>: msr spsr_el1, x22 0xffffff8008082c28 <+232>: ldp x0, x1, [sp] 0xffffff8008082c2c <+236>: ldp x2, x3, [sp, #16] 0xffffff8008082c30 <+240>: ldp x4, x5, [sp, #32] 0xffffff8008082c34 <+244>: ldp x6, x7, [sp, #48] 0xffffff8008082c38 <+248>: ldp x8, x9, [sp, #64] 0xffffff8008082c3c <+252>: ldp x10, x11, [sp, #80] 0xffffff8008082c40 <+256>: ldp x12, x13, [sp, #96] 0xffffff8008082c44 <+260>: ldp x14, x15, [sp, #112] 0xffffff8008082c48 <+264>: ldp x16, x17, [sp, #128] 0xffffff8008082c4c <+268>: ldp x18, x19, [sp, #144] 0xffffff8008082c50 <+272>: ldp x20, x21, [sp, #160] 0xffffff8008082c54 <+276>: ldp x22, x23, [sp, #176] 0xffffff8008082c58 <+280>: ldp x24, x25, [sp, #192] 0xffffff8008082c5c <+284>: ldp x26, x27, [sp, #208] 0xffffff8008082c60 <+288>: ldp x28, x29, [sp, #224] 0xffffff8008082c64 <+292>: ldr x30, [sp, #240] 0xffffff8008082c68 <+296>: add sp, sp, #0x140 0xffffff8008082c6c <+300>: eret End of assembler dump. ...note: the beginning of the disassembly looks really weird, but I think that's because I think we got jumped to from "Exception vectors" and thus ran "kernel_ventry". |
|||||||||||||||||||||||
►
Sign in to add a comment |
|||||||||||||||||||||||
Comment 1 by lloz...@google.com
, Nov 28