debugd: crash when closing fd: bad file descriptor |
|||||||||||||
Issue descriptionChrome Version: 60.0.3096.0 Chrome OS Version: 9544.0.0 No specific steps to repro. crash report:https://crash.corp.google.com/browse?stbtiq=cc4a0ed8d0000000#0 Stack Quality91%Show frame trust levels 0xe7433b86 (libc-2.23.so + 0x00016b86 ) __libc_do_syscall 0xe7442de3 (libc-2.23.so -raise.c:54 ) raise 0xe7443d55 (libc-2.23.so -abort.c:89 ) abort 0xe775c547 (libbase-core-395517.so -debugger_posix.cc:249 ) base::debug::BreakDebugger() 0xe776e8ed (libbase-core-395517.so -logging.cc:755 ) logging::LogMessage::~LogMessage() 0xe776ebe9 (libbase-core-395517.so -logging.cc:877 ) logging::ErrnoLogMessage::~ErrnoLogMessage() 0xe7768c95 (libbase-core-395517.so -scoped_file.cc:30 ) base::internal::ScopedFDCloseTraits::Free(int) 0xe7763de3 (libbase-core-395517.so -scoped_generic.h:149 ) base::File::Close() 0xe775dea7 (libbase-core-395517.so -file.cc:61 ) base::File::~File() 0xe76da24f (libbase-dbus-395517.so -file_descriptor.cc:30 ) <name omitted> 0xad1adfff (debugd -dbus_param_reader.h:112 ) brillo::dbus_utils::DBusParamReader<true, const dbus::FileDescriptor &>::InvokeHelper<const dbus::FileDescriptor &, (lambda at ../../../../../../../usr/include/brillo/dbus/dbus_object_internal_impl.h:162:28), unsigned int, std::vector<std::basic_string<char>, std::allocator<std::basic_string<char> > > > 0xad1adb73 (debugd -dbus_param_reader.h:59 ) brillo::dbus_utils::DBusParamReader<true, const std::vector<std::basic_string<char>, std::allocator<std::basic_string<char> > > &, const dbus::FileDescriptor &>::InvokeHelper<const std::vector<std::basic_string<char>, std::allocator<std::basic_string<char> > > &, (lambda at ../../../../../../../usr/include/brillo/dbus/dbus_object_internal_impl.h:162:28), unsigned int> 0xad1ad889 (debugd -dbus_param_reader.h:59 ) brillo::dbus_utils::DBusParamReader<true, unsigned int, const std::vector<std::basic_string<char>, std::allocator<std::basic_string<char> > > &, const dbus::FileDescriptor &>::InvokeHelper<unsigned int, (lambda at ../../../../../../../usr/include/brillo/dbus/dbus_object_internal_impl.h:162:28)> 0xad1ad80b (debugd -dbus_param_reader.h:59 ) brillo::dbus_utils::SimpleDBusInterfaceMethodHandlerWithError<unsigned int, std::vector<std::string, std::allocator<std::string> > const&, dbus::FileDescriptor const&>::HandleMethod(dbus::MethodCall*, base::Callback<void (std::unique_ptr<dbus::Response, std::default_delete<dbus::Response> >), (base::internal::CopyMode)1>) 0xe7819c13 (libbrillo-core-395517.so -dbus_object.cc:129 ) brillo::dbus_utils::DBusInterface::HandleMethodCall(dbus::MethodCall*, base::Callback<void (std::unique_ptr<dbus::Response, std::default_delete<dbus::Response> >), (base::internal::CopyMode)1>) 0xe781bb0d (libbrillo-core-395517.so -bind_internal.h:186 ) base::internal::Invoker<base::IndexSequence<0u>, base::internal::BindState<base::internal::RunnableAdapter<void (brillo::dbus_utils::DBusInterface::*)(dbus::MethodCall*, base::Callback<void (std::unique_ptr<dbus::Response, std::default_delete<dbus::Response> >), (base::internal::CopyMode)1>)>, void (brillo::dbus_utils::DBusInterface*, dbus::MethodCall*, base::Callback<void (std::unique_ptr<dbus::Response, std::default_delete<dbus::Response> >), (base::internal::CopyMode)1>), base::internal::UnretainedWrapper<brillo::dbus_utils::DBusInterface> >, base::internal::InvokeHelper<false, void, base::internal::RunnableAdapter<void (brillo::dbus_utils::DBusInterface::*)(dbus::MethodCall*, base::Callback<void (std::unique_ptr<dbus::Response, std::default_delete<dbus::Response> >), (base::internal::CopyMode)1>)> >, void (dbus::MethodCall*, base::Callback<void (std::unique_ptr<dbus::Response, std::default_delete<dbus::Response> >), (base::internal::CopyMode)1>)>::Run(base::internal::BindStateBase*, dbus::MethodCall*&&, base::Callback<void (std::unique_ptr<dbus::Response, std::default_delete<dbus::Response> >), (base::internal::CopyMode)1>&&) 0xe76d8e8f (libbase-dbus-395517.so -callback.h:397 ) dbus::ExportedObject::HandleMessage(DBusConnection*, DBusMessage*) 0xe72523f7 (libdbus-1.so.3.14.8 -dbus-object-tree.c:1020 ) _dbus_object_tree_dispatch_and_unlock 0xe7244b6d (libdbus-1.so.3.14.8 -dbus-connection.c:4750 ) dbus_connection_dispatch 0xe76d2625 (libbase-dbus-395517.so -bus.cc:815 ) dbus::Bus::ProcessAllIncomingDataIfAny() 0xe775d503 (libbase-core-395517.so -callback.h:397 ) base::debug::TaskAnnotator::RunTask(char const*, base::PendingTask const&) 0xe77733c9 (libbase-core-395517.so -message_loop.cc:478 ) base::MessageLoop::RunTask(base::PendingTask const&) 0xe7773659 (libbase-core-395517.so -message_loop.cc:487 ) base::MessageLoop::DeferOrRunPendingTask(base::PendingTask const&) 0xe77738b5 (libbase-core-395517.so -message_loop.cc:604 ) base::MessageLoop::DoWork() 0xe7775527 (libbase-core-395517.so -message_pump_libevent.cc:219 ) base::MessagePumpLibevent::Run(base::MessagePump::Delegate*) 0xe778f487 (libbase-core-395517.so -run_loop.cc:35 ) base::RunLoop::Run() 0xe77fbbb3 (libbrillo-core-395517.so -base_message_loop.cc:212 ) brillo::BaseMessageLoop::Run() 0xe77f66ff (libbrillo-core-395517.so -daemon.cc:29 ) brillo::Daemon::Run() 0xad1a3cff (debugd -main.cc:77 ) main 0xe74338b7 (libc-2.23.so -libc-start.c:289 ) __libc_start_main 0xad1a3adb (debugd + 0x0000aadb ) _start 0xad1ec673 (debugd -elf-init.c:87 ) __libc_csu_init 0xe7843f3b (ld-2.23.so + 0x0000bf3b ) _dl_sort_fini 0xad1a3aa7 (debugd -key.pb.cc:70 ) <name omitted>
,
May 15 2017
Issue 701368 has been merged into this issue.
,
May 15 2017
Issue 694903 has been merged into this issue.
,
May 15 2017
Issue 698586 has been merged into this issue.
,
May 18 2017
Repro'd this crash on Kevin 9563.0.0,60.0.3102.0 while moving app window to external monitor Crash reports: https://crash.corp.google.com/browse?stbtiq=0ca97345a8000000 https://crash.corp.google.com/browse?stbtiq=cbaaf53970000000 Device logs: https://storage.cloud.google.com/chromiumos-test-logs/bugfiles/cr/722501/DebugLogsKevinCrashComment%235.tgz?_ga=1.34481697.1204488054.1493153045
,
May 24 2017
i suspect this is related to the recent debugd cleanups
,
May 24 2017
example log from my local VM: 2017-05-23T23:50:29.363820+00:00 CRIT debugd[29599]: Check failed: 0 == IGNORE_EINTR(close(fd)). : Bad file descriptor#012/usr/lib64/libbase-core-395517.so(base::debug::StackTrace::StackTrace()+0x13) [0x7d2abe18af03]#012
,
May 24 2017
Moving an app window to an external monitor seems unrelated. vapier, did you see which D-Bus method is causing this issue?
,
May 24 2017
I was able to repro with: $ dbus-send --print-reply --system --dest=org.chromium.debugd /org/chromium/debugd org.chromium.debugd.GetPerfOutputFd uint32:60 array:string:perf,stat fd:1 It does start the process, which eventually returns garbled output, but also crashes debugd with the message given in c#7. Time to figure out what went wrong.
,
May 24 2017
This is super weird. The file descriptor is only closed once, so getting EBADF would imply that it was never open to begin with. However, FileDescriptor::CheckValidity seems to work fine, and looking at the table of open FDs *right after this error occurs* shows that it's open and refers to a socket. I'm not sure if this socket-ness has something to do with it, but it might be an artifact of how FDs are passed through unix sockets, or some D-Bus internal details. Either way, EBADF seems like a spurious and incorrect error here, but it's an extremely reliable repro and doesn't seem to show up with the other process-spawning functions.
,
May 24 2017
afaik, when you send a fd via sockets, it's as if you dup-ed them. that is, both sides have their own copy of the fd, and are free to close/whatever them w/out impacting the other side.
,
May 24 2017
OK, strace is shedding more light on the issue.
epoll_wait(3, [{EPOLLIN, {u32=9, u64=9}}], 32, -1) = 1
recvmsg(9, {msg_name=NULL, msg_namelen=0, msg_iov=[{iov_base="l\1\0\1$\0\0\0\2\0\0\0\236\0\0\0\1\1o\0\24\0\0\0/org/chr"..., iov_len=2048}], msg_iovlen=1, msg_control=[{cmsg_len=20, cmsg_level=SOL_SOCKET, cmsg_type=SCM_RIGHTS, cmsg_data=[10]}], msg_controllen=24, msg_flags=MSG_CMSG_CLOEXEC}, MSG_CMSG_CLOEXEC) = 212
fcntl(10, F_GETFD) = 0x1 (flags FD_CLOEXEC)
fcntl(10, F_SETFD, FD_CLOEXEC) = 0
recvmsg(9, {msg_namelen=0}, MSG_CMSG_CLOEXEC) = -1 EAGAIN (Resource temporarily unavailable)
gettid() = 1282
write(7, "\0", 1) = 1
gettid() = 1282
gettid() = 1282
gettid() = 1282
fcntl(10, F_DUPFD_CLOEXEC, 3) = 11
fstat(11, {st_mode=S_IFCHR|0620, st_rdev=makedev(136, 0), ...}) = 0
fstat(1, {st_mode=S_IFCHR|0666, st_rdev=makedev(1, 3), ...}) = 0
clone(child_stack=NULL, flags=CLONE_CHILD_CLEARTID|CLONE_CHILD_SETTID|SIGCHLD, child_tidptr=0x79da3e938a10) = 3260
wait4(3260, [{WIFEXITED(s) && WEXITSTATUS(s) == 0}], 0, NULL) = 3260
--- SIGCHLD {si_signo=SIGCHLD, si_code=CLD_EXITED, si_pid=3260, si_uid=0, si_status=0, si_utime=0, si_stime=0} ---
close(11) = 0
close(-1) = -1 EBADF (Bad file descriptor)
gettid() = 1282
gettid() = 1282
sendmsg(9, {msg_name=NULL, msg_namelen=0, msg_iov=[{iov_base="l\2\1\1\0\0\0\0\t\0\0\0\30\0\0\0\6\1s\0\5\0\0\0:1.52\0\0\0"..., iov_len=40}, {iov_base="", iov_len=0}], msg_iovlen=2, msg_controllen=0, msg_flags=0}, MSG_NOSIGNAL) = 40
gettid() = 1282
close(11) = -1 EBADF (Bad file descriptor)
The first recvmsg is debugd reading a D-Bus message from the socket. We can tell from the cmsg_data that the descriptor passed as part of the D-Bus message looks like it's actually fd 10, which is /dev/pts/0 and thus correctly refers to the calling terminal:
3962 debugd 0 /dev/null
3962 debugd 1 /dev/null
3962 debugd 2 /dev/null
3962 debugd 3 anon_inode:[eventpoll]
3962 debugd 4 pipe:[16959]
3962 debugd 5 pipe:[16959]
3962 debugd 6 pipe:[16960]
3962 debugd 7 pipe:[16960]
3962 debugd 8 anon_inode:[signalfd]
3962 debugd 9 socket:[16961]
3962 debugd 10 /dev/pts/0
3962 debugd 11 socket:[17051]
fd 11 is actually being used to send the logs to /dev/log and thus it's kind of a red herring here. There are two places where we close a socket and get EBADF and one of them looks like a double close that happens after the D-Bus response is sent, which means this stack trace doesn't actually make much sense?
There are a few places where this close call could happen. ProcessImpl::Start closes fds that are not bound in the parent, but since we used ProcessImpl::BindFd to bind them they should be bound and we don't have to worry about this. ProcessImpl::~ProcessImpl calls ProcessImpl::Reset which does close all file descriptors, however.
We run into this problem in GetPerfOutputFd and not the Ping, Tracepath, Memtester etc. tools because the latter tools bind the FD and leave the process running to be cleaned up later, so the dbus::FileDescriptor destructor closes them first. The ProcessImpl destructor does not care if it runs into EBADF errors, and doesn't even check the return value save for wrapping it in IGNORE_EINTR, so when we clean up during PingStop or the like, the "failed" close calls are just dropped on the floor.
This can be fixed by releasing (TakeValue) the fd into the ProcessImpl pipe map if the process does not outlive the D-Bus method handler. I'll put up a CL to fix this shortly.
,
May 24 2017
With dbus::FileDescriptor that may take ownership of the file descriptor received over DBus (while DBusFileDescriptor from dbus-c++ doesn't), debugd::SandboxedProcess shouldn't assume ownership of the same file descriptor that is passed to debugd::SandboxedProcess::BindFd(). debugd::SandboxedProcess was changed to close all bound file descriptors in https://chromium-review.googlesource.com/c/346762/, which is no longer correct after the migration to dbus::FileDescriptor.
,
May 30 2017
The following revision refers to this bug: https://chromium.googlesource.com/chromiumos/platform2/+/a6f1adba577727b6148d8505530c21b11a90e285 commit a6f1adba577727b6148d8505530c21b11a90e285 Author: Eric Caruso <ejcaruso@chromium.org> Date: Tue May 30 23:29:02 2017 debugd: don't close bound fds in SandboxedProcess dbus::FileDescriptor contains a base::ScopedFD, which fails a CHECK on destruction if the close call fails with EBADF. This means you can't double-close a ScopedFD. This is a problem if it's stored in a SandboxedProcess that doesn't outlive the dbus::FileDescriptor because the ProcessImpl closes all FDs in its pipe map on destruction. Unfortunately, the dbus::FileDescriptors are passed as const & from the D-Bus bindings, so TakeValue() is not an option without a const cast. Additionally, if BindFd was supposed to take ownership of the FD its API should take a ScopedFD instead to signal this. BUG= chromium:722501 TEST=use SystraceTool, PerfTool and ensure they don't crash debugd via the double-close CHECK; also watch the FD table and make sure no FDs are leaked Change-Id: I0b748e34f311f04dda33287f4e85ce4bf6970e27 Reviewed-on: https://chromium-review.googlesource.com/514522 Commit-Ready: Eric Caruso <ejcaruso@chromium.org> Tested-by: Eric Caruso <ejcaruso@chromium.org> Reviewed-by: Ben Chan <benchan@chromium.org> [modify] https://crrev.com/a6f1adba577727b6148d8505530c21b11a90e285/debugd/src/sandboxed_process.cc [modify] https://crrev.com/a6f1adba577727b6148d8505530c21b11a90e285/debugd/src/sandboxed_process.h
,
May 31 2017
,
May 31 2017
i think this landed after the M60 branch ? this would be good to cherry pick back so we don't get known noise in M60 crash stats.
,
May 31 2017
,
Jun 1 2017
Your change meets the bar and is auto-approved for M60. Please go ahead and merge the CL to branch 3112 manually. Please contact milestone owner if you have questions. Owners: amineer@(Android), cmasso@(iOS), josafat@(ChromeOS), bustamante@(Desktop) For more details visit https://www.chromium.org/issue-tracking/autotriage - Your friendly Sheriffbot
,
Jun 1 2017
The following revision refers to this bug: https://chromium.googlesource.com/chromiumos/platform2/+/fa44df88d8a48e08c97f7ea5e84b911d5e44bd55 commit fa44df88d8a48e08c97f7ea5e84b911d5e44bd55 Author: Eric Caruso <ejcaruso@chromium.org> Date: Thu Jun 01 20:36:48 2017 debugd: don't close bound fds in SandboxedProcess dbus::FileDescriptor contains a base::ScopedFD, which fails a CHECK on destruction if the close call fails with EBADF. This means you can't double-close a ScopedFD. This is a problem if it's stored in a SandboxedProcess that doesn't outlive the dbus::FileDescriptor because the ProcessImpl closes all FDs in its pipe map on destruction. Unfortunately, the dbus::FileDescriptors are passed as const & from the D-Bus bindings, so TakeValue() is not an option without a const cast. Additionally, if BindFd was supposed to take ownership of the FD its API should take a ScopedFD instead to signal this. BUG= chromium:722501 TEST=use SystraceTool, PerfTool and ensure they don't crash debugd via the double-close CHECK; also watch the FD table and make sure no FDs are leaked Change-Id: I0b748e34f311f04dda33287f4e85ce4bf6970e27 Reviewed-on: https://chromium-review.googlesource.com/514522 Commit-Ready: Eric Caruso <ejcaruso@chromium.org> Tested-by: Eric Caruso <ejcaruso@chromium.org> Reviewed-by: Ben Chan <benchan@chromium.org> (cherry picked from commit a6f1adba577727b6148d8505530c21b11a90e285) Reviewed-on: https://chromium-review.googlesource.com/521786 Reviewed-by: Eric Caruso <ejcaruso@chromium.org> Commit-Queue: Eric Caruso <ejcaruso@chromium.org> Trybot-Ready: Eric Caruso <ejcaruso@chromium.org> [modify] https://crrev.com/fa44df88d8a48e08c97f7ea5e84b911d5e44bd55/debugd/src/sandboxed_process.cc [modify] https://crrev.com/fa44df88d8a48e08c97f7ea5e84b911d5e44bd55/debugd/src/sandboxed_process.h
,
Jun 5 2017
This issue has been approved for a merge. Please merge the fix to any appropriate branches as soon as possible! If all merges have been completed, please remove any remaining Merge-Approved labels from this issue. Thanks for your time! To disable nags, add the Disable-Nags label. For more details visit https://www.chromium.org/issue-tracking/autotriage - Your friendly Sheriffbot
,
Jun 5 2017
,
Nov 20 2017
No crash reports on recent builds. closing it now.
,
Dec 27 2017
Observing via M64-BETA-CHROMEOS-2 (10176.22.0, 64.0.3282.41) Re-Opening https://crash.corp.google.com/browse?q=product.name%3D%27ChromeOS%27%20AND%20stable_signature%3D%27raise-f2b94ca6%27%20AND%20product.Version%3D%2710176.22.0%27&sql_dialect=googlesql&ignore_case=false&enable_rewrite=true&omit_field_name=&omit_field_value=&omit_field_opt=%3D&stbtiq=&reportid=&index=0
,
Dec 27 2017
,
Dec 27 2017
both of those are unrelated to this bug, and the stack traces look nothing alike each other. please file new bugs.
,
Jan 5 2018
Will do; was associated by crash reporter |
|||||||||||||
►
Sign in to add a comment |
|||||||||||||
Comment 1 by abod...@chromium.org
, May 15 2017