diff --git a/CMakeLists.txt b/CMakeLists.txt index 01e8b9434fa..d2599e0c60d 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1312,6 +1312,7 @@ set(BASIC_TESTS timer timerfd times + tracee_unmap_vdso truncate_temp tun two_signals_with_mask diff --git a/src/AddressSpace.cc b/src/AddressSpace.cc index 13ba0988623..f10279a3b9c 100644 --- a/src/AddressSpace.cc +++ b/src/AddressSpace.cc @@ -269,12 +269,8 @@ static uint32_t find_offset_of_syscall_instruction_in(SupportedArch arch, uint32_t AddressSpace::offset_to_syscall_in_vdso[SupportedArch_MAX + 1]; -remote_code_ptr AddressSpace::find_syscall_instruction(Task* t) { +remote_code_ptr AddressSpace::find_syscall_instruction_in_vdso(Task* t) { SupportedArch arch = t->arch(); - // This assert passes even if --unmap-vdso is passed because this only ever - // gets called at the start of process_execve before we unmap the vdso. After - // the rr page is mapped in, we use the syscall instructions contained therein - ASSERT(t, has_vdso()) << "Kernel with vDSO disabled?"; if (!offset_to_syscall_in_vdso[arch]) { auto vdso_data = t->read_mem(vdso().start().cast(), vdso().size()); offset_to_syscall_in_vdso[arch] = find_offset_of_syscall_instruction_in( @@ -595,7 +591,7 @@ void AddressSpace::read_mm_map(Task* t, NativeArch::prctl_mm_map* map) { void AddressSpace::post_exec_syscall(Task* t) { // First locate a syscall instruction we can use for remote syscalls. - traced_syscall_ip_ = find_syscall_instruction(t); + traced_syscall_ip_ = find_syscall_instruction_in_vdso(t); privileged_traced_syscall_ip_ = nullptr; do_breakpoint_fault_addr_ = nullptr; @@ -1316,12 +1312,22 @@ void AddressSpace::unmap(Task* t, remote_ptr addr, ssize_t num_bytes) { return unmap_internal(t, addr, num_bytes); } -void AddressSpace::unmap_internal(Task*, remote_ptr addr, +void AddressSpace::did_unmap_rr_page(Task* t, const Mapping& m) { + if (m.map.contains(traced_syscall_ip_.to_data_ptr())) { + traced_syscall_ip_ = find_syscall_instruction_in_vdso(t); + } + privileged_traced_syscall_ip_ = nullptr; +} + +void AddressSpace::unmap_internal(Task* t, remote_ptr addr, ssize_t num_bytes) { LOG(debug) << "munmap(" << addr << ", " << num_bytes << ")"; - auto unmapper = [this](Mapping m, MemoryRange rem) { + auto unmapper = [this, t](Mapping m, MemoryRange rem) { LOG(debug) << " unmapping (" << rem << ") ..."; + if (m.map.start() == rr_page_start()) { + did_unmap_rr_page(t, m); + } remove_from_map(m.map); diff --git a/src/AddressSpace.h b/src/AddressSpace.h index c7f3b33a731..4902afbe97d 100644 --- a/src/AddressSpace.h +++ b/src/AddressSpace.h @@ -759,12 +759,12 @@ class AddressSpace : public HasTaskSet { static remote_ptr rr_page_record_ff_bytes() { return RR_PAGE_FF_BYTES; } /** - * Locate a syscall instruction in t's VDSO. + * Locate a syscall instruction in t's VDSO (the real one, not our fake one). * This gives us a way to execute remote syscalls without having to write * a syscall instruction into executable tracee memory (which might not be * possible with some kernels, e.g. PaX). */ - remote_code_ptr find_syscall_instruction(Task* t); + remote_code_ptr find_syscall_instruction_in_vdso(Task* t); /** * Task |t| just forked from this address space. Apply dont_fork and @@ -922,6 +922,7 @@ class AddressSpace : public HasTaskSet { void populate_address_space(Task* t); void unmap_internal(Task* t, remote_ptr addr, ssize_t num_bytes); + void did_unmap_rr_page(Task* t, const Mapping& m); bool update_watchpoint_value(const MemoryRange& range, Watchpoint& watchpoint); diff --git a/src/ReplaySession.cc b/src/ReplaySession.cc index 263a245fec0..701d2f392a8 100644 --- a/src/ReplaySession.cc +++ b/src/ReplaySession.cc @@ -1654,7 +1654,13 @@ static void end_task(ReplayTask* t) { t->destroy_buffers(); Registers r = t->regs(); - r.set_ip(t->vm()->privileged_traced_syscall_ip()); + remote_code_ptr syscall_ip = t->vm()->privileged_traced_syscall_ip(); + if (!syscall_ip) { + // Fall back to unprivileged. If someone uses a seccomp policy to + // block `exit` *and* unmaps the rr page, they lose. + syscall_ip = t->vm()->traced_syscall_ip(); + } + r.set_ip(syscall_ip); r.set_syscallno(syscall_number_for_exit(t->arch())); t->set_regs(r); // Enter the syscall. diff --git a/src/test/tracee_unmap_vdso.c b/src/test/tracee_unmap_vdso.c new file mode 100644 index 00000000000..19278bd84d9 --- /dev/null +++ b/src/test/tracee_unmap_vdso.c @@ -0,0 +1,22 @@ +/* -*- Mode: C; tab-width: 8; c-basic-offset: 2; indent-tabs-mode: nil; -*- */ + +#include "util.h" + +int main(void) { + char* vdso = (char*)getauxval(AT_SYSINFO_EHDR); + size_t page_size = sysconf(_SC_PAGESIZE); + munmap(vdso, 4*page_size); + + pid_t child = fork(); + if (!child) { + return 77; + } + int status; + int ret = waitpid(child, &status, 0); + test_assert(ret == child); + test_assert(WIFEXITED(status)); + test_assert(WEXITSTATUS(status) == 77); + + atomic_puts("EXIT-SUCCESS"); + return 0; +} diff --git a/src/test/tracee_unmap_vdso.run b/src/test/tracee_unmap_vdso.run new file mode 100644 index 00000000000..2285e9a6f7f --- /dev/null +++ b/src/test/tracee_unmap_vdso.run @@ -0,0 +1,6 @@ +source `dirname $0`/util.sh +# Unmapping the rr page breaks ssycallbuf; we don't support that. +skip_if_syscall_buf +# Unmapping the VDSO breaks all syscalls on 32-bit x86. +skip_if_32_bit +compare_test EXIT-SUCCESS