From 6fb5368d34ceb0293f0a64d4af45e8dc7617a492 Mon Sep 17 00:00:00 2001 From: Robert O'Callahan Date: Tue, 17 Oct 2023 13:22:01 +1300 Subject: [PATCH] Add support for using Intel PT to collect control flow data to debug rr divergences --- CMakeLists.txt | 11 ++ src/AddressSpace.cc | 62 +++++++--- src/AddressSpace.h | 1 + src/PerfCounters.cc | 149 ++++++++++++++++++++++- src/PerfCounters.h | 33 +++++- src/ProcessorTraceDecoder.cc | 222 +++++++++++++++++++++++++++++++++++ src/ProcessorTraceDecoder.h | 80 +++++++++++++ src/RecordCommand.cc | 16 ++- src/RecordSession.cc | 11 +- src/RecordSession.h | 6 +- src/RecordTask.cc | 8 ++ src/ReplayCommand.cc | 15 ++- src/ReplaySession.cc | 166 +++++++++++++++++++++++++- src/ReplaySession.h | 7 +- src/Session.cc | 6 +- src/Session.h | 8 ++ src/Task.cc | 3 +- src/log.cc | 5 + src/util.cc | 45 +++++++ src/util.h | 18 +++ 20 files changed, 834 insertions(+), 38 deletions(-) create mode 100644 src/ProcessorTraceDecoder.cc create mode 100644 src/ProcessorTraceDecoder.h diff --git a/CMakeLists.txt b/CMakeLists.txt index 3d91bc0a7bc..449741b1fe2 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -657,6 +657,12 @@ else() set(CMAKE_INSTALL_INCLUDEDIR "include") endif() +option(pt_decoding "Build with Intel PT decoding enabled.") +if (pt_decoding) + set(RR_SOURCES ${RR_SOURCES} src/ProcessorTraceDecoder.cc) + add_definitions(-DINTEL_PT_DECODING=1) +endif() + add_executable(rr ${RR_SOURCES}) set_target_properties(rr PROPERTIES ENABLE_EXPORTS true) post_build_executable(rr) @@ -683,6 +689,11 @@ if(LOWERCASE_CMAKE_BUILD_TYPE STREQUAL "release") endif() endif() +if (pt_decoding) + find_library(LIBIPT ipt) + target_link_libraries(rr ${LIBIPT}) +endif() + if(LIBRT) target_link_libraries(rr ${LIBRT}) endif() diff --git a/src/AddressSpace.cc b/src/AddressSpace.cc index 56fd97ce1e9..fcaf1935e82 100644 --- a/src/AddressSpace.cc +++ b/src/AddressSpace.cc @@ -286,16 +286,10 @@ remote_code_ptr AddressSpace::find_syscall_instruction_in_vdso(Task* t) { .as_int()); } -void AddressSpace::map_rr_page(AutoRemoteSyscalls& remote) { - int prot = PROT_EXEC | PROT_READ; - int flags = MAP_PRIVATE | MAP_FIXED; - +static string rr_page_file_name(SupportedArch arch, const char** fname_out) { string file_name; - Task* t = remote.task(); - SupportedArch arch = t->arch(); - const char *fname = nullptr; - switch (t->arch()) { + switch (arch) { case x86_64: case aarch64: fname = RRPAGE_LIB_FILENAME; @@ -308,13 +302,29 @@ void AddressSpace::map_rr_page(AutoRemoteSyscalls& remote) { #endif break; } + *fname_out = fname; string path = find_helper_library(fname); if (path.empty()) { - FATAL() << "Failed to locate " << fname << "; needed by " - << t->exe_path() << " (" << arch_name(t->arch()) << ")"; + return path; } path += fname; + return path; +} + +void AddressSpace::map_rr_page(AutoRemoteSyscalls& remote) { + int prot = PROT_EXEC | PROT_READ; + int flags = MAP_PRIVATE | MAP_FIXED; + + Task* t = remote.task(); + SupportedArch arch = t->arch(); + const char* fname; + string path = rr_page_file_name(arch, &fname); + if (path.empty()) { + FATAL() << "Failed to locate " << fname << "; needed by " + << t->exe_path() << " (" << arch_name(arch) << ")"; + } + size_t offset_pages = t->session().is_recording() ? RRPAGE_RECORD_PAGE_OFFSET : RRPAGE_REPLAY_PAGE_OFFSET; size_t offset_bytes = offset_pages * PRELOAD_LIBRARY_PAGE_SIZE; @@ -332,7 +342,7 @@ void AddressSpace::map_rr_page(AutoRemoteSyscalls& remote) { child_fd, offset_bytes); struct stat fstat = t->stat_fd(child_fd); - file_name = t->file_name_of_fd(child_fd); + string file_name = t->file_name_of_fd(child_fd); remote.infallible_close_syscall_if_alive(child_fd); @@ -356,6 +366,25 @@ void AddressSpace::map_rr_page(AutoRemoteSyscalls& remote) { } } +vector AddressSpace::read_rr_page_for_recording(SupportedArch arch) { + const char* fname; + string path = rr_page_file_name(arch, &fname); + if (path.empty()) { + FATAL() << "Failed to locate " << fname; + } + + ScopedFd page(path.c_str(), O_RDONLY); + char buf[PRELOAD_LIBRARY_PAGE_SIZE]; + ssize_t ret = read_to_end(page, + RRPAGE_RECORD_PAGE_OFFSET * PRELOAD_LIBRARY_PAGE_SIZE, buf, sizeof(buf)); + if (ret != PRELOAD_LIBRARY_PAGE_SIZE) { + FATAL() << "Failed to read full page from " << path; + } + vector result; + result.insert(result.end(), buf, buf + sizeof(buf)); + return result; +} + void AddressSpace::unmap_all_but_rr_mappings(AutoRemoteSyscalls& remote, UnmapOptions options) { vector unmaps; @@ -693,14 +722,9 @@ bool AddressSpace::is_breakpoint_in_private_read_only_memory( void AddressSpace::replace_breakpoints_with_original_values( uint8_t* dest, size_t length, remote_ptr addr) { for (auto& it : breakpoints) { - remote_ptr bkpt_location = it.first.to_data_ptr(); - remote_ptr start = max(addr, bkpt_location); - remote_ptr end = - min(addr + length, bkpt_location + bkpt_instruction_length(arch())); - if (start < end) { - memcpy(dest + (start - addr), - it.second.original_data() + (start - bkpt_location), end - start); - } + replace_in_buffer(MemoryRange(it.first.to_data_ptr(), bkpt_instruction_length(arch())), + it.second.original_data(), + MemoryRange(addr, length), dest); } } diff --git a/src/AddressSpace.h b/src/AddressSpace.h index 1e14c2c0b2e..0203b83a1a6 100644 --- a/src/AddressSpace.h +++ b/src/AddressSpace.h @@ -867,6 +867,7 @@ class AddressSpace : public HasTaskSet { }; void map_rr_page(AutoRemoteSyscalls& remote); + static std::vector read_rr_page_for_recording(SupportedArch arch); struct UnmapOptions { bool exclude_vdso_vvar; UnmapOptions() : exclude_vdso_vvar(false) {} diff --git a/src/PerfCounters.cc b/src/PerfCounters.cc index 8fb715c754c..fce76630e76 100644 --- a/src/PerfCounters.cc +++ b/src/PerfCounters.cc @@ -608,12 +608,16 @@ uint32_t PerfCounters::skid_size() { } PerfCounters::PerfCounters(pid_t tid, int cpu_binding, - TicksSemantics ticks_semantics, bool enable) + TicksSemantics ticks_semantics, bool enable, + bool enable_pt) : tid(tid), pmu_index(get_pmu_index(cpu_binding)), ticks_semantics_(ticks_semantics), enable(enable), started(false), counting(false) { if (!supports_ticks_semantics(ticks_semantics)) { FATAL() << "Ticks semantics " << ticks_semantics << " not supported"; } + if (enable_pt) { + pt_state = make_unique(); + } } static void make_counter_async(ScopedFd& fd, int signal) { @@ -647,6 +651,131 @@ static void infallible_perf_event_disable_if_open(ScopedFd& fd) { } } +static uint32_t pt_event_type() { + static const char file_name[] = "/sys/bus/event_source/devices/intel_pt/type"; + ScopedFd fd(file_name, O_RDONLY); + if (!fd.is_open()) { + FATAL() << "Can't open " << file_name << ", PT events not available"; + } + char buf[6]; + ssize_t ret = read(fd, buf, sizeof(buf)); + if (ret < 1 || ret > 5) { + FATAL() << "Invalid value in " << file_name; + } + char* end_ptr; + unsigned long value = strtoul(buf, &end_ptr, 10); + if (end_ptr < buf + ret && *end_ptr && *end_ptr != '\n') { + FATAL() << "Invalid value in " << file_name; + } + return value; +} + +static const size_t PT_PERF_DATA_SIZE = 8*1024*1024; +static const size_t PT_PERF_AUX_SIZE = 32*1024*1024; + +// See https://github.com/intel/libipt/blob/master/doc/howto_capture.md +static void start_pt(pid_t tid, PerfCounters::PTState& state) { + static uint32_t event_type = pt_event_type(); + + struct perf_event_attr attr; + init_perf_event_attr(&attr, event_type, 0); + state.pt_perf_event_fd = start_counter(tid, -1, &attr); + + size_t page_size = sysconf(_SC_PAGESIZE); + void* base = mmap(NULL, page_size + PT_PERF_DATA_SIZE, + PROT_READ | PROT_WRITE, MAP_SHARED, state.pt_perf_event_fd, 0); + if (base == MAP_FAILED) { + FATAL() << "Can't allocate memory for PT DATA area"; + } + auto header = static_cast(base); + state.mmap_header = header; + + header->aux_offset = header->data_offset + header->data_size; + header->aux_size = PT_PERF_AUX_SIZE; + + void* aux = mmap(NULL, header->aux_size, PROT_READ | PROT_WRITE, MAP_SHARED, + state.pt_perf_event_fd, header->aux_offset); + if (aux == MAP_FAILED) { + FATAL() << "Can't allocate memory for PT AUX area"; + } + state.mmap_aux_buffer = static_cast(aux); +} + +// I wish I knew why this type isn't defined in perf_event.h but is just +// commented out there... +struct PerfEventAux { + struct perf_event_header header; + uint64_t aux_offset; + uint64_t aux_size; + uint64_t flags; + uint64_t sample_id; +}; + +static void flush_pt(PerfCounters::PTState& state) { + uint64_t data_end = state.mmap_header->data_head; + __sync_synchronize(); + char* data_buf = reinterpret_cast(state.mmap_header) + + state.mmap_header->data_offset; + + vector packet; + while (state.mmap_header->data_tail < data_end) { + uint64_t data_start = state.mmap_header->data_tail; + size_t start_offset = data_start % state.mmap_header->data_size; + auto header = reinterpret_cast(data_buf + start_offset); + packet.resize(header->size); + size_t first_chunk_size = min(header->size, + state.mmap_header->data_size - start_offset); + memcpy(packet.data(), header, first_chunk_size); + memcpy(packet.data() + first_chunk_size, data_buf, header->size - first_chunk_size); + state.mmap_header->data_tail += header->size; + + switch (header->type) { + case PERF_RECORD_LOST: + FATAL() << "PT records lost!"; + break; + case PERF_RECORD_ITRACE_START: + break; + case PERF_RECORD_AUX: { + auto aux_packet = reinterpret_cast(packet.data()); + if (aux_packet->flags) { + FATAL() << "Unexpected AUX packet flags " << aux_packet->flags; + } + size_t current_size = state.pt_data.data.size(); + state.pt_data.data.resize(current_size + aux_packet->aux_size); + uint8_t* current = state.pt_data.data.data() + current_size; + size_t aux_start_offset = aux_packet->aux_offset % PT_PERF_AUX_SIZE; + first_chunk_size = min(aux_packet->aux_size, PT_PERF_AUX_SIZE - aux_start_offset); + memcpy(current, state.mmap_aux_buffer + aux_start_offset, first_chunk_size); + memcpy(current + first_chunk_size, state.mmap_aux_buffer, + aux_packet->aux_size - first_chunk_size); + break; + } + default: + FATAL() << "Unknown record " << header->type; + break; + } + } +} + +PTData PerfCounters::extract_pt_data() { + PTData result; + if (pt_state) { + result = std::move(pt_state->pt_data); + } + return result; +} + +void PerfCounters::PTState::stop() { + pt_perf_event_fd.close(); + if (mmap_aux_buffer) { + size_t page_size = sysconf(_SC_PAGESIZE); + munmap(mmap_aux_buffer, mmap_header->aux_size); + mmap_aux_buffer = nullptr; + munmap(mmap_header, page_size + PT_PERF_DATA_SIZE); + mmap_header = nullptr; + } +} + void PerfCounters::reset(Ticks ticks_period) { if (!enable) { return; @@ -689,6 +818,10 @@ void PerfCounters::reset(Ticks ticks_period) { FATAL() << "Failed to SETOWN_EX ticks event fd"; } make_counter_async(fd_ticks_interrupt, PerfCounters::TIME_SLICE_SIGNAL); + + if (pt_state) { + start_pt(tid, *pt_state); + } } else { LOG(debug) << "Resetting counters with period " << ticks_period; @@ -707,6 +840,10 @@ void PerfCounters::reset(Ticks ticks_period) { infallible_perf_event_reset_if_open(fd_ticks_in_transaction); infallible_perf_event_enable_if_open(fd_ticks_in_transaction); + + if (pt_state) { + infallible_perf_event_enable_if_open(pt_state->pt_perf_event_fd); + } } started = true; @@ -724,6 +861,9 @@ void PerfCounters::stop() { return; } started = false; + if (pt_state) { + pt_state->stop(); + } fd_ticks_interrupt.close(); fd_ticks_measure.close(); @@ -744,6 +884,9 @@ void PerfCounters::stop_counting() { infallible_perf_event_disable_if_open(fd_minus_ticks_measure); infallible_perf_event_disable_if_open(fd_ticks_measure); infallible_perf_event_disable_if_open(fd_ticks_in_transaction); + if (pt_state) { + infallible_perf_event_disable_if_open(pt_state->pt_perf_event_fd); + } } } @@ -768,6 +911,10 @@ Ticks PerfCounters::read_ticks(Task* t) { return 0; } + if (pt_state) { + flush_pt(*pt_state); + } + if (fd_ticks_in_transaction.is_open()) { uint64_t transaction_ticks = read_counter(fd_ticks_in_transaction); if (transaction_ticks > 0) { diff --git a/src/PerfCounters.h b/src/PerfCounters.h index 4dc2a3e5e69..1cce8a99590 100644 --- a/src/PerfCounters.h +++ b/src/PerfCounters.h @@ -9,13 +9,16 @@ #include #include +#include #include +#include +#include +#include + #include "ScopedFd.h" #include "Ticks.h" -struct perf_event_attr; - namespace rr { class Task; @@ -25,6 +28,16 @@ enum TicksSemantics { TICKS_TAKEN_BRANCHES, }; +/** + * A buffer of Intel PT control-flow data. + */ +struct PTData { + PTData() {} + explicit PTData(std::vector data) + : data(std::move(data)) {} + std::vector data; +}; + /** * A class encapsulating the performance counters we use to monitor * each task during recording and replay. @@ -41,9 +54,19 @@ class PerfCounters { * When enable is false, we always report 0 and don't do any interrupts. */ PerfCounters(pid_t tid, int cpu_binding, TicksSemantics ticks_semantics, - bool enable); + bool enable, bool enable_pt = false); ~PerfCounters() { stop(); } + struct PTState { + PTData pt_data; + ScopedFd pt_perf_event_fd; + struct perf_event_mmap_page* mmap_header; + char* mmap_aux_buffer = nullptr; + + void stop(); + ~PTState() { stop(); } + }; + void set_tid(pid_t tid); /** @@ -130,6 +153,8 @@ class PerfCounters { */ uint32_t recording_skid_size() { return skid_size() * 5; } + PTData extract_pt_data(); + private: // Only valid while 'counting' is true Ticks counting_period; @@ -150,6 +175,8 @@ class PerfCounters { // aarch64 specific counter to detect use of ll/sc instructions ScopedFd fd_strex_counter; + std::unique_ptr pt_state; + TicksSemantics ticks_semantics_; bool enable; bool started; diff --git a/src/ProcessorTraceDecoder.cc b/src/ProcessorTraceDecoder.cc new file mode 100644 index 00000000000..f36b5f926d8 --- /dev/null +++ b/src/ProcessorTraceDecoder.cc @@ -0,0 +1,222 @@ +/* -*- Mode: C++; tab-width: 8; c-basic-offset: 2; indent-tabs-mode: nil; -*- */ + +#include + +#include +#include +#include + +#include "AddressSpace.h" +#include "ProcessorTraceDecoder.h" +#include "Session.h" +#include "log.h" +#include "preload/preload_interface.h" +#include "util.h" + +using namespace std; + +namespace rr { + +static string pt_err(int status) { + char buf[1024]; + sprintf(buf, "%s (%d)", pt_errstr(pt_errcode(status)), status); + return buf; +} + +ProcessorTraceDecoder::~ProcessorTraceDecoder() { + if (decoder) { + pt_insn_free_decoder(decoder); + } +} + +int read_mem_callback(uint8_t *buffer, size_t size, + const pt_asid *, + uint64_t ip, void *context) { + return static_cast(context)-> + read_mem(ip, buffer, size); +} + +static constexpr uint8_t injected_header_packets[] = + { /*PSB*/ 0x02, 0x82, 0x02, 0x82, 0x02, 0x82, 0x02, 0x82, + 0x02, 0x82, 0x02, 0x82, 0x02, 0x82, 0x02, 0x82, + /*CBR 31*/ 0x02, 0x03, 0x1f, 0x00, + /*PSBEND*/ 0x02, 0x23 }; +static constexpr size_t PSB_LEN = 16; + +void ProcessorTraceDecoder::init(const PTData& trace_data) { + if (trace_data.data.empty()) { + return; + } + + if (trace_data.data.size() < PSB_LEN || + memcmp(injected_header_packets, trace_data.data.data(), PSB_LEN)) { + full_trace_data.reserve(sizeof(injected_header_packets) + trace_data.data.size()); + full_trace_data.insert(full_trace_data.end(), injected_header_packets, + injected_header_packets + sizeof(injected_header_packets)); + } + full_trace_data.insert(full_trace_data.end(), trace_data.data.begin(), + trace_data.data.end()); + + init_decoder(); +} + +void ProcessorTraceDecoder::dump_full_trace_data_to_file() { + ScopedFd out("/tmp/ptdata", O_WRONLY | O_CREAT | O_TRUNC, 0700); + write_all(out, full_trace_data.data(), full_trace_data.size()); +} + +void ProcessorTraceDecoder::init_decoder() { + pt_config config; + memset(&config, 0, sizeof(config)); + config.size = sizeof(config); + config.begin = const_cast(full_trace_data.data()); + config.end = const_cast(full_trace_data.data() + full_trace_data.size()); + + decoder = pt_insn_alloc_decoder(&config); + if (!decoder) { + FATAL() << "Cannot create PT decoder"; + } + + pt_image* image = pt_insn_get_image(decoder); + int status = pt_image_set_callback(image, read_mem_callback, this); + if (status < 0) { + FATAL() << "Can't set PT mem callback: " << pt_errstr(pt_errcode(status)); + } + + need_sync = true; +} + +ProcessorTraceDecoder::ProcessorTraceDecoder(Task* task, std::vector full_trace_data, + Mode mode) + : task(task), full_trace_data(full_trace_data), mode(mode), need_sync(false) { + init_decoder(); +} + +static vector* cached_rr_page_for_recording[2]; + +static const vector& rr_page_for_recording(SupportedArch arch) { + if (arch != x86 && arch != x86_64) { + FATAL() << "Unsupported arch " << arch_name(arch); + } + int index = arch == x86_64; + vector* page = cached_rr_page_for_recording[index]; + if (page) { + return *page; + } + page = new vector(); + *page = AddressSpace::read_rr_page_for_recording(arch); + cached_rr_page_for_recording[index] = page; + return *page; +} + +int ProcessorTraceDecoder::read_mem(uint64_t ip, uint8_t *buffer, size_t size) { + ssize_t ret = task->read_bytes_fallible(ip, size, buffer); + if (ret <= 0) { + return ret; + } + if (mode == IS_RECORDING) { + // The rr page instructions differ between recording and replay. The + // task's rr page contains the replay values; if we're analyzing + // PT data from the recording, make sure to use the rr page instructions + // from the recording. + const vector& rr_page_data = rr_page_for_recording(task->arch()); + remote_ptr rr_page_addr(RR_PAGE_ADDR); + remote_ptr ip_addr(ip); + replace_in_buffer(MemoryRange(rr_page_addr, rr_page_data.size()), + rr_page_data.data(), MemoryRange(ip, ret), buffer); + } + replace_in_buffer(MemoryRange(patch_addr, patch_data.size()), + patch_data.data(), MemoryRange(ip, ret), buffer); + return ret; +} + +void ProcessorTraceDecoder::maybe_process_events(int status) { + while (status & pts_event_pending) { + pt_event event; + status = pt_insn_event(decoder, &event, sizeof(event)); + if (status < 0) { + FATAL() << "Cannot get PT event: " << pt_err(status); + } + switch (event.type) { + case ptev_enabled: + case ptev_disabled: + case ptev_async_disabled: + case ptev_exec_mode: + case ptev_cbr: + case ptev_tsx: + break; + default: + FATAL() << "Unhandled event type: " << event.type; + break; + } + } +} + +string ProcessorTraceDecoder::internal_error_context_string() { + ProcessorTraceDecoder helper(task, full_trace_data, mode); + vector instructions; + while (true) { + int pt_status = 0; + Instruction instruction; + if (helper.next_instruction(&instruction, &pt_status)) { + instructions.push_back(instruction); + } else { + break; + } + } + size_t start_index = 0; + stringstream out; + if (instructions.size() > 10000) { + start_index = instructions.size() - 10000; + out << "\n... skipped " << start_index << " instructions"; + } + for (size_t i = start_index; i < instructions.size(); ++i) { + out << "\n" << instructions[i].address; + } + return out.str(); +} + +bool ProcessorTraceDecoder::next_instruction(Instruction* out, int* pt_status) { + if (!decoder) { + return false; + } + + while (true) { + if (need_sync) { + int status = pt_insn_sync_forward(decoder); + if (pt_errcode(status) == pte_eos) { + return false; + } + if (status < 0) { + if (pt_status) { + *pt_status = status; + return false; + } + FATAL() << "Can't sync forward: " << pt_err(status) + << internal_error_context_string(); + } + need_sync = false; + maybe_process_events(status); + } + + pt_insn insn; + int status = pt_insn_next(decoder, &insn, sizeof(insn)); + if (pt_errcode(status) == pte_eos) { + need_sync = true; + continue; + } + if (status < 0) { + if (pt_status) { + *pt_status = status; + return false; + } + FATAL() << "Can't read next instruction: " << pt_err(status) + << internal_error_context_string(); + } + maybe_process_events(status); + out->address = insn.ip; + return true; + } +} + +} \ No newline at end of file diff --git a/src/ProcessorTraceDecoder.h b/src/ProcessorTraceDecoder.h new file mode 100644 index 00000000000..46b7b5e8dce --- /dev/null +++ b/src/ProcessorTraceDecoder.h @@ -0,0 +1,80 @@ +/* -*- Mode: C++; tab-width: 8; c-basic-offset: 2; indent-tabs-mode: nil; -*- */ + +#ifndef RR_PROCESSOR_TRACE_DECODER_H_ +#define RR_PROCESSOR_TRACE_DECODER_H_ + +#include + +#include "log.h" +#include "PerfCounters.h" +#include "Task.h" +#include "remote_code_ptr.h" + +struct pt_insn_decoder; + +namespace rr { + +class ProcessorTraceDecoder { +public: + enum Mode { + IS_RECORDING, + IS_REPLAY + }; + + struct Instruction { + remote_code_ptr address; + }; + + ProcessorTraceDecoder(Task* t, const PTData& trace_data, Mode mode) + : task(t), decoder(nullptr), mode(mode), need_sync(false) { + init(trace_data); + } +#ifdef INTEL_PT_DECODING + ~ProcessorTraceDecoder(); + bool next_instruction(Instruction* out, int* pt_status = nullptr); +#else + bool next_instruction(Instruction*, int* = nullptr) + { + FATAL() << "Intel PT support not built"; + return false; + } +#endif + + // Override a memory range with specific data to be seen by libipt. + void set_patch(remote_ptr addr, const std::vector& data) { + patch_addr = addr; + patch_data = data; + } + int read_mem(uint64_t ip, uint8_t *buffer, size_t size); + +private: +#ifdef INTEL_PT_DECODING + void init(const PTData& trace_data); +#else + void init(const PTData&) + { + FATAL() << "Intel PT support not built"; + } +#endif + void init_decoder(); + + void dump_full_trace_data_to_file(); + + ProcessorTraceDecoder(Task* task, std::vector full_trace_data, + Mode mode); + + void maybe_process_events(int status); + std::string internal_error_context_string(); + + Task* const task; + std::vector full_trace_data; + pt_insn_decoder* decoder; + remote_ptr patch_addr; + std::vector patch_data; + Mode mode; + bool need_sync; +}; + +} // namespace rr + +#endif /* RR_PROCESSOR_TRACE_DECODER_H_ */ diff --git a/src/RecordCommand.cc b/src/RecordCommand.cc index c569360137b..aaa5b642384 100644 --- a/src/RecordCommand.cc +++ b/src/RecordCommand.cc @@ -71,6 +71,8 @@ RecordCommand RecordCommand::singleton( " instead of terminating the program. The\n" " signal will still be delivered for user\n" " handlers and debugging.\n" + " --intel-pt Enable PT collection of control flow\n" + " (for debugging rr)\n" " -u, --cpu-unbound allow tracees to run on any virtual CPU.\n" " Default is to bind to a random CPU. This " "option\n" @@ -180,6 +182,10 @@ struct RecordFlags { /* True if we should always enable TSAN compatibility. */ bool tsan; + /* True if we should enable collection of control flow + with PT. */ + bool intel_pt; + RecordFlags() : max_ticks(Scheduler::DEFAULT_MAX_TICKS), ignore_sig(0), @@ -203,7 +209,8 @@ struct RecordFlags { stap_sdt(false), unmap_vdso(false), asan(false), - tsan(false) {} + tsan(false), + intel_pt(false) {} }; static void parse_signal_name(ParsedOption& opt) { @@ -265,6 +272,7 @@ static bool parse_record_arg(vector& args, RecordFlags& flags) { { 16, "disable-avx-512", NO_PARAMETER }, { 17, "asan", NO_PARAMETER }, { 18, "tsan", NO_PARAMETER }, + { 19, "intel-pt", NO_PARAMETER }, { 'c', "num-cpu-ticks", HAS_PARAMETER }, { 'h', "chaos", NO_PARAMETER }, { 'i', "ignore-signal", HAS_PARAMETER }, @@ -478,6 +486,9 @@ static bool parse_record_arg(vector& args, RecordFlags& flags) { case 18: flags.tsan = true; break; + case 19: + flags.intel_pt = true; + break; case 's': flags.always_switch = true; break; @@ -667,7 +678,8 @@ static WaitStatus record(const vector& args, const RecordFlags& flags) { flags.use_syscall_buffer, flags.syscallbuf_desched_sig, flags.bind_cpu, flags.output_trace_dir, flags.trace_id.get(), - flags.stap_sdt, flags.unmap_vdso, flags.asan, flags.tsan); + flags.stap_sdt, flags.unmap_vdso, flags.asan, flags.tsan, + flags.intel_pt); setup_session_from_flags(*session, flags); static_session = session.get(); diff --git a/src/RecordSession.cc b/src/RecordSession.cc index 68ceb57ab33..49d34af2ee9 100644 --- a/src/RecordSession.cc +++ b/src/RecordSession.cc @@ -2291,7 +2291,8 @@ static string lookup_by_path(const string& name) { bool use_audit, bool unmap_vdso, bool force_asan_active, - bool force_tsan_active) { + bool force_tsan_active, + bool intel_pt) { TraceeAttentionSet::initialize(); // The syscallbuf library interposes some critical @@ -2419,7 +2420,8 @@ static string lookup_by_path(const string& name) { shr_ptr session( new RecordSession(full_path, argv, env, disable_cpuid_features, syscallbuf, syscallbuf_desched_sig, bind_cpu, - output_trace_dir, trace_id, use_audit, unmap_vdso)); + output_trace_dir, trace_id, use_audit, unmap_vdso, + intel_pt)); session->excluded_ranges_ = std::move(exe_info.sanitizer_exclude_memory_ranges); session->fixed_global_exclusion_range_ = std::move(exe_info.fixed_global_exclusion_range); return session; @@ -2435,7 +2437,8 @@ RecordSession::RecordSession(const std::string& exe_path, const string& output_trace_dir, const TraceUuid* trace_id, bool use_audit, - bool unmap_vdso) + bool unmap_vdso, + bool intel_pt) : trace_out(argv[0], output_trace_dir, ticks_semantics_), scheduler_(*this), trace_id(trace_id), @@ -2452,6 +2455,8 @@ RecordSession::RecordSession(const std::string& exe_path, wait_for_all_(false), use_audit_(use_audit), unmap_vdso_(unmap_vdso) { + set_intel_pt(intel_pt); + if (!has_cpuid_faulting() && disable_cpuid_features.any_features_disabled()) { FATAL() << "CPUID faulting required to disable CPUID features"; diff --git a/src/RecordSession.h b/src/RecordSession.h index 05fe2eaa684..ed9998ba4e0 100644 --- a/src/RecordSession.h +++ b/src/RecordSession.h @@ -71,7 +71,8 @@ class RecordSession final : public Session { bool use_audit = false, bool unmap_vdso = false, bool force_asan_active = false, - bool force_tsan_active = false); + bool force_tsan_active = false, + bool intel_pt = false); ~RecordSession() override; @@ -219,7 +220,8 @@ class RecordSession final : public Session { const std::string& output_trace_dir, const TraceUuid* trace_id, bool use_audit, - bool unmap_vdso); + bool unmap_vdso, + bool intel_pt); virtual void on_create(Task* t) override; diff --git a/src/RecordTask.cc b/src/RecordTask.cc index d54ddd30d44..e608c7f8f4c 100644 --- a/src/RecordTask.cc +++ b/src/RecordTask.cc @@ -1992,6 +1992,14 @@ void RecordTask::record_event(Event ev, FlushSyscallbuf flush, if (should_checksum(ev, current_time)) { checksum_process_memory(this, current_time); } + if (!ev.has_ticks_slop()) { + // Only associate PT data with events whose timing is + // exactly the same between recording and replay. + PTData pt_data = hpc.extract_pt_data(); + if (!pt_data.data.empty()) { + write_pt_data(this, current_time, pt_data.data); + } + } if (trace_writer().clear_fip_fdp()) { const ExtraRegisters* maybe_extra = extra_regs_fallible(); diff --git a/src/ReplayCommand.cc b/src/ReplayCommand.cc index 84c18fa368a..c4519be2f9f 100644 --- a/src/ReplayCommand.cc +++ b/src/ReplayCommand.cc @@ -69,6 +69,8 @@ ReplayCommand ReplayCommand::singleton( " to run on the CPU stored in the trace.\n" " Note that this may diverge from the recording\n" " in some cases.\n" + " --intel-pt-start-checking-event verify control flow using Intel PT\n" + " (used for debugging rr)\n" " -x, --gdb-x= execute gdb commands from \n" " --stats= display brief stats every N steps (eg 10000).\n" " --serve-files Serve all files from the trace rather than\n" @@ -135,6 +137,8 @@ struct ReplayFlags { string tty; + FrameTime intel_pt_start_checking_event; + ReplayFlags() : goto_event(0), singlestep_to_event(0), @@ -149,7 +153,8 @@ struct ReplayFlags { cpu_unbound(false), share_private_mappings(false), dump_interval(0), - serve_files(false) {} + serve_files(false), + intel_pt_start_checking_event(-1) {} }; static bool parse_replay_arg(vector& args, ReplayFlags& flags) { @@ -176,6 +181,7 @@ static bool parse_replay_arg(vector& args, ReplayFlags& flags) { { 2, "stats", HAS_PARAMETER }, { 3, "serve-files", NO_PARAMETER }, { 4, "tty", HAS_PARAMETER }, + { 5, "intel-pt-start-checking-event", HAS_PARAMETER }, { 'u', "cpu-unbound", NO_PARAMETER }, { 'i', "interpreter", HAS_PARAMETER } }; @@ -267,6 +273,12 @@ static bool parse_replay_arg(vector& args, ReplayFlags& flags) { case 4: flags.tty = opt.value; break; + case 5: + if (!opt.verify_valid_int(1, INT64_MAX)) { + return false; + } + flags.intel_pt_start_checking_event = opt.int_value; + break; case 'u': flags.cpu_unbound = true; break; @@ -344,6 +356,7 @@ static ReplaySession::Flags session_flags(const ReplayFlags& flags) { result.redirect_stdio_file = flags.tty; result.share_private_mappings = flags.share_private_mappings; result.cpu_unbound = flags.cpu_unbound; + result.intel_pt_start_checking_event = flags.intel_pt_start_checking_event; return result; } diff --git a/src/ReplaySession.cc b/src/ReplaySession.cc index 701d2f392a8..8db97c540df 100644 --- a/src/ReplaySession.cc +++ b/src/ReplaySession.cc @@ -9,9 +9,12 @@ #include #include +#include +#include #include "AutoRemoteSyscalls.h" #include "Flags.h" +#include "ProcessorTraceDecoder.h" #include "ReplayTask.h" #include "ThreadGroup.h" #include "core.h" @@ -39,6 +42,156 @@ static void debug_memory(ReplayTask* t) { } } +static string intel_pt_context(ReplayTask* t, const PTData& record_pt_data, + const PTData& replay_pt_data, int instruction_count, + remote_ptr patch_addr, + const vector& patch_data) { + stringstream out; + ProcessorTraceDecoder record_pt_decoder(t, record_pt_data, + ProcessorTraceDecoder::IS_RECORDING); + record_pt_decoder.set_patch(patch_addr, patch_data); + ProcessorTraceDecoder replay_pt_decoder(t, replay_pt_data, + ProcessorTraceDecoder::IS_REPLAY); + replay_pt_decoder.set_patch(patch_addr, patch_data); + int start_at = max(0, instruction_count - 100); + out << "\nrecord and replay (" << start_at << " instructions skipped)"; + for (int i = 0; i < instruction_count; ++i) { + ProcessorTraceDecoder::Instruction tmp; + record_pt_decoder.next_instruction(&tmp); + replay_pt_decoder.next_instruction(&tmp); + if (i >= start_at) { + out << "\n" << tmp.address; + } + } + out << "\nrecord"; + for (int i = 0; i < 10; ++i) { + ProcessorTraceDecoder::Instruction tmp; + if (record_pt_decoder.next_instruction(&tmp)) { + out << "\n" << tmp.address; + } else { + break; + } + } + out << "\nreplay"; + for (int i = 0; i < 10; ++i) { + ProcessorTraceDecoder::Instruction tmp; + if (replay_pt_decoder.next_instruction(&tmp)) { + out << "\n" << tmp.address; + } else { + break; + } + } + return out.str(); +} + +static bool get_next_instruction(ReplayTask* t, ProcessorTraceDecoder& decoder, + ProcessorTraceDecoder::Instruction* out, + int* instruction_count) { + while (true) { + bool result = decoder.next_instruction(out); + if (!result) { + return false; + } + if (out->address == t->vm()->do_breakpoint_fault_addr()) { + ++*instruction_count; + continue; + } + return true; + } +} + +static bool check_intel_pt_internal(ReplayTask* t, + const PTData& replay_pt_data, ostream& stream) { + remote_ptr patch_addr; + vector patch_data; + const Event& ev = t->session().current_trace_frame().event(); + if (ev.type() == EV_PATCH_SYSCALL) { + // The task now has the syscallbuf patch applied. + // But when we executed this code, there was a syscall + // there. + patch_addr = t->ip().to_data_ptr(); + patch_data = syscall_instruction(t->arch()); + } + + FrameTime current_time = t->current_frame_time(); + PTData record_pt_data(read_pt_data(t, current_time)); + ProcessorTraceDecoder record_pt_decoder(t, record_pt_data, + ProcessorTraceDecoder::IS_RECORDING); + record_pt_decoder.set_patch(patch_addr, patch_data); + ProcessorTraceDecoder replay_pt_decoder(t, replay_pt_data, + ProcessorTraceDecoder::IS_REPLAY); + replay_pt_decoder.set_patch(patch_addr, patch_data); + + int instruction_count = 0; + while (true) { + ProcessorTraceDecoder::Instruction record_instruction; + bool got_record_instruction = + get_next_instruction(t, record_pt_decoder, &record_instruction, + &instruction_count); + ProcessorTraceDecoder::Instruction replay_instruction; + bool got_replay_instruction = + get_next_instruction(t, replay_pt_decoder, &replay_instruction, + &instruction_count); + if (got_record_instruction != got_replay_instruction) { + stream << "Instruction sequence ended early: got_record_instruction " + << got_record_instruction << " got_replay_instruction " + << got_replay_instruction + << intel_pt_context(t, record_pt_data, replay_pt_data, instruction_count, + patch_addr, patch_data); + return false; + } + if (!got_record_instruction) { + break; + } + if (record_instruction.address != replay_instruction.address) { + stream << "Control flow diverged" + << intel_pt_context(t, record_pt_data, replay_pt_data, instruction_count, + patch_addr, patch_data); + return false; + } + ++instruction_count; + } + if (instruction_count) { + LOG(debug) << "check_intel_pt verified " << instruction_count << " instructions"; + } + return true; +} + +static void maybe_check_intel_pt(ReplayTask* t) { + const Event& ev = t->session().current_trace_frame().event(); + if (ev.has_ticks_slop()) { + // Only check PT data at events whose timing is exactly + // the same between record and replay. + return; + } + + // Always extract PT data if there is any, and throw it + // away if we don't want to check it. + PTData replay_pt_data = t->hpc.extract_pt_data(); + + FrameTime start_checking_event = + t->session().flags().intel_pt_start_checking_event; + if (start_checking_event < 0) { + return; + } + FrameTime current_time = t->current_frame_time(); + if (start_checking_event > current_time || + !t->session().done_initial_exec()) { + return; + } + stringstream stream; + bool ok = check_intel_pt_internal(t, replay_pt_data, stream); + ASSERT(t, ok) << "Intel PT detected control flow divergence: " + << stream.str(); +} + +void emergency_check_intel_pt(ReplayTask* t, ostream& stream) { + PTData replay_pt_data = t->hpc.extract_pt_data(); + if (!replay_pt_data.data.empty()) { + check_intel_pt_internal(t, replay_pt_data, stream); + } +} + static void split_at_address(ReplaySession::MemoryRanges& ranges, remote_ptr addr) { ReplaySession::MemoryRanges::iterator it = @@ -209,6 +362,8 @@ ReplaySession::ReplaySession(const std::string& dir, const Flags& flags) check_xsave_compatibility(trace_in); } } + + set_intel_pt(flags.intel_pt_start_checking_event >= 0); } ReplaySession::ReplaySession(const ReplaySession& other) @@ -595,12 +750,16 @@ Completion ReplaySession::enter_syscall(ReplayTask* t, // in writeable or shared memory, since in those cases it could be // overwritten by the tracee. It could even be dynamically generated and // not generated yet. + // Also disable the breakpoint optimization when intel_pt is enabled + // since we want to see the execution of the syscall instruction. if (!syscall_bp_vm && t->vm()->is_breakpoint_in_private_read_only_memory( syscall_instruction) && - t->vm()->add_breakpoint(syscall_instruction, BKPT_INTERNAL)) { - syscall_bp_vm = t->vm(); - syscall_bp_addr = syscall_instruction; + !t->session().intel_pt()) { + if (t->vm()->add_breakpoint(syscall_instruction, BKPT_INTERNAL)) { + syscall_bp_vm = t->vm(); + syscall_bp_addr = syscall_instruction; + } } } @@ -2018,6 +2177,7 @@ ReplayResult ReplaySession::replay_step(const StepConstraints& constraints) { } debug_memory(t); + maybe_check_intel_pt(t); check_for_watchpoint_changes(t, result.break_status); check_approaching_ticks_target(t, constraints, result.break_status); diff --git a/src/ReplaySession.h b/src/ReplaySession.h index 36878d879e4..b17092461bc 100644 --- a/src/ReplaySession.h +++ b/src/ReplaySession.h @@ -4,6 +4,7 @@ #define RR_REPLAY_SESSION_H_ #include +#include #include #include "AddressSpace.h" @@ -241,13 +242,15 @@ class ReplaySession final : public Session { : redirect_stdio(false) , share_private_mappings(false) , replay_stops_at_first_execve(false) - , cpu_unbound(false) {} + , cpu_unbound(false) + , intel_pt_start_checking_event(-1) {} Flags(const Flags&) = default; bool redirect_stdio; std::string redirect_stdio_file; bool share_private_mappings; bool replay_stops_at_first_execve; bool cpu_unbound; + FrameTime intel_pt_start_checking_event; }; /** @@ -416,6 +419,8 @@ class ReplaySession final : public Session { remote_code_ptr syscall_bp_addr; }; +void emergency_check_intel_pt(ReplayTask* t, std::ostream& stream); + } // namespace rr #endif // RR_REPLAY_SESSION_H_ diff --git a/src/Session.cc b/src/Session.cc index 7cffc04877b..1e384256aa4 100644 --- a/src/Session.cc +++ b/src/Session.cc @@ -51,7 +51,8 @@ Session::Session() syscall_seccomp_ordering_(PTRACE_SYSCALL_BEFORE_SECCOMP_UNKNOWN), ticks_semantics_(PerfCounters::default_ticks_semantics()), done_initial_exec_(false), - visible_execution_(true) { + visible_execution_(true), + intel_pt_(false) { LOG(debug) << "Session " << this << " created"; } @@ -76,7 +77,8 @@ Session::Session(const Session& other) ticks_semantics_(other.ticks_semantics_), original_affinity_(other.original_affinity_), done_initial_exec_(other.done_initial_exec_), - visible_execution_(other.visible_execution_) {} + visible_execution_(other.visible_execution_), + intel_pt_(other.intel_pt_) {} void Session::on_create(ThreadGroup* tg) { thread_group_map_[tg->tguid()] = tg; } void Session::on_destroy(ThreadGroup* tg) { diff --git a/src/Session.h b/src/Session.h index e51421e1983..a8d38719f6d 100644 --- a/src/Session.h +++ b/src/Session.h @@ -418,6 +418,9 @@ class Session { return dflt; } + void set_intel_pt(bool intel_pt) { intel_pt_ = intel_pt; } + bool intel_pt() const { return intel_pt_; } + protected: Session(); virtual ~Session(); @@ -477,6 +480,11 @@ class Session { * True while the execution of this session is visible to users. */ bool visible_execution_; + + /** + * True while we're collecting Intel PT data. + */ + bool intel_pt_; }; } // namespace rr diff --git a/src/Task.cc b/src/Task.cc index 4bc6136309a..e8c454862d1 100644 --- a/src/Task.cc +++ b/src/Task.cc @@ -75,7 +75,8 @@ Task::Task(Session& session, pid_t _tid, pid_t _rec_tid, uint32_t serial, // This will be initialized when the syscall buffer is. cloned_file_data_fd_child(-1), hpc(_tid, session.cpu_binding(), session.ticks_semantics(), - session.need_performance_counters()), + session.need_performance_counters(), + session.intel_pt()), tid(_tid), rec_tid(_rec_tid > 0 ? _rec_tid : _tid), own_namespace_rec_tid(_rec_tid > 0 ? _rec_tid: _tid), diff --git a/src/log.cc b/src/log.cc index 8ae84be08ce..4ae16b8d68f 100644 --- a/src/log.cc +++ b/src/log.cc @@ -16,6 +16,8 @@ #include "GdbConnection.h" #include "GdbServer.h" #include "RecordSession.h" +#include "ReplaySession.h" +#include "ReplayTask.h" #include "core.h" #include "ftrace.h" #include "kernel_abi.h" @@ -444,6 +446,9 @@ static void emergency_debug(Task* t) { if (record_session) { record_session->close_trace_writer(TraceWriter::CLOSE_ERROR); } + if (t->session().is_replaying()) { + emergency_check_intel_pt(static_cast(t), log_stream()); + } // Capture the log buffer now to prevent the log messages from the trace // stream read below from overwriting any data from the actual failure. diff --git a/src/util.cc b/src/util.cc index 059fd660530..36250e41ce1 100644 --- a/src/util.cc +++ b/src/util.cc @@ -315,6 +315,39 @@ void dump_process_memory(Task* t, FrameTime global_time, const char* tag) { fclose(dump_file); } +void write_pt_data(Task* t, FrameTime global_time, const vector& data) { + char filename[PATH_MAX]; + format_dump_filename(t, global_time, "pt", filename, sizeof(filename)); + FILE* dump_file = fopen64(filename, "w"); + ASSERT(t, dump_file) << "Can't write to file " << filename; + size_t written = fwrite(data.data(), 1, data.size(), dump_file); + ASSERT(t, written == data.size()) << "Failed to write PT data to " << filename; + fclose(dump_file); +} + +vector read_pt_data(Task* t, FrameTime global_time) { + char filename[PATH_MAX]; + format_dump_filename(t, global_time, "pt", filename, sizeof(filename)); + FILE* dump_file = fopen64(filename, "r"); + vector result; + if (!dump_file) { + return result; + } + + while (true) { + char buf[1024*1024]; + size_t bytes_read = fread(buf, 1, sizeof(buf), dump_file); + size_t current_size = result.size(); + result.resize(current_size + bytes_read); + memcpy(result.data() + current_size, buf, bytes_read); + if (bytes_read < sizeof(buf)) { + break; + } + } + fclose(dump_file); + return result; +} + static void notify_checksum_error(ReplayTask* t, FrameTime global_time, unsigned checksum, unsigned rec_checksum, const string& raw_map_line) { @@ -2537,4 +2570,16 @@ int parse_tid_from_proc_path(const std::string& pathname, } return -1; } + +void replace_in_buffer(MemoryRange src, const uint8_t* src_data, + MemoryRange dst, uint8_t* dst_data) { + remote_ptr overlap_start = max(src.start(), dst.start()); + remote_ptr overlap_end = min(src.end(), dst.end()); + if (overlap_start < overlap_end) { + memcpy(dst_data + (overlap_start - dst.start()), + src_data + (overlap_start - src.start()), + overlap_end - overlap_start); + } +} + } // namespace rr diff --git a/src/util.h b/src/util.h index c220849ebc4..fa271ad9524 100644 --- a/src/util.h +++ b/src/util.h @@ -17,6 +17,7 @@ #include #endif +#include "MemoryRange.h" #include "ScopedFd.h" #include "TraceFrame.h" #include "remote_ptr.h" @@ -129,6 +130,16 @@ void checksum_process_memory(Task* t, FrameTime global_time); */ void validate_process_memory(Task* t, FrameTime global_time); +/** + * Write raw PT data to a file in the trace dir. + */ +void write_pt_data(Task* t, FrameTime global_time, const std::vector& data); + +/** + * Read raw PT data to a file in the trace dir. Returns an empty vector if none found. + */ +std::vector read_pt_data(Task* t, FrameTime global_time); + /** * Return nonzero if the rr session is probably not interactive (that * is, there's probably no user watching or interacting with rr), and @@ -630,6 +641,13 @@ inline unsigned long long dczid_el0_block_size(void) { #endif } +/** + * If `src` overlaps `dst`, replace the bytes in `dst_data` from the range `dst` + * with the corresponding bytes in `src_data` from the range `src`. + */ +void replace_in_buffer(MemoryRange src, const uint8_t* src_data, + MemoryRange dst, uint8_t* dst_data); + } // namespace rr #endif /* RR_UTIL_H_ */