diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 107136e6..050c5398 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -16,8 +16,8 @@ endif() if(UNIX) # LINUX, FREE_BSD, APPLE if (NOT APPLE) set(CMAKE_EXE_LINKER_FLAGS_RELEASE "${CMAKE_EXE_LINKER_FLAGS} -s") # --strip-unneeded for packaging - list(APPEND PROJECT_NAMES pcm-sensor-server) endif() + list(APPEND PROJECT_NAMES pcm-sensor-server) list(APPEND PROJECT_NAMES pcm-sensor) # libpcm.a diff --git a/src/cpucounters.cpp b/src/cpucounters.cpp index e2b4609e..0e7e36fa 100644 --- a/src/cpucounters.cpp +++ b/src/cpucounters.cpp @@ -889,8 +889,13 @@ constexpr auto perfBadSpecPath = "/sys/bus/event_source/devices/cpu/events/topdo constexpr auto perfBackEndPath = "/sys/bus/event_source/devices/cpu/events/topdown-be-bound"; constexpr auto perfFrontEndPath = "/sys/bus/event_source/devices/cpu/events/topdown-fe-bound"; constexpr auto perfRetiringPath = "/sys/bus/event_source/devices/cpu/events/topdown-retiring"; +// L2 extensions: +constexpr auto perfBrMispred = "/sys/bus/event_source/devices/cpu/events/topdown-br-mispredict"; +constexpr auto perfFetchLat = "/sys/bus/event_source/devices/cpu/events/topdown-fetch-lat"; +constexpr auto perfHeavyOps = "/sys/bus/event_source/devices/cpu/events/topdown-heavy-ops"; +constexpr auto perfMemBound = "/sys/bus/event_source/devices/cpu/events/topdown-mem-bound"; -bool perfSupportsTopDown() +bool PCM::perfSupportsTopDown() { static int yes = -1; if (-1 == yes) @@ -900,7 +905,16 @@ bool perfSupportsTopDown() const auto be = readSysFS(perfBackEndPath, true); const auto fe = readSysFS(perfFrontEndPath, true); const auto ret = readSysFS(perfRetiringPath, true); - yes = (slots.size() && bad.size() && be.size() && fe.size() && ret.size()) ? 1 : 0; + bool supported = slots.size() && bad.size() && be.size() && fe.size() && ret.size(); + if (isHWTMAL2Supported()) + { + supported = supported && + readSysFS("/sys/bus/event_source/devices/cpu/events/topdown-br-mispredict", true).size() && + readSysFS("/sys/bus/event_source/devices/cpu/events/topdown-fetch-lat", true).size() && + readSysFS("/sys/bus/event_source/devices/cpu/events/topdown-heavy-ops", true).size() && + readSysFS("/sys/bus/event_source/devices/cpu/events/topdown-mem-bound", true).size(); + } + yes = supported ? 1 : 0; } return 1 == yes; } @@ -1535,6 +1549,10 @@ bool PCM::discoverSystemTopology() BackendBoundSlots.resize(num_cores, 0); RetiringSlots.resize(num_cores, 0); AllSlotsRaw.resize(num_cores, 0); + MemBoundSlots.resize(num_cores, 0); + FetchLatSlots.resize(num_cores, 0); + BrMispredSlots.resize(num_cores, 0); + HeavyOpsSlots.resize(num_cores, 0); #if 0 std::cerr << "Socket reference cores:\n"; @@ -1602,7 +1620,12 @@ void PCM::printSystemTopology() const bool PCM::initMSR() { -#ifndef __APPLE__ +#ifdef __APPLE__ + for (size_t i=0; i < MSR.size(); ++i) + { + systemTopology->addMSRHandleToOSThread(MSR[i], (uint32)i); + } +#else try { for (int i = 0; i < (int)num_cores; ++i) @@ -3918,11 +3941,18 @@ PCM::ErrorCode PCM::programCoreCounters(const int i /* core */, { if (isFixedCounterSupported(3) && isHWTMAL1Supported() && perfSupportsTopDown()) { - const auto topDownEvents = { std::make_pair(perfSlotsPath, PERF_TOPDOWN_SLOTS_POS), + std::vector > topDownEvents = { std::make_pair(perfSlotsPath, PERF_TOPDOWN_SLOTS_POS), std::make_pair(perfBadSpecPath, PERF_TOPDOWN_BADSPEC_POS), std::make_pair(perfBackEndPath, PERF_TOPDOWN_BACKEND_POS), std::make_pair(perfFrontEndPath, PERF_TOPDOWN_FRONTEND_POS), std::make_pair(perfRetiringPath, PERF_TOPDOWN_RETIRING_POS)}; + if (isHWTMAL2Supported()) + { + topDownEvents.push_back(std::make_pair(perfMemBound, PERF_TOPDOWN_MEM_BOUND_POS)); + topDownEvents.push_back(std::make_pair(perfFetchLat, PERF_TOPDOWN_FETCH_LAT_POS)); + topDownEvents.push_back(std::make_pair(perfBrMispred, PERF_TOPDOWN_BR_MISPRED_POS)); + topDownEvents.push_back(std::make_pair(perfHeavyOps, PERF_TOPDOWN_HEAVY_OPS_POS)); + } int readPos = core_fixed_counter_num_used + core_gen_counter_num_used; leader_counter = -1; for (const auto & event : topDownEvents) @@ -5058,8 +5088,9 @@ void PCM::readPerfData(uint32 core, std::vector & outData) if (isHWTMAL1Supported() && perfSupportsTopDown()) { std::vector outTopDownData(outData.size(), 0); - readPerfDataHelper(core, outTopDownData, PERF_TOPDOWN_GROUP_LEADER_COUNTER, PERF_TOPDOWN_COUNTERS); - std::copy(outTopDownData.begin(), outTopDownData.begin() + PERF_TOPDOWN_COUNTERS, outData.begin() + core_fixed_counter_num_used + core_gen_counter_num_used); + const auto topdownCtrNum = isHWTMAL2Supported() ? PERF_TOPDOWN_COUNTERS : PERF_TOPDOWN_COUNTERS_L1; + readPerfDataHelper(core, outTopDownData, PERF_TOPDOWN_GROUP_LEADER_COUNTER, topdownCtrNum); + std::copy(outTopDownData.begin(), outTopDownData.begin() + topdownCtrNum, outData.begin() + core_fixed_counter_num_used + core_gen_counter_num_used); } } #endif @@ -5089,6 +5120,7 @@ void BasicCounterState::readAndAggregateTSC(std::shared_ptr msr) void BasicCounterState::readAndAggregate(std::shared_ptr msr) { + assert(msr.get()); uint64 cInstRetiredAny = 0, cCpuClkUnhaltedThread = 0, cCpuClkUnhaltedRef = 0; uint64 cL3Occupancy = 0; uint64 cCustomEvents[PERF_MAX_CUSTOM_COUNTERS] = {0ULL, 0ULL, 0ULL, 0ULL, 0ULL, 0ULL, 0ULL, 0ULL }; @@ -5101,6 +5133,10 @@ void BasicCounterState::readAndAggregate(std::shared_ptr msr) uint64 cBackendBoundSlots = 0; uint64 cRetiringSlots = 0; uint64 cAllSlotsRaw = 0; + uint64 cMemBoundSlots = 0; + uint64 cFetchLatSlots = 0; + uint64 cBrMispredSlots = 0; + uint64 cHeavyOpsSlots = 0; const int32 core_id = msr->getCoreId(); TemporalThreadAffinity tempThreadAffinity(core_id); // speedup trick for Linux @@ -5130,7 +5166,7 @@ void BasicCounterState::readAndAggregate(std::shared_ptr msr) { cCustomEvents[i] = perfData[PCM::PERF_GEN_EVENT_0_POS + i]; } - if (m->isHWTMAL1Supported() && perfSupportsTopDown()) + if (m->isHWTMAL1Supported() && m->perfSupportsTopDown()) { cFrontendBoundSlots = perfData[m->perfTopDownPos[PCM::PERF_TOPDOWN_FRONTEND_POS]]; cBadSpeculationSlots = perfData[m->perfTopDownPos[PCM::PERF_TOPDOWN_BADSPEC_POS]]; @@ -5138,6 +5174,13 @@ void BasicCounterState::readAndAggregate(std::shared_ptr msr) cRetiringSlots = perfData[m->perfTopDownPos[PCM::PERF_TOPDOWN_RETIRING_POS]]; cAllSlotsRaw = perfData[m->perfTopDownPos[PCM::PERF_TOPDOWN_SLOTS_POS]]; // if (core_id == 0) std::cout << "DEBUG: All: "<< cAllSlotsRaw << " FE: " << cFrontendBoundSlots << " BAD-SP: " << cBadSpeculationSlots << " BE: " << cBackendBoundSlots << " RET: " << cRetiringSlots << std::endl; + if (m->isHWTMAL2Supported()) + { + cMemBoundSlots = perfData[m->perfTopDownPos[PCM::PERF_TOPDOWN_MEM_BOUND_POS]]; + cFetchLatSlots = perfData[m->perfTopDownPos[PCM::PERF_TOPDOWN_FETCH_LAT_POS]]; + cBrMispredSlots = perfData[m->perfTopDownPos[PCM::PERF_TOPDOWN_BR_MISPRED_POS]];; + cHeavyOpsSlots = perfData[m->perfTopDownPos[PCM::PERF_TOPDOWN_HEAVY_OPS_POS]]; + } } } else @@ -5170,6 +5213,13 @@ void BasicCounterState::readAndAggregate(std::shared_ptr msr) cBadSpeculationSlots = extract_bits(perfMetrics, 8, 15); cBackendBoundSlots = extract_bits(perfMetrics, 24, 31); cRetiringSlots = extract_bits(perfMetrics, 0, 7); + if (m->isHWTMAL2Supported()) + { + cMemBoundSlots = extract_bits(perfMetrics, 32 + 3*8, 32 + 3*8 + 7); + cFetchLatSlots = extract_bits(perfMetrics, 32 + 2*8, 32 + 2*8 + 7); + cBrMispredSlots = extract_bits(perfMetrics, 32 + 1*8, 32 + 1*8 + 7); + cHeavyOpsSlots = extract_bits(perfMetrics, 32 + 0*8, 32 + 0*8 + 7); + } const double total = double(cFrontendBoundSlots + cBadSpeculationSlots + cBackendBoundSlots + cRetiringSlots); if (total != 0) { @@ -5177,6 +5227,13 @@ void BasicCounterState::readAndAggregate(std::shared_ptr msr) cBadSpeculationSlots = m->BadSpeculationSlots[core_id] += uint64((double(cBadSpeculationSlots) / total) * double(slots)); cBackendBoundSlots = m->BackendBoundSlots[core_id] += uint64((double(cBackendBoundSlots) / total) * double(slots)); cRetiringSlots = m->RetiringSlots[core_id] += uint64((double(cRetiringSlots) / total) * double(slots)); + if (m->isHWTMAL2Supported()) + { + cMemBoundSlots = m->MemBoundSlots[core_id] += uint64((double(cMemBoundSlots) / total) * double(slots)); + cFetchLatSlots = m->FetchLatSlots[core_id] += uint64((double(cFetchLatSlots) / total) * double(slots)); + cBrMispredSlots = m->BrMispredSlots[core_id] += uint64((double(cBrMispredSlots) / total) * double(slots)); + cHeavyOpsSlots = m->HeavyOpsSlots[core_id] += uint64((double(cHeavyOpsSlots) / total) * double(slots)); + } } cAllSlotsRaw = m->AllSlotsRaw[core_id] += slots; // std::cout << "DEBUG: "<< slots << " " << cFrontendBoundSlots << " " << cBadSpeculationSlots << " " << cBackendBoundSlots << " " << cRetiringSlots << std::endl; @@ -5250,6 +5307,10 @@ void BasicCounterState::readAndAggregate(std::shared_ptr msr) BackendBoundSlots += cBackendBoundSlots; RetiringSlots += cRetiringSlots; AllSlotsRaw += cAllSlotsRaw; + MemBoundSlots += cMemBoundSlots; + FetchLatSlots += cFetchLatSlots; + BrMispredSlots += cBrMispredSlots; + HeavyOpsSlots += cHeavyOpsSlots; if (freezeUnfreeze) { diff --git a/src/cpucounters.h b/src/cpucounters.h index 367d6939..8dffcad4 100644 --- a/src/cpucounters.h +++ b/src/cpucounters.h @@ -687,6 +687,7 @@ class PCM_API PCM bool forceRTMAbortMode; std::vector FrontendBoundSlots, BadSpeculationSlots, BackendBoundSlots, RetiringSlots, AllSlotsRaw; + std::vector MemBoundSlots, FetchLatSlots, BrMispredSlots, HeavyOpsSlots; bool isFixedCounterSupported(unsigned c); bool vm = false; bool linux_arch_perfmon = false; @@ -941,10 +942,14 @@ class PCM_API PCM PERF_TOPDOWN_FRONTEND_POS = PERF_TOPDOWN_SLOTS_POS + 1, PERF_TOPDOWN_BADSPEC_POS = PERF_TOPDOWN_SLOTS_POS + 2, PERF_TOPDOWN_BACKEND_POS = PERF_TOPDOWN_SLOTS_POS + 3, - PERF_TOPDOWN_RETIRING_POS = PERF_TOPDOWN_SLOTS_POS + 4 + PERF_TOPDOWN_RETIRING_POS = PERF_TOPDOWN_SLOTS_POS + 4, + PERF_TOPDOWN_MEM_BOUND_POS = PERF_TOPDOWN_SLOTS_POS + 5, + PERF_TOPDOWN_FETCH_LAT_POS = PERF_TOPDOWN_SLOTS_POS + 6, + PERF_TOPDOWN_BR_MISPRED_POS = PERF_TOPDOWN_SLOTS_POS + 7, + PERF_TOPDOWN_HEAVY_OPS_POS = PERF_TOPDOWN_SLOTS_POS + 8 }; - std::array perfTopDownPos; + std::array perfTopDownPos; enum { PERF_GROUP_LEADER_COUNTER = PERF_INST_RETIRED_POS, @@ -1091,12 +1096,25 @@ class PCM_API PCM void initUncorePMUsPerf(); bool isRDTDisabled() const; +#ifdef __linux__ + bool perfSupportsTopDown(); +#endif + public: static bool isInitialized() { return instance != nullptr; } //! check if TMA level 1 metrics are supported bool isHWTMAL1Supported() const; + //! check if TMA level 2 metrics are supported + bool isHWTMAL2Supported() const + { + return isHWTMAL1Supported() && + ( + SPR == cpu_model + ); + } + enum EventPosition { TOR_OCCUPANCY = 0, @@ -2596,6 +2614,22 @@ class BasicCounterState template friend double getRetiring(const CounterStateType & before, const CounterStateType & after); template + friend double getFetchLatencyBound(const CounterStateType & before, const CounterStateType & after); + template + friend double getFetchBandwidthBound(const CounterStateType & before, const CounterStateType & after); + template + friend double getBranchMispredictionBound(const CounterStateType & before, const CounterStateType & after); + template + friend double getMachineClearsBound(const CounterStateType & before, const CounterStateType & after); + template + friend double getMemoryBound(const CounterStateType & before, const CounterStateType & after); + template + friend double getCoreBound(const CounterStateType & before, const CounterStateType & after); + template + friend double getHeavyOperationsBound(const CounterStateType & before, const CounterStateType & after); + template + friend double getLightOperationsBound(const CounterStateType & before, const CounterStateType & after); + template friend uint64 getMSREvent(const uint64 & index, const PCM::MSRType & type, const CounterStateType& before, const CounterStateType& after); protected: checked_uint64 InstRetiredAny{}; @@ -2623,6 +2657,7 @@ class BasicCounterState uint64 MemoryBWTotal; uint64 SMICount; uint64 FrontendBoundSlots, BadSpeculationSlots, BackendBoundSlots, RetiringSlots, AllSlotsRaw; + uint64 MemBoundSlots, FetchLatSlots, BrMispredSlots, HeavyOpsSlots; std::unordered_map MSRValues; public: @@ -2637,7 +2672,11 @@ class BasicCounterState BadSpeculationSlots(0), BackendBoundSlots(0), RetiringSlots(0), - AllSlotsRaw(0) + AllSlotsRaw(0), + MemBoundSlots(0), + FetchLatSlots(0), + BrMispredSlots(0), + HeavyOpsSlots(0) { std::fill(CStateResidency, CStateResidency + PCM::MAX_C_STATE + 1, 0); } @@ -2671,11 +2710,19 @@ class BasicCounterState BackendBoundSlots += o.BackendBoundSlots; RetiringSlots += o.RetiringSlots; AllSlotsRaw += o.AllSlotsRaw; + MemBoundSlots += o.MemBoundSlots; + FetchLatSlots += o.FetchLatSlots; + BrMispredSlots += o.BrMispredSlots; + HeavyOpsSlots += o.HeavyOpsSlots; //std::cout << "after PCM debug aggregate "<< FrontendBoundSlots << " " << BadSpeculationSlots << " " << BackendBoundSlots << " " <= old.FrontendBoundSlots); assert(BadSpeculationSlots >= old.BadSpeculationSlots); assert(BackendBoundSlots >= old.BackendBoundSlots); assert(RetiringSlots >= old.RetiringSlots); + assert(MemBoundSlots >= old.MemBoundSlots); + assert(FetchLatSlots >= old.FetchLatSlots); + assert(BrMispredSlots >= old.BrMispredSlots); + assert(HeavyOpsSlots >= old.HeavyOpsSlots); return *this; } @@ -4536,6 +4583,24 @@ inline double getBackendBound(const CounterStateType & before, const CounterStat return 0.; } +//! \brief Returns unutilized pipeline slots where no uop was delivered due to stalls on buffer, cache or memory resources as range 0..1 +template +inline double getMemoryBound(const CounterStateType & before, const CounterStateType & after) +{ + if (PCM::getInstance()->isHWTMAL2Supported()) + return double(after.MemBoundSlots - before.MemBoundSlots)/double(getAllSlots(before, after)); + return 0.; +} + +//! \brief Returns unutilized pipeline slots where no uop was delivered due to lack of core resources as range 0..1 +template +inline double getCoreBound(const CounterStateType & before, const CounterStateType & after) +{ + if (PCM::getInstance()->isHWTMAL2Supported()) + return getBackendBound(before, after) - getMemoryBound(before, after); + return 0.; +} + //! \brief Returns unutilized pipeline slots where Front-end did not deliver a uop while back-end is ready as range 0..1 template inline double getFrontendBound(const CounterStateType & before, const CounterStateType & after) @@ -4546,6 +4611,24 @@ inline double getFrontendBound(const CounterStateType & before, const CounterSta return 0.; } +//! \brief Returns unutilized pipeline slots where Front-end due to fetch latency constraints did not deliver a uop while back-end is ready as range 0..1 +template +inline double getFetchLatencyBound(const CounterStateType & before, const CounterStateType & after) +{ + if (PCM::getInstance()->isHWTMAL2Supported()) + return double(after.FetchLatSlots - before.FetchLatSlots)/double(getAllSlots(before, after)); + return 0.; +} + +//! \brief Returns unutilized pipeline slots where Front-end due to fetch bandwidth constraints did not deliver a uop while back-end is ready as range 0..1 +template +inline double getFetchBandwidthBound(const CounterStateType & before, const CounterStateType & after) +{ + if (PCM::getInstance()->isHWTMAL2Supported()) + return getFrontendBound(before, after) - getFetchLatencyBound(before, after); + return 0.; +} + //! \brief Returns wasted pipeline slots due to incorrect speculation, covering whole penalty: Utilized by uops that do not retire, or Recovery Bubbles (unutilized slots) as range 0..1 template inline double getBadSpeculation(const CounterStateType & before, const CounterStateType & after) @@ -4556,6 +4639,24 @@ inline double getBadSpeculation(const CounterStateType & before, const CounterSt return 0.; } +//! \brief Returns wasted pipeline slots due to incorrect speculation (branch misprediction), covering whole penalty: Utilized by uops that do not retire, or Recovery Bubbles (unutilized slots) as range 0..1 +template +inline double getBranchMispredictionBound(const CounterStateType & before, const CounterStateType & after) +{ + if (PCM::getInstance()->isHWTMAL2Supported()) + return double(after.BrMispredSlots - before.BrMispredSlots)/double(getAllSlots(before, after)); + return 0.; +} + +//! \brief Returns wasted pipeline slots due to incorrect speculation (machine clears), covering whole penalty: Utilized by uops that do not retire, or Recovery Bubbles (unutilized slots) as range 0..1 +template +inline double getMachineClearsBound(const CounterStateType & before, const CounterStateType & after) +{ + if (PCM::getInstance()->isHWTMAL2Supported()) + return getBadSpeculation(before, after) - getBranchMispredictionBound(before, after); + return 0.; +} + //! \brief Returns pipeline slots utilized by uops that eventually retire (commit) template inline double getRetiring(const CounterStateType & before, const CounterStateType & after) @@ -4566,6 +4667,24 @@ inline double getRetiring(const CounterStateType & before, const CounterStateTyp return 0.; } +//! \brief Returns pipeline slots utilized by uops that eventually retire (commit) - heavy operations +template +inline double getHeavyOperationsBound(const CounterStateType & before, const CounterStateType & after) +{ + if (PCM::getInstance()->isHWTMAL2Supported()) + return double(after.HeavyOpsSlots - before.HeavyOpsSlots)/double(getAllSlots(before, after)); + return 0.; +} + +//! \brief Returns pipeline slots utilized by uops that eventually retire (commit) - light operations +template +inline double getLightOperationsBound(const CounterStateType & before, const CounterStateType & after) +{ + if (PCM::getInstance()->isHWTMAL2Supported()) + return getRetiring(before, after) - getHeavyOperationsBound(before, after); + return 0.; +} + template inline std::vector getRegisterEvent(const PCM::RawEventEncoding& eventEnc, const ValuesType& beforeValues, const ValuesType& afterValues) { diff --git a/src/pcm-raw.cpp b/src/pcm-raw.cpp index 888592dc..7c61d27c 100644 --- a/src/pcm-raw.cpp +++ b/src/pcm-raw.cpp @@ -1179,7 +1179,8 @@ uint64 nullFixedMetricFunc(const uint32, const ServerUncoreCounterState&, const } const char* fixedCoreEventNames[] = { "InstructionsRetired" , "Cycles", "RefCycles", "TopDownSlots" }; -const char* topdownEventNames[] = { "PERF_METRICS.FRONTEND_BOUND" , "PERF_METRICS.BAD_SPECULATION", "PERF_METRICS.BACKEND_BOUND", "PERF_METRICS.RETIRING" }; +const char* topdownEventNames[] = { "PERF_METRICS.FRONTEND_BOUND" , "PERF_METRICS.BAD_SPECULATION", "PERF_METRICS.BACKEND_BOUND", "PERF_METRICS.RETIRING", + "PERF_METRICS.HEAVY_OPERATIONS", "PERF_METRICS.BRANCH_MISPREDICTS", "PERF_METRICS.FETCH_LATENCY", "PERF_METRICS.MEMORY_BOUND"}; constexpr uint32 PerfMetricsConfig = 2; constexpr uint64 PerfMetricsMask = 1ULL; constexpr uint64 maxPerfMetricsValue = 255ULL; @@ -1233,6 +1234,11 @@ enum MSRScope Package }; +uint32 numTMAEvents(PCM* m) +{ + return (m->isHWTMAL2Supported() ? 8 : 4); +} + void printTransposed(const PCM::RawPMUConfigs& curPMUConfigs, PCM* m, SystemCounterState& SysBeforeState, SystemCounterState& SysAfterState, @@ -1470,7 +1476,15 @@ void printTransposed(const PCM::RawPMUConfigs& curPMUConfigs, static FuncType funcTopDown[] = { [](const CoreCounterState& before, const CoreCounterState& after) { return uint64(getFrontendBound(before, after) * maxPerfMetricsValue); }, [](const CoreCounterState& before, const CoreCounterState& after) { return uint64(getBadSpeculation(before, after) * maxPerfMetricsValue); }, [](const CoreCounterState& before, const CoreCounterState& after) { return uint64(getBackendBound(before, after) * maxPerfMetricsValue); }, - [](const CoreCounterState& before, const CoreCounterState& after) { return uint64(getRetiring(before, after) * maxPerfMetricsValue); } + [](const CoreCounterState& before, const CoreCounterState& after) { return uint64(getRetiring(before, after) * maxPerfMetricsValue); }, + // "PERF_METRICS.HEAVY_OPERATIONS" : + [](const CoreCounterState& before, const CoreCounterState& after) { return uint64(getHeavyOperationsBound(before, after) * maxPerfMetricsValue); }, + // "PERF_METRICS.BRANCH_MISPREDICTS" : + [](const CoreCounterState& before, const CoreCounterState& after) { return uint64(getBranchMispredictionBound(before, after) * maxPerfMetricsValue); }, + // "PERF_METRICS.FETCH_LATENCY" : + [](const CoreCounterState& before, const CoreCounterState& after) { return uint64(getFetchLatencyBound(before, after) * maxPerfMetricsValue); }, + // "PERF_METRICS.MEMORY_BOUND" : + [](const CoreCounterState& before, const CoreCounterState& after) { return uint64(getMemoryBound(before, after) * maxPerfMetricsValue); } }; for (const auto& event : fixedEvents) { @@ -1488,7 +1502,7 @@ void printTransposed(const PCM::RawPMUConfigs& curPMUConfigs, if (cnt == 3 && (event.first[PerfMetricsConfig] & PerfMetricsMask)) { - for (uint32 t = 0; t < 4; ++t) + for (uint32 t = 0; t < numTMAEvents(m); ++t) { printRow(topdownEventNames[t], funcTopDown[t], BeforeState, AfterState, m, outputType, printOffset, coreType, type); } @@ -1766,7 +1780,11 @@ void print(const PCM::RawPMUConfigs& curPMUConfigs, uint64(getFrontendBound(BeforeState[core], AfterState[core]) * maxPerfMetricsValue), uint64(getBadSpeculation(BeforeState[core], AfterState[core]) * maxPerfMetricsValue), uint64(getBackendBound(BeforeState[core], AfterState[core]) * maxPerfMetricsValue), - uint64(getRetiring(BeforeState[core], AfterState[core]) * maxPerfMetricsValue) + uint64(getRetiring(BeforeState[core], AfterState[core]) * maxPerfMetricsValue), + uint64(getHeavyOperationsBound(BeforeState[core], AfterState[core]) * maxPerfMetricsValue), + uint64(getBranchMispredictionBound(BeforeState[core], AfterState[core]) * maxPerfMetricsValue), + uint64(getFetchLatencyBound(BeforeState[core], AfterState[core]) * maxPerfMetricsValue), + uint64(getMemoryBound(BeforeState[core], AfterState[core]) * maxPerfMetricsValue) }; for (const auto& event : fixedEvents) { @@ -1784,7 +1802,7 @@ void print(const PCM::RawPMUConfigs& curPMUConfigs, print(event.second.empty() ? fixedCoreEventNames[cnt] : event.second, fixedCtrValues[cnt]); if (cnt == 3 && (event.first[PerfMetricsConfig] & PerfMetricsMask)) { - for (uint32 t = 0; t < 4; ++t) + for (uint32 t = 0; t < numTMAEvents(m); ++t) { print(topdownEventNames[t], topdownCtrValues[t]); } diff --git a/src/pcm-sensor-server.cpp b/src/pcm-sensor-server.cpp index 6e7aa692..866c9724 100644 --- a/src/pcm-sensor-server.cpp +++ b/src/pcm-sensor-server.cpp @@ -3181,7 +3181,9 @@ void printHelpText( std::string const & programName ) { std::cout << " -p portnumber : Run on port (default port is " << DEFAULT_HTTP_PORT << ")\n"; std::cout << " -r|--reset : Reset programming of the performance counters.\n"; std::cout << " -D|--debug level : level = 0: no debug info, > 0 increase verbosity.\n"; +#ifndef __APPLE__ std::cout << " -R|--real-time : If possible the daemon will run with real time\n"; +#endif std::cout << " priority, could be useful under heavy load to \n"; std::cout << " stabilize the async counter fetching.\n"; #if defined (USE_SSL) @@ -3209,7 +3211,9 @@ int mainThrows(int argc, char * argv[]) { bool useSSL = false; #endif bool forcedProgramming = false; +#ifndef __APPLE__ bool useRealtimePriority = false; +#endif bool forceRTMAbortMode = false; unsigned short port = 0; unsigned short debug_level = 0; @@ -3270,10 +3274,12 @@ int mainThrows(int argc, char * argv[]) { throw std::runtime_error( "main: Error no debug level argument given" ); } } +#ifndef __APPLE__ else if ( check_argument_equals( argv[i], {"-R", "--real-time"} ) ) { useRealtimePriority = true; } +#endif else if ( check_argument_equals( argv[i], {"--help", "-h", "/h"} ) ) { printHelpText( argv[0] ); @@ -3397,6 +3403,7 @@ int mainThrows(int argc, char * argv[]) { } #endif +#ifndef __APPLE__ if ( useRealtimePriority ) { int priority = sched_get_priority_min( SCHED_RR ); if ( priority == -1 ) { @@ -3414,6 +3421,7 @@ int mainThrows(int argc, char * argv[]) { } } } +#endif pid_t pid; if ( daemonMode ) diff --git a/src/pcm.cpp b/src/pcm.cpp index b8880e8f..77a60cfd 100644 --- a/src/pcm.cpp +++ b/src/pcm.cpp @@ -317,13 +317,26 @@ void print_output(PCM * m, cout << "\n PHYSICAL CORE IPC : " << getCoreIPC(sstate1, sstate2) << " => corresponds to " << 100. * (getCoreIPC(sstate1, sstate2) / double(m->getMaxIPC())) << " % utilization for cores in active state"; cout << "\n Instructions per nominal CPU cycle: " << getTotalExecUsage(sstate1, sstate2) << " => corresponds to " << 100. * (getTotalExecUsage(sstate1, sstate2) / double(m->getMaxIPC())) << " % core utilization over time interval\n"; } - if (m->isHWTMAL1Supported()) + if (m->isHWTMAL2Supported()) + { + cout << " Pipeline stalls: Frontend (fetch latency: " << int(100. * getFetchLatencyBound(sstate1, sstate2)) <<" %, fetch bandwidth: " << int(100. * getFetchBandwidthBound(sstate1, sstate2)) << + " %)\n bad Speculation (branch misprediction: " << int(100. * getBranchMispredictionBound(sstate1, sstate2)) << + " %, machine clears: " << int(100. * getMachineClearsBound(sstate1, sstate2)) << + " %)\n Backend (buffer/cache/memory: " << int(100. * getMemoryBound(sstate1, sstate2)) << + " %, core: " << int(100. * getCoreBound(sstate1, sstate2)) << + " %)\n Retiring (heavy operations: " << int(100. * getHeavyOperationsBound(sstate1, sstate2)) << + " %, light operations: " << int(100. * getLightOperationsBound(sstate1, sstate2)) << " %)\n"; + } + else if (m->isHWTMAL1Supported()) { cout << " Pipeline stalls: Frontend bound: " << int(100. * getFrontendBound(sstate1, sstate2)) << " %, bad Speculation: " << int(100. * getBadSpeculation(sstate1, sstate2)) << " %, Backend bound: " << int(100. * getBackendBound(sstate1, sstate2)) << " %, Retiring: " << int(100. * getRetiring(sstate1, sstate2)) << " %\n"; + } + if (m->isHWTMAL1Supported()) + { std::vector TMAStackedBar; TMAStackedBar.push_back(StackedBarItem(getFrontendBound(sstate1, sstate2), "", 'F')); TMAStackedBar.push_back(StackedBarItem(getBadSpeculation(sstate1, sstate2), "", 'S')); @@ -332,6 +345,7 @@ void print_output(PCM * m, drawStackedBar(" Pipeline stall distribution ", TMAStackedBar, 80); cout << "\n"; } + cout << " SMI count: " << getSMICount(sstate1, sstate2) << "\n"; } @@ -555,6 +569,11 @@ void print_basic_metrics_csv_header(const PCM * m) cout << "L2MPI,"; if (m->isHWTMAL1Supported()) cout << "Frontend_bound(%),Bad_Speculation(%),Backend_Bound(%),Retiring(%),"; + if (m->isHWTMAL2Supported()) + { + cout << "Fetch_latency_bound(%),Fetch_bandwidth_bound(%),Branch_misprediction_bound(%),Machine_clears_bound(%)," + << "Buffer_Cache_Memory_bound(%),Core_bound(%),Heavy_operations_bound(%),Light_operations_bound(%),"; + } } void print_csv_header_helper(const string & header, int count=1){ @@ -582,6 +601,8 @@ void print_basic_metrics_csv_semicolons(const PCM * m, const string & header) print_csv_header_helper(header); // L2MPI; if (m->isHWTMAL1Supported()) print_csv_header_helper(header, 4); // Frontend_bound(%),Bad_Speculation(%),Backend_Bound(%),Retiring(%) + if (m->isHWTMAL2Supported()) + print_csv_header_helper(header, 8); } void print_csv_header(PCM * m, @@ -956,6 +977,17 @@ void print_basic_metrics_csv(const PCM * m, const State & state1, const State & cout << ',' << int(100. * getBackendBound(state1, state2)); cout << ',' << int(100. * getRetiring(state1, state2)); } + if (m->isHWTMAL2Supported()) + { + cout << ',' << int(100. * getFetchLatencyBound(state1, state2)); + cout << ',' << int(100. * getFetchBandwidthBound(state1, state2)); + cout << ',' << int(100. * getBranchMispredictionBound(state1, state2)); + cout << ',' << int(100. * getMachineClearsBound(state1, state2)); + cout << ',' << int(100. * getMemoryBound(state1, state2)); + cout << ',' << int(100. * getCoreBound(state1, state2)); + cout << ',' << int(100. * getHeavyOperationsBound(state1, state2)); + cout << ',' << int(100. * getLightOperationsBound(state1, state2)); + } if (print_last_semicolon) cout << ","; } diff --git a/src/types.h b/src/types.h index 884c219c..3f911d2e 100644 --- a/src/types.h +++ b/src/types.h @@ -62,7 +62,8 @@ constexpr auto IA32_PEBS_ENABLE_ADDR = 0x3F1; #define PERF_MAX_FIXED_COUNTERS (3) #define PERF_MAX_CUSTOM_COUNTERS (8) -#define PERF_TOPDOWN_COUNTERS (5) +#define PERF_TOPDOWN_COUNTERS_L1 (5) +#define PERF_TOPDOWN_COUNTERS (PERF_TOPDOWN_COUNTERS_L1 + 4) #define PERF_MAX_COUNTERS (PERF_MAX_FIXED_COUNTERS + PERF_MAX_CUSTOM_COUNTERS + PERF_TOPDOWN_COUNTERS) #define IA32_DEBUGCTL (0x1D9)