diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 107136e6..050c5398 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -16,8 +16,8 @@ endif()
 if(UNIX)  # LINUX, FREE_BSD, APPLE
     if (NOT APPLE)
       set(CMAKE_EXE_LINKER_FLAGS_RELEASE "${CMAKE_EXE_LINKER_FLAGS} -s")  # --strip-unneeded for packaging
-      list(APPEND PROJECT_NAMES pcm-sensor-server)
     endif()
+    list(APPEND PROJECT_NAMES pcm-sensor-server)
     list(APPEND PROJECT_NAMES pcm-sensor)
 
     # libpcm.a
diff --git a/src/cpucounters.cpp b/src/cpucounters.cpp
index e2b4609e..0e7e36fa 100644
--- a/src/cpucounters.cpp
+++ b/src/cpucounters.cpp
@@ -889,8 +889,13 @@ constexpr auto perfBadSpecPath = "/sys/bus/event_source/devices/cpu/events/topdo
 constexpr auto perfBackEndPath = "/sys/bus/event_source/devices/cpu/events/topdown-be-bound";
 constexpr auto perfFrontEndPath = "/sys/bus/event_source/devices/cpu/events/topdown-fe-bound";
 constexpr auto perfRetiringPath = "/sys/bus/event_source/devices/cpu/events/topdown-retiring";
+// L2 extensions:
+constexpr auto perfBrMispred = "/sys/bus/event_source/devices/cpu/events/topdown-br-mispredict";
+constexpr auto perfFetchLat = "/sys/bus/event_source/devices/cpu/events/topdown-fetch-lat";
+constexpr auto perfHeavyOps = "/sys/bus/event_source/devices/cpu/events/topdown-heavy-ops";
+constexpr auto perfMemBound = "/sys/bus/event_source/devices/cpu/events/topdown-mem-bound";
 
-bool perfSupportsTopDown()
+bool PCM::perfSupportsTopDown()
 {
     static int yes = -1;
     if (-1 == yes)
@@ -900,7 +905,16 @@ bool perfSupportsTopDown()
         const auto be = readSysFS(perfBackEndPath, true);
         const auto fe = readSysFS(perfFrontEndPath, true);
         const auto ret = readSysFS(perfRetiringPath, true);
-        yes = (slots.size() && bad.size() && be.size() && fe.size() && ret.size()) ? 1 : 0;
+        bool supported = slots.size() && bad.size() && be.size() && fe.size() && ret.size();
+        if (isHWTMAL2Supported())
+        {
+            supported = supported &&
+                readSysFS("/sys/bus/event_source/devices/cpu/events/topdown-br-mispredict", true).size() &&
+                readSysFS("/sys/bus/event_source/devices/cpu/events/topdown-fetch-lat", true).size() &&
+                readSysFS("/sys/bus/event_source/devices/cpu/events/topdown-heavy-ops", true).size() &&
+                readSysFS("/sys/bus/event_source/devices/cpu/events/topdown-mem-bound", true).size();
+        }
+        yes = supported ? 1 : 0;
     }
     return 1 == yes;
 }
@@ -1535,6 +1549,10 @@ bool PCM::discoverSystemTopology()
     BackendBoundSlots.resize(num_cores, 0);
     RetiringSlots.resize(num_cores, 0);
     AllSlotsRaw.resize(num_cores, 0);
+    MemBoundSlots.resize(num_cores, 0);
+    FetchLatSlots.resize(num_cores, 0);
+    BrMispredSlots.resize(num_cores, 0);
+    HeavyOpsSlots.resize(num_cores, 0);
 
 #if 0
     std::cerr << "Socket reference cores:\n";
@@ -1602,7 +1620,12 @@ void PCM::printSystemTopology() const
 
 bool PCM::initMSR()
 {
-#ifndef __APPLE__
+#ifdef __APPLE__
+    for (size_t i=0; i < MSR.size(); ++i)
+    {
+        systemTopology->addMSRHandleToOSThread(MSR[i], (uint32)i);
+    }
+#else
     try
     {
         for (int i = 0; i < (int)num_cores; ++i)
@@ -3918,11 +3941,18 @@ PCM::ErrorCode PCM::programCoreCounters(const int i /* core */,
     {
 	    if (isFixedCounterSupported(3) && isHWTMAL1Supported() && perfSupportsTopDown())
         {
-            const auto topDownEvents = {  std::make_pair(perfSlotsPath, PERF_TOPDOWN_SLOTS_POS),
+            std::vector<std::pair<const char*, int> > topDownEvents = {  std::make_pair(perfSlotsPath, PERF_TOPDOWN_SLOTS_POS),
                                           std::make_pair(perfBadSpecPath, PERF_TOPDOWN_BADSPEC_POS),
                                           std::make_pair(perfBackEndPath, PERF_TOPDOWN_BACKEND_POS),
                                           std::make_pair(perfFrontEndPath, PERF_TOPDOWN_FRONTEND_POS),
                                           std::make_pair(perfRetiringPath, PERF_TOPDOWN_RETIRING_POS)};
+            if (isHWTMAL2Supported())
+            {
+                topDownEvents.push_back(std::make_pair(perfMemBound, PERF_TOPDOWN_MEM_BOUND_POS));
+                topDownEvents.push_back(std::make_pair(perfFetchLat, PERF_TOPDOWN_FETCH_LAT_POS));
+                topDownEvents.push_back(std::make_pair(perfBrMispred, PERF_TOPDOWN_BR_MISPRED_POS));
+                topDownEvents.push_back(std::make_pair(perfHeavyOps, PERF_TOPDOWN_HEAVY_OPS_POS));
+            }
             int readPos = core_fixed_counter_num_used + core_gen_counter_num_used;
             leader_counter = -1;
             for (const auto & event : topDownEvents)
@@ -5058,8 +5088,9 @@ void PCM::readPerfData(uint32 core, std::vector<uint64> & outData)
     if (isHWTMAL1Supported() && perfSupportsTopDown())
     {
         std::vector<uint64> outTopDownData(outData.size(), 0);
-        readPerfDataHelper(core, outTopDownData, PERF_TOPDOWN_GROUP_LEADER_COUNTER, PERF_TOPDOWN_COUNTERS);
-        std::copy(outTopDownData.begin(), outTopDownData.begin() + PERF_TOPDOWN_COUNTERS, outData.begin() + core_fixed_counter_num_used + core_gen_counter_num_used);
+        const auto topdownCtrNum = isHWTMAL2Supported() ? PERF_TOPDOWN_COUNTERS : PERF_TOPDOWN_COUNTERS_L1;
+        readPerfDataHelper(core, outTopDownData, PERF_TOPDOWN_GROUP_LEADER_COUNTER, topdownCtrNum);
+        std::copy(outTopDownData.begin(), outTopDownData.begin() + topdownCtrNum, outData.begin() + core_fixed_counter_num_used + core_gen_counter_num_used);
     }
 }
 #endif
@@ -5089,6 +5120,7 @@ void BasicCounterState::readAndAggregateTSC(std::shared_ptr<SafeMsrHandle> msr)
 
 void BasicCounterState::readAndAggregate(std::shared_ptr<SafeMsrHandle> msr)
 {
+    assert(msr.get());
     uint64 cInstRetiredAny = 0, cCpuClkUnhaltedThread = 0, cCpuClkUnhaltedRef = 0;
     uint64 cL3Occupancy = 0;
     uint64 cCustomEvents[PERF_MAX_CUSTOM_COUNTERS] = {0ULL, 0ULL, 0ULL, 0ULL, 0ULL, 0ULL, 0ULL, 0ULL };
@@ -5101,6 +5133,10 @@ void BasicCounterState::readAndAggregate(std::shared_ptr<SafeMsrHandle> msr)
     uint64 cBackendBoundSlots = 0;
     uint64 cRetiringSlots = 0;
     uint64 cAllSlotsRaw = 0;
+    uint64 cMemBoundSlots = 0;
+    uint64 cFetchLatSlots = 0;
+    uint64 cBrMispredSlots = 0;
+    uint64 cHeavyOpsSlots = 0;
     const int32 core_id = msr->getCoreId();
     TemporalThreadAffinity tempThreadAffinity(core_id); // speedup trick for Linux
 
@@ -5130,7 +5166,7 @@ void BasicCounterState::readAndAggregate(std::shared_ptr<SafeMsrHandle> msr)
         {
             cCustomEvents[i] = perfData[PCM::PERF_GEN_EVENT_0_POS + i];
         }
-        if (m->isHWTMAL1Supported() && perfSupportsTopDown())
+        if (m->isHWTMAL1Supported() && m->perfSupportsTopDown())
         {
             cFrontendBoundSlots =   perfData[m->perfTopDownPos[PCM::PERF_TOPDOWN_FRONTEND_POS]];
             cBadSpeculationSlots =  perfData[m->perfTopDownPos[PCM::PERF_TOPDOWN_BADSPEC_POS]];
@@ -5138,6 +5174,13 @@ void BasicCounterState::readAndAggregate(std::shared_ptr<SafeMsrHandle> msr)
             cRetiringSlots =        perfData[m->perfTopDownPos[PCM::PERF_TOPDOWN_RETIRING_POS]];
             cAllSlotsRaw =          perfData[m->perfTopDownPos[PCM::PERF_TOPDOWN_SLOTS_POS]];
 //          if (core_id == 0) std::cout << "DEBUG: All: "<< cAllSlotsRaw << " FE: " << cFrontendBoundSlots << " BAD-SP: " << cBadSpeculationSlots << " BE: " << cBackendBoundSlots << " RET: " << cRetiringSlots << std::endl;
+            if (m->isHWTMAL2Supported())
+            {
+                cMemBoundSlots = perfData[m->perfTopDownPos[PCM::PERF_TOPDOWN_MEM_BOUND_POS]];
+                cFetchLatSlots = perfData[m->perfTopDownPos[PCM::PERF_TOPDOWN_FETCH_LAT_POS]];
+                cBrMispredSlots = perfData[m->perfTopDownPos[PCM::PERF_TOPDOWN_BR_MISPRED_POS]];;
+                cHeavyOpsSlots = perfData[m->perfTopDownPos[PCM::PERF_TOPDOWN_HEAVY_OPS_POS]];
+            }
         }
     }
     else
@@ -5170,6 +5213,13 @@ void BasicCounterState::readAndAggregate(std::shared_ptr<SafeMsrHandle> msr)
             cBadSpeculationSlots = extract_bits(perfMetrics, 8, 15);
             cBackendBoundSlots = extract_bits(perfMetrics, 24, 31);
             cRetiringSlots = extract_bits(perfMetrics, 0, 7);
+            if (m->isHWTMAL2Supported())
+            {
+                cMemBoundSlots = extract_bits(perfMetrics,  32 + 3*8, 32 + 3*8 + 7);
+                cFetchLatSlots = extract_bits(perfMetrics,  32 + 2*8, 32 + 2*8 + 7);
+                cBrMispredSlots = extract_bits(perfMetrics, 32 + 1*8, 32 + 1*8 + 7);
+                cHeavyOpsSlots = extract_bits(perfMetrics,    32 + 0*8, 32 + 0*8 + 7);
+            }
             const double total = double(cFrontendBoundSlots + cBadSpeculationSlots + cBackendBoundSlots + cRetiringSlots);
             if (total != 0)
             {
@@ -5177,6 +5227,13 @@ void BasicCounterState::readAndAggregate(std::shared_ptr<SafeMsrHandle> msr)
                 cBadSpeculationSlots = m->BadSpeculationSlots[core_id] += uint64((double(cBadSpeculationSlots) / total) * double(slots));
                 cBackendBoundSlots = m->BackendBoundSlots[core_id] += uint64((double(cBackendBoundSlots) / total) * double(slots));
                 cRetiringSlots = m->RetiringSlots[core_id] += uint64((double(cRetiringSlots) / total) * double(slots));
+                if (m->isHWTMAL2Supported())
+                {
+                    cMemBoundSlots = m->MemBoundSlots[core_id] += uint64((double(cMemBoundSlots) / total) * double(slots));
+                    cFetchLatSlots = m->FetchLatSlots[core_id] += uint64((double(cFetchLatSlots) / total) * double(slots));
+                    cBrMispredSlots = m->BrMispredSlots[core_id] += uint64((double(cBrMispredSlots) / total) * double(slots));
+                    cHeavyOpsSlots = m->HeavyOpsSlots[core_id] += uint64((double(cHeavyOpsSlots) / total) * double(slots));
+                }
             }
             cAllSlotsRaw = m->AllSlotsRaw[core_id] += slots;
             // std::cout << "DEBUG: "<< slots << " " << cFrontendBoundSlots << " " << cBadSpeculationSlots << " " << cBackendBoundSlots << " " << cRetiringSlots << std::endl;
@@ -5250,6 +5307,10 @@ void BasicCounterState::readAndAggregate(std::shared_ptr<SafeMsrHandle> msr)
     BackendBoundSlots   += cBackendBoundSlots;
     RetiringSlots       += cRetiringSlots;
     AllSlotsRaw         += cAllSlotsRaw;
+    MemBoundSlots       += cMemBoundSlots;
+    FetchLatSlots       += cFetchLatSlots;
+    BrMispredSlots      += cBrMispredSlots;
+    HeavyOpsSlots       += cHeavyOpsSlots;
 
     if (freezeUnfreeze)
     {
diff --git a/src/cpucounters.h b/src/cpucounters.h
index 367d6939..8dffcad4 100644
--- a/src/cpucounters.h
+++ b/src/cpucounters.h
@@ -687,6 +687,7 @@ class PCM_API PCM
     bool forceRTMAbortMode;
 
     std::vector<uint64> FrontendBoundSlots, BadSpeculationSlots, BackendBoundSlots, RetiringSlots, AllSlotsRaw;
+    std::vector<uint64> MemBoundSlots, FetchLatSlots, BrMispredSlots, HeavyOpsSlots;
     bool isFixedCounterSupported(unsigned c);
     bool vm = false;
     bool linux_arch_perfmon = false;
@@ -941,10 +942,14 @@ class PCM_API PCM
         PERF_TOPDOWN_FRONTEND_POS = PERF_TOPDOWN_SLOTS_POS + 1,
         PERF_TOPDOWN_BADSPEC_POS = PERF_TOPDOWN_SLOTS_POS + 2,
         PERF_TOPDOWN_BACKEND_POS = PERF_TOPDOWN_SLOTS_POS + 3,
-        PERF_TOPDOWN_RETIRING_POS = PERF_TOPDOWN_SLOTS_POS + 4
+        PERF_TOPDOWN_RETIRING_POS = PERF_TOPDOWN_SLOTS_POS + 4,
+        PERF_TOPDOWN_MEM_BOUND_POS = PERF_TOPDOWN_SLOTS_POS + 5,
+        PERF_TOPDOWN_FETCH_LAT_POS = PERF_TOPDOWN_SLOTS_POS + 6,
+        PERF_TOPDOWN_BR_MISPRED_POS = PERF_TOPDOWN_SLOTS_POS + 7,
+        PERF_TOPDOWN_HEAVY_OPS_POS = PERF_TOPDOWN_SLOTS_POS + 8
     };
 
-    std::array<int, (PERF_TOPDOWN_RETIRING_POS + 1)> perfTopDownPos;
+    std::array<int, (PERF_TOPDOWN_HEAVY_OPS_POS + 1)> perfTopDownPos;
 
     enum {
         PERF_GROUP_LEADER_COUNTER = PERF_INST_RETIRED_POS,
@@ -1091,12 +1096,25 @@ class PCM_API PCM
     void initUncorePMUsPerf();
     bool isRDTDisabled() const;
 
+#ifdef __linux__
+    bool perfSupportsTopDown();
+#endif
+
 public:
     static bool isInitialized() { return instance != nullptr; }
 
     //! check if TMA level 1 metrics are supported
     bool isHWTMAL1Supported() const;
 
+    //! check if TMA level 2 metrics are supported
+    bool isHWTMAL2Supported() const
+    {
+        return isHWTMAL1Supported() &&
+                (
+                    SPR == cpu_model
+                );
+    }
+
     enum EventPosition
     {
         TOR_OCCUPANCY = 0,
@@ -2596,6 +2614,22 @@ class BasicCounterState
     template <class CounterStateType>
     friend double getRetiring(const CounterStateType & before, const CounterStateType & after);
     template <class CounterStateType>
+    friend double getFetchLatencyBound(const CounterStateType & before, const CounterStateType & after);
+    template <class CounterStateType>
+    friend double getFetchBandwidthBound(const CounterStateType & before, const CounterStateType & after);
+    template <class CounterStateType>
+    friend double getBranchMispredictionBound(const CounterStateType & before, const CounterStateType & after);
+    template <class CounterStateType>
+    friend double getMachineClearsBound(const CounterStateType & before, const CounterStateType & after);
+    template <class CounterStateType>
+    friend double getMemoryBound(const CounterStateType & before, const CounterStateType & after);
+    template <class CounterStateType>
+    friend double getCoreBound(const CounterStateType & before, const CounterStateType & after);
+    template <class CounterStateType>
+    friend double getHeavyOperationsBound(const CounterStateType & before, const CounterStateType & after);
+    template <class CounterStateType>
+    friend double getLightOperationsBound(const CounterStateType & before, const CounterStateType & after);
+    template <class CounterStateType>
     friend uint64 getMSREvent(const uint64 & index, const PCM::MSRType & type, const CounterStateType& before, const CounterStateType& after);
 protected:
     checked_uint64 InstRetiredAny{};
@@ -2623,6 +2657,7 @@ class BasicCounterState
     uint64 MemoryBWTotal;
     uint64 SMICount;
     uint64 FrontendBoundSlots, BadSpeculationSlots, BackendBoundSlots, RetiringSlots, AllSlotsRaw;
+    uint64 MemBoundSlots, FetchLatSlots, BrMispredSlots, HeavyOpsSlots;
     std::unordered_map<uint64, uint64> MSRValues;
 
 public:
@@ -2637,7 +2672,11 @@ class BasicCounterState
     BadSpeculationSlots(0),
     BackendBoundSlots(0),
     RetiringSlots(0),
-    AllSlotsRaw(0)
+    AllSlotsRaw(0),
+    MemBoundSlots(0),
+    FetchLatSlots(0),
+    BrMispredSlots(0),
+    HeavyOpsSlots(0)
     {
         std::fill(CStateResidency, CStateResidency + PCM::MAX_C_STATE + 1, 0);
     }
@@ -2671,11 +2710,19 @@ class BasicCounterState
         BackendBoundSlots += o.BackendBoundSlots;
         RetiringSlots += o.RetiringSlots;
         AllSlotsRaw += o.AllSlotsRaw;
+        MemBoundSlots += o.MemBoundSlots;
+        FetchLatSlots += o.FetchLatSlots;
+        BrMispredSlots += o.BrMispredSlots;
+        HeavyOpsSlots += o.HeavyOpsSlots;
         //std::cout << "after PCM debug aggregate "<< FrontendBoundSlots << " " << BadSpeculationSlots << " " << BackendBoundSlots << " " <<RetiringSlots << std::endl;
         assert(FrontendBoundSlots >= old.FrontendBoundSlots);
         assert(BadSpeculationSlots >= old.BadSpeculationSlots);
         assert(BackendBoundSlots >= old.BackendBoundSlots);
         assert(RetiringSlots >= old.RetiringSlots);
+        assert(MemBoundSlots >= old.MemBoundSlots);
+        assert(FetchLatSlots >= old.FetchLatSlots);
+        assert(BrMispredSlots >= old.BrMispredSlots);
+        assert(HeavyOpsSlots >= old.HeavyOpsSlots);
         return *this;
     }
 
@@ -4536,6 +4583,24 @@ inline double getBackendBound(const CounterStateType & before, const CounterStat
     return 0.;
 }
 
+//! \brief Returns unutilized pipeline slots where no uop was delivered due to stalls on buffer, cache or memory resources as range 0..1
+template <class CounterStateType>
+inline double getMemoryBound(const CounterStateType & before, const CounterStateType & after)
+{
+    if (PCM::getInstance()->isHWTMAL2Supported())
+        return double(after.MemBoundSlots - before.MemBoundSlots)/double(getAllSlots(before, after));
+    return 0.;
+}
+
+//! \brief Returns unutilized pipeline slots where no uop was delivered due to lack of core resources as range 0..1
+template <class CounterStateType>
+inline double getCoreBound(const CounterStateType & before, const CounterStateType & after)
+{
+    if (PCM::getInstance()->isHWTMAL2Supported())
+        return getBackendBound(before, after) - getMemoryBound(before, after);
+    return 0.;
+}
+
 //! \brief Returns unutilized pipeline slots where Front-end did not deliver a uop while back-end is ready as range 0..1
 template <class CounterStateType>
 inline double getFrontendBound(const CounterStateType & before, const CounterStateType & after)
@@ -4546,6 +4611,24 @@ inline double getFrontendBound(const CounterStateType & before, const CounterSta
     return 0.;
 }
 
+//! \brief Returns unutilized pipeline slots where Front-end due to fetch latency constraints did not deliver a uop while back-end is ready as range 0..1
+template <class CounterStateType>
+inline double getFetchLatencyBound(const CounterStateType & before, const CounterStateType & after)
+{
+    if (PCM::getInstance()->isHWTMAL2Supported())
+        return double(after.FetchLatSlots - before.FetchLatSlots)/double(getAllSlots(before, after));
+    return 0.;
+}
+
+//! \brief Returns unutilized pipeline slots where Front-end due to fetch bandwidth constraints did not deliver a uop while back-end is ready as range 0..1
+template <class CounterStateType>
+inline double getFetchBandwidthBound(const CounterStateType & before, const CounterStateType & after)
+{
+    if (PCM::getInstance()->isHWTMAL2Supported())
+        return getFrontendBound(before, after) - getFetchLatencyBound(before, after);
+    return 0.;
+}
+
 //! \brief Returns wasted pipeline slots due to incorrect speculation, covering whole penalty: Utilized by uops that do not retire, or Recovery Bubbles (unutilized slots) as range 0..1
 template <class CounterStateType>
 inline double getBadSpeculation(const CounterStateType & before, const CounterStateType & after)
@@ -4556,6 +4639,24 @@ inline double getBadSpeculation(const CounterStateType & before, const CounterSt
     return 0.;
 }
 
+//! \brief Returns wasted pipeline slots due to incorrect speculation (branch misprediction), covering whole penalty: Utilized by uops that do not retire, or Recovery Bubbles (unutilized slots) as range 0..1
+template <class CounterStateType>
+inline double getBranchMispredictionBound(const CounterStateType & before, const CounterStateType & after)
+{
+    if (PCM::getInstance()->isHWTMAL2Supported())
+        return double(after.BrMispredSlots - before.BrMispredSlots)/double(getAllSlots(before, after));
+    return 0.;
+}
+
+//! \brief Returns wasted pipeline slots due to incorrect speculation (machine clears), covering whole penalty: Utilized by uops that do not retire, or Recovery Bubbles (unutilized slots) as range 0..1
+template <class CounterStateType>
+inline double getMachineClearsBound(const CounterStateType & before, const CounterStateType & after)
+{
+    if (PCM::getInstance()->isHWTMAL2Supported())
+        return getBadSpeculation(before, after) - getBranchMispredictionBound(before, after);
+    return 0.;
+}
+
 //! \brief Returns pipeline slots utilized by uops that eventually retire (commit)
 template <class CounterStateType>
 inline double getRetiring(const CounterStateType & before, const CounterStateType & after)
@@ -4566,6 +4667,24 @@ inline double getRetiring(const CounterStateType & before, const CounterStateTyp
     return 0.;
 }
 
+//! \brief Returns pipeline slots utilized by uops that eventually retire (commit) - heavy operations
+template <class CounterStateType>
+inline double getHeavyOperationsBound(const CounterStateType & before, const CounterStateType & after)
+{
+    if (PCM::getInstance()->isHWTMAL2Supported())
+        return double(after.HeavyOpsSlots - before.HeavyOpsSlots)/double(getAllSlots(before, after));
+    return 0.;
+}
+
+//! \brief Returns pipeline slots utilized by uops that eventually retire (commit) - light operations
+template <class CounterStateType>
+inline double getLightOperationsBound(const CounterStateType & before, const CounterStateType & after)
+{
+    if (PCM::getInstance()->isHWTMAL2Supported())
+        return getRetiring(before, after) - getHeavyOperationsBound(before, after);
+    return 0.;
+}
+
 template <class ValuesType>
 inline std::vector<uint64> getRegisterEvent(const PCM::RawEventEncoding& eventEnc, const ValuesType& beforeValues, const ValuesType& afterValues)
 {
diff --git a/src/pcm-raw.cpp b/src/pcm-raw.cpp
index 888592dc..7c61d27c 100644
--- a/src/pcm-raw.cpp
+++ b/src/pcm-raw.cpp
@@ -1179,7 +1179,8 @@ uint64 nullFixedMetricFunc(const uint32, const ServerUncoreCounterState&, const
 }
 
 const char* fixedCoreEventNames[] = { "InstructionsRetired" , "Cycles", "RefCycles", "TopDownSlots" };
-const char* topdownEventNames[] = { "PERF_METRICS.FRONTEND_BOUND" , "PERF_METRICS.BAD_SPECULATION", "PERF_METRICS.BACKEND_BOUND", "PERF_METRICS.RETIRING" };
+const char* topdownEventNames[] = { "PERF_METRICS.FRONTEND_BOUND" , "PERF_METRICS.BAD_SPECULATION", "PERF_METRICS.BACKEND_BOUND", "PERF_METRICS.RETIRING",
+                                    "PERF_METRICS.HEAVY_OPERATIONS", "PERF_METRICS.BRANCH_MISPREDICTS", "PERF_METRICS.FETCH_LATENCY", "PERF_METRICS.MEMORY_BOUND"};
 constexpr uint32 PerfMetricsConfig = 2;
 constexpr uint64 PerfMetricsMask = 1ULL;
 constexpr uint64 maxPerfMetricsValue = 255ULL;
@@ -1233,6 +1234,11 @@ enum MSRScope
     Package
 };
 
+uint32 numTMAEvents(PCM* m)
+{
+    return (m->isHWTMAL2Supported() ? 8 : 4);
+}
+
 void printTransposed(const PCM::RawPMUConfigs& curPMUConfigs,
     PCM* m,
     SystemCounterState& SysBeforeState, SystemCounterState& SysAfterState,
@@ -1470,7 +1476,15 @@ void printTransposed(const PCM::RawPMUConfigs& curPMUConfigs,
                 static FuncType funcTopDown[] = { [](const CoreCounterState& before, const CoreCounterState& after) { return uint64(getFrontendBound(before, after) * maxPerfMetricsValue); },
                               [](const CoreCounterState& before, const CoreCounterState& after) { return uint64(getBadSpeculation(before, after) * maxPerfMetricsValue); },
                               [](const CoreCounterState& before, const CoreCounterState& after) { return uint64(getBackendBound(before, after) * maxPerfMetricsValue); },
-                              [](const CoreCounterState& before, const CoreCounterState& after) { return uint64(getRetiring(before, after) * maxPerfMetricsValue); }
+                              [](const CoreCounterState& before, const CoreCounterState& after) { return uint64(getRetiring(before, after) * maxPerfMetricsValue); },
+                              // "PERF_METRICS.HEAVY_OPERATIONS" :
+                              [](const CoreCounterState& before, const CoreCounterState& after) { return uint64(getHeavyOperationsBound(before, after) * maxPerfMetricsValue); },
+                              // "PERF_METRICS.BRANCH_MISPREDICTS" :
+                              [](const CoreCounterState& before, const CoreCounterState& after) { return uint64(getBranchMispredictionBound(before, after) * maxPerfMetricsValue); },
+                              // "PERF_METRICS.FETCH_LATENCY" :
+                              [](const CoreCounterState& before, const CoreCounterState& after) { return uint64(getFetchLatencyBound(before, after) * maxPerfMetricsValue); },
+                              // "PERF_METRICS.MEMORY_BOUND" :
+                              [](const CoreCounterState& before, const CoreCounterState& after) { return uint64(getMemoryBound(before, after) * maxPerfMetricsValue); }
                 };
                 for (const auto& event : fixedEvents)
                 {
@@ -1488,7 +1502,7 @@ void printTransposed(const PCM::RawPMUConfigs& curPMUConfigs,
 
                             if (cnt == 3 && (event.first[PerfMetricsConfig] & PerfMetricsMask))
                             {
-                                for (uint32 t = 0; t < 4; ++t)
+                                for (uint32 t = 0; t < numTMAEvents(m); ++t)
                                 {
                                     printRow(topdownEventNames[t], funcTopDown[t], BeforeState, AfterState, m, outputType, printOffset, coreType, type);
                                 }
@@ -1766,7 +1780,11 @@ void print(const PCM::RawPMUConfigs& curPMUConfigs,
                     uint64(getFrontendBound(BeforeState[core], AfterState[core]) * maxPerfMetricsValue),
                     uint64(getBadSpeculation(BeforeState[core], AfterState[core]) * maxPerfMetricsValue),
                     uint64(getBackendBound(BeforeState[core], AfterState[core]) * maxPerfMetricsValue),
-                    uint64(getRetiring(BeforeState[core], AfterState[core]) * maxPerfMetricsValue)
+                    uint64(getRetiring(BeforeState[core], AfterState[core]) * maxPerfMetricsValue),
+                    uint64(getHeavyOperationsBound(BeforeState[core], AfterState[core]) * maxPerfMetricsValue),
+                    uint64(getBranchMispredictionBound(BeforeState[core], AfterState[core]) * maxPerfMetricsValue),
+                    uint64(getFetchLatencyBound(BeforeState[core], AfterState[core]) * maxPerfMetricsValue),
+                    uint64(getMemoryBound(BeforeState[core], AfterState[core]) * maxPerfMetricsValue)
                 };
                 for (const auto& event : fixedEvents)
                 {
@@ -1784,7 +1802,7 @@ void print(const PCM::RawPMUConfigs& curPMUConfigs,
                             print(event.second.empty() ? fixedCoreEventNames[cnt] : event.second, fixedCtrValues[cnt]);
                             if (cnt == 3 && (event.first[PerfMetricsConfig] & PerfMetricsMask))
                             {
-                                for (uint32 t = 0; t < 4; ++t)
+                                for (uint32 t = 0; t < numTMAEvents(m); ++t)
                                 {
                                     print(topdownEventNames[t], topdownCtrValues[t]);
                                 }
diff --git a/src/pcm-sensor-server.cpp b/src/pcm-sensor-server.cpp
index 6e7aa692..866c9724 100644
--- a/src/pcm-sensor-server.cpp
+++ b/src/pcm-sensor-server.cpp
@@ -3181,7 +3181,9 @@ void printHelpText( std::string const & programName ) {
     std::cout << "    -p portnumber        : Run on port <portnumber> (default port is " << DEFAULT_HTTP_PORT << ")\n";
     std::cout << "    -r|--reset           : Reset programming of the performance counters.\n";
     std::cout << "    -D|--debug level     : level = 0: no debug info, > 0 increase verbosity.\n";
+#ifndef __APPLE__
     std::cout << "    -R|--real-time       : If possible the daemon will run with real time\n";
+#endif
     std::cout << "                           priority, could be useful under heavy load to \n";
     std::cout << "                           stabilize the async counter fetching.\n";
 #if defined (USE_SSL)
@@ -3209,7 +3211,9 @@ int mainThrows(int argc, char * argv[]) {
     bool useSSL = false;
 #endif
     bool forcedProgramming = false;
+#ifndef __APPLE__
     bool useRealtimePriority = false;
+#endif
     bool forceRTMAbortMode = false;
     unsigned short port = 0;
     unsigned short debug_level = 0;
@@ -3270,10 +3274,12 @@ int mainThrows(int argc, char * argv[]) {
                     throw std::runtime_error( "main: Error no debug level argument given" );
                 }
             }
+#ifndef __APPLE__
             else if ( check_argument_equals( argv[i], {"-R", "--real-time"} ) )
             {
                 useRealtimePriority = true;
             }
+#endif
             else if ( check_argument_equals( argv[i], {"--help", "-h", "/h"} ) )
             {
                 printHelpText( argv[0] );
@@ -3397,6 +3403,7 @@ int mainThrows(int argc, char * argv[]) {
     }
 #endif
 
+#ifndef __APPLE__
     if ( useRealtimePriority ) {
         int priority = sched_get_priority_min( SCHED_RR );
         if ( priority == -1 ) {
@@ -3414,6 +3421,7 @@ int mainThrows(int argc, char * argv[]) {
             }
         }
     }
+#endif
 
     pid_t pid;
     if ( daemonMode )
diff --git a/src/pcm.cpp b/src/pcm.cpp
index b8880e8f..77a60cfd 100644
--- a/src/pcm.cpp
+++ b/src/pcm.cpp
@@ -317,13 +317,26 @@ void print_output(PCM * m,
             cout << "\n PHYSICAL CORE IPC                 : " << getCoreIPC(sstate1, sstate2) << " => corresponds to " << 100. * (getCoreIPC(sstate1, sstate2) / double(m->getMaxIPC())) << " % utilization for cores in active state";
             cout << "\n Instructions per nominal CPU cycle: " << getTotalExecUsage(sstate1, sstate2) << " => corresponds to " << 100. * (getTotalExecUsage(sstate1, sstate2) / double(m->getMaxIPC())) << " % core utilization over time interval\n";
         }
-        if (m->isHWTMAL1Supported())
+        if (m->isHWTMAL2Supported())
+        {
+            cout << " Pipeline stalls: Frontend (fetch latency: " << int(100. * getFetchLatencyBound(sstate1, sstate2)) <<" %, fetch bandwidth: " << int(100. * getFetchBandwidthBound(sstate1, sstate2)) <<
+                " %)\n                  bad Speculation (branch misprediction: " << int(100. * getBranchMispredictionBound(sstate1, sstate2)) <<
+                " %, machine clears: " << int(100. * getMachineClearsBound(sstate1, sstate2)) <<
+                " %)\n                  Backend (buffer/cache/memory: " << int(100. * getMemoryBound(sstate1, sstate2)) <<
+                " %, core: " << int(100. * getCoreBound(sstate1, sstate2)) <<
+                " %)\n                  Retiring (heavy operations: " << int(100. * getHeavyOperationsBound(sstate1, sstate2)) <<
+                " %, light operations: " << int(100. * getLightOperationsBound(sstate1, sstate2)) << " %)\n";
+        }
+        else if (m->isHWTMAL1Supported())
         {
             cout << " Pipeline stalls: Frontend bound: " << int(100. * getFrontendBound(sstate1, sstate2)) <<
                 " %, bad Speculation: " << int(100. * getBadSpeculation(sstate1, sstate2)) <<
                 " %, Backend bound: " << int(100. * getBackendBound(sstate1, sstate2)) <<
                 " %, Retiring: " << int(100. * getRetiring(sstate1, sstate2)) << " %\n";
+        }
 
+        if (m->isHWTMAL1Supported())
+        {
             std::vector<StackedBarItem> TMAStackedBar;
             TMAStackedBar.push_back(StackedBarItem(getFrontendBound(sstate1, sstate2), "", 'F'));
             TMAStackedBar.push_back(StackedBarItem(getBadSpeculation(sstate1, sstate2), "", 'S'));
@@ -332,6 +345,7 @@ void print_output(PCM * m,
             drawStackedBar(" Pipeline stall distribution ", TMAStackedBar, 80);
             cout << "\n";
         }
+
         cout << " SMI count: " << getSMICount(sstate1, sstate2) << "\n";
     }
 
@@ -555,6 +569,11 @@ void print_basic_metrics_csv_header(const PCM * m)
         cout << "L2MPI,";
     if (m->isHWTMAL1Supported())
         cout << "Frontend_bound(%),Bad_Speculation(%),Backend_Bound(%),Retiring(%),";
+    if (m->isHWTMAL2Supported())
+    {
+        cout << "Fetch_latency_bound(%),Fetch_bandwidth_bound(%),Branch_misprediction_bound(%),Machine_clears_bound(%),"
+             << "Buffer_Cache_Memory_bound(%),Core_bound(%),Heavy_operations_bound(%),Light_operations_bound(%),";
+    }
 }
 
 void print_csv_header_helper(const string & header, int count=1){
@@ -582,6 +601,8 @@ void print_basic_metrics_csv_semicolons(const PCM * m, const string & header)
         print_csv_header_helper(header);  // L2MPI;
     if (m->isHWTMAL1Supported())
         print_csv_header_helper(header, 4); // Frontend_bound(%),Bad_Speculation(%),Backend_Bound(%),Retiring(%)
+    if (m->isHWTMAL2Supported())
+        print_csv_header_helper(header, 8);
 }
 
 void print_csv_header(PCM * m,
@@ -956,6 +977,17 @@ void print_basic_metrics_csv(const PCM * m, const State & state1, const State &
         cout << ',' << int(100. * getBackendBound(state1, state2));
         cout << ',' << int(100. * getRetiring(state1, state2));
     }
+    if (m->isHWTMAL2Supported())
+    {
+        cout << ',' << int(100. * getFetchLatencyBound(state1, state2));
+        cout << ',' << int(100. * getFetchBandwidthBound(state1, state2));
+        cout << ',' << int(100. * getBranchMispredictionBound(state1, state2));
+        cout << ',' << int(100. * getMachineClearsBound(state1, state2));
+        cout << ',' << int(100. * getMemoryBound(state1, state2));
+        cout << ',' << int(100. * getCoreBound(state1, state2));
+        cout << ',' << int(100. * getHeavyOperationsBound(state1, state2));
+        cout << ',' << int(100. * getLightOperationsBound(state1, state2));
+    }
     if (print_last_semicolon)
         cout << ",";
 }
diff --git a/src/types.h b/src/types.h
index 884c219c..3f911d2e 100644
--- a/src/types.h
+++ b/src/types.h
@@ -62,7 +62,8 @@ constexpr auto IA32_PEBS_ENABLE_ADDR = 0x3F1;
 
 #define PERF_MAX_FIXED_COUNTERS          (3)
 #define PERF_MAX_CUSTOM_COUNTERS         (8)
-#define PERF_TOPDOWN_COUNTERS           (5)
+#define PERF_TOPDOWN_COUNTERS_L1        (5)
+#define PERF_TOPDOWN_COUNTERS           (PERF_TOPDOWN_COUNTERS_L1 + 4)
 #define PERF_MAX_COUNTERS               (PERF_MAX_FIXED_COUNTERS + PERF_MAX_CUSTOM_COUNTERS + PERF_TOPDOWN_COUNTERS)
 
 #define IA32_DEBUGCTL                   (0x1D9)