diff --git a/src/rocprof_compute_soc/analysis_configs/gfx906/1400_constant-cache.yaml b/src/rocprof_compute_soc/analysis_configs/gfx906/1400_constant-cache.yaml index b8a29a027..d4cac1c48 100644 --- a/src/rocprof_compute_soc/analysis_configs/gfx906/1400_constant-cache.yaml +++ b/src/rocprof_compute_soc/analysis_configs/gfx906/1400_constant-cache.yaml @@ -2,6 +2,68 @@ # Add description/tips for each metric in this section. # So it could be shown in hover. Metric Description: + Speed-of-Light: + Bandwidth: &SoL_Bandwidth_tip >- + The number of bytes looked up in the sL1D cache, as a percent of the peak theoretical + bandwidth. Calculated as the ratio of sL1D requests over the total sL1D cycles. + Cache Hit Rate: &SoL_Cache_Hit_Rate_tip >- + The percent of sL1D requests that hit on a previously loaded line in the cache. Calculated as + the ratio of the number of sL1D requests that hit over the number of all sL1D requests. + sL1D-L2 BW: &SoL_sL1D-L2_Bandwidth_tip >- + The number of bytes requested by the sL1D from the L2 cache, as a percent of the peak + theoretical sL1D → L2 cache bandwidth. Calculated as the ratio of the total number of + requests from the sL1D to the L2 cache over the total sL1D-L2 interface cycles. + Scalar L1D Cache Accesses: + Requests: &Accesses_Requests_tip >- + The total number of requests, of any size or type, made to the sL1D per normalization unit. + Hits: &Accesses_Hits_tip >- + The total number of sL1D requests that hit on a previously loaded cache line, per + normalization unit. + Misses - Non-Duplicated: &Accesses_Misses_Non-Duplicated_tip >- + The total number of sL1D requests that missed on a cache line that was not already pending due + to another request, per normalization unit. + Misses - Duplicated: &Accesses_Misses_Duplicated_tip >- + The total number of sL1D requests that missed on a cache line that was already pending due to + another request, per normalization unit. + Cache Hit Rate: &Accesses_Cache_Hit_Rate_tip >- + Indicates the percent of sL1D requests that hit on a previously loaded line the cache. The + ratio of the number of sL1D requests that hit over the number of all sL1D requests. + Read Requests (Total): &Accesses_Read_Requests_(Total)_tip >- + The total number of sL1D read requests of any size, per normalization unit. + Atomic Requests: &Accesses_Atomic_Requests_tip >- + The total number of sL1D atomic requests of any size, per normalization unit. Typically unused + on CDNA accelerators. + Read Requests (1 DWord): &Accesses_Read_Requests_(1_DWord)_tip >- + The total number of sL1D read requests made for a single dword of data (4B), per normalization + unit. + Read Requests (2 DWord): &Accesses_Read_Requests_(2_DWord)_tip >- + The total number of sL1D read requests made for a two dwords of data + (8B), per normalization unit. + Read Requests (4 DWord): &Accesses_Read_Requests_(4_DWord)_tip >- + The total number of sL1D read requests made for a four dwords of data + (16B), per normalization unit. + Read Requests (8 DWord): &Accesses_Read_Requests_(8_DWord)_tip >- + The total number of sL1D read requests made for a eight dwords of data + (32B), per normalization unit. + Read Requests (16 DWord): &Accesses_Read_Requests_(16_DWord)_tip >- + The total number of sL1D read requests made for a sixteen dwords of data + (64B), per normalization unit. + Scalar L1D Cache - L2 Interface: + sL1D-L2 Bandwidth: &sL1D-L2_Bandwidth_tip >- + The total number of bytes read from, written to, or atomically updated across the sL1D↔L2 + interface, per normalization unit. Note that sL1D writes and atomics are typically unused on + current CDNA accelerators, so in the majority of cases this can be interpreted as an sL1D→L2 + read bandwidth. + Read Requests: &sL1D-L2_Read_Requests_tip >- + The total number of read requests from sL1D to the L2, per normalization unit. + Write Requests: &sL1D-L2_Write_Requests_tip >- + The total number of write requests from sL1D to the L2, per normalization unit. Typically + unused on current CDNA accelerators. + Atomic Requests: &sL1D-L2_Atomic_Requests_tip >- + The total number of atomic requests from sL1D to the L2, per normalization unit. Typically + unused on current CDNA accelerators. + Stall Cycles: &sL1D-L2_Stall_Cycles_tip >- + The total number of cycles the sL1D↔L2 interface was stalled, per normalization unit. # Define the panel properties and properties of each metric in the panel. Panel Config: @@ -21,17 +83,17 @@ Panel Config: value: AVG(((SQC_DCACHE_REQ * 100000) / (($max_sclk * $sqc_per_gpu) * (End_Timestamp - Start_Timestamp)))) unit: Pct of Peak - tips: + tips: *SoL_Bandwidth_tip Cache Hit Rate: value: AVG((((SQC_DCACHE_HITS * 100) / (SQC_DCACHE_HITS + SQC_DCACHE_MISSES + SQC_DCACHE_MISSES_DUPLICATE)) if ((SQC_DCACHE_HITS + SQC_DCACHE_MISSES + SQC_DCACHE_MISSES_DUPLICATE) != 0) else None)) unit: Pct of Peak - tips: + tips: *SoL_Cache_Hit_Rate_tip sL1D-L2 BW: value: AVG(((SQC_TC_DATA_READ_REQ + SQC_TC_DATA_WRITE_REQ + SQC_TC_DATA_ATOMIC_REQ) * 100000) / (2 * ($max_sclk * $sqc_per_gpu) * (End_Timestamp - Start_Timestamp))) unit: Pct of Peak - tips: + tips: *SoL_sL1D-L2_Bandwidth_tip comparable: false # for now cli_style: simple_bar @@ -51,25 +113,25 @@ Panel Config: min: MIN((SQC_DCACHE_REQ / $denom)) max: MAX((SQC_DCACHE_REQ / $denom)) unit: (Req + $normUnit) - tips: + tips: *Accesses_Requests_tip Hits: avg: AVG((SQC_DCACHE_HITS / $denom)) min: MIN((SQC_DCACHE_HITS / $denom)) max: MAX((SQC_DCACHE_HITS / $denom)) unit: (Req + $normUnit) - tips: + tips: *Accesses_Hits_tip Misses - Non Duplicated: avg: AVG((SQC_DCACHE_MISSES / $denom)) min: MIN((SQC_DCACHE_MISSES / $denom)) max: MAX((SQC_DCACHE_MISSES / $denom)) unit: (Req + $normUnit) - tips: + tips: *Accesses_Misses_Non-Duplicated_tip Misses- Duplicated: avg: AVG((SQC_DCACHE_MISSES_DUPLICATE / $denom)) min: MIN((SQC_DCACHE_MISSES_DUPLICATE / $denom)) max: MAX((SQC_DCACHE_MISSES_DUPLICATE / $denom)) unit: (Req + $normUnit) - tips: + tips: *Accesses_Misses_Duplicated_tip Cache Hit Rate: avg: AVG((((100 * SQC_DCACHE_HITS) / ((SQC_DCACHE_HITS + SQC_DCACHE_MISSES) + SQC_DCACHE_MISSES_DUPLICATE)) if (((SQC_DCACHE_HITS + SQC_DCACHE_MISSES) @@ -81,7 +143,7 @@ Panel Config: + SQC_DCACHE_MISSES_DUPLICATE)) if (((SQC_DCACHE_HITS + SQC_DCACHE_MISSES) + SQC_DCACHE_MISSES_DUPLICATE) != 0) else None)) unit: pct - tips: + tips: *Accesses_Cache_Hit_Rate_tip Read Req (Total): avg: AVG((((((SQC_DCACHE_REQ_READ_1 + SQC_DCACHE_REQ_READ_2) + SQC_DCACHE_REQ_READ_4) + SQC_DCACHE_REQ_READ_8) + SQC_DCACHE_REQ_READ_16) / $denom)) @@ -90,43 +152,43 @@ Panel Config: max: MAX((((((SQC_DCACHE_REQ_READ_1 + SQC_DCACHE_REQ_READ_2) + SQC_DCACHE_REQ_READ_4) + SQC_DCACHE_REQ_READ_8) + SQC_DCACHE_REQ_READ_16) / $denom)) unit: (Req + $normUnit) - tips: + tips: *Accesses_Read_Requests_(Total)_tip Atomic Req: avg: AVG((SQC_DCACHE_ATOMIC / $denom)) min: MIN((SQC_DCACHE_ATOMIC / $denom)) max: MAX((SQC_DCACHE_ATOMIC / $denom)) unit: (Req + $normUnit) - tips: + tips: *Accesses_Atomic_Requests_tip Read Req (1 DWord): avg: AVG((SQC_DCACHE_REQ_READ_1 / $denom)) min: MIN((SQC_DCACHE_REQ_READ_1 / $denom)) max: MAX((SQC_DCACHE_REQ_READ_1 / $denom)) unit: (Req + $normUnit) - tips: + tips: *Accesses_Read_Requests_(1_DWord)_tip Read Req (2 DWord): avg: AVG((SQC_DCACHE_REQ_READ_2 / $denom)) min: MIN((SQC_DCACHE_REQ_READ_2 / $denom)) max: MAX((SQC_DCACHE_REQ_READ_2 / $denom)) unit: (Req + $normUnit) - tips: + tips: *Accesses_Read_Requests_(2_DWord)_tip Read Req (4 DWord): avg: AVG((SQC_DCACHE_REQ_READ_4 / $denom)) min: MIN((SQC_DCACHE_REQ_READ_4 / $denom)) max: MAX((SQC_DCACHE_REQ_READ_4 / $denom)) unit: (Req + $normUnit) - tips: + tips: *Accesses_Read_Requests_(4_DWord)_tip Read Req (8 DWord): avg: AVG((SQC_DCACHE_REQ_READ_8 / $denom)) min: MIN((SQC_DCACHE_REQ_READ_8 / $denom)) max: MAX((SQC_DCACHE_REQ_READ_8 / $denom)) unit: (Req + $normUnit) - tips: + tips: *Accesses_Read_Requests_(8_DWord)_tip Read Req (16 DWord): avg: AVG((SQC_DCACHE_REQ_READ_16 / $denom)) min: MIN((SQC_DCACHE_REQ_READ_16 / $denom)) max: MAX((SQC_DCACHE_REQ_READ_16 / $denom)) unit: (Req + $normUnit) - tips: + tips: *Accesses_Read_Requests_(16_DWord)_tip - metric_table: id: 1403 @@ -144,28 +206,28 @@ Panel Config: min: MIN(((((SQC_TC_DATA_READ_REQ + SQC_TC_DATA_WRITE_REQ + SQC_TC_DATA_ATOMIC_REQ) * 64)) / $denom)) max: MAX(((((SQC_TC_DATA_READ_REQ + SQC_TC_DATA_WRITE_REQ + SQC_TC_DATA_ATOMIC_REQ) * 64)) / $denom)) unit: (Bytes + $normUnit) - tips: + tips: *sL1D-L2_Bandwidth_tip Read Req: avg: AVG((SQC_TC_DATA_READ_REQ / $denom)) min: MIN((SQC_TC_DATA_READ_REQ / $denom)) max: MAX((SQC_TC_DATA_READ_REQ / $denom)) unit: (Req + $normUnit) - tips: + tips: *sL1D-L2_Read_Requests_tip Write Req: avg: AVG((SQC_TC_DATA_WRITE_REQ / $denom)) min: MIN((SQC_TC_DATA_WRITE_REQ / $denom)) max: MAX((SQC_TC_DATA_WRITE_REQ / $denom)) unit: (Req + $normUnit) - tips: + tips: *sL1D-L2_Write_Requests_tip Atomic Req: avg: AVG((SQC_TC_DATA_ATOMIC_REQ / $denom)) min: MIN((SQC_TC_DATA_ATOMIC_REQ / $denom)) max: MAX((SQC_TC_DATA_ATOMIC_REQ / $denom)) unit: (Req + $normUnit) - tips: + tips: *sL1D-L2_Atomic_Requests_tip Stall Cycles: avg: AVG((SQC_TC_STALL / $denom)) min: MIN((SQC_TC_STALL / $denom)) max: MAX((SQC_TC_STALL / $denom)) unit: (Cycles + $normUnit) - tips: + tips: *sL1D-L2_Stall_Cycles_tip diff --git a/src/rocprof_compute_soc/analysis_configs/gfx908/1400_constant-cache.yaml b/src/rocprof_compute_soc/analysis_configs/gfx908/1400_constant-cache.yaml index b8a29a027..d4cac1c48 100644 --- a/src/rocprof_compute_soc/analysis_configs/gfx908/1400_constant-cache.yaml +++ b/src/rocprof_compute_soc/analysis_configs/gfx908/1400_constant-cache.yaml @@ -2,6 +2,68 @@ # Add description/tips for each metric in this section. # So it could be shown in hover. Metric Description: + Speed-of-Light: + Bandwidth: &SoL_Bandwidth_tip >- + The number of bytes looked up in the sL1D cache, as a percent of the peak theoretical + bandwidth. Calculated as the ratio of sL1D requests over the total sL1D cycles. + Cache Hit Rate: &SoL_Cache_Hit_Rate_tip >- + The percent of sL1D requests that hit on a previously loaded line in the cache. Calculated as + the ratio of the number of sL1D requests that hit over the number of all sL1D requests. + sL1D-L2 BW: &SoL_sL1D-L2_Bandwidth_tip >- + The number of bytes requested by the sL1D from the L2 cache, as a percent of the peak + theoretical sL1D → L2 cache bandwidth. Calculated as the ratio of the total number of + requests from the sL1D to the L2 cache over the total sL1D-L2 interface cycles. + Scalar L1D Cache Accesses: + Requests: &Accesses_Requests_tip >- + The total number of requests, of any size or type, made to the sL1D per normalization unit. + Hits: &Accesses_Hits_tip >- + The total number of sL1D requests that hit on a previously loaded cache line, per + normalization unit. + Misses - Non-Duplicated: &Accesses_Misses_Non-Duplicated_tip >- + The total number of sL1D requests that missed on a cache line that was not already pending due + to another request, per normalization unit. + Misses - Duplicated: &Accesses_Misses_Duplicated_tip >- + The total number of sL1D requests that missed on a cache line that was already pending due to + another request, per normalization unit. + Cache Hit Rate: &Accesses_Cache_Hit_Rate_tip >- + Indicates the percent of sL1D requests that hit on a previously loaded line the cache. The + ratio of the number of sL1D requests that hit over the number of all sL1D requests. + Read Requests (Total): &Accesses_Read_Requests_(Total)_tip >- + The total number of sL1D read requests of any size, per normalization unit. + Atomic Requests: &Accesses_Atomic_Requests_tip >- + The total number of sL1D atomic requests of any size, per normalization unit. Typically unused + on CDNA accelerators. + Read Requests (1 DWord): &Accesses_Read_Requests_(1_DWord)_tip >- + The total number of sL1D read requests made for a single dword of data (4B), per normalization + unit. + Read Requests (2 DWord): &Accesses_Read_Requests_(2_DWord)_tip >- + The total number of sL1D read requests made for a two dwords of data + (8B), per normalization unit. + Read Requests (4 DWord): &Accesses_Read_Requests_(4_DWord)_tip >- + The total number of sL1D read requests made for a four dwords of data + (16B), per normalization unit. + Read Requests (8 DWord): &Accesses_Read_Requests_(8_DWord)_tip >- + The total number of sL1D read requests made for a eight dwords of data + (32B), per normalization unit. + Read Requests (16 DWord): &Accesses_Read_Requests_(16_DWord)_tip >- + The total number of sL1D read requests made for a sixteen dwords of data + (64B), per normalization unit. + Scalar L1D Cache - L2 Interface: + sL1D-L2 Bandwidth: &sL1D-L2_Bandwidth_tip >- + The total number of bytes read from, written to, or atomically updated across the sL1D↔L2 + interface, per normalization unit. Note that sL1D writes and atomics are typically unused on + current CDNA accelerators, so in the majority of cases this can be interpreted as an sL1D→L2 + read bandwidth. + Read Requests: &sL1D-L2_Read_Requests_tip >- + The total number of read requests from sL1D to the L2, per normalization unit. + Write Requests: &sL1D-L2_Write_Requests_tip >- + The total number of write requests from sL1D to the L2, per normalization unit. Typically + unused on current CDNA accelerators. + Atomic Requests: &sL1D-L2_Atomic_Requests_tip >- + The total number of atomic requests from sL1D to the L2, per normalization unit. Typically + unused on current CDNA accelerators. + Stall Cycles: &sL1D-L2_Stall_Cycles_tip >- + The total number of cycles the sL1D↔L2 interface was stalled, per normalization unit. # Define the panel properties and properties of each metric in the panel. Panel Config: @@ -21,17 +83,17 @@ Panel Config: value: AVG(((SQC_DCACHE_REQ * 100000) / (($max_sclk * $sqc_per_gpu) * (End_Timestamp - Start_Timestamp)))) unit: Pct of Peak - tips: + tips: *SoL_Bandwidth_tip Cache Hit Rate: value: AVG((((SQC_DCACHE_HITS * 100) / (SQC_DCACHE_HITS + SQC_DCACHE_MISSES + SQC_DCACHE_MISSES_DUPLICATE)) if ((SQC_DCACHE_HITS + SQC_DCACHE_MISSES + SQC_DCACHE_MISSES_DUPLICATE) != 0) else None)) unit: Pct of Peak - tips: + tips: *SoL_Cache_Hit_Rate_tip sL1D-L2 BW: value: AVG(((SQC_TC_DATA_READ_REQ + SQC_TC_DATA_WRITE_REQ + SQC_TC_DATA_ATOMIC_REQ) * 100000) / (2 * ($max_sclk * $sqc_per_gpu) * (End_Timestamp - Start_Timestamp))) unit: Pct of Peak - tips: + tips: *SoL_sL1D-L2_Bandwidth_tip comparable: false # for now cli_style: simple_bar @@ -51,25 +113,25 @@ Panel Config: min: MIN((SQC_DCACHE_REQ / $denom)) max: MAX((SQC_DCACHE_REQ / $denom)) unit: (Req + $normUnit) - tips: + tips: *Accesses_Requests_tip Hits: avg: AVG((SQC_DCACHE_HITS / $denom)) min: MIN((SQC_DCACHE_HITS / $denom)) max: MAX((SQC_DCACHE_HITS / $denom)) unit: (Req + $normUnit) - tips: + tips: *Accesses_Hits_tip Misses - Non Duplicated: avg: AVG((SQC_DCACHE_MISSES / $denom)) min: MIN((SQC_DCACHE_MISSES / $denom)) max: MAX((SQC_DCACHE_MISSES / $denom)) unit: (Req + $normUnit) - tips: + tips: *Accesses_Misses_Non-Duplicated_tip Misses- Duplicated: avg: AVG((SQC_DCACHE_MISSES_DUPLICATE / $denom)) min: MIN((SQC_DCACHE_MISSES_DUPLICATE / $denom)) max: MAX((SQC_DCACHE_MISSES_DUPLICATE / $denom)) unit: (Req + $normUnit) - tips: + tips: *Accesses_Misses_Duplicated_tip Cache Hit Rate: avg: AVG((((100 * SQC_DCACHE_HITS) / ((SQC_DCACHE_HITS + SQC_DCACHE_MISSES) + SQC_DCACHE_MISSES_DUPLICATE)) if (((SQC_DCACHE_HITS + SQC_DCACHE_MISSES) @@ -81,7 +143,7 @@ Panel Config: + SQC_DCACHE_MISSES_DUPLICATE)) if (((SQC_DCACHE_HITS + SQC_DCACHE_MISSES) + SQC_DCACHE_MISSES_DUPLICATE) != 0) else None)) unit: pct - tips: + tips: *Accesses_Cache_Hit_Rate_tip Read Req (Total): avg: AVG((((((SQC_DCACHE_REQ_READ_1 + SQC_DCACHE_REQ_READ_2) + SQC_DCACHE_REQ_READ_4) + SQC_DCACHE_REQ_READ_8) + SQC_DCACHE_REQ_READ_16) / $denom)) @@ -90,43 +152,43 @@ Panel Config: max: MAX((((((SQC_DCACHE_REQ_READ_1 + SQC_DCACHE_REQ_READ_2) + SQC_DCACHE_REQ_READ_4) + SQC_DCACHE_REQ_READ_8) + SQC_DCACHE_REQ_READ_16) / $denom)) unit: (Req + $normUnit) - tips: + tips: *Accesses_Read_Requests_(Total)_tip Atomic Req: avg: AVG((SQC_DCACHE_ATOMIC / $denom)) min: MIN((SQC_DCACHE_ATOMIC / $denom)) max: MAX((SQC_DCACHE_ATOMIC / $denom)) unit: (Req + $normUnit) - tips: + tips: *Accesses_Atomic_Requests_tip Read Req (1 DWord): avg: AVG((SQC_DCACHE_REQ_READ_1 / $denom)) min: MIN((SQC_DCACHE_REQ_READ_1 / $denom)) max: MAX((SQC_DCACHE_REQ_READ_1 / $denom)) unit: (Req + $normUnit) - tips: + tips: *Accesses_Read_Requests_(1_DWord)_tip Read Req (2 DWord): avg: AVG((SQC_DCACHE_REQ_READ_2 / $denom)) min: MIN((SQC_DCACHE_REQ_READ_2 / $denom)) max: MAX((SQC_DCACHE_REQ_READ_2 / $denom)) unit: (Req + $normUnit) - tips: + tips: *Accesses_Read_Requests_(2_DWord)_tip Read Req (4 DWord): avg: AVG((SQC_DCACHE_REQ_READ_4 / $denom)) min: MIN((SQC_DCACHE_REQ_READ_4 / $denom)) max: MAX((SQC_DCACHE_REQ_READ_4 / $denom)) unit: (Req + $normUnit) - tips: + tips: *Accesses_Read_Requests_(4_DWord)_tip Read Req (8 DWord): avg: AVG((SQC_DCACHE_REQ_READ_8 / $denom)) min: MIN((SQC_DCACHE_REQ_READ_8 / $denom)) max: MAX((SQC_DCACHE_REQ_READ_8 / $denom)) unit: (Req + $normUnit) - tips: + tips: *Accesses_Read_Requests_(8_DWord)_tip Read Req (16 DWord): avg: AVG((SQC_DCACHE_REQ_READ_16 / $denom)) min: MIN((SQC_DCACHE_REQ_READ_16 / $denom)) max: MAX((SQC_DCACHE_REQ_READ_16 / $denom)) unit: (Req + $normUnit) - tips: + tips: *Accesses_Read_Requests_(16_DWord)_tip - metric_table: id: 1403 @@ -144,28 +206,28 @@ Panel Config: min: MIN(((((SQC_TC_DATA_READ_REQ + SQC_TC_DATA_WRITE_REQ + SQC_TC_DATA_ATOMIC_REQ) * 64)) / $denom)) max: MAX(((((SQC_TC_DATA_READ_REQ + SQC_TC_DATA_WRITE_REQ + SQC_TC_DATA_ATOMIC_REQ) * 64)) / $denom)) unit: (Bytes + $normUnit) - tips: + tips: *sL1D-L2_Bandwidth_tip Read Req: avg: AVG((SQC_TC_DATA_READ_REQ / $denom)) min: MIN((SQC_TC_DATA_READ_REQ / $denom)) max: MAX((SQC_TC_DATA_READ_REQ / $denom)) unit: (Req + $normUnit) - tips: + tips: *sL1D-L2_Read_Requests_tip Write Req: avg: AVG((SQC_TC_DATA_WRITE_REQ / $denom)) min: MIN((SQC_TC_DATA_WRITE_REQ / $denom)) max: MAX((SQC_TC_DATA_WRITE_REQ / $denom)) unit: (Req + $normUnit) - tips: + tips: *sL1D-L2_Write_Requests_tip Atomic Req: avg: AVG((SQC_TC_DATA_ATOMIC_REQ / $denom)) min: MIN((SQC_TC_DATA_ATOMIC_REQ / $denom)) max: MAX((SQC_TC_DATA_ATOMIC_REQ / $denom)) unit: (Req + $normUnit) - tips: + tips: *sL1D-L2_Atomic_Requests_tip Stall Cycles: avg: AVG((SQC_TC_STALL / $denom)) min: MIN((SQC_TC_STALL / $denom)) max: MAX((SQC_TC_STALL / $denom)) unit: (Cycles + $normUnit) - tips: + tips: *sL1D-L2_Stall_Cycles_tip diff --git a/src/rocprof_compute_soc/analysis_configs/gfx90a/1400_constant-cache.yaml b/src/rocprof_compute_soc/analysis_configs/gfx90a/1400_constant-cache.yaml index b8a29a027..d4cac1c48 100644 --- a/src/rocprof_compute_soc/analysis_configs/gfx90a/1400_constant-cache.yaml +++ b/src/rocprof_compute_soc/analysis_configs/gfx90a/1400_constant-cache.yaml @@ -2,6 +2,68 @@ # Add description/tips for each metric in this section. # So it could be shown in hover. Metric Description: + Speed-of-Light: + Bandwidth: &SoL_Bandwidth_tip >- + The number of bytes looked up in the sL1D cache, as a percent of the peak theoretical + bandwidth. Calculated as the ratio of sL1D requests over the total sL1D cycles. + Cache Hit Rate: &SoL_Cache_Hit_Rate_tip >- + The percent of sL1D requests that hit on a previously loaded line in the cache. Calculated as + the ratio of the number of sL1D requests that hit over the number of all sL1D requests. + sL1D-L2 BW: &SoL_sL1D-L2_Bandwidth_tip >- + The number of bytes requested by the sL1D from the L2 cache, as a percent of the peak + theoretical sL1D → L2 cache bandwidth. Calculated as the ratio of the total number of + requests from the sL1D to the L2 cache over the total sL1D-L2 interface cycles. + Scalar L1D Cache Accesses: + Requests: &Accesses_Requests_tip >- + The total number of requests, of any size or type, made to the sL1D per normalization unit. + Hits: &Accesses_Hits_tip >- + The total number of sL1D requests that hit on a previously loaded cache line, per + normalization unit. + Misses - Non-Duplicated: &Accesses_Misses_Non-Duplicated_tip >- + The total number of sL1D requests that missed on a cache line that was not already pending due + to another request, per normalization unit. + Misses - Duplicated: &Accesses_Misses_Duplicated_tip >- + The total number of sL1D requests that missed on a cache line that was already pending due to + another request, per normalization unit. + Cache Hit Rate: &Accesses_Cache_Hit_Rate_tip >- + Indicates the percent of sL1D requests that hit on a previously loaded line the cache. The + ratio of the number of sL1D requests that hit over the number of all sL1D requests. + Read Requests (Total): &Accesses_Read_Requests_(Total)_tip >- + The total number of sL1D read requests of any size, per normalization unit. + Atomic Requests: &Accesses_Atomic_Requests_tip >- + The total number of sL1D atomic requests of any size, per normalization unit. Typically unused + on CDNA accelerators. + Read Requests (1 DWord): &Accesses_Read_Requests_(1_DWord)_tip >- + The total number of sL1D read requests made for a single dword of data (4B), per normalization + unit. + Read Requests (2 DWord): &Accesses_Read_Requests_(2_DWord)_tip >- + The total number of sL1D read requests made for a two dwords of data + (8B), per normalization unit. + Read Requests (4 DWord): &Accesses_Read_Requests_(4_DWord)_tip >- + The total number of sL1D read requests made for a four dwords of data + (16B), per normalization unit. + Read Requests (8 DWord): &Accesses_Read_Requests_(8_DWord)_tip >- + The total number of sL1D read requests made for a eight dwords of data + (32B), per normalization unit. + Read Requests (16 DWord): &Accesses_Read_Requests_(16_DWord)_tip >- + The total number of sL1D read requests made for a sixteen dwords of data + (64B), per normalization unit. + Scalar L1D Cache - L2 Interface: + sL1D-L2 Bandwidth: &sL1D-L2_Bandwidth_tip >- + The total number of bytes read from, written to, or atomically updated across the sL1D↔L2 + interface, per normalization unit. Note that sL1D writes and atomics are typically unused on + current CDNA accelerators, so in the majority of cases this can be interpreted as an sL1D→L2 + read bandwidth. + Read Requests: &sL1D-L2_Read_Requests_tip >- + The total number of read requests from sL1D to the L2, per normalization unit. + Write Requests: &sL1D-L2_Write_Requests_tip >- + The total number of write requests from sL1D to the L2, per normalization unit. Typically + unused on current CDNA accelerators. + Atomic Requests: &sL1D-L2_Atomic_Requests_tip >- + The total number of atomic requests from sL1D to the L2, per normalization unit. Typically + unused on current CDNA accelerators. + Stall Cycles: &sL1D-L2_Stall_Cycles_tip >- + The total number of cycles the sL1D↔L2 interface was stalled, per normalization unit. # Define the panel properties and properties of each metric in the panel. Panel Config: @@ -21,17 +83,17 @@ Panel Config: value: AVG(((SQC_DCACHE_REQ * 100000) / (($max_sclk * $sqc_per_gpu) * (End_Timestamp - Start_Timestamp)))) unit: Pct of Peak - tips: + tips: *SoL_Bandwidth_tip Cache Hit Rate: value: AVG((((SQC_DCACHE_HITS * 100) / (SQC_DCACHE_HITS + SQC_DCACHE_MISSES + SQC_DCACHE_MISSES_DUPLICATE)) if ((SQC_DCACHE_HITS + SQC_DCACHE_MISSES + SQC_DCACHE_MISSES_DUPLICATE) != 0) else None)) unit: Pct of Peak - tips: + tips: *SoL_Cache_Hit_Rate_tip sL1D-L2 BW: value: AVG(((SQC_TC_DATA_READ_REQ + SQC_TC_DATA_WRITE_REQ + SQC_TC_DATA_ATOMIC_REQ) * 100000) / (2 * ($max_sclk * $sqc_per_gpu) * (End_Timestamp - Start_Timestamp))) unit: Pct of Peak - tips: + tips: *SoL_sL1D-L2_Bandwidth_tip comparable: false # for now cli_style: simple_bar @@ -51,25 +113,25 @@ Panel Config: min: MIN((SQC_DCACHE_REQ / $denom)) max: MAX((SQC_DCACHE_REQ / $denom)) unit: (Req + $normUnit) - tips: + tips: *Accesses_Requests_tip Hits: avg: AVG((SQC_DCACHE_HITS / $denom)) min: MIN((SQC_DCACHE_HITS / $denom)) max: MAX((SQC_DCACHE_HITS / $denom)) unit: (Req + $normUnit) - tips: + tips: *Accesses_Hits_tip Misses - Non Duplicated: avg: AVG((SQC_DCACHE_MISSES / $denom)) min: MIN((SQC_DCACHE_MISSES / $denom)) max: MAX((SQC_DCACHE_MISSES / $denom)) unit: (Req + $normUnit) - tips: + tips: *Accesses_Misses_Non-Duplicated_tip Misses- Duplicated: avg: AVG((SQC_DCACHE_MISSES_DUPLICATE / $denom)) min: MIN((SQC_DCACHE_MISSES_DUPLICATE / $denom)) max: MAX((SQC_DCACHE_MISSES_DUPLICATE / $denom)) unit: (Req + $normUnit) - tips: + tips: *Accesses_Misses_Duplicated_tip Cache Hit Rate: avg: AVG((((100 * SQC_DCACHE_HITS) / ((SQC_DCACHE_HITS + SQC_DCACHE_MISSES) + SQC_DCACHE_MISSES_DUPLICATE)) if (((SQC_DCACHE_HITS + SQC_DCACHE_MISSES) @@ -81,7 +143,7 @@ Panel Config: + SQC_DCACHE_MISSES_DUPLICATE)) if (((SQC_DCACHE_HITS + SQC_DCACHE_MISSES) + SQC_DCACHE_MISSES_DUPLICATE) != 0) else None)) unit: pct - tips: + tips: *Accesses_Cache_Hit_Rate_tip Read Req (Total): avg: AVG((((((SQC_DCACHE_REQ_READ_1 + SQC_DCACHE_REQ_READ_2) + SQC_DCACHE_REQ_READ_4) + SQC_DCACHE_REQ_READ_8) + SQC_DCACHE_REQ_READ_16) / $denom)) @@ -90,43 +152,43 @@ Panel Config: max: MAX((((((SQC_DCACHE_REQ_READ_1 + SQC_DCACHE_REQ_READ_2) + SQC_DCACHE_REQ_READ_4) + SQC_DCACHE_REQ_READ_8) + SQC_DCACHE_REQ_READ_16) / $denom)) unit: (Req + $normUnit) - tips: + tips: *Accesses_Read_Requests_(Total)_tip Atomic Req: avg: AVG((SQC_DCACHE_ATOMIC / $denom)) min: MIN((SQC_DCACHE_ATOMIC / $denom)) max: MAX((SQC_DCACHE_ATOMIC / $denom)) unit: (Req + $normUnit) - tips: + tips: *Accesses_Atomic_Requests_tip Read Req (1 DWord): avg: AVG((SQC_DCACHE_REQ_READ_1 / $denom)) min: MIN((SQC_DCACHE_REQ_READ_1 / $denom)) max: MAX((SQC_DCACHE_REQ_READ_1 / $denom)) unit: (Req + $normUnit) - tips: + tips: *Accesses_Read_Requests_(1_DWord)_tip Read Req (2 DWord): avg: AVG((SQC_DCACHE_REQ_READ_2 / $denom)) min: MIN((SQC_DCACHE_REQ_READ_2 / $denom)) max: MAX((SQC_DCACHE_REQ_READ_2 / $denom)) unit: (Req + $normUnit) - tips: + tips: *Accesses_Read_Requests_(2_DWord)_tip Read Req (4 DWord): avg: AVG((SQC_DCACHE_REQ_READ_4 / $denom)) min: MIN((SQC_DCACHE_REQ_READ_4 / $denom)) max: MAX((SQC_DCACHE_REQ_READ_4 / $denom)) unit: (Req + $normUnit) - tips: + tips: *Accesses_Read_Requests_(4_DWord)_tip Read Req (8 DWord): avg: AVG((SQC_DCACHE_REQ_READ_8 / $denom)) min: MIN((SQC_DCACHE_REQ_READ_8 / $denom)) max: MAX((SQC_DCACHE_REQ_READ_8 / $denom)) unit: (Req + $normUnit) - tips: + tips: *Accesses_Read_Requests_(8_DWord)_tip Read Req (16 DWord): avg: AVG((SQC_DCACHE_REQ_READ_16 / $denom)) min: MIN((SQC_DCACHE_REQ_READ_16 / $denom)) max: MAX((SQC_DCACHE_REQ_READ_16 / $denom)) unit: (Req + $normUnit) - tips: + tips: *Accesses_Read_Requests_(16_DWord)_tip - metric_table: id: 1403 @@ -144,28 +206,28 @@ Panel Config: min: MIN(((((SQC_TC_DATA_READ_REQ + SQC_TC_DATA_WRITE_REQ + SQC_TC_DATA_ATOMIC_REQ) * 64)) / $denom)) max: MAX(((((SQC_TC_DATA_READ_REQ + SQC_TC_DATA_WRITE_REQ + SQC_TC_DATA_ATOMIC_REQ) * 64)) / $denom)) unit: (Bytes + $normUnit) - tips: + tips: *sL1D-L2_Bandwidth_tip Read Req: avg: AVG((SQC_TC_DATA_READ_REQ / $denom)) min: MIN((SQC_TC_DATA_READ_REQ / $denom)) max: MAX((SQC_TC_DATA_READ_REQ / $denom)) unit: (Req + $normUnit) - tips: + tips: *sL1D-L2_Read_Requests_tip Write Req: avg: AVG((SQC_TC_DATA_WRITE_REQ / $denom)) min: MIN((SQC_TC_DATA_WRITE_REQ / $denom)) max: MAX((SQC_TC_DATA_WRITE_REQ / $denom)) unit: (Req + $normUnit) - tips: + tips: *sL1D-L2_Write_Requests_tip Atomic Req: avg: AVG((SQC_TC_DATA_ATOMIC_REQ / $denom)) min: MIN((SQC_TC_DATA_ATOMIC_REQ / $denom)) max: MAX((SQC_TC_DATA_ATOMIC_REQ / $denom)) unit: (Req + $normUnit) - tips: + tips: *sL1D-L2_Atomic_Requests_tip Stall Cycles: avg: AVG((SQC_TC_STALL / $denom)) min: MIN((SQC_TC_STALL / $denom)) max: MAX((SQC_TC_STALL / $denom)) unit: (Cycles + $normUnit) - tips: + tips: *sL1D-L2_Stall_Cycles_tip diff --git a/src/rocprof_compute_soc/analysis_configs/gfx940/1400_constant-cache.yaml b/src/rocprof_compute_soc/analysis_configs/gfx940/1400_constant-cache.yaml index 669a5834b..d4cac1c48 100644 --- a/src/rocprof_compute_soc/analysis_configs/gfx940/1400_constant-cache.yaml +++ b/src/rocprof_compute_soc/analysis_configs/gfx940/1400_constant-cache.yaml @@ -2,6 +2,68 @@ # Add description/tips for each metric in this section. # So it could be shown in hover. Metric Description: + Speed-of-Light: + Bandwidth: &SoL_Bandwidth_tip >- + The number of bytes looked up in the sL1D cache, as a percent of the peak theoretical + bandwidth. Calculated as the ratio of sL1D requests over the total sL1D cycles. + Cache Hit Rate: &SoL_Cache_Hit_Rate_tip >- + The percent of sL1D requests that hit on a previously loaded line in the cache. Calculated as + the ratio of the number of sL1D requests that hit over the number of all sL1D requests. + sL1D-L2 BW: &SoL_sL1D-L2_Bandwidth_tip >- + The number of bytes requested by the sL1D from the L2 cache, as a percent of the peak + theoretical sL1D → L2 cache bandwidth. Calculated as the ratio of the total number of + requests from the sL1D to the L2 cache over the total sL1D-L2 interface cycles. + Scalar L1D Cache Accesses: + Requests: &Accesses_Requests_tip >- + The total number of requests, of any size or type, made to the sL1D per normalization unit. + Hits: &Accesses_Hits_tip >- + The total number of sL1D requests that hit on a previously loaded cache line, per + normalization unit. + Misses - Non-Duplicated: &Accesses_Misses_Non-Duplicated_tip >- + The total number of sL1D requests that missed on a cache line that was not already pending due + to another request, per normalization unit. + Misses - Duplicated: &Accesses_Misses_Duplicated_tip >- + The total number of sL1D requests that missed on a cache line that was already pending due to + another request, per normalization unit. + Cache Hit Rate: &Accesses_Cache_Hit_Rate_tip >- + Indicates the percent of sL1D requests that hit on a previously loaded line the cache. The + ratio of the number of sL1D requests that hit over the number of all sL1D requests. + Read Requests (Total): &Accesses_Read_Requests_(Total)_tip >- + The total number of sL1D read requests of any size, per normalization unit. + Atomic Requests: &Accesses_Atomic_Requests_tip >- + The total number of sL1D atomic requests of any size, per normalization unit. Typically unused + on CDNA accelerators. + Read Requests (1 DWord): &Accesses_Read_Requests_(1_DWord)_tip >- + The total number of sL1D read requests made for a single dword of data (4B), per normalization + unit. + Read Requests (2 DWord): &Accesses_Read_Requests_(2_DWord)_tip >- + The total number of sL1D read requests made for a two dwords of data + (8B), per normalization unit. + Read Requests (4 DWord): &Accesses_Read_Requests_(4_DWord)_tip >- + The total number of sL1D read requests made for a four dwords of data + (16B), per normalization unit. + Read Requests (8 DWord): &Accesses_Read_Requests_(8_DWord)_tip >- + The total number of sL1D read requests made for a eight dwords of data + (32B), per normalization unit. + Read Requests (16 DWord): &Accesses_Read_Requests_(16_DWord)_tip >- + The total number of sL1D read requests made for a sixteen dwords of data + (64B), per normalization unit. + Scalar L1D Cache - L2 Interface: + sL1D-L2 Bandwidth: &sL1D-L2_Bandwidth_tip >- + The total number of bytes read from, written to, or atomically updated across the sL1D↔L2 + interface, per normalization unit. Note that sL1D writes and atomics are typically unused on + current CDNA accelerators, so in the majority of cases this can be interpreted as an sL1D→L2 + read bandwidth. + Read Requests: &sL1D-L2_Read_Requests_tip >- + The total number of read requests from sL1D to the L2, per normalization unit. + Write Requests: &sL1D-L2_Write_Requests_tip >- + The total number of write requests from sL1D to the L2, per normalization unit. Typically + unused on current CDNA accelerators. + Atomic Requests: &sL1D-L2_Atomic_Requests_tip >- + The total number of atomic requests from sL1D to the L2, per normalization unit. Typically + unused on current CDNA accelerators. + Stall Cycles: &sL1D-L2_Stall_Cycles_tip >- + The total number of cycles the sL1D↔L2 interface was stalled, per normalization unit. # Define the panel properties and properties of each metric in the panel. Panel Config: @@ -21,17 +83,17 @@ Panel Config: value: AVG(((SQC_DCACHE_REQ * 100000) / (($max_sclk * $sqc_per_gpu) * (End_Timestamp - Start_Timestamp)))) unit: Pct of Peak - tips: + tips: *SoL_Bandwidth_tip Cache Hit Rate: value: AVG((((SQC_DCACHE_HITS * 100) / (SQC_DCACHE_HITS + SQC_DCACHE_MISSES + SQC_DCACHE_MISSES_DUPLICATE)) if ((SQC_DCACHE_HITS + SQC_DCACHE_MISSES + SQC_DCACHE_MISSES_DUPLICATE) != 0) else None)) unit: Pct of Peak - tips: + tips: *SoL_Cache_Hit_Rate_tip sL1D-L2 BW: value: AVG(((SQC_TC_DATA_READ_REQ + SQC_TC_DATA_WRITE_REQ + SQC_TC_DATA_ATOMIC_REQ) * 100000) / (2 * ($max_sclk * $sqc_per_gpu) * (End_Timestamp - Start_Timestamp))) unit: Pct of Peak - tips: + tips: *SoL_sL1D-L2_Bandwidth_tip comparable: false # for now cli_style: simple_bar @@ -51,25 +113,25 @@ Panel Config: min: MIN((SQC_DCACHE_REQ / $denom)) max: MAX((SQC_DCACHE_REQ / $denom)) unit: (Req + $normUnit) - tips: + tips: *Accesses_Requests_tip Hits: avg: AVG((SQC_DCACHE_HITS / $denom)) min: MIN((SQC_DCACHE_HITS / $denom)) max: MAX((SQC_DCACHE_HITS / $denom)) unit: (Req + $normUnit) - tips: + tips: *Accesses_Hits_tip Misses - Non Duplicated: avg: AVG((SQC_DCACHE_MISSES / $denom)) min: MIN((SQC_DCACHE_MISSES / $denom)) max: MAX((SQC_DCACHE_MISSES / $denom)) unit: (Req + $normUnit) - tips: + tips: *Accesses_Misses_Non-Duplicated_tip Misses- Duplicated: avg: AVG((SQC_DCACHE_MISSES_DUPLICATE / $denom)) min: MIN((SQC_DCACHE_MISSES_DUPLICATE / $denom)) max: MAX((SQC_DCACHE_MISSES_DUPLICATE / $denom)) unit: (Req + $normUnit) - tips: + tips: *Accesses_Misses_Duplicated_tip Cache Hit Rate: avg: AVG((((100 * SQC_DCACHE_HITS) / ((SQC_DCACHE_HITS + SQC_DCACHE_MISSES) + SQC_DCACHE_MISSES_DUPLICATE)) if (((SQC_DCACHE_HITS + SQC_DCACHE_MISSES) @@ -81,7 +143,7 @@ Panel Config: + SQC_DCACHE_MISSES_DUPLICATE)) if (((SQC_DCACHE_HITS + SQC_DCACHE_MISSES) + SQC_DCACHE_MISSES_DUPLICATE) != 0) else None)) unit: pct - tips: + tips: *Accesses_Cache_Hit_Rate_tip Read Req (Total): avg: AVG((((((SQC_DCACHE_REQ_READ_1 + SQC_DCACHE_REQ_READ_2) + SQC_DCACHE_REQ_READ_4) + SQC_DCACHE_REQ_READ_8) + SQC_DCACHE_REQ_READ_16) / $denom)) @@ -90,43 +152,43 @@ Panel Config: max: MAX((((((SQC_DCACHE_REQ_READ_1 + SQC_DCACHE_REQ_READ_2) + SQC_DCACHE_REQ_READ_4) + SQC_DCACHE_REQ_READ_8) + SQC_DCACHE_REQ_READ_16) / $denom)) unit: (Req + $normUnit) - tips: + tips: *Accesses_Read_Requests_(Total)_tip Atomic Req: avg: AVG((SQC_DCACHE_ATOMIC / $denom)) min: MIN((SQC_DCACHE_ATOMIC / $denom)) max: MAX((SQC_DCACHE_ATOMIC / $denom)) unit: (Req + $normUnit) - tips: + tips: *Accesses_Atomic_Requests_tip Read Req (1 DWord): avg: AVG((SQC_DCACHE_REQ_READ_1 / $denom)) min: MIN((SQC_DCACHE_REQ_READ_1 / $denom)) max: MAX((SQC_DCACHE_REQ_READ_1 / $denom)) unit: (Req + $normUnit) - tips: + tips: *Accesses_Read_Requests_(1_DWord)_tip Read Req (2 DWord): avg: AVG((SQC_DCACHE_REQ_READ_2 / $denom)) min: MIN((SQC_DCACHE_REQ_READ_2 / $denom)) max: MAX((SQC_DCACHE_REQ_READ_2 / $denom)) unit: (Req + $normUnit) - tips: + tips: *Accesses_Read_Requests_(2_DWord)_tip Read Req (4 DWord): avg: AVG((SQC_DCACHE_REQ_READ_4 / $denom)) min: MIN((SQC_DCACHE_REQ_READ_4 / $denom)) max: MAX((SQC_DCACHE_REQ_READ_4 / $denom)) unit: (Req + $normUnit) - tips: + tips: *Accesses_Read_Requests_(4_DWord)_tip Read Req (8 DWord): avg: AVG((SQC_DCACHE_REQ_READ_8 / $denom)) min: MIN((SQC_DCACHE_REQ_READ_8 / $denom)) max: MAX((SQC_DCACHE_REQ_READ_8 / $denom)) unit: (Req + $normUnit) - tips: + tips: *Accesses_Read_Requests_(8_DWord)_tip Read Req (16 DWord): avg: AVG((SQC_DCACHE_REQ_READ_16 / $denom)) min: MIN((SQC_DCACHE_REQ_READ_16 / $denom)) max: MAX((SQC_DCACHE_REQ_READ_16 / $denom)) unit: (Req + $normUnit) - tips: + tips: *Accesses_Read_Requests_(16_DWord)_tip - metric_table: id: 1403 @@ -144,28 +206,28 @@ Panel Config: min: MIN(((((SQC_TC_DATA_READ_REQ + SQC_TC_DATA_WRITE_REQ + SQC_TC_DATA_ATOMIC_REQ) * 64)) / $denom)) max: MAX(((((SQC_TC_DATA_READ_REQ + SQC_TC_DATA_WRITE_REQ + SQC_TC_DATA_ATOMIC_REQ) * 64)) / $denom)) unit: (Bytes + $normUnit) - tips: + tips: *sL1D-L2_Bandwidth_tip Read Req: avg: AVG((SQC_TC_DATA_READ_REQ / $denom)) min: MIN((SQC_TC_DATA_READ_REQ / $denom)) max: MAX((SQC_TC_DATA_READ_REQ / $denom)) unit: (Req + $normUnit) - tips: + tips: *sL1D-L2_Read_Requests_tip Write Req: avg: AVG((SQC_TC_DATA_WRITE_REQ / $denom)) min: MIN((SQC_TC_DATA_WRITE_REQ / $denom)) max: MAX((SQC_TC_DATA_WRITE_REQ / $denom)) unit: (Req + $normUnit) - tips: + tips: *sL1D-L2_Write_Requests_tip Atomic Req: avg: AVG((SQC_TC_DATA_ATOMIC_REQ / $denom)) min: MIN((SQC_TC_DATA_ATOMIC_REQ / $denom)) max: MAX((SQC_TC_DATA_ATOMIC_REQ / $denom)) unit: (Req + $normUnit) - tips: + tips: *sL1D-L2_Atomic_Requests_tip Stall Cycles: avg: AVG((SQC_TC_STALL / $denom)) min: MIN((SQC_TC_STALL / $denom)) max: MAX((SQC_TC_STALL / $denom)) unit: (Cycles + $normUnit) - tips: + tips: *sL1D-L2_Stall_Cycles_tip diff --git a/src/rocprof_compute_soc/analysis_configs/gfx941/1400_constant-cache.yaml b/src/rocprof_compute_soc/analysis_configs/gfx941/1400_constant-cache.yaml index 669a5834b..d4cac1c48 100644 --- a/src/rocprof_compute_soc/analysis_configs/gfx941/1400_constant-cache.yaml +++ b/src/rocprof_compute_soc/analysis_configs/gfx941/1400_constant-cache.yaml @@ -2,6 +2,68 @@ # Add description/tips for each metric in this section. # So it could be shown in hover. Metric Description: + Speed-of-Light: + Bandwidth: &SoL_Bandwidth_tip >- + The number of bytes looked up in the sL1D cache, as a percent of the peak theoretical + bandwidth. Calculated as the ratio of sL1D requests over the total sL1D cycles. + Cache Hit Rate: &SoL_Cache_Hit_Rate_tip >- + The percent of sL1D requests that hit on a previously loaded line in the cache. Calculated as + the ratio of the number of sL1D requests that hit over the number of all sL1D requests. + sL1D-L2 BW: &SoL_sL1D-L2_Bandwidth_tip >- + The number of bytes requested by the sL1D from the L2 cache, as a percent of the peak + theoretical sL1D → L2 cache bandwidth. Calculated as the ratio of the total number of + requests from the sL1D to the L2 cache over the total sL1D-L2 interface cycles. + Scalar L1D Cache Accesses: + Requests: &Accesses_Requests_tip >- + The total number of requests, of any size or type, made to the sL1D per normalization unit. + Hits: &Accesses_Hits_tip >- + The total number of sL1D requests that hit on a previously loaded cache line, per + normalization unit. + Misses - Non-Duplicated: &Accesses_Misses_Non-Duplicated_tip >- + The total number of sL1D requests that missed on a cache line that was not already pending due + to another request, per normalization unit. + Misses - Duplicated: &Accesses_Misses_Duplicated_tip >- + The total number of sL1D requests that missed on a cache line that was already pending due to + another request, per normalization unit. + Cache Hit Rate: &Accesses_Cache_Hit_Rate_tip >- + Indicates the percent of sL1D requests that hit on a previously loaded line the cache. The + ratio of the number of sL1D requests that hit over the number of all sL1D requests. + Read Requests (Total): &Accesses_Read_Requests_(Total)_tip >- + The total number of sL1D read requests of any size, per normalization unit. + Atomic Requests: &Accesses_Atomic_Requests_tip >- + The total number of sL1D atomic requests of any size, per normalization unit. Typically unused + on CDNA accelerators. + Read Requests (1 DWord): &Accesses_Read_Requests_(1_DWord)_tip >- + The total number of sL1D read requests made for a single dword of data (4B), per normalization + unit. + Read Requests (2 DWord): &Accesses_Read_Requests_(2_DWord)_tip >- + The total number of sL1D read requests made for a two dwords of data + (8B), per normalization unit. + Read Requests (4 DWord): &Accesses_Read_Requests_(4_DWord)_tip >- + The total number of sL1D read requests made for a four dwords of data + (16B), per normalization unit. + Read Requests (8 DWord): &Accesses_Read_Requests_(8_DWord)_tip >- + The total number of sL1D read requests made for a eight dwords of data + (32B), per normalization unit. + Read Requests (16 DWord): &Accesses_Read_Requests_(16_DWord)_tip >- + The total number of sL1D read requests made for a sixteen dwords of data + (64B), per normalization unit. + Scalar L1D Cache - L2 Interface: + sL1D-L2 Bandwidth: &sL1D-L2_Bandwidth_tip >- + The total number of bytes read from, written to, or atomically updated across the sL1D↔L2 + interface, per normalization unit. Note that sL1D writes and atomics are typically unused on + current CDNA accelerators, so in the majority of cases this can be interpreted as an sL1D→L2 + read bandwidth. + Read Requests: &sL1D-L2_Read_Requests_tip >- + The total number of read requests from sL1D to the L2, per normalization unit. + Write Requests: &sL1D-L2_Write_Requests_tip >- + The total number of write requests from sL1D to the L2, per normalization unit. Typically + unused on current CDNA accelerators. + Atomic Requests: &sL1D-L2_Atomic_Requests_tip >- + The total number of atomic requests from sL1D to the L2, per normalization unit. Typically + unused on current CDNA accelerators. + Stall Cycles: &sL1D-L2_Stall_Cycles_tip >- + The total number of cycles the sL1D↔L2 interface was stalled, per normalization unit. # Define the panel properties and properties of each metric in the panel. Panel Config: @@ -21,17 +83,17 @@ Panel Config: value: AVG(((SQC_DCACHE_REQ * 100000) / (($max_sclk * $sqc_per_gpu) * (End_Timestamp - Start_Timestamp)))) unit: Pct of Peak - tips: + tips: *SoL_Bandwidth_tip Cache Hit Rate: value: AVG((((SQC_DCACHE_HITS * 100) / (SQC_DCACHE_HITS + SQC_DCACHE_MISSES + SQC_DCACHE_MISSES_DUPLICATE)) if ((SQC_DCACHE_HITS + SQC_DCACHE_MISSES + SQC_DCACHE_MISSES_DUPLICATE) != 0) else None)) unit: Pct of Peak - tips: + tips: *SoL_Cache_Hit_Rate_tip sL1D-L2 BW: value: AVG(((SQC_TC_DATA_READ_REQ + SQC_TC_DATA_WRITE_REQ + SQC_TC_DATA_ATOMIC_REQ) * 100000) / (2 * ($max_sclk * $sqc_per_gpu) * (End_Timestamp - Start_Timestamp))) unit: Pct of Peak - tips: + tips: *SoL_sL1D-L2_Bandwidth_tip comparable: false # for now cli_style: simple_bar @@ -51,25 +113,25 @@ Panel Config: min: MIN((SQC_DCACHE_REQ / $denom)) max: MAX((SQC_DCACHE_REQ / $denom)) unit: (Req + $normUnit) - tips: + tips: *Accesses_Requests_tip Hits: avg: AVG((SQC_DCACHE_HITS / $denom)) min: MIN((SQC_DCACHE_HITS / $denom)) max: MAX((SQC_DCACHE_HITS / $denom)) unit: (Req + $normUnit) - tips: + tips: *Accesses_Hits_tip Misses - Non Duplicated: avg: AVG((SQC_DCACHE_MISSES / $denom)) min: MIN((SQC_DCACHE_MISSES / $denom)) max: MAX((SQC_DCACHE_MISSES / $denom)) unit: (Req + $normUnit) - tips: + tips: *Accesses_Misses_Non-Duplicated_tip Misses- Duplicated: avg: AVG((SQC_DCACHE_MISSES_DUPLICATE / $denom)) min: MIN((SQC_DCACHE_MISSES_DUPLICATE / $denom)) max: MAX((SQC_DCACHE_MISSES_DUPLICATE / $denom)) unit: (Req + $normUnit) - tips: + tips: *Accesses_Misses_Duplicated_tip Cache Hit Rate: avg: AVG((((100 * SQC_DCACHE_HITS) / ((SQC_DCACHE_HITS + SQC_DCACHE_MISSES) + SQC_DCACHE_MISSES_DUPLICATE)) if (((SQC_DCACHE_HITS + SQC_DCACHE_MISSES) @@ -81,7 +143,7 @@ Panel Config: + SQC_DCACHE_MISSES_DUPLICATE)) if (((SQC_DCACHE_HITS + SQC_DCACHE_MISSES) + SQC_DCACHE_MISSES_DUPLICATE) != 0) else None)) unit: pct - tips: + tips: *Accesses_Cache_Hit_Rate_tip Read Req (Total): avg: AVG((((((SQC_DCACHE_REQ_READ_1 + SQC_DCACHE_REQ_READ_2) + SQC_DCACHE_REQ_READ_4) + SQC_DCACHE_REQ_READ_8) + SQC_DCACHE_REQ_READ_16) / $denom)) @@ -90,43 +152,43 @@ Panel Config: max: MAX((((((SQC_DCACHE_REQ_READ_1 + SQC_DCACHE_REQ_READ_2) + SQC_DCACHE_REQ_READ_4) + SQC_DCACHE_REQ_READ_8) + SQC_DCACHE_REQ_READ_16) / $denom)) unit: (Req + $normUnit) - tips: + tips: *Accesses_Read_Requests_(Total)_tip Atomic Req: avg: AVG((SQC_DCACHE_ATOMIC / $denom)) min: MIN((SQC_DCACHE_ATOMIC / $denom)) max: MAX((SQC_DCACHE_ATOMIC / $denom)) unit: (Req + $normUnit) - tips: + tips: *Accesses_Atomic_Requests_tip Read Req (1 DWord): avg: AVG((SQC_DCACHE_REQ_READ_1 / $denom)) min: MIN((SQC_DCACHE_REQ_READ_1 / $denom)) max: MAX((SQC_DCACHE_REQ_READ_1 / $denom)) unit: (Req + $normUnit) - tips: + tips: *Accesses_Read_Requests_(1_DWord)_tip Read Req (2 DWord): avg: AVG((SQC_DCACHE_REQ_READ_2 / $denom)) min: MIN((SQC_DCACHE_REQ_READ_2 / $denom)) max: MAX((SQC_DCACHE_REQ_READ_2 / $denom)) unit: (Req + $normUnit) - tips: + tips: *Accesses_Read_Requests_(2_DWord)_tip Read Req (4 DWord): avg: AVG((SQC_DCACHE_REQ_READ_4 / $denom)) min: MIN((SQC_DCACHE_REQ_READ_4 / $denom)) max: MAX((SQC_DCACHE_REQ_READ_4 / $denom)) unit: (Req + $normUnit) - tips: + tips: *Accesses_Read_Requests_(4_DWord)_tip Read Req (8 DWord): avg: AVG((SQC_DCACHE_REQ_READ_8 / $denom)) min: MIN((SQC_DCACHE_REQ_READ_8 / $denom)) max: MAX((SQC_DCACHE_REQ_READ_8 / $denom)) unit: (Req + $normUnit) - tips: + tips: *Accesses_Read_Requests_(8_DWord)_tip Read Req (16 DWord): avg: AVG((SQC_DCACHE_REQ_READ_16 / $denom)) min: MIN((SQC_DCACHE_REQ_READ_16 / $denom)) max: MAX((SQC_DCACHE_REQ_READ_16 / $denom)) unit: (Req + $normUnit) - tips: + tips: *Accesses_Read_Requests_(16_DWord)_tip - metric_table: id: 1403 @@ -144,28 +206,28 @@ Panel Config: min: MIN(((((SQC_TC_DATA_READ_REQ + SQC_TC_DATA_WRITE_REQ + SQC_TC_DATA_ATOMIC_REQ) * 64)) / $denom)) max: MAX(((((SQC_TC_DATA_READ_REQ + SQC_TC_DATA_WRITE_REQ + SQC_TC_DATA_ATOMIC_REQ) * 64)) / $denom)) unit: (Bytes + $normUnit) - tips: + tips: *sL1D-L2_Bandwidth_tip Read Req: avg: AVG((SQC_TC_DATA_READ_REQ / $denom)) min: MIN((SQC_TC_DATA_READ_REQ / $denom)) max: MAX((SQC_TC_DATA_READ_REQ / $denom)) unit: (Req + $normUnit) - tips: + tips: *sL1D-L2_Read_Requests_tip Write Req: avg: AVG((SQC_TC_DATA_WRITE_REQ / $denom)) min: MIN((SQC_TC_DATA_WRITE_REQ / $denom)) max: MAX((SQC_TC_DATA_WRITE_REQ / $denom)) unit: (Req + $normUnit) - tips: + tips: *sL1D-L2_Write_Requests_tip Atomic Req: avg: AVG((SQC_TC_DATA_ATOMIC_REQ / $denom)) min: MIN((SQC_TC_DATA_ATOMIC_REQ / $denom)) max: MAX((SQC_TC_DATA_ATOMIC_REQ / $denom)) unit: (Req + $normUnit) - tips: + tips: *sL1D-L2_Atomic_Requests_tip Stall Cycles: avg: AVG((SQC_TC_STALL / $denom)) min: MIN((SQC_TC_STALL / $denom)) max: MAX((SQC_TC_STALL / $denom)) unit: (Cycles + $normUnit) - tips: + tips: *sL1D-L2_Stall_Cycles_tip diff --git a/src/rocprof_compute_soc/analysis_configs/gfx942/1400_constant-cache.yaml b/src/rocprof_compute_soc/analysis_configs/gfx942/1400_constant-cache.yaml index 669a5834b..d4cac1c48 100644 --- a/src/rocprof_compute_soc/analysis_configs/gfx942/1400_constant-cache.yaml +++ b/src/rocprof_compute_soc/analysis_configs/gfx942/1400_constant-cache.yaml @@ -2,6 +2,68 @@ # Add description/tips for each metric in this section. # So it could be shown in hover. Metric Description: + Speed-of-Light: + Bandwidth: &SoL_Bandwidth_tip >- + The number of bytes looked up in the sL1D cache, as a percent of the peak theoretical + bandwidth. Calculated as the ratio of sL1D requests over the total sL1D cycles. + Cache Hit Rate: &SoL_Cache_Hit_Rate_tip >- + The percent of sL1D requests that hit on a previously loaded line in the cache. Calculated as + the ratio of the number of sL1D requests that hit over the number of all sL1D requests. + sL1D-L2 BW: &SoL_sL1D-L2_Bandwidth_tip >- + The number of bytes requested by the sL1D from the L2 cache, as a percent of the peak + theoretical sL1D → L2 cache bandwidth. Calculated as the ratio of the total number of + requests from the sL1D to the L2 cache over the total sL1D-L2 interface cycles. + Scalar L1D Cache Accesses: + Requests: &Accesses_Requests_tip >- + The total number of requests, of any size or type, made to the sL1D per normalization unit. + Hits: &Accesses_Hits_tip >- + The total number of sL1D requests that hit on a previously loaded cache line, per + normalization unit. + Misses - Non-Duplicated: &Accesses_Misses_Non-Duplicated_tip >- + The total number of sL1D requests that missed on a cache line that was not already pending due + to another request, per normalization unit. + Misses - Duplicated: &Accesses_Misses_Duplicated_tip >- + The total number of sL1D requests that missed on a cache line that was already pending due to + another request, per normalization unit. + Cache Hit Rate: &Accesses_Cache_Hit_Rate_tip >- + Indicates the percent of sL1D requests that hit on a previously loaded line the cache. The + ratio of the number of sL1D requests that hit over the number of all sL1D requests. + Read Requests (Total): &Accesses_Read_Requests_(Total)_tip >- + The total number of sL1D read requests of any size, per normalization unit. + Atomic Requests: &Accesses_Atomic_Requests_tip >- + The total number of sL1D atomic requests of any size, per normalization unit. Typically unused + on CDNA accelerators. + Read Requests (1 DWord): &Accesses_Read_Requests_(1_DWord)_tip >- + The total number of sL1D read requests made for a single dword of data (4B), per normalization + unit. + Read Requests (2 DWord): &Accesses_Read_Requests_(2_DWord)_tip >- + The total number of sL1D read requests made for a two dwords of data + (8B), per normalization unit. + Read Requests (4 DWord): &Accesses_Read_Requests_(4_DWord)_tip >- + The total number of sL1D read requests made for a four dwords of data + (16B), per normalization unit. + Read Requests (8 DWord): &Accesses_Read_Requests_(8_DWord)_tip >- + The total number of sL1D read requests made for a eight dwords of data + (32B), per normalization unit. + Read Requests (16 DWord): &Accesses_Read_Requests_(16_DWord)_tip >- + The total number of sL1D read requests made for a sixteen dwords of data + (64B), per normalization unit. + Scalar L1D Cache - L2 Interface: + sL1D-L2 Bandwidth: &sL1D-L2_Bandwidth_tip >- + The total number of bytes read from, written to, or atomically updated across the sL1D↔L2 + interface, per normalization unit. Note that sL1D writes and atomics are typically unused on + current CDNA accelerators, so in the majority of cases this can be interpreted as an sL1D→L2 + read bandwidth. + Read Requests: &sL1D-L2_Read_Requests_tip >- + The total number of read requests from sL1D to the L2, per normalization unit. + Write Requests: &sL1D-L2_Write_Requests_tip >- + The total number of write requests from sL1D to the L2, per normalization unit. Typically + unused on current CDNA accelerators. + Atomic Requests: &sL1D-L2_Atomic_Requests_tip >- + The total number of atomic requests from sL1D to the L2, per normalization unit. Typically + unused on current CDNA accelerators. + Stall Cycles: &sL1D-L2_Stall_Cycles_tip >- + The total number of cycles the sL1D↔L2 interface was stalled, per normalization unit. # Define the panel properties and properties of each metric in the panel. Panel Config: @@ -21,17 +83,17 @@ Panel Config: value: AVG(((SQC_DCACHE_REQ * 100000) / (($max_sclk * $sqc_per_gpu) * (End_Timestamp - Start_Timestamp)))) unit: Pct of Peak - tips: + tips: *SoL_Bandwidth_tip Cache Hit Rate: value: AVG((((SQC_DCACHE_HITS * 100) / (SQC_DCACHE_HITS + SQC_DCACHE_MISSES + SQC_DCACHE_MISSES_DUPLICATE)) if ((SQC_DCACHE_HITS + SQC_DCACHE_MISSES + SQC_DCACHE_MISSES_DUPLICATE) != 0) else None)) unit: Pct of Peak - tips: + tips: *SoL_Cache_Hit_Rate_tip sL1D-L2 BW: value: AVG(((SQC_TC_DATA_READ_REQ + SQC_TC_DATA_WRITE_REQ + SQC_TC_DATA_ATOMIC_REQ) * 100000) / (2 * ($max_sclk * $sqc_per_gpu) * (End_Timestamp - Start_Timestamp))) unit: Pct of Peak - tips: + tips: *SoL_sL1D-L2_Bandwidth_tip comparable: false # for now cli_style: simple_bar @@ -51,25 +113,25 @@ Panel Config: min: MIN((SQC_DCACHE_REQ / $denom)) max: MAX((SQC_DCACHE_REQ / $denom)) unit: (Req + $normUnit) - tips: + tips: *Accesses_Requests_tip Hits: avg: AVG((SQC_DCACHE_HITS / $denom)) min: MIN((SQC_DCACHE_HITS / $denom)) max: MAX((SQC_DCACHE_HITS / $denom)) unit: (Req + $normUnit) - tips: + tips: *Accesses_Hits_tip Misses - Non Duplicated: avg: AVG((SQC_DCACHE_MISSES / $denom)) min: MIN((SQC_DCACHE_MISSES / $denom)) max: MAX((SQC_DCACHE_MISSES / $denom)) unit: (Req + $normUnit) - tips: + tips: *Accesses_Misses_Non-Duplicated_tip Misses- Duplicated: avg: AVG((SQC_DCACHE_MISSES_DUPLICATE / $denom)) min: MIN((SQC_DCACHE_MISSES_DUPLICATE / $denom)) max: MAX((SQC_DCACHE_MISSES_DUPLICATE / $denom)) unit: (Req + $normUnit) - tips: + tips: *Accesses_Misses_Duplicated_tip Cache Hit Rate: avg: AVG((((100 * SQC_DCACHE_HITS) / ((SQC_DCACHE_HITS + SQC_DCACHE_MISSES) + SQC_DCACHE_MISSES_DUPLICATE)) if (((SQC_DCACHE_HITS + SQC_DCACHE_MISSES) @@ -81,7 +143,7 @@ Panel Config: + SQC_DCACHE_MISSES_DUPLICATE)) if (((SQC_DCACHE_HITS + SQC_DCACHE_MISSES) + SQC_DCACHE_MISSES_DUPLICATE) != 0) else None)) unit: pct - tips: + tips: *Accesses_Cache_Hit_Rate_tip Read Req (Total): avg: AVG((((((SQC_DCACHE_REQ_READ_1 + SQC_DCACHE_REQ_READ_2) + SQC_DCACHE_REQ_READ_4) + SQC_DCACHE_REQ_READ_8) + SQC_DCACHE_REQ_READ_16) / $denom)) @@ -90,43 +152,43 @@ Panel Config: max: MAX((((((SQC_DCACHE_REQ_READ_1 + SQC_DCACHE_REQ_READ_2) + SQC_DCACHE_REQ_READ_4) + SQC_DCACHE_REQ_READ_8) + SQC_DCACHE_REQ_READ_16) / $denom)) unit: (Req + $normUnit) - tips: + tips: *Accesses_Read_Requests_(Total)_tip Atomic Req: avg: AVG((SQC_DCACHE_ATOMIC / $denom)) min: MIN((SQC_DCACHE_ATOMIC / $denom)) max: MAX((SQC_DCACHE_ATOMIC / $denom)) unit: (Req + $normUnit) - tips: + tips: *Accesses_Atomic_Requests_tip Read Req (1 DWord): avg: AVG((SQC_DCACHE_REQ_READ_1 / $denom)) min: MIN((SQC_DCACHE_REQ_READ_1 / $denom)) max: MAX((SQC_DCACHE_REQ_READ_1 / $denom)) unit: (Req + $normUnit) - tips: + tips: *Accesses_Read_Requests_(1_DWord)_tip Read Req (2 DWord): avg: AVG((SQC_DCACHE_REQ_READ_2 / $denom)) min: MIN((SQC_DCACHE_REQ_READ_2 / $denom)) max: MAX((SQC_DCACHE_REQ_READ_2 / $denom)) unit: (Req + $normUnit) - tips: + tips: *Accesses_Read_Requests_(2_DWord)_tip Read Req (4 DWord): avg: AVG((SQC_DCACHE_REQ_READ_4 / $denom)) min: MIN((SQC_DCACHE_REQ_READ_4 / $denom)) max: MAX((SQC_DCACHE_REQ_READ_4 / $denom)) unit: (Req + $normUnit) - tips: + tips: *Accesses_Read_Requests_(4_DWord)_tip Read Req (8 DWord): avg: AVG((SQC_DCACHE_REQ_READ_8 / $denom)) min: MIN((SQC_DCACHE_REQ_READ_8 / $denom)) max: MAX((SQC_DCACHE_REQ_READ_8 / $denom)) unit: (Req + $normUnit) - tips: + tips: *Accesses_Read_Requests_(8_DWord)_tip Read Req (16 DWord): avg: AVG((SQC_DCACHE_REQ_READ_16 / $denom)) min: MIN((SQC_DCACHE_REQ_READ_16 / $denom)) max: MAX((SQC_DCACHE_REQ_READ_16 / $denom)) unit: (Req + $normUnit) - tips: + tips: *Accesses_Read_Requests_(16_DWord)_tip - metric_table: id: 1403 @@ -144,28 +206,28 @@ Panel Config: min: MIN(((((SQC_TC_DATA_READ_REQ + SQC_TC_DATA_WRITE_REQ + SQC_TC_DATA_ATOMIC_REQ) * 64)) / $denom)) max: MAX(((((SQC_TC_DATA_READ_REQ + SQC_TC_DATA_WRITE_REQ + SQC_TC_DATA_ATOMIC_REQ) * 64)) / $denom)) unit: (Bytes + $normUnit) - tips: + tips: *sL1D-L2_Bandwidth_tip Read Req: avg: AVG((SQC_TC_DATA_READ_REQ / $denom)) min: MIN((SQC_TC_DATA_READ_REQ / $denom)) max: MAX((SQC_TC_DATA_READ_REQ / $denom)) unit: (Req + $normUnit) - tips: + tips: *sL1D-L2_Read_Requests_tip Write Req: avg: AVG((SQC_TC_DATA_WRITE_REQ / $denom)) min: MIN((SQC_TC_DATA_WRITE_REQ / $denom)) max: MAX((SQC_TC_DATA_WRITE_REQ / $denom)) unit: (Req + $normUnit) - tips: + tips: *sL1D-L2_Write_Requests_tip Atomic Req: avg: AVG((SQC_TC_DATA_ATOMIC_REQ / $denom)) min: MIN((SQC_TC_DATA_ATOMIC_REQ / $denom)) max: MAX((SQC_TC_DATA_ATOMIC_REQ / $denom)) unit: (Req + $normUnit) - tips: + tips: *sL1D-L2_Atomic_Requests_tip Stall Cycles: avg: AVG((SQC_TC_STALL / $denom)) min: MIN((SQC_TC_STALL / $denom)) max: MAX((SQC_TC_STALL / $denom)) unit: (Cycles + $normUnit) - tips: + tips: *sL1D-L2_Stall_Cycles_tip