diff --git a/src/rocprof_compute_soc/analysis_configs/gfx906/1500_TA_and_TD.yaml b/src/rocprof_compute_soc/analysis_configs/gfx906/1500_TA_and_TD.yaml index f14fe2f69..9df2694da 100644 --- a/src/rocprof_compute_soc/analysis_configs/gfx906/1500_TA_and_TD.yaml +++ b/src/rocprof_compute_soc/analysis_configs/gfx906/1500_TA_and_TD.yaml @@ -2,6 +2,78 @@ # Add description/tips for each metric in this section. # So it could be shown in hover. Metric Description: + Address Processing Unit: + Busy: &TA_Busy_tip >- + Percent of the total CU cycles the address processor was busy. + Address Stall: &TA_Address_Stall_tip >- + Percent of the total CU cycles the address processor was stalled from sending address requests + further into the vL1D pipeline. + Data Stall: &TA_Data_Stall_tip >- + Percent of the total CU cycles the address processor was stalled from sending write/atomic + data further into the vL1D pipeline. + Data-Processor → Address Stall: &TA_Data-Processor_Address_Stall_tip >- + Percent of total CU cycles the address processor was stalled waiting to send command data to + the data processor. + Total Instructions: &TA_Total_Instructions_tip >- + Global/Generic Instructions: &TA_Global_Generic_Instructions_tip >- + The total number of global and generic memory instructions executed on all compute units on + the accelerator, per normalization unit. + Global/Generic Read Instructions: &TA_Global_Generic_Read_tip >- + The total number of global and generic memory read instructions executed on all compute units + on the accelerator, per normalization unit. + Global/Generic Write Instructions: &TA_Global_Generic_Write_tip >- + The total number of global and generic memory write instructions executed on all compute units + on the accelerator, per normalization unit. + Global/Generic Atomic Instructions: &TA_Global_Generic_Atomic_tip >- + The total number of global and generic memory atomic (with and without return) instructions + executed on all compute units on the accelerator, per normalization unit. + Spill/Stack Instructions: &TA_Spill_Stack_Instructions_tip >- + The total number of spill/stack memory instructions executed on all compute units on the + accelerator, per normalization unit. + Spill/Stack Read Instructions: &TA_Spill_Stack_Read_tip >- + The total number of spill/stack memory read instructions executed on all compute units on the + accelerator, per normalization unit. + Spill/Stack Write Instructions: &TA_Spill_Stack_Write_tip >- + The total number of spill/stack memory write instructions executed on all compute units on the + accelerator, per normalization unit. + Spill/Stack Atomic Instructions: &TA_Spill_Stack_Atomic_tip >- + The total number of spill/stack memory atomic (with and without return) instructions executed + on all compute units on the accelerator, per normalization unit. Typically unused as these + memory operations are typically used to implement thread-local storage. + Spill/Stack Total Cycles: &TA_Spill_Stack_Total_Cycles_tip >- + The number of cycles the address processing unit spent working on spill/stack instructions, + per normalization unit. + Spill/Stack Coalesced Read: &TA_Spill_Stack_Coalesced_Read_tip >- + The number of cycles the address processing unit spent working on coalesced spill/stack read + instructions, per normalization unit. + Spill/Stack Coalesced Write: &TA_Spill_Stack_Coalesced_Write_tip >- + The number of cycles the address processing unit spent working on coalesced spill/stack write + instructions, per normalization unit. + Data-Return Path: + Data-Return Busy: &TD_Busy_tip >- + Percent of the total CU cycles the data-return unit was busy processing or waiting on data to + return to the CU. + Cache RAM → Data-Return Stall: &TD_Cache_RAM_Data-Return_Stall_tip >- + Percent of the total CU cycles the data-return unit was stalled on data to be returned from + the vL1D Cache RAM. + Workgroup Manager → Data-Return Stall: &TD_Workgroup_Manager_Data-Return_Stall_tip >- + Percent of the total CU cycles the data-return unit was stalled by the workgroup manager due + to initialization of registers as a part of launching new workgroups. + Coalescable Instructions: &TD_Coalescable_Instructions_tip >- + The number of instructions submitted to the data-return unit by the address processor that + were found to be coalescable, per normalization unit. + Read Instructions: &TD_Read_tip >- + The number of read instructions submitted to the data-return unit by the address processor + summed over all compute units on the accelerator, per normalization unit. This is expected to + be the sum of global/generic and spill/stack reads in the address processor. + Write Instructions: &TD_Write_tip >- + The number of store instructions submitted to the data-return unit by the address processor + summed over all compute units on the accelerator, per normalization unit. This is expected to + be the sum of global/generic and spill/stack stores counted by the vL1D cache-front-end. + Atomic Instructions: &TD_Atomic_tip >- + The number of atomic instructions submitted to the data-return unit by the address processor + summed over all compute units on the accelerator, per normalization unit. This is expected to + be the sum of global/generic and spill/stack atomics in the address processor. # Define the panel properties and properties of each metric in the panel. Panel Config: @@ -24,97 +96,97 @@ Panel Config: min: MIN(((100 * TA_TA_BUSY_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) max: MAX(((100 * TA_TA_BUSY_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) unit: pct - tips: + tips: *TA_Busy_tip Address Stall: avg: AVG(((100 * TA_ADDR_STALLED_BY_TC_CYCLES_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) min: MIN(((100 * TA_ADDR_STALLED_BY_TC_CYCLES_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) max: MAX(((100 * TA_ADDR_STALLED_BY_TC_CYCLES_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) unit: pct - tips: + tips: *TA_Address_Stall_tip Data Stall: avg: AVG(((100 * TA_DATA_STALLED_BY_TC_CYCLES_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) min: MIN(((100 * TA_DATA_STALLED_BY_TC_CYCLES_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) max: MAX(((100 * TA_DATA_STALLED_BY_TC_CYCLES_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) unit: pct - tips: + tips: *TA_Data_Stall_tip Data-Processor → Address Stall: avg: AVG(((100 * TA_ADDR_STALLED_BY_TD_CYCLES_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) min: MIN(((100 * TA_ADDR_STALLED_BY_TD_CYCLES_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) max: MAX(((100 * TA_ADDR_STALLED_BY_TD_CYCLES_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) unit: pct - tips: + tips: *TA_Data-Processor_Address_Stall_tip Total Instructions: avg: AVG((TA_TOTAL_WAVEFRONTS_sum / $denom)) min: MIN((TA_TOTAL_WAVEFRONTS_sum / $denom)) max: MAX((TA_TOTAL_WAVEFRONTS_sum / $denom)) unit: (Instructions + $normUnit) - tips: + tips: *TA_Total_Instructions_tip Global/Generic Instructions: avg: AVG((TA_FLAT_WAVEFRONTS_sum / $denom)) min: MIN((TA_FLAT_WAVEFRONTS_sum / $denom)) max: MAX((TA_FLAT_WAVEFRONTS_sum / $denom)) unit: (Instructions + $normUnit) - tips: + tips: *TA_Global_Generic_Instructions_tip Global/Generic Read Instructions: avg: AVG((TA_FLAT_READ_WAVEFRONTS_sum / $denom)) min: MIN((TA_FLAT_READ_WAVEFRONTS_sum / $denom)) max: MAX((TA_FLAT_READ_WAVEFRONTS_sum / $denom)) unit: (Instructions + $normUnit) - tips: + tips: *TA_Global_Generic_Read_tip Global/Generic Write Instructions: avg: AVG((TA_FLAT_WRITE_WAVEFRONTS_sum / $denom)) min: MIN((TA_FLAT_WRITE_WAVEFRONTS_sum / $denom)) max: MAX((TA_FLAT_WRITE_WAVEFRONTS_sum / $denom)) unit: (Instructions + $normUnit) - tips: + tips: *TA_Global_Generic_Write_tip Global/Generic Atomic Instructions: avg: AVG((TA_FLAT_ATOMIC_WAVEFRONTS_sum / $denom)) min: MIN((TA_FLAT_ATOMIC_WAVEFRONTS_sum / $denom)) max: MAX((TA_FLAT_ATOMIC_WAVEFRONTS_sum / $denom)) unit: (Instructions + $normUnit) - tips: + tips: *TA_Global_Generic_Atomic_tip Spill/Stack Instructions: avg: AVG((TA_BUFFER_WAVEFRONTS_sum / $denom)) min: MIN((TA_BUFFER_WAVEFRONTS_sum / $denom)) max: MAX((TA_BUFFER_WAVEFRONTS_sum / $denom)) unit: (Instructions + $normUnit) - tips: + tips: *TA_Spill_Stack_Instructions_tip Spill/Stack Read Instructions: avg: AVG((TA_BUFFER_READ_WAVEFRONTS_sum / $denom)) min: MIN((TA_BUFFER_READ_WAVEFRONTS_sum / $denom)) max: MAX((TA_BUFFER_READ_WAVEFRONTS_sum / $denom)) unit: (Instructions + $normUnit) - tips: + tips: *TA_Spill_Stack_Read_tip Spill/Stack Write Instructions: avg: AVG((TA_BUFFER_WRITE_WAVEFRONTS_sum / $denom)) min: MIN((TA_BUFFER_WRITE_WAVEFRONTS_sum / $denom)) max: MAX((TA_BUFFER_WRITE_WAVEFRONTS_sum / $denom)) unit: (Instructions + $normUnit) - tips: + tips: *TA_Spill_Stack_Write_tip Spill/Stack Atomic Instructions: avg: AVG((TA_BUFFER_ATOMIC_WAVEFRONTS_sum / $denom)) min: MIN((TA_BUFFER_ATOMIC_WAVEFRONTS_sum / $denom)) max: MAX((TA_BUFFER_ATOMIC_WAVEFRONTS_sum / $denom)) unit: (Instructions + $normUnit) - tips: + tips: *TA_Spill_Stack_Atomic_tip Spill/Stack Total Cycles: avg: AVG((TA_BUFFER_TOTAL_CYCLES_sum / $denom)) min: MIN((TA_BUFFER_TOTAL_CYCLES_sum / $denom)) max: MAX((TA_BUFFER_TOTAL_CYCLES_sum / $denom)) unit: (Cycles + $normUnit) - tips: + tips: *TA_Spill_Stack_Total_Cycles_tip Spill/Stack Coalesced Read: avg: AVG((TA_BUFFER_COALESCED_READ_CYCLES_sum / $denom)) min: MIN((TA_BUFFER_COALESCED_READ_CYCLES_sum / $denom)) max: MAX((TA_BUFFER_COALESCED_READ_CYCLES_sum / $denom)) unit: (Cycles + $normUnit) - tips: + tips: *TA_Spill_Stack_Coalesced_Read_tip Spill/Stack Coalesced Write: avg: AVG((TA_BUFFER_COALESCED_WRITE_CYCLES_sum / $denom)) min: MIN((TA_BUFFER_COALESCED_WRITE_CYCLES_sum / $denom)) max: MAX((TA_BUFFER_COALESCED_WRITE_CYCLES_sum / $denom)) unit: (Cycles + $normUnit) - tips: + tips: *TA_Spill_Stack_Coalesced_Write_tip - metric_table: id: 1502 @@ -132,13 +204,13 @@ Panel Config: min: MIN(((100 * TD_TD_BUSY_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) max: MAX(((100 * TD_TD_BUSY_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) unit: pct - tips: + tips: *TD_Busy_tip Cache RAM → Data-Return Stall: avg: AVG(((100 * TD_TC_STALL_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) min: MIN(((100 * TD_TC_STALL_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) max: MAX(((100 * TD_TC_STALL_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) unit: pct - tips: + tips: *TD_Cache_RAM_Data-Return_Stall_tip Workgroup manager → Data-Return Stall: avg: # No perf counter min: # No perf counter @@ -150,7 +222,7 @@ Panel Config: min: MIN((TD_COALESCABLE_WAVEFRONT_sum / $denom)) max: MAX((TD_COALESCABLE_WAVEFRONT_sum / $denom)) unit: (Instructions + $normUnit) - tips: + tips: *TD_Coalescable_Instructions_tip Read Instructions: avg: AVG((((TD_LOAD_WAVEFRONT_sum - TD_STORE_WAVEFRONT_sum) - TD_ATOMIC_WAVEFRONT_sum) / $denom)) @@ -159,16 +231,16 @@ Panel Config: max: MAX((((TD_LOAD_WAVEFRONT_sum - TD_STORE_WAVEFRONT_sum) - TD_ATOMIC_WAVEFRONT_sum) / $denom)) unit: (Instructions + $normUnit) - tips: + tips: *TD_Read_tip Write Instructions: avg: AVG((TD_STORE_WAVEFRONT_sum / $denom)) min: MIN((TD_STORE_WAVEFRONT_sum / $denom)) max: MAX((TD_STORE_WAVEFRONT_sum / $denom)) unit: (Instructions + $normUnit) - tips: + tips: *TD_Write_tip Atomic Instructions: avg: AVG((TD_ATOMIC_WAVEFRONT_sum / $denom)) min: MIN((TD_ATOMIC_WAVEFRONT_sum / $denom)) max: MAX((TD_ATOMIC_WAVEFRONT_sum / $denom)) unit: (Instructions + $normUnit) - tips: + tips: *TD_Atomic_tip diff --git a/src/rocprof_compute_soc/analysis_configs/gfx908/1500_TA_and_TD.yaml b/src/rocprof_compute_soc/analysis_configs/gfx908/1500_TA_and_TD.yaml index f14fe2f69..9df2694da 100644 --- a/src/rocprof_compute_soc/analysis_configs/gfx908/1500_TA_and_TD.yaml +++ b/src/rocprof_compute_soc/analysis_configs/gfx908/1500_TA_and_TD.yaml @@ -2,6 +2,78 @@ # Add description/tips for each metric in this section. # So it could be shown in hover. Metric Description: + Address Processing Unit: + Busy: &TA_Busy_tip >- + Percent of the total CU cycles the address processor was busy. + Address Stall: &TA_Address_Stall_tip >- + Percent of the total CU cycles the address processor was stalled from sending address requests + further into the vL1D pipeline. + Data Stall: &TA_Data_Stall_tip >- + Percent of the total CU cycles the address processor was stalled from sending write/atomic + data further into the vL1D pipeline. + Data-Processor → Address Stall: &TA_Data-Processor_Address_Stall_tip >- + Percent of total CU cycles the address processor was stalled waiting to send command data to + the data processor. + Total Instructions: &TA_Total_Instructions_tip >- + Global/Generic Instructions: &TA_Global_Generic_Instructions_tip >- + The total number of global and generic memory instructions executed on all compute units on + the accelerator, per normalization unit. + Global/Generic Read Instructions: &TA_Global_Generic_Read_tip >- + The total number of global and generic memory read instructions executed on all compute units + on the accelerator, per normalization unit. + Global/Generic Write Instructions: &TA_Global_Generic_Write_tip >- + The total number of global and generic memory write instructions executed on all compute units + on the accelerator, per normalization unit. + Global/Generic Atomic Instructions: &TA_Global_Generic_Atomic_tip >- + The total number of global and generic memory atomic (with and without return) instructions + executed on all compute units on the accelerator, per normalization unit. + Spill/Stack Instructions: &TA_Spill_Stack_Instructions_tip >- + The total number of spill/stack memory instructions executed on all compute units on the + accelerator, per normalization unit. + Spill/Stack Read Instructions: &TA_Spill_Stack_Read_tip >- + The total number of spill/stack memory read instructions executed on all compute units on the + accelerator, per normalization unit. + Spill/Stack Write Instructions: &TA_Spill_Stack_Write_tip >- + The total number of spill/stack memory write instructions executed on all compute units on the + accelerator, per normalization unit. + Spill/Stack Atomic Instructions: &TA_Spill_Stack_Atomic_tip >- + The total number of spill/stack memory atomic (with and without return) instructions executed + on all compute units on the accelerator, per normalization unit. Typically unused as these + memory operations are typically used to implement thread-local storage. + Spill/Stack Total Cycles: &TA_Spill_Stack_Total_Cycles_tip >- + The number of cycles the address processing unit spent working on spill/stack instructions, + per normalization unit. + Spill/Stack Coalesced Read: &TA_Spill_Stack_Coalesced_Read_tip >- + The number of cycles the address processing unit spent working on coalesced spill/stack read + instructions, per normalization unit. + Spill/Stack Coalesced Write: &TA_Spill_Stack_Coalesced_Write_tip >- + The number of cycles the address processing unit spent working on coalesced spill/stack write + instructions, per normalization unit. + Data-Return Path: + Data-Return Busy: &TD_Busy_tip >- + Percent of the total CU cycles the data-return unit was busy processing or waiting on data to + return to the CU. + Cache RAM → Data-Return Stall: &TD_Cache_RAM_Data-Return_Stall_tip >- + Percent of the total CU cycles the data-return unit was stalled on data to be returned from + the vL1D Cache RAM. + Workgroup Manager → Data-Return Stall: &TD_Workgroup_Manager_Data-Return_Stall_tip >- + Percent of the total CU cycles the data-return unit was stalled by the workgroup manager due + to initialization of registers as a part of launching new workgroups. + Coalescable Instructions: &TD_Coalescable_Instructions_tip >- + The number of instructions submitted to the data-return unit by the address processor that + were found to be coalescable, per normalization unit. + Read Instructions: &TD_Read_tip >- + The number of read instructions submitted to the data-return unit by the address processor + summed over all compute units on the accelerator, per normalization unit. This is expected to + be the sum of global/generic and spill/stack reads in the address processor. + Write Instructions: &TD_Write_tip >- + The number of store instructions submitted to the data-return unit by the address processor + summed over all compute units on the accelerator, per normalization unit. This is expected to + be the sum of global/generic and spill/stack stores counted by the vL1D cache-front-end. + Atomic Instructions: &TD_Atomic_tip >- + The number of atomic instructions submitted to the data-return unit by the address processor + summed over all compute units on the accelerator, per normalization unit. This is expected to + be the sum of global/generic and spill/stack atomics in the address processor. # Define the panel properties and properties of each metric in the panel. Panel Config: @@ -24,97 +96,97 @@ Panel Config: min: MIN(((100 * TA_TA_BUSY_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) max: MAX(((100 * TA_TA_BUSY_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) unit: pct - tips: + tips: *TA_Busy_tip Address Stall: avg: AVG(((100 * TA_ADDR_STALLED_BY_TC_CYCLES_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) min: MIN(((100 * TA_ADDR_STALLED_BY_TC_CYCLES_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) max: MAX(((100 * TA_ADDR_STALLED_BY_TC_CYCLES_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) unit: pct - tips: + tips: *TA_Address_Stall_tip Data Stall: avg: AVG(((100 * TA_DATA_STALLED_BY_TC_CYCLES_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) min: MIN(((100 * TA_DATA_STALLED_BY_TC_CYCLES_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) max: MAX(((100 * TA_DATA_STALLED_BY_TC_CYCLES_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) unit: pct - tips: + tips: *TA_Data_Stall_tip Data-Processor → Address Stall: avg: AVG(((100 * TA_ADDR_STALLED_BY_TD_CYCLES_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) min: MIN(((100 * TA_ADDR_STALLED_BY_TD_CYCLES_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) max: MAX(((100 * TA_ADDR_STALLED_BY_TD_CYCLES_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) unit: pct - tips: + tips: *TA_Data-Processor_Address_Stall_tip Total Instructions: avg: AVG((TA_TOTAL_WAVEFRONTS_sum / $denom)) min: MIN((TA_TOTAL_WAVEFRONTS_sum / $denom)) max: MAX((TA_TOTAL_WAVEFRONTS_sum / $denom)) unit: (Instructions + $normUnit) - tips: + tips: *TA_Total_Instructions_tip Global/Generic Instructions: avg: AVG((TA_FLAT_WAVEFRONTS_sum / $denom)) min: MIN((TA_FLAT_WAVEFRONTS_sum / $denom)) max: MAX((TA_FLAT_WAVEFRONTS_sum / $denom)) unit: (Instructions + $normUnit) - tips: + tips: *TA_Global_Generic_Instructions_tip Global/Generic Read Instructions: avg: AVG((TA_FLAT_READ_WAVEFRONTS_sum / $denom)) min: MIN((TA_FLAT_READ_WAVEFRONTS_sum / $denom)) max: MAX((TA_FLAT_READ_WAVEFRONTS_sum / $denom)) unit: (Instructions + $normUnit) - tips: + tips: *TA_Global_Generic_Read_tip Global/Generic Write Instructions: avg: AVG((TA_FLAT_WRITE_WAVEFRONTS_sum / $denom)) min: MIN((TA_FLAT_WRITE_WAVEFRONTS_sum / $denom)) max: MAX((TA_FLAT_WRITE_WAVEFRONTS_sum / $denom)) unit: (Instructions + $normUnit) - tips: + tips: *TA_Global_Generic_Write_tip Global/Generic Atomic Instructions: avg: AVG((TA_FLAT_ATOMIC_WAVEFRONTS_sum / $denom)) min: MIN((TA_FLAT_ATOMIC_WAVEFRONTS_sum / $denom)) max: MAX((TA_FLAT_ATOMIC_WAVEFRONTS_sum / $denom)) unit: (Instructions + $normUnit) - tips: + tips: *TA_Global_Generic_Atomic_tip Spill/Stack Instructions: avg: AVG((TA_BUFFER_WAVEFRONTS_sum / $denom)) min: MIN((TA_BUFFER_WAVEFRONTS_sum / $denom)) max: MAX((TA_BUFFER_WAVEFRONTS_sum / $denom)) unit: (Instructions + $normUnit) - tips: + tips: *TA_Spill_Stack_Instructions_tip Spill/Stack Read Instructions: avg: AVG((TA_BUFFER_READ_WAVEFRONTS_sum / $denom)) min: MIN((TA_BUFFER_READ_WAVEFRONTS_sum / $denom)) max: MAX((TA_BUFFER_READ_WAVEFRONTS_sum / $denom)) unit: (Instructions + $normUnit) - tips: + tips: *TA_Spill_Stack_Read_tip Spill/Stack Write Instructions: avg: AVG((TA_BUFFER_WRITE_WAVEFRONTS_sum / $denom)) min: MIN((TA_BUFFER_WRITE_WAVEFRONTS_sum / $denom)) max: MAX((TA_BUFFER_WRITE_WAVEFRONTS_sum / $denom)) unit: (Instructions + $normUnit) - tips: + tips: *TA_Spill_Stack_Write_tip Spill/Stack Atomic Instructions: avg: AVG((TA_BUFFER_ATOMIC_WAVEFRONTS_sum / $denom)) min: MIN((TA_BUFFER_ATOMIC_WAVEFRONTS_sum / $denom)) max: MAX((TA_BUFFER_ATOMIC_WAVEFRONTS_sum / $denom)) unit: (Instructions + $normUnit) - tips: + tips: *TA_Spill_Stack_Atomic_tip Spill/Stack Total Cycles: avg: AVG((TA_BUFFER_TOTAL_CYCLES_sum / $denom)) min: MIN((TA_BUFFER_TOTAL_CYCLES_sum / $denom)) max: MAX((TA_BUFFER_TOTAL_CYCLES_sum / $denom)) unit: (Cycles + $normUnit) - tips: + tips: *TA_Spill_Stack_Total_Cycles_tip Spill/Stack Coalesced Read: avg: AVG((TA_BUFFER_COALESCED_READ_CYCLES_sum / $denom)) min: MIN((TA_BUFFER_COALESCED_READ_CYCLES_sum / $denom)) max: MAX((TA_BUFFER_COALESCED_READ_CYCLES_sum / $denom)) unit: (Cycles + $normUnit) - tips: + tips: *TA_Spill_Stack_Coalesced_Read_tip Spill/Stack Coalesced Write: avg: AVG((TA_BUFFER_COALESCED_WRITE_CYCLES_sum / $denom)) min: MIN((TA_BUFFER_COALESCED_WRITE_CYCLES_sum / $denom)) max: MAX((TA_BUFFER_COALESCED_WRITE_CYCLES_sum / $denom)) unit: (Cycles + $normUnit) - tips: + tips: *TA_Spill_Stack_Coalesced_Write_tip - metric_table: id: 1502 @@ -132,13 +204,13 @@ Panel Config: min: MIN(((100 * TD_TD_BUSY_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) max: MAX(((100 * TD_TD_BUSY_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) unit: pct - tips: + tips: *TD_Busy_tip Cache RAM → Data-Return Stall: avg: AVG(((100 * TD_TC_STALL_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) min: MIN(((100 * TD_TC_STALL_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) max: MAX(((100 * TD_TC_STALL_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) unit: pct - tips: + tips: *TD_Cache_RAM_Data-Return_Stall_tip Workgroup manager → Data-Return Stall: avg: # No perf counter min: # No perf counter @@ -150,7 +222,7 @@ Panel Config: min: MIN((TD_COALESCABLE_WAVEFRONT_sum / $denom)) max: MAX((TD_COALESCABLE_WAVEFRONT_sum / $denom)) unit: (Instructions + $normUnit) - tips: + tips: *TD_Coalescable_Instructions_tip Read Instructions: avg: AVG((((TD_LOAD_WAVEFRONT_sum - TD_STORE_WAVEFRONT_sum) - TD_ATOMIC_WAVEFRONT_sum) / $denom)) @@ -159,16 +231,16 @@ Panel Config: max: MAX((((TD_LOAD_WAVEFRONT_sum - TD_STORE_WAVEFRONT_sum) - TD_ATOMIC_WAVEFRONT_sum) / $denom)) unit: (Instructions + $normUnit) - tips: + tips: *TD_Read_tip Write Instructions: avg: AVG((TD_STORE_WAVEFRONT_sum / $denom)) min: MIN((TD_STORE_WAVEFRONT_sum / $denom)) max: MAX((TD_STORE_WAVEFRONT_sum / $denom)) unit: (Instructions + $normUnit) - tips: + tips: *TD_Write_tip Atomic Instructions: avg: AVG((TD_ATOMIC_WAVEFRONT_sum / $denom)) min: MIN((TD_ATOMIC_WAVEFRONT_sum / $denom)) max: MAX((TD_ATOMIC_WAVEFRONT_sum / $denom)) unit: (Instructions + $normUnit) - tips: + tips: *TD_Atomic_tip diff --git a/src/rocprof_compute_soc/analysis_configs/gfx90a/1500_TA_and_TD.yaml b/src/rocprof_compute_soc/analysis_configs/gfx90a/1500_TA_and_TD.yaml index b14fccbcb..1e4247205 100644 --- a/src/rocprof_compute_soc/analysis_configs/gfx90a/1500_TA_and_TD.yaml +++ b/src/rocprof_compute_soc/analysis_configs/gfx90a/1500_TA_and_TD.yaml @@ -2,6 +2,78 @@ # Add description/tips for each metric in this section. # So it could be shown in hover. Metric Description: + Address Processing Unit: + Busy: &TA_Busy_tip >- + Percent of the total CU cycles the address processor was busy. + Address Stall: &TA_Address_Stall_tip >- + Percent of the total CU cycles the address processor was stalled from sending address requests + further into the vL1D pipeline. + Data Stall: &TA_Data_Stall_tip >- + Percent of the total CU cycles the address processor was stalled from sending write/atomic + data further into the vL1D pipeline. + Data-Processor → Address Stall: &TA_Data-Processor_Address_Stall_tip >- + Percent of total CU cycles the address processor was stalled waiting to send command data to + the data processor. + Total Instructions: &TA_Total_Instructions_tip >- + Global/Generic Instructions: &TA_Global_Generic_Instructions_tip >- + The total number of global and generic memory instructions executed on all compute units on + the accelerator, per normalization unit. + Global/Generic Read Instructions: &TA_Global_Generic_Read_tip >- + The total number of global and generic memory read instructions executed on all compute units + on the accelerator, per normalization unit. + Global/Generic Write Instructions: &TA_Global_Generic_Write_tip >- + The total number of global and generic memory write instructions executed on all compute units + on the accelerator, per normalization unit. + Global/Generic Atomic Instructions: &TA_Global_Generic_Atomic_tip >- + The total number of global and generic memory atomic (with and without return) instructions + executed on all compute units on the accelerator, per normalization unit. + Spill/Stack Instructions: &TA_Spill_Stack_Instructions_tip >- + The total number of spill/stack memory instructions executed on all compute units on the + accelerator, per normalization unit. + Spill/Stack Read Instructions: &TA_Spill_Stack_Read_tip >- + The total number of spill/stack memory read instructions executed on all compute units on the + accelerator, per normalization unit. + Spill/Stack Write Instructions: &TA_Spill_Stack_Write_tip >- + The total number of spill/stack memory write instructions executed on all compute units on the + accelerator, per normalization unit. + Spill/Stack Atomic Instructions: &TA_Spill_Stack_Atomic_tip >- + The total number of spill/stack memory atomic (with and without return) instructions executed + on all compute units on the accelerator, per normalization unit. Typically unused as these + memory operations are typically used to implement thread-local storage. + Spill/Stack Total Cycles: &TA_Spill_Stack_Total_Cycles_tip >- + The number of cycles the address processing unit spent working on spill/stack instructions, + per normalization unit. + Spill/Stack Coalesced Read: &TA_Spill_Stack_Coalesced_Read_tip >- + The number of cycles the address processing unit spent working on coalesced spill/stack read + instructions, per normalization unit. + Spill/Stack Coalesced Write: &TA_Spill_Stack_Coalesced_Write_tip >- + The number of cycles the address processing unit spent working on coalesced spill/stack write + instructions, per normalization unit. + Data-Return Path: + Data-Return Busy: &TD_Busy_tip >- + Percent of the total CU cycles the data-return unit was busy processing or waiting on data to + return to the CU. + Cache RAM → Data-Return Stall: &TD_Cache_RAM_Data-Return_Stall_tip >- + Percent of the total CU cycles the data-return unit was stalled on data to be returned from + the vL1D Cache RAM. + Workgroup Manager → Data-Return Stall: &TD_Workgroup_Manager_Data-Return_Stall_tip >- + Percent of the total CU cycles the data-return unit was stalled by the workgroup manager due + to initialization of registers as a part of launching new workgroups. + Coalescable Instructions: &TD_Coalescable_Instructions_tip >- + The number of instructions submitted to the data-return unit by the address processor that + were found to be coalescable, per normalization unit. + Read Instructions: &TD_Read_tip >- + The number of read instructions submitted to the data-return unit by the address processor + summed over all compute units on the accelerator, per normalization unit. This is expected to + be the sum of global/generic and spill/stack reads in the address processor. + Write Instructions: &TD_Write_tip >- + The number of store instructions submitted to the data-return unit by the address processor + summed over all compute units on the accelerator, per normalization unit. This is expected to + be the sum of global/generic and spill/stack stores counted by the vL1D cache-front-end. + Atomic Instructions: &TD_Atomic_tip >- + The number of atomic instructions submitted to the data-return unit by the address processor + summed over all compute units on the accelerator, per normalization unit. This is expected to + be the sum of global/generic and spill/stack atomics in the address processor. # Define the panel properties and properties of each metric in the panel. Panel Config: @@ -24,97 +96,97 @@ Panel Config: min: MIN(((100 * TA_TA_BUSY_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) max: MAX(((100 * TA_TA_BUSY_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) unit: pct - tips: + tips: *TA_Busy_tip Address Stall: avg: AVG(((100 * TA_ADDR_STALLED_BY_TC_CYCLES_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) min: MIN(((100 * TA_ADDR_STALLED_BY_TC_CYCLES_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) max: MAX(((100 * TA_ADDR_STALLED_BY_TC_CYCLES_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) unit: pct - tips: + tips: *TA_Address_Stall_tip Data Stall: avg: AVG(((100 * TA_DATA_STALLED_BY_TC_CYCLES_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) min: MIN(((100 * TA_DATA_STALLED_BY_TC_CYCLES_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) max: MAX(((100 * TA_DATA_STALLED_BY_TC_CYCLES_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) unit: pct - tips: + tips: *TA_Data_Stall_tip Data-Processor → Address Stall: avg: AVG(((100 * TA_ADDR_STALLED_BY_TD_CYCLES_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) min: MIN(((100 * TA_ADDR_STALLED_BY_TD_CYCLES_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) max: MAX(((100 * TA_ADDR_STALLED_BY_TD_CYCLES_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) unit: pct - tips: + tips: *TA_Data-Processor_Address_Stall_tip Total Instructions: avg: AVG((TA_TOTAL_WAVEFRONTS_sum / $denom)) min: MIN((TA_TOTAL_WAVEFRONTS_sum / $denom)) max: MAX((TA_TOTAL_WAVEFRONTS_sum / $denom)) unit: (Instructions + $normUnit) - tips: + tips: *TA_Total_Instructions_tip Global/Generic Instructions: avg: AVG((TA_FLAT_WAVEFRONTS_sum / $denom)) min: MIN((TA_FLAT_WAVEFRONTS_sum / $denom)) max: MAX((TA_FLAT_WAVEFRONTS_sum / $denom)) unit: (Instructions + $normUnit) - tips: + tips: *TA_Global_Generic_Instructions_tip Global/Generic Read Instructions: avg: AVG((TA_FLAT_READ_WAVEFRONTS_sum / $denom)) min: MIN((TA_FLAT_READ_WAVEFRONTS_sum / $denom)) max: MAX((TA_FLAT_READ_WAVEFRONTS_sum / $denom)) unit: (Instructions + $normUnit) - tips: + tips: *TA_Global_Generic_Read_tip Global/Generic Write Instructions: avg: AVG((TA_FLAT_WRITE_WAVEFRONTS_sum / $denom)) min: MIN((TA_FLAT_WRITE_WAVEFRONTS_sum / $denom)) max: MAX((TA_FLAT_WRITE_WAVEFRONTS_sum / $denom)) unit: (Instructions + $normUnit) - tips: + tips: *TA_Global_Generic_Write_tip Global/Generic Atomic Instructions: avg: AVG((TA_FLAT_ATOMIC_WAVEFRONTS_sum / $denom)) min: MIN((TA_FLAT_ATOMIC_WAVEFRONTS_sum / $denom)) max: MAX((TA_FLAT_ATOMIC_WAVEFRONTS_sum / $denom)) unit: (Instructions + $normUnit) - tips: + tips: *TA_Global_Generic_Atomic_tip Spill/Stack Instructions: avg: AVG((TA_BUFFER_WAVEFRONTS_sum / $denom)) min: MIN((TA_BUFFER_WAVEFRONTS_sum / $denom)) max: MAX((TA_BUFFER_WAVEFRONTS_sum / $denom)) unit: (Instructions + $normUnit) - tips: + tips: *TA_Spill_Stack_Instructions_tip Spill/Stack Read Instructions: avg: AVG((TA_BUFFER_READ_WAVEFRONTS_sum / $denom)) min: MIN((TA_BUFFER_READ_WAVEFRONTS_sum / $denom)) max: MAX((TA_BUFFER_READ_WAVEFRONTS_sum / $denom)) unit: (Instructions + $normUnit) - tips: + tips: *TA_Spill_Stack_Read_tip Spill/Stack Write Instructions: avg: AVG((TA_BUFFER_WRITE_WAVEFRONTS_sum / $denom)) min: MIN((TA_BUFFER_WRITE_WAVEFRONTS_sum / $denom)) max: MAX((TA_BUFFER_WRITE_WAVEFRONTS_sum / $denom)) unit: (Instructions + $normUnit) - tips: + tips: *TA_Spill_Stack_Write_tip Spill/Stack Atomic Instructions: avg: AVG((TA_BUFFER_ATOMIC_WAVEFRONTS_sum / $denom)) min: MIN((TA_BUFFER_ATOMIC_WAVEFRONTS_sum / $denom)) max: MAX((TA_BUFFER_ATOMIC_WAVEFRONTS_sum / $denom)) unit: (Instructions + $normUnit) - tips: + tips: *TA_Spill_Stack_Atomic_tip Spill/Stack Total Cycles: avg: AVG((TA_BUFFER_TOTAL_CYCLES_sum / $denom)) min: MIN((TA_BUFFER_TOTAL_CYCLES_sum / $denom)) max: MAX((TA_BUFFER_TOTAL_CYCLES_sum / $denom)) unit: (Cycles + $normUnit) - tips: + tips: *TA_Spill_Stack_Total_Cycles_tip Spill/Stack Coalesced Read: avg: AVG((TA_BUFFER_COALESCED_READ_CYCLES_sum / $denom)) min: MIN((TA_BUFFER_COALESCED_READ_CYCLES_sum / $denom)) max: MAX((TA_BUFFER_COALESCED_READ_CYCLES_sum / $denom)) unit: (Cycles + $normUnit) - tips: + tips: *TA_Spill_Stack_Coalesced_Read_tip Spill/Stack Coalesced Write: avg: AVG((TA_BUFFER_COALESCED_WRITE_CYCLES_sum / $denom)) min: MIN((TA_BUFFER_COALESCED_WRITE_CYCLES_sum / $denom)) max: MAX((TA_BUFFER_COALESCED_WRITE_CYCLES_sum / $denom)) unit: (Cycles + $normUnit) - tips: + tips: *TA_Spill_Stack_Coalesced_Write_tip - metric_table: id: 1502 @@ -132,25 +204,25 @@ Panel Config: min: MIN(((100 * TD_TD_BUSY_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) max: MAX(((100 * TD_TD_BUSY_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) unit: pct - tips: + tips: *TD_Busy_tip Cache RAM → Data-Return Stall: avg: AVG(((100 * TD_TC_STALL_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) min: MIN(((100 * TD_TC_STALL_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) max: MAX(((100 * TD_TC_STALL_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) unit: pct - tips: + tips: *TD_Cache_RAM_Data-Return_Stall_tip Workgroup manager → Data-Return Stall: avg: AVG(((100 * TD_SPI_STALL_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) min: MIN(((100 * TD_SPI_STALL_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) max: MAX(((100 * TD_SPI_STALL_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) unit: pct - tips: + tips: *TD_Workgroup_Manager_Data-Return_Stall_tip Coalescable Instructions: avg: AVG((TD_COALESCABLE_WAVEFRONT_sum / $denom)) min: MIN((TD_COALESCABLE_WAVEFRONT_sum / $denom)) max: MAX((TD_COALESCABLE_WAVEFRONT_sum / $denom)) unit: (Instructions + $normUnit) - tips: + tips: *TD_Coalescable_Instructions_tip Read Instructions: avg: AVG((((TD_LOAD_WAVEFRONT_sum - TD_STORE_WAVEFRONT_sum) - TD_ATOMIC_WAVEFRONT_sum) / $denom)) @@ -159,16 +231,16 @@ Panel Config: max: MAX((((TD_LOAD_WAVEFRONT_sum - TD_STORE_WAVEFRONT_sum) - TD_ATOMIC_WAVEFRONT_sum) / $denom)) unit: (Instructions + $normUnit) - tips: + tips: *TD_Read_tip Write Instructions: avg: AVG((TD_STORE_WAVEFRONT_sum / $denom)) min: MIN((TD_STORE_WAVEFRONT_sum / $denom)) max: MAX((TD_STORE_WAVEFRONT_sum / $denom)) unit: (Instructions + $normUnit) - tips: + tips: *TD_Write_tip Atomic Instructions: avg: AVG((TD_ATOMIC_WAVEFRONT_sum / $denom)) min: MIN((TD_ATOMIC_WAVEFRONT_sum / $denom)) max: MAX((TD_ATOMIC_WAVEFRONT_sum / $denom)) unit: (Instructions + $normUnit) - tips: + tips: *TD_Atomic_tip diff --git a/src/rocprof_compute_soc/analysis_configs/gfx940/1500_TA_and_TD.yaml b/src/rocprof_compute_soc/analysis_configs/gfx940/1500_TA_and_TD.yaml index 5283eca3a..1e4247205 100644 --- a/src/rocprof_compute_soc/analysis_configs/gfx940/1500_TA_and_TD.yaml +++ b/src/rocprof_compute_soc/analysis_configs/gfx940/1500_TA_and_TD.yaml @@ -2,6 +2,78 @@ # Add description/tips for each metric in this section. # So it could be shown in hover. Metric Description: + Address Processing Unit: + Busy: &TA_Busy_tip >- + Percent of the total CU cycles the address processor was busy. + Address Stall: &TA_Address_Stall_tip >- + Percent of the total CU cycles the address processor was stalled from sending address requests + further into the vL1D pipeline. + Data Stall: &TA_Data_Stall_tip >- + Percent of the total CU cycles the address processor was stalled from sending write/atomic + data further into the vL1D pipeline. + Data-Processor → Address Stall: &TA_Data-Processor_Address_Stall_tip >- + Percent of total CU cycles the address processor was stalled waiting to send command data to + the data processor. + Total Instructions: &TA_Total_Instructions_tip >- + Global/Generic Instructions: &TA_Global_Generic_Instructions_tip >- + The total number of global and generic memory instructions executed on all compute units on + the accelerator, per normalization unit. + Global/Generic Read Instructions: &TA_Global_Generic_Read_tip >- + The total number of global and generic memory read instructions executed on all compute units + on the accelerator, per normalization unit. + Global/Generic Write Instructions: &TA_Global_Generic_Write_tip >- + The total number of global and generic memory write instructions executed on all compute units + on the accelerator, per normalization unit. + Global/Generic Atomic Instructions: &TA_Global_Generic_Atomic_tip >- + The total number of global and generic memory atomic (with and without return) instructions + executed on all compute units on the accelerator, per normalization unit. + Spill/Stack Instructions: &TA_Spill_Stack_Instructions_tip >- + The total number of spill/stack memory instructions executed on all compute units on the + accelerator, per normalization unit. + Spill/Stack Read Instructions: &TA_Spill_Stack_Read_tip >- + The total number of spill/stack memory read instructions executed on all compute units on the + accelerator, per normalization unit. + Spill/Stack Write Instructions: &TA_Spill_Stack_Write_tip >- + The total number of spill/stack memory write instructions executed on all compute units on the + accelerator, per normalization unit. + Spill/Stack Atomic Instructions: &TA_Spill_Stack_Atomic_tip >- + The total number of spill/stack memory atomic (with and without return) instructions executed + on all compute units on the accelerator, per normalization unit. Typically unused as these + memory operations are typically used to implement thread-local storage. + Spill/Stack Total Cycles: &TA_Spill_Stack_Total_Cycles_tip >- + The number of cycles the address processing unit spent working on spill/stack instructions, + per normalization unit. + Spill/Stack Coalesced Read: &TA_Spill_Stack_Coalesced_Read_tip >- + The number of cycles the address processing unit spent working on coalesced spill/stack read + instructions, per normalization unit. + Spill/Stack Coalesced Write: &TA_Spill_Stack_Coalesced_Write_tip >- + The number of cycles the address processing unit spent working on coalesced spill/stack write + instructions, per normalization unit. + Data-Return Path: + Data-Return Busy: &TD_Busy_tip >- + Percent of the total CU cycles the data-return unit was busy processing or waiting on data to + return to the CU. + Cache RAM → Data-Return Stall: &TD_Cache_RAM_Data-Return_Stall_tip >- + Percent of the total CU cycles the data-return unit was stalled on data to be returned from + the vL1D Cache RAM. + Workgroup Manager → Data-Return Stall: &TD_Workgroup_Manager_Data-Return_Stall_tip >- + Percent of the total CU cycles the data-return unit was stalled by the workgroup manager due + to initialization of registers as a part of launching new workgroups. + Coalescable Instructions: &TD_Coalescable_Instructions_tip >- + The number of instructions submitted to the data-return unit by the address processor that + were found to be coalescable, per normalization unit. + Read Instructions: &TD_Read_tip >- + The number of read instructions submitted to the data-return unit by the address processor + summed over all compute units on the accelerator, per normalization unit. This is expected to + be the sum of global/generic and spill/stack reads in the address processor. + Write Instructions: &TD_Write_tip >- + The number of store instructions submitted to the data-return unit by the address processor + summed over all compute units on the accelerator, per normalization unit. This is expected to + be the sum of global/generic and spill/stack stores counted by the vL1D cache-front-end. + Atomic Instructions: &TD_Atomic_tip >- + The number of atomic instructions submitted to the data-return unit by the address processor + summed over all compute units on the accelerator, per normalization unit. This is expected to + be the sum of global/generic and spill/stack atomics in the address processor. # Define the panel properties and properties of each metric in the panel. Panel Config: @@ -24,97 +96,97 @@ Panel Config: min: MIN(((100 * TA_TA_BUSY_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) max: MAX(((100 * TA_TA_BUSY_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) unit: pct - tips: + tips: *TA_Busy_tip Address Stall: avg: AVG(((100 * TA_ADDR_STALLED_BY_TC_CYCLES_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) min: MIN(((100 * TA_ADDR_STALLED_BY_TC_CYCLES_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) max: MAX(((100 * TA_ADDR_STALLED_BY_TC_CYCLES_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) unit: pct - tips: + tips: *TA_Address_Stall_tip Data Stall: avg: AVG(((100 * TA_DATA_STALLED_BY_TC_CYCLES_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) min: MIN(((100 * TA_DATA_STALLED_BY_TC_CYCLES_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) max: MAX(((100 * TA_DATA_STALLED_BY_TC_CYCLES_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) unit: pct - tips: + tips: *TA_Data_Stall_tip Data-Processor → Address Stall: avg: AVG(((100 * TA_ADDR_STALLED_BY_TD_CYCLES_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) min: MIN(((100 * TA_ADDR_STALLED_BY_TD_CYCLES_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) max: MAX(((100 * TA_ADDR_STALLED_BY_TD_CYCLES_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) unit: pct - tips: + tips: *TA_Data-Processor_Address_Stall_tip Total Instructions: avg: AVG((TA_TOTAL_WAVEFRONTS_sum / $denom)) min: MIN((TA_TOTAL_WAVEFRONTS_sum / $denom)) max: MAX((TA_TOTAL_WAVEFRONTS_sum / $denom)) unit: (Instructions + $normUnit) - tips: + tips: *TA_Total_Instructions_tip Global/Generic Instructions: avg: AVG((TA_FLAT_WAVEFRONTS_sum / $denom)) min: MIN((TA_FLAT_WAVEFRONTS_sum / $denom)) max: MAX((TA_FLAT_WAVEFRONTS_sum / $denom)) unit: (Instructions + $normUnit) - tips: + tips: *TA_Global_Generic_Instructions_tip Global/Generic Read Instructions: avg: AVG((TA_FLAT_READ_WAVEFRONTS_sum / $denom)) min: MIN((TA_FLAT_READ_WAVEFRONTS_sum / $denom)) max: MAX((TA_FLAT_READ_WAVEFRONTS_sum / $denom)) unit: (Instructions + $normUnit) - tips: + tips: *TA_Global_Generic_Read_tip Global/Generic Write Instructions: avg: AVG((TA_FLAT_WRITE_WAVEFRONTS_sum / $denom)) min: MIN((TA_FLAT_WRITE_WAVEFRONTS_sum / $denom)) max: MAX((TA_FLAT_WRITE_WAVEFRONTS_sum / $denom)) unit: (Instructions + $normUnit) - tips: + tips: *TA_Global_Generic_Write_tip Global/Generic Atomic Instructions: avg: AVG((TA_FLAT_ATOMIC_WAVEFRONTS_sum / $denom)) min: MIN((TA_FLAT_ATOMIC_WAVEFRONTS_sum / $denom)) max: MAX((TA_FLAT_ATOMIC_WAVEFRONTS_sum / $denom)) unit: (Instructions + $normUnit) - tips: + tips: *TA_Global_Generic_Atomic_tip Spill/Stack Instructions: avg: AVG((TA_BUFFER_WAVEFRONTS_sum / $denom)) min: MIN((TA_BUFFER_WAVEFRONTS_sum / $denom)) max: MAX((TA_BUFFER_WAVEFRONTS_sum / $denom)) unit: (Instructions + $normUnit) - tips: + tips: *TA_Spill_Stack_Instructions_tip Spill/Stack Read Instructions: avg: AVG((TA_BUFFER_READ_WAVEFRONTS_sum / $denom)) min: MIN((TA_BUFFER_READ_WAVEFRONTS_sum / $denom)) max: MAX((TA_BUFFER_READ_WAVEFRONTS_sum / $denom)) unit: (Instructions + $normUnit) - tips: + tips: *TA_Spill_Stack_Read_tip Spill/Stack Write Instructions: avg: AVG((TA_BUFFER_WRITE_WAVEFRONTS_sum / $denom)) min: MIN((TA_BUFFER_WRITE_WAVEFRONTS_sum / $denom)) max: MAX((TA_BUFFER_WRITE_WAVEFRONTS_sum / $denom)) unit: (Instructions + $normUnit) - tips: + tips: *TA_Spill_Stack_Write_tip Spill/Stack Atomic Instructions: avg: AVG((TA_BUFFER_ATOMIC_WAVEFRONTS_sum / $denom)) min: MIN((TA_BUFFER_ATOMIC_WAVEFRONTS_sum / $denom)) max: MAX((TA_BUFFER_ATOMIC_WAVEFRONTS_sum / $denom)) unit: (Instructions + $normUnit) - tips: + tips: *TA_Spill_Stack_Atomic_tip Spill/Stack Total Cycles: avg: AVG((TA_BUFFER_TOTAL_CYCLES_sum / $denom)) min: MIN((TA_BUFFER_TOTAL_CYCLES_sum / $denom)) max: MAX((TA_BUFFER_TOTAL_CYCLES_sum / $denom)) unit: (Cycles + $normUnit) - tips: + tips: *TA_Spill_Stack_Total_Cycles_tip Spill/Stack Coalesced Read: avg: AVG((TA_BUFFER_COALESCED_READ_CYCLES_sum / $denom)) min: MIN((TA_BUFFER_COALESCED_READ_CYCLES_sum / $denom)) max: MAX((TA_BUFFER_COALESCED_READ_CYCLES_sum / $denom)) unit: (Cycles + $normUnit) - tips: + tips: *TA_Spill_Stack_Coalesced_Read_tip Spill/Stack Coalesced Write: avg: AVG((TA_BUFFER_COALESCED_WRITE_CYCLES_sum / $denom)) min: MIN((TA_BUFFER_COALESCED_WRITE_CYCLES_sum / $denom)) max: MAX((TA_BUFFER_COALESCED_WRITE_CYCLES_sum / $denom)) unit: (Cycles + $normUnit) - tips: + tips: *TA_Spill_Stack_Coalesced_Write_tip - metric_table: id: 1502 @@ -132,25 +204,25 @@ Panel Config: min: MIN(((100 * TD_TD_BUSY_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) max: MAX(((100 * TD_TD_BUSY_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) unit: pct - tips: + tips: *TD_Busy_tip Cache RAM → Data-Return Stall: avg: AVG(((100 * TD_TC_STALL_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) min: MIN(((100 * TD_TC_STALL_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) max: MAX(((100 * TD_TC_STALL_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) unit: pct - tips: + tips: *TD_Cache_RAM_Data-Return_Stall_tip Workgroup manager → Data-Return Stall: avg: AVG(((100 * TD_SPI_STALL_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) min: MIN(((100 * TD_SPI_STALL_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) max: MAX(((100 * TD_SPI_STALL_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) unit: pct - tips: + tips: *TD_Workgroup_Manager_Data-Return_Stall_tip Coalescable Instructions: avg: AVG((TD_COALESCABLE_WAVEFRONT_sum / $denom)) min: MIN((TD_COALESCABLE_WAVEFRONT_sum / $denom)) max: MAX((TD_COALESCABLE_WAVEFRONT_sum / $denom)) unit: (Instructions + $normUnit) - tips: + tips: *TD_Coalescable_Instructions_tip Read Instructions: avg: AVG((((TD_LOAD_WAVEFRONT_sum - TD_STORE_WAVEFRONT_sum) - TD_ATOMIC_WAVEFRONT_sum) / $denom)) @@ -159,16 +231,16 @@ Panel Config: max: MAX((((TD_LOAD_WAVEFRONT_sum - TD_STORE_WAVEFRONT_sum) - TD_ATOMIC_WAVEFRONT_sum) / $denom)) unit: (Instructions + $normUnit) - tips: + tips: *TD_Read_tip Write Instructions: avg: AVG((TD_STORE_WAVEFRONT_sum / $denom)) min: MIN((TD_STORE_WAVEFRONT_sum / $denom)) max: MAX((TD_STORE_WAVEFRONT_sum / $denom)) unit: (Instructions + $normUnit) - tips: + tips: *TD_Write_tip Atomic Instructions: avg: AVG((TD_ATOMIC_WAVEFRONT_sum / $denom)) min: MIN((TD_ATOMIC_WAVEFRONT_sum / $denom)) max: MAX((TD_ATOMIC_WAVEFRONT_sum / $denom)) unit: (Instructions + $normUnit) - tips: \ No newline at end of file + tips: *TD_Atomic_tip diff --git a/src/rocprof_compute_soc/analysis_configs/gfx941/1500_TA_and_TD.yaml b/src/rocprof_compute_soc/analysis_configs/gfx941/1500_TA_and_TD.yaml index 5283eca3a..1e4247205 100644 --- a/src/rocprof_compute_soc/analysis_configs/gfx941/1500_TA_and_TD.yaml +++ b/src/rocprof_compute_soc/analysis_configs/gfx941/1500_TA_and_TD.yaml @@ -2,6 +2,78 @@ # Add description/tips for each metric in this section. # So it could be shown in hover. Metric Description: + Address Processing Unit: + Busy: &TA_Busy_tip >- + Percent of the total CU cycles the address processor was busy. + Address Stall: &TA_Address_Stall_tip >- + Percent of the total CU cycles the address processor was stalled from sending address requests + further into the vL1D pipeline. + Data Stall: &TA_Data_Stall_tip >- + Percent of the total CU cycles the address processor was stalled from sending write/atomic + data further into the vL1D pipeline. + Data-Processor → Address Stall: &TA_Data-Processor_Address_Stall_tip >- + Percent of total CU cycles the address processor was stalled waiting to send command data to + the data processor. + Total Instructions: &TA_Total_Instructions_tip >- + Global/Generic Instructions: &TA_Global_Generic_Instructions_tip >- + The total number of global and generic memory instructions executed on all compute units on + the accelerator, per normalization unit. + Global/Generic Read Instructions: &TA_Global_Generic_Read_tip >- + The total number of global and generic memory read instructions executed on all compute units + on the accelerator, per normalization unit. + Global/Generic Write Instructions: &TA_Global_Generic_Write_tip >- + The total number of global and generic memory write instructions executed on all compute units + on the accelerator, per normalization unit. + Global/Generic Atomic Instructions: &TA_Global_Generic_Atomic_tip >- + The total number of global and generic memory atomic (with and without return) instructions + executed on all compute units on the accelerator, per normalization unit. + Spill/Stack Instructions: &TA_Spill_Stack_Instructions_tip >- + The total number of spill/stack memory instructions executed on all compute units on the + accelerator, per normalization unit. + Spill/Stack Read Instructions: &TA_Spill_Stack_Read_tip >- + The total number of spill/stack memory read instructions executed on all compute units on the + accelerator, per normalization unit. + Spill/Stack Write Instructions: &TA_Spill_Stack_Write_tip >- + The total number of spill/stack memory write instructions executed on all compute units on the + accelerator, per normalization unit. + Spill/Stack Atomic Instructions: &TA_Spill_Stack_Atomic_tip >- + The total number of spill/stack memory atomic (with and without return) instructions executed + on all compute units on the accelerator, per normalization unit. Typically unused as these + memory operations are typically used to implement thread-local storage. + Spill/Stack Total Cycles: &TA_Spill_Stack_Total_Cycles_tip >- + The number of cycles the address processing unit spent working on spill/stack instructions, + per normalization unit. + Spill/Stack Coalesced Read: &TA_Spill_Stack_Coalesced_Read_tip >- + The number of cycles the address processing unit spent working on coalesced spill/stack read + instructions, per normalization unit. + Spill/Stack Coalesced Write: &TA_Spill_Stack_Coalesced_Write_tip >- + The number of cycles the address processing unit spent working on coalesced spill/stack write + instructions, per normalization unit. + Data-Return Path: + Data-Return Busy: &TD_Busy_tip >- + Percent of the total CU cycles the data-return unit was busy processing or waiting on data to + return to the CU. + Cache RAM → Data-Return Stall: &TD_Cache_RAM_Data-Return_Stall_tip >- + Percent of the total CU cycles the data-return unit was stalled on data to be returned from + the vL1D Cache RAM. + Workgroup Manager → Data-Return Stall: &TD_Workgroup_Manager_Data-Return_Stall_tip >- + Percent of the total CU cycles the data-return unit was stalled by the workgroup manager due + to initialization of registers as a part of launching new workgroups. + Coalescable Instructions: &TD_Coalescable_Instructions_tip >- + The number of instructions submitted to the data-return unit by the address processor that + were found to be coalescable, per normalization unit. + Read Instructions: &TD_Read_tip >- + The number of read instructions submitted to the data-return unit by the address processor + summed over all compute units on the accelerator, per normalization unit. This is expected to + be the sum of global/generic and spill/stack reads in the address processor. + Write Instructions: &TD_Write_tip >- + The number of store instructions submitted to the data-return unit by the address processor + summed over all compute units on the accelerator, per normalization unit. This is expected to + be the sum of global/generic and spill/stack stores counted by the vL1D cache-front-end. + Atomic Instructions: &TD_Atomic_tip >- + The number of atomic instructions submitted to the data-return unit by the address processor + summed over all compute units on the accelerator, per normalization unit. This is expected to + be the sum of global/generic and spill/stack atomics in the address processor. # Define the panel properties and properties of each metric in the panel. Panel Config: @@ -24,97 +96,97 @@ Panel Config: min: MIN(((100 * TA_TA_BUSY_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) max: MAX(((100 * TA_TA_BUSY_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) unit: pct - tips: + tips: *TA_Busy_tip Address Stall: avg: AVG(((100 * TA_ADDR_STALLED_BY_TC_CYCLES_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) min: MIN(((100 * TA_ADDR_STALLED_BY_TC_CYCLES_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) max: MAX(((100 * TA_ADDR_STALLED_BY_TC_CYCLES_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) unit: pct - tips: + tips: *TA_Address_Stall_tip Data Stall: avg: AVG(((100 * TA_DATA_STALLED_BY_TC_CYCLES_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) min: MIN(((100 * TA_DATA_STALLED_BY_TC_CYCLES_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) max: MAX(((100 * TA_DATA_STALLED_BY_TC_CYCLES_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) unit: pct - tips: + tips: *TA_Data_Stall_tip Data-Processor → Address Stall: avg: AVG(((100 * TA_ADDR_STALLED_BY_TD_CYCLES_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) min: MIN(((100 * TA_ADDR_STALLED_BY_TD_CYCLES_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) max: MAX(((100 * TA_ADDR_STALLED_BY_TD_CYCLES_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) unit: pct - tips: + tips: *TA_Data-Processor_Address_Stall_tip Total Instructions: avg: AVG((TA_TOTAL_WAVEFRONTS_sum / $denom)) min: MIN((TA_TOTAL_WAVEFRONTS_sum / $denom)) max: MAX((TA_TOTAL_WAVEFRONTS_sum / $denom)) unit: (Instructions + $normUnit) - tips: + tips: *TA_Total_Instructions_tip Global/Generic Instructions: avg: AVG((TA_FLAT_WAVEFRONTS_sum / $denom)) min: MIN((TA_FLAT_WAVEFRONTS_sum / $denom)) max: MAX((TA_FLAT_WAVEFRONTS_sum / $denom)) unit: (Instructions + $normUnit) - tips: + tips: *TA_Global_Generic_Instructions_tip Global/Generic Read Instructions: avg: AVG((TA_FLAT_READ_WAVEFRONTS_sum / $denom)) min: MIN((TA_FLAT_READ_WAVEFRONTS_sum / $denom)) max: MAX((TA_FLAT_READ_WAVEFRONTS_sum / $denom)) unit: (Instructions + $normUnit) - tips: + tips: *TA_Global_Generic_Read_tip Global/Generic Write Instructions: avg: AVG((TA_FLAT_WRITE_WAVEFRONTS_sum / $denom)) min: MIN((TA_FLAT_WRITE_WAVEFRONTS_sum / $denom)) max: MAX((TA_FLAT_WRITE_WAVEFRONTS_sum / $denom)) unit: (Instructions + $normUnit) - tips: + tips: *TA_Global_Generic_Write_tip Global/Generic Atomic Instructions: avg: AVG((TA_FLAT_ATOMIC_WAVEFRONTS_sum / $denom)) min: MIN((TA_FLAT_ATOMIC_WAVEFRONTS_sum / $denom)) max: MAX((TA_FLAT_ATOMIC_WAVEFRONTS_sum / $denom)) unit: (Instructions + $normUnit) - tips: + tips: *TA_Global_Generic_Atomic_tip Spill/Stack Instructions: avg: AVG((TA_BUFFER_WAVEFRONTS_sum / $denom)) min: MIN((TA_BUFFER_WAVEFRONTS_sum / $denom)) max: MAX((TA_BUFFER_WAVEFRONTS_sum / $denom)) unit: (Instructions + $normUnit) - tips: + tips: *TA_Spill_Stack_Instructions_tip Spill/Stack Read Instructions: avg: AVG((TA_BUFFER_READ_WAVEFRONTS_sum / $denom)) min: MIN((TA_BUFFER_READ_WAVEFRONTS_sum / $denom)) max: MAX((TA_BUFFER_READ_WAVEFRONTS_sum / $denom)) unit: (Instructions + $normUnit) - tips: + tips: *TA_Spill_Stack_Read_tip Spill/Stack Write Instructions: avg: AVG((TA_BUFFER_WRITE_WAVEFRONTS_sum / $denom)) min: MIN((TA_BUFFER_WRITE_WAVEFRONTS_sum / $denom)) max: MAX((TA_BUFFER_WRITE_WAVEFRONTS_sum / $denom)) unit: (Instructions + $normUnit) - tips: + tips: *TA_Spill_Stack_Write_tip Spill/Stack Atomic Instructions: avg: AVG((TA_BUFFER_ATOMIC_WAVEFRONTS_sum / $denom)) min: MIN((TA_BUFFER_ATOMIC_WAVEFRONTS_sum / $denom)) max: MAX((TA_BUFFER_ATOMIC_WAVEFRONTS_sum / $denom)) unit: (Instructions + $normUnit) - tips: + tips: *TA_Spill_Stack_Atomic_tip Spill/Stack Total Cycles: avg: AVG((TA_BUFFER_TOTAL_CYCLES_sum / $denom)) min: MIN((TA_BUFFER_TOTAL_CYCLES_sum / $denom)) max: MAX((TA_BUFFER_TOTAL_CYCLES_sum / $denom)) unit: (Cycles + $normUnit) - tips: + tips: *TA_Spill_Stack_Total_Cycles_tip Spill/Stack Coalesced Read: avg: AVG((TA_BUFFER_COALESCED_READ_CYCLES_sum / $denom)) min: MIN((TA_BUFFER_COALESCED_READ_CYCLES_sum / $denom)) max: MAX((TA_BUFFER_COALESCED_READ_CYCLES_sum / $denom)) unit: (Cycles + $normUnit) - tips: + tips: *TA_Spill_Stack_Coalesced_Read_tip Spill/Stack Coalesced Write: avg: AVG((TA_BUFFER_COALESCED_WRITE_CYCLES_sum / $denom)) min: MIN((TA_BUFFER_COALESCED_WRITE_CYCLES_sum / $denom)) max: MAX((TA_BUFFER_COALESCED_WRITE_CYCLES_sum / $denom)) unit: (Cycles + $normUnit) - tips: + tips: *TA_Spill_Stack_Coalesced_Write_tip - metric_table: id: 1502 @@ -132,25 +204,25 @@ Panel Config: min: MIN(((100 * TD_TD_BUSY_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) max: MAX(((100 * TD_TD_BUSY_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) unit: pct - tips: + tips: *TD_Busy_tip Cache RAM → Data-Return Stall: avg: AVG(((100 * TD_TC_STALL_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) min: MIN(((100 * TD_TC_STALL_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) max: MAX(((100 * TD_TC_STALL_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) unit: pct - tips: + tips: *TD_Cache_RAM_Data-Return_Stall_tip Workgroup manager → Data-Return Stall: avg: AVG(((100 * TD_SPI_STALL_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) min: MIN(((100 * TD_SPI_STALL_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) max: MAX(((100 * TD_SPI_STALL_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) unit: pct - tips: + tips: *TD_Workgroup_Manager_Data-Return_Stall_tip Coalescable Instructions: avg: AVG((TD_COALESCABLE_WAVEFRONT_sum / $denom)) min: MIN((TD_COALESCABLE_WAVEFRONT_sum / $denom)) max: MAX((TD_COALESCABLE_WAVEFRONT_sum / $denom)) unit: (Instructions + $normUnit) - tips: + tips: *TD_Coalescable_Instructions_tip Read Instructions: avg: AVG((((TD_LOAD_WAVEFRONT_sum - TD_STORE_WAVEFRONT_sum) - TD_ATOMIC_WAVEFRONT_sum) / $denom)) @@ -159,16 +231,16 @@ Panel Config: max: MAX((((TD_LOAD_WAVEFRONT_sum - TD_STORE_WAVEFRONT_sum) - TD_ATOMIC_WAVEFRONT_sum) / $denom)) unit: (Instructions + $normUnit) - tips: + tips: *TD_Read_tip Write Instructions: avg: AVG((TD_STORE_WAVEFRONT_sum / $denom)) min: MIN((TD_STORE_WAVEFRONT_sum / $denom)) max: MAX((TD_STORE_WAVEFRONT_sum / $denom)) unit: (Instructions + $normUnit) - tips: + tips: *TD_Write_tip Atomic Instructions: avg: AVG((TD_ATOMIC_WAVEFRONT_sum / $denom)) min: MIN((TD_ATOMIC_WAVEFRONT_sum / $denom)) max: MAX((TD_ATOMIC_WAVEFRONT_sum / $denom)) unit: (Instructions + $normUnit) - tips: \ No newline at end of file + tips: *TD_Atomic_tip diff --git a/src/rocprof_compute_soc/analysis_configs/gfx942/1500_TA_and_TD.yaml b/src/rocprof_compute_soc/analysis_configs/gfx942/1500_TA_and_TD.yaml index 5283eca3a..1e4247205 100644 --- a/src/rocprof_compute_soc/analysis_configs/gfx942/1500_TA_and_TD.yaml +++ b/src/rocprof_compute_soc/analysis_configs/gfx942/1500_TA_and_TD.yaml @@ -2,6 +2,78 @@ # Add description/tips for each metric in this section. # So it could be shown in hover. Metric Description: + Address Processing Unit: + Busy: &TA_Busy_tip >- + Percent of the total CU cycles the address processor was busy. + Address Stall: &TA_Address_Stall_tip >- + Percent of the total CU cycles the address processor was stalled from sending address requests + further into the vL1D pipeline. + Data Stall: &TA_Data_Stall_tip >- + Percent of the total CU cycles the address processor was stalled from sending write/atomic + data further into the vL1D pipeline. + Data-Processor → Address Stall: &TA_Data-Processor_Address_Stall_tip >- + Percent of total CU cycles the address processor was stalled waiting to send command data to + the data processor. + Total Instructions: &TA_Total_Instructions_tip >- + Global/Generic Instructions: &TA_Global_Generic_Instructions_tip >- + The total number of global and generic memory instructions executed on all compute units on + the accelerator, per normalization unit. + Global/Generic Read Instructions: &TA_Global_Generic_Read_tip >- + The total number of global and generic memory read instructions executed on all compute units + on the accelerator, per normalization unit. + Global/Generic Write Instructions: &TA_Global_Generic_Write_tip >- + The total number of global and generic memory write instructions executed on all compute units + on the accelerator, per normalization unit. + Global/Generic Atomic Instructions: &TA_Global_Generic_Atomic_tip >- + The total number of global and generic memory atomic (with and without return) instructions + executed on all compute units on the accelerator, per normalization unit. + Spill/Stack Instructions: &TA_Spill_Stack_Instructions_tip >- + The total number of spill/stack memory instructions executed on all compute units on the + accelerator, per normalization unit. + Spill/Stack Read Instructions: &TA_Spill_Stack_Read_tip >- + The total number of spill/stack memory read instructions executed on all compute units on the + accelerator, per normalization unit. + Spill/Stack Write Instructions: &TA_Spill_Stack_Write_tip >- + The total number of spill/stack memory write instructions executed on all compute units on the + accelerator, per normalization unit. + Spill/Stack Atomic Instructions: &TA_Spill_Stack_Atomic_tip >- + The total number of spill/stack memory atomic (with and without return) instructions executed + on all compute units on the accelerator, per normalization unit. Typically unused as these + memory operations are typically used to implement thread-local storage. + Spill/Stack Total Cycles: &TA_Spill_Stack_Total_Cycles_tip >- + The number of cycles the address processing unit spent working on spill/stack instructions, + per normalization unit. + Spill/Stack Coalesced Read: &TA_Spill_Stack_Coalesced_Read_tip >- + The number of cycles the address processing unit spent working on coalesced spill/stack read + instructions, per normalization unit. + Spill/Stack Coalesced Write: &TA_Spill_Stack_Coalesced_Write_tip >- + The number of cycles the address processing unit spent working on coalesced spill/stack write + instructions, per normalization unit. + Data-Return Path: + Data-Return Busy: &TD_Busy_tip >- + Percent of the total CU cycles the data-return unit was busy processing or waiting on data to + return to the CU. + Cache RAM → Data-Return Stall: &TD_Cache_RAM_Data-Return_Stall_tip >- + Percent of the total CU cycles the data-return unit was stalled on data to be returned from + the vL1D Cache RAM. + Workgroup Manager → Data-Return Stall: &TD_Workgroup_Manager_Data-Return_Stall_tip >- + Percent of the total CU cycles the data-return unit was stalled by the workgroup manager due + to initialization of registers as a part of launching new workgroups. + Coalescable Instructions: &TD_Coalescable_Instructions_tip >- + The number of instructions submitted to the data-return unit by the address processor that + were found to be coalescable, per normalization unit. + Read Instructions: &TD_Read_tip >- + The number of read instructions submitted to the data-return unit by the address processor + summed over all compute units on the accelerator, per normalization unit. This is expected to + be the sum of global/generic and spill/stack reads in the address processor. + Write Instructions: &TD_Write_tip >- + The number of store instructions submitted to the data-return unit by the address processor + summed over all compute units on the accelerator, per normalization unit. This is expected to + be the sum of global/generic and spill/stack stores counted by the vL1D cache-front-end. + Atomic Instructions: &TD_Atomic_tip >- + The number of atomic instructions submitted to the data-return unit by the address processor + summed over all compute units on the accelerator, per normalization unit. This is expected to + be the sum of global/generic and spill/stack atomics in the address processor. # Define the panel properties and properties of each metric in the panel. Panel Config: @@ -24,97 +96,97 @@ Panel Config: min: MIN(((100 * TA_TA_BUSY_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) max: MAX(((100 * TA_TA_BUSY_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) unit: pct - tips: + tips: *TA_Busy_tip Address Stall: avg: AVG(((100 * TA_ADDR_STALLED_BY_TC_CYCLES_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) min: MIN(((100 * TA_ADDR_STALLED_BY_TC_CYCLES_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) max: MAX(((100 * TA_ADDR_STALLED_BY_TC_CYCLES_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) unit: pct - tips: + tips: *TA_Address_Stall_tip Data Stall: avg: AVG(((100 * TA_DATA_STALLED_BY_TC_CYCLES_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) min: MIN(((100 * TA_DATA_STALLED_BY_TC_CYCLES_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) max: MAX(((100 * TA_DATA_STALLED_BY_TC_CYCLES_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) unit: pct - tips: + tips: *TA_Data_Stall_tip Data-Processor → Address Stall: avg: AVG(((100 * TA_ADDR_STALLED_BY_TD_CYCLES_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) min: MIN(((100 * TA_ADDR_STALLED_BY_TD_CYCLES_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) max: MAX(((100 * TA_ADDR_STALLED_BY_TD_CYCLES_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) unit: pct - tips: + tips: *TA_Data-Processor_Address_Stall_tip Total Instructions: avg: AVG((TA_TOTAL_WAVEFRONTS_sum / $denom)) min: MIN((TA_TOTAL_WAVEFRONTS_sum / $denom)) max: MAX((TA_TOTAL_WAVEFRONTS_sum / $denom)) unit: (Instructions + $normUnit) - tips: + tips: *TA_Total_Instructions_tip Global/Generic Instructions: avg: AVG((TA_FLAT_WAVEFRONTS_sum / $denom)) min: MIN((TA_FLAT_WAVEFRONTS_sum / $denom)) max: MAX((TA_FLAT_WAVEFRONTS_sum / $denom)) unit: (Instructions + $normUnit) - tips: + tips: *TA_Global_Generic_Instructions_tip Global/Generic Read Instructions: avg: AVG((TA_FLAT_READ_WAVEFRONTS_sum / $denom)) min: MIN((TA_FLAT_READ_WAVEFRONTS_sum / $denom)) max: MAX((TA_FLAT_READ_WAVEFRONTS_sum / $denom)) unit: (Instructions + $normUnit) - tips: + tips: *TA_Global_Generic_Read_tip Global/Generic Write Instructions: avg: AVG((TA_FLAT_WRITE_WAVEFRONTS_sum / $denom)) min: MIN((TA_FLAT_WRITE_WAVEFRONTS_sum / $denom)) max: MAX((TA_FLAT_WRITE_WAVEFRONTS_sum / $denom)) unit: (Instructions + $normUnit) - tips: + tips: *TA_Global_Generic_Write_tip Global/Generic Atomic Instructions: avg: AVG((TA_FLAT_ATOMIC_WAVEFRONTS_sum / $denom)) min: MIN((TA_FLAT_ATOMIC_WAVEFRONTS_sum / $denom)) max: MAX((TA_FLAT_ATOMIC_WAVEFRONTS_sum / $denom)) unit: (Instructions + $normUnit) - tips: + tips: *TA_Global_Generic_Atomic_tip Spill/Stack Instructions: avg: AVG((TA_BUFFER_WAVEFRONTS_sum / $denom)) min: MIN((TA_BUFFER_WAVEFRONTS_sum / $denom)) max: MAX((TA_BUFFER_WAVEFRONTS_sum / $denom)) unit: (Instructions + $normUnit) - tips: + tips: *TA_Spill_Stack_Instructions_tip Spill/Stack Read Instructions: avg: AVG((TA_BUFFER_READ_WAVEFRONTS_sum / $denom)) min: MIN((TA_BUFFER_READ_WAVEFRONTS_sum / $denom)) max: MAX((TA_BUFFER_READ_WAVEFRONTS_sum / $denom)) unit: (Instructions + $normUnit) - tips: + tips: *TA_Spill_Stack_Read_tip Spill/Stack Write Instructions: avg: AVG((TA_BUFFER_WRITE_WAVEFRONTS_sum / $denom)) min: MIN((TA_BUFFER_WRITE_WAVEFRONTS_sum / $denom)) max: MAX((TA_BUFFER_WRITE_WAVEFRONTS_sum / $denom)) unit: (Instructions + $normUnit) - tips: + tips: *TA_Spill_Stack_Write_tip Spill/Stack Atomic Instructions: avg: AVG((TA_BUFFER_ATOMIC_WAVEFRONTS_sum / $denom)) min: MIN((TA_BUFFER_ATOMIC_WAVEFRONTS_sum / $denom)) max: MAX((TA_BUFFER_ATOMIC_WAVEFRONTS_sum / $denom)) unit: (Instructions + $normUnit) - tips: + tips: *TA_Spill_Stack_Atomic_tip Spill/Stack Total Cycles: avg: AVG((TA_BUFFER_TOTAL_CYCLES_sum / $denom)) min: MIN((TA_BUFFER_TOTAL_CYCLES_sum / $denom)) max: MAX((TA_BUFFER_TOTAL_CYCLES_sum / $denom)) unit: (Cycles + $normUnit) - tips: + tips: *TA_Spill_Stack_Total_Cycles_tip Spill/Stack Coalesced Read: avg: AVG((TA_BUFFER_COALESCED_READ_CYCLES_sum / $denom)) min: MIN((TA_BUFFER_COALESCED_READ_CYCLES_sum / $denom)) max: MAX((TA_BUFFER_COALESCED_READ_CYCLES_sum / $denom)) unit: (Cycles + $normUnit) - tips: + tips: *TA_Spill_Stack_Coalesced_Read_tip Spill/Stack Coalesced Write: avg: AVG((TA_BUFFER_COALESCED_WRITE_CYCLES_sum / $denom)) min: MIN((TA_BUFFER_COALESCED_WRITE_CYCLES_sum / $denom)) max: MAX((TA_BUFFER_COALESCED_WRITE_CYCLES_sum / $denom)) unit: (Cycles + $normUnit) - tips: + tips: *TA_Spill_Stack_Coalesced_Write_tip - metric_table: id: 1502 @@ -132,25 +204,25 @@ Panel Config: min: MIN(((100 * TD_TD_BUSY_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) max: MAX(((100 * TD_TD_BUSY_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) unit: pct - tips: + tips: *TD_Busy_tip Cache RAM → Data-Return Stall: avg: AVG(((100 * TD_TC_STALL_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) min: MIN(((100 * TD_TC_STALL_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) max: MAX(((100 * TD_TC_STALL_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) unit: pct - tips: + tips: *TD_Cache_RAM_Data-Return_Stall_tip Workgroup manager → Data-Return Stall: avg: AVG(((100 * TD_SPI_STALL_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) min: MIN(((100 * TD_SPI_STALL_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) max: MAX(((100 * TD_SPI_STALL_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) unit: pct - tips: + tips: *TD_Workgroup_Manager_Data-Return_Stall_tip Coalescable Instructions: avg: AVG((TD_COALESCABLE_WAVEFRONT_sum / $denom)) min: MIN((TD_COALESCABLE_WAVEFRONT_sum / $denom)) max: MAX((TD_COALESCABLE_WAVEFRONT_sum / $denom)) unit: (Instructions + $normUnit) - tips: + tips: *TD_Coalescable_Instructions_tip Read Instructions: avg: AVG((((TD_LOAD_WAVEFRONT_sum - TD_STORE_WAVEFRONT_sum) - TD_ATOMIC_WAVEFRONT_sum) / $denom)) @@ -159,16 +231,16 @@ Panel Config: max: MAX((((TD_LOAD_WAVEFRONT_sum - TD_STORE_WAVEFRONT_sum) - TD_ATOMIC_WAVEFRONT_sum) / $denom)) unit: (Instructions + $normUnit) - tips: + tips: *TD_Read_tip Write Instructions: avg: AVG((TD_STORE_WAVEFRONT_sum / $denom)) min: MIN((TD_STORE_WAVEFRONT_sum / $denom)) max: MAX((TD_STORE_WAVEFRONT_sum / $denom)) unit: (Instructions + $normUnit) - tips: + tips: *TD_Write_tip Atomic Instructions: avg: AVG((TD_ATOMIC_WAVEFRONT_sum / $denom)) min: MIN((TD_ATOMIC_WAVEFRONT_sum / $denom)) max: MAX((TD_ATOMIC_WAVEFRONT_sum / $denom)) unit: (Instructions + $normUnit) - tips: \ No newline at end of file + tips: *TD_Atomic_tip