Skip to content

Commit 592713a

Browse files
Yang Wangalexdeucher
authored andcommitted
drm/amd/pm: correct mem_busy_percent display due to calculation errors
PMFW may return invalid values due to internal calculation errors. so, the kmd driver must validate and sanitize the returned values to prevent issues caused by firmware calculation errors. For example, values 0xfffe (-2) and 0xffff (-1) are treated as invalid and clamped to 0. this applies to devices with CAB (Cache As Buffer) functionality. Closes: https://gitlab.freedesktop.org/drm/amd/-/work_items/4905 Signed-off-by: Yang Wang <kevinyang.wang@amd.com> Reviewed-by: Kenneth Feng <kenneth.feng@amd.com> Acked-by: Alex Deucher <alexander.deucher@amd.com> Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
1 parent 95a599c commit 592713a

4 files changed

Lines changed: 32 additions & 15 deletions

File tree

drivers/gpu/drm/amd/pm/swsmu/inc/amdgpu_smu.h

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2164,4 +2164,21 @@ static inline void smu_feature_init(struct smu_context *smu, int feature_num)
21642164
smu_feature_list_clear_all(smu, SMU_FEATURE_LIST_ALLOWED);
21652165
}
21662166

2167+
/*
2168+
* smu_safe_u16_nn - Make u16 safe by filtering negative overflow errors
2169+
* @val: Input u16 value, may contain invalid negative overflows
2170+
*
2171+
* Convert u16 to non-negative value. Cast to s16 to detect negative values
2172+
* caused by calculation errors. Return 0 for negative errors, return
2173+
* original value if valid.
2174+
*
2175+
* Return: Valid u16 value or 0
2176+
*/
2177+
static inline u16 smu_safe_u16_nn(u16 val)
2178+
{
2179+
s16 tmp = (s16)val;
2180+
2181+
return tmp < 0 ? 0 : val;
2182+
}
2183+
21672184
#endif

drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_0_ppt.c

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -773,13 +773,13 @@ static int smu_v13_0_0_get_smu_metrics_data(struct smu_context *smu,
773773
*value = metrics->AverageGfxclkFrequencyPreDs;
774774
break;
775775
case METRICS_AVERAGE_FCLK:
776-
if (metrics->AverageUclkActivity <= SMU_13_0_0_BUSY_THRESHOLD)
776+
if (smu_safe_u16_nn(metrics->AverageUclkActivity) <= SMU_13_0_0_BUSY_THRESHOLD)
777777
*value = metrics->AverageFclkFrequencyPostDs;
778778
else
779779
*value = metrics->AverageFclkFrequencyPreDs;
780780
break;
781781
case METRICS_AVERAGE_UCLK:
782-
if (metrics->AverageUclkActivity <= SMU_13_0_0_BUSY_THRESHOLD)
782+
if (smu_safe_u16_nn(metrics->AverageUclkActivity) <= SMU_13_0_0_BUSY_THRESHOLD)
783783
*value = metrics->AverageMemclkFrequencyPostDs;
784784
else
785785
*value = metrics->AverageMemclkFrequencyPreDs;
@@ -800,7 +800,7 @@ static int smu_v13_0_0_get_smu_metrics_data(struct smu_context *smu,
800800
*value = metrics->AverageGfxActivity;
801801
break;
802802
case METRICS_AVERAGE_MEMACTIVITY:
803-
*value = metrics->AverageUclkActivity;
803+
*value = smu_safe_u16_nn(metrics->AverageUclkActivity);
804804
break;
805805
case METRICS_AVERAGE_VCNACTIVITY:
806806
*value = max(metrics->Vcn0ActivityPercentage,
@@ -2085,7 +2085,7 @@ static ssize_t smu_v13_0_0_get_gpu_metrics(struct smu_context *smu,
20852085
metrics->AvgTemperature[TEMP_VR_MEM1]);
20862086

20872087
gpu_metrics->average_gfx_activity = metrics->AverageGfxActivity;
2088-
gpu_metrics->average_umc_activity = metrics->AverageUclkActivity;
2088+
gpu_metrics->average_umc_activity = smu_safe_u16_nn(metrics->AverageUclkActivity);
20892089
gpu_metrics->average_mm_activity = max(metrics->Vcn0ActivityPercentage,
20902090
metrics->Vcn1ActivityPercentage);
20912091

@@ -2102,7 +2102,7 @@ static ssize_t smu_v13_0_0_get_gpu_metrics(struct smu_context *smu,
21022102
else
21032103
gpu_metrics->average_gfxclk_frequency = metrics->AverageGfxclkFrequencyPreDs;
21042104

2105-
if (metrics->AverageUclkActivity <= SMU_13_0_0_BUSY_THRESHOLD)
2105+
if (smu_safe_u16_nn(metrics->AverageUclkActivity) <= SMU_13_0_0_BUSY_THRESHOLD)
21062106
gpu_metrics->average_uclk_frequency = metrics->AverageMemclkFrequencyPostDs;
21072107
else
21082108
gpu_metrics->average_uclk_frequency = metrics->AverageMemclkFrequencyPreDs;

drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_7_ppt.c

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -783,13 +783,13 @@ static int smu_v13_0_7_get_smu_metrics_data(struct smu_context *smu,
783783
*value = metrics->AverageGfxclkFrequencyPreDs;
784784
break;
785785
case METRICS_AVERAGE_FCLK:
786-
if (metrics->AverageUclkActivity <= SMU_13_0_7_BUSY_THRESHOLD)
786+
if (smu_safe_u16_nn(metrics->AverageUclkActivity) <= SMU_13_0_7_BUSY_THRESHOLD)
787787
*value = metrics->AverageFclkFrequencyPostDs;
788788
else
789789
*value = metrics->AverageFclkFrequencyPreDs;
790790
break;
791791
case METRICS_AVERAGE_UCLK:
792-
if (metrics->AverageUclkActivity <= SMU_13_0_7_BUSY_THRESHOLD)
792+
if (smu_safe_u16_nn(metrics->AverageUclkActivity) <= SMU_13_0_7_BUSY_THRESHOLD)
793793
*value = metrics->AverageMemclkFrequencyPostDs;
794794
else
795795
*value = metrics->AverageMemclkFrequencyPreDs;
@@ -814,7 +814,7 @@ static int smu_v13_0_7_get_smu_metrics_data(struct smu_context *smu,
814814
*value = metrics->AverageGfxActivity;
815815
break;
816816
case METRICS_AVERAGE_MEMACTIVITY:
817-
*value = metrics->AverageUclkActivity;
817+
*value = smu_safe_u16_nn(metrics->AverageUclkActivity);
818818
break;
819819
case METRICS_AVERAGE_SOCKETPOWER:
820820
*value = metrics->AverageSocketPower << 8;
@@ -2091,7 +2091,7 @@ static ssize_t smu_v13_0_7_get_gpu_metrics(struct smu_context *smu,
20912091
metrics->AvgTemperature[TEMP_VR_MEM1]);
20922092

20932093
gpu_metrics->average_gfx_activity = metrics->AverageGfxActivity;
2094-
gpu_metrics->average_umc_activity = metrics->AverageUclkActivity;
2094+
gpu_metrics->average_umc_activity = smu_safe_u16_nn(metrics->AverageUclkActivity);
20952095
gpu_metrics->average_mm_activity = max(metrics->Vcn0ActivityPercentage,
20962096
metrics->Vcn1ActivityPercentage);
20972097

@@ -2104,7 +2104,7 @@ static ssize_t smu_v13_0_7_get_gpu_metrics(struct smu_context *smu,
21042104
else
21052105
gpu_metrics->average_gfxclk_frequency = metrics->AverageGfxclkFrequencyPreDs;
21062106

2107-
if (metrics->AverageUclkActivity <= SMU_13_0_7_BUSY_THRESHOLD)
2107+
if (smu_safe_u16_nn(metrics->AverageUclkActivity) <= SMU_13_0_7_BUSY_THRESHOLD)
21082108
gpu_metrics->average_uclk_frequency = metrics->AverageMemclkFrequencyPostDs;
21092109
else
21102110
gpu_metrics->average_uclk_frequency = metrics->AverageMemclkFrequencyPreDs;

drivers/gpu/drm/amd/pm/swsmu/smu14/smu_v14_0_2_ppt.c

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -661,13 +661,13 @@ static int smu_v14_0_2_get_smu_metrics_data(struct smu_context *smu,
661661
*value = metrics->AverageGfxclkFrequencyPreDs;
662662
break;
663663
case METRICS_AVERAGE_FCLK:
664-
if (metrics->AverageUclkActivity <= SMU_14_0_2_BUSY_THRESHOLD)
664+
if (smu_safe_u16_nn(metrics->AverageUclkActivity) <= SMU_14_0_2_BUSY_THRESHOLD)
665665
*value = metrics->AverageFclkFrequencyPostDs;
666666
else
667667
*value = metrics->AverageFclkFrequencyPreDs;
668668
break;
669669
case METRICS_AVERAGE_UCLK:
670-
if (metrics->AverageUclkActivity <= SMU_14_0_2_BUSY_THRESHOLD)
670+
if (smu_safe_u16_nn(metrics->AverageUclkActivity) <= SMU_14_0_2_BUSY_THRESHOLD)
671671
*value = metrics->AverageMemclkFrequencyPostDs;
672672
else
673673
*value = metrics->AverageMemclkFrequencyPreDs;
@@ -688,7 +688,7 @@ static int smu_v14_0_2_get_smu_metrics_data(struct smu_context *smu,
688688
*value = metrics->AverageGfxActivity;
689689
break;
690690
case METRICS_AVERAGE_MEMACTIVITY:
691-
*value = metrics->AverageUclkActivity;
691+
*value = smu_safe_u16_nn(metrics->AverageUclkActivity);
692692
break;
693693
case METRICS_AVERAGE_VCNACTIVITY:
694694
*value = max(metrics->AverageVcn0ActivityPercentage,
@@ -2147,7 +2147,7 @@ static ssize_t smu_v14_0_2_get_gpu_metrics(struct smu_context *smu,
21472147
metrics->AvgTemperature[TEMP_VR_MEM1]);
21482148

21492149
gpu_metrics->average_gfx_activity = metrics->AverageGfxActivity;
2150-
gpu_metrics->average_umc_activity = metrics->AverageUclkActivity;
2150+
gpu_metrics->average_umc_activity = smu_safe_u16_nn(metrics->AverageUclkActivity);
21512151
gpu_metrics->average_mm_activity = max(metrics->AverageVcn0ActivityPercentage,
21522152
metrics->Vcn1ActivityPercentage);
21532153

@@ -2159,7 +2159,7 @@ static ssize_t smu_v14_0_2_get_gpu_metrics(struct smu_context *smu,
21592159
else
21602160
gpu_metrics->average_gfxclk_frequency = metrics->AverageGfxclkFrequencyPreDs;
21612161

2162-
if (metrics->AverageUclkActivity <= SMU_14_0_2_BUSY_THRESHOLD)
2162+
if (smu_safe_u16_nn(metrics->AverageUclkActivity) <= SMU_14_0_2_BUSY_THRESHOLD)
21632163
gpu_metrics->average_uclk_frequency = metrics->AverageMemclkFrequencyPostDs;
21642164
else
21652165
gpu_metrics->average_uclk_frequency = metrics->AverageMemclkFrequencyPreDs;

0 commit comments

Comments
 (0)