Skip to content

Commit c332fba

Browse files
kpoosarodrigovivi
authored andcommitted
drm/xe/hwmon: Expose temperature limits
Read temperature limits using pcode mailbox and expose shutdown temperature limit as tempX_emergency, critical temperature limit as tempX_crit and GPU max temperature limit as temp2_max. Update Xe hwmon documentation with above entries. v2: - Resolve a documentation warning. - Address below review comments from Raag. - Update date and kernel version in Xe hwmon documentation. - Remove explicit disable of has_mbx_thermal_info for unsupported platforms. - Remove unnecessary default case in switches. - Remove obvious comments. - Use TEMP_LIMIT_MAX to compute number of dwords needed in xe_hwmon_thermal_info. - Remove THERMAL_LIMITS_DWORDS macro. - Use has_mbx_thermal_info for checking thermal mailbox support. v3: - Address below minor comments. (Raag) - Group new temperature attributes with existing temperature attributes as per channel index in Xe hwmon documentation. - Rename enums of xe_temp_limit to improve clarity. - Use DIV_ROUND_UP to calculate dwords needed for temperature limits. - Use return instead of breaks in xe_hwmon_temp_read. - Minor aesthetic refinements. v4: - Remove a redundant break. (Raag) - Update drm_dbg to drm_warn to inform user of unavailability for thermal mailbox on expected platforms. Signed-off-by: Karthik Poosa <karthik.poosa@intel.com> Reviewed-by: Raag Jadav <raag.jadav@intel.com> Link: https://patch.msgid.link/20260112203521.1014388-2-karthik.poosa@intel.com Signed-off-by: Rodrigo Vivi <rodrigo.vivi@intel.com>
1 parent b1dcec9 commit c332fba

6 files changed

Lines changed: 148 additions & 3 deletions

File tree

Documentation/ABI/testing/sysfs-driver-intel-xe-hwmon

Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -109,6 +109,22 @@ Description: RO. Package current voltage in millivolt.
109109

110110
Only supported for particular Intel Xe graphics platforms.
111111

112+
What: /sys/bus/pci/drivers/xe/.../hwmon/hwmon<i>/temp2_crit
113+
Date: January 2026
114+
KernelVersion: 7.0
115+
Contact: intel-xe@lists.freedesktop.org
116+
Description: RO. Package critical temperature in millidegree Celsius.
117+
118+
Only supported for particular Intel Xe graphics platforms.
119+
120+
What: /sys/bus/pci/drivers/xe/.../hwmon/hwmon<i>/temp2_emergency
121+
Date: January 2026
122+
KernelVersion: 7.0
123+
Contact: intel-xe@lists.freedesktop.org
124+
Description: RO. Package shutdown temperature in millidegree Celsius.
125+
126+
Only supported for particular Intel Xe graphics platforms.
127+
112128
What: /sys/bus/pci/drivers/xe/.../hwmon/hwmon<i>/temp2_input
113129
Date: March 2025
114130
KernelVersion: 6.15
@@ -117,6 +133,30 @@ Description: RO. Package temperature in millidegree Celsius.
117133

118134
Only supported for particular Intel Xe graphics platforms.
119135

136+
What: /sys/bus/pci/drivers/xe/.../hwmon/hwmon<i>/temp2_max
137+
Date: January 2026
138+
KernelVersion: 7.0
139+
Contact: intel-xe@lists.freedesktop.org
140+
Description: RO. Package maximum temperature limit in millidegree Celsius.
141+
142+
Only supported for particular Intel Xe graphics platforms.
143+
144+
What: /sys/bus/pci/drivers/xe/.../hwmon/hwmon<i>/temp3_crit
145+
Date: January 2026
146+
KernelVersion: 7.0
147+
Contact: intel-xe@lists.freedesktop.org
148+
Description: RO. VRAM critical temperature in millidegree Celsius.
149+
150+
Only supported for particular Intel Xe graphics platforms.
151+
152+
What: /sys/bus/pci/drivers/xe/.../hwmon/hwmon<i>/temp3_emergency
153+
Date: January 2026
154+
KernelVersion: 7.0
155+
Contact: intel-xe@lists.freedesktop.org
156+
Description: RO. VRAM shutdown temperature in millidegree Celsius.
157+
158+
Only supported for particular Intel Xe graphics platforms.
159+
120160
What: /sys/bus/pci/drivers/xe/.../hwmon/hwmon<i>/temp3_input
121161
Date: March 2025
122162
KernelVersion: 6.15

drivers/gpu/drm/xe/xe_device_types.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -341,6 +341,8 @@ struct xe_device {
341341
* pcode mailbox commands.
342342
*/
343343
u8 has_mbx_power_limits:1;
344+
/** @info.has_mbx_thermal_info: Device supports thermal mailbox commands */
345+
u8 has_mbx_thermal_info:1;
344346
/** @info.has_mem_copy_instr: Device supports MEM_COPY instruction */
345347
u8 has_mem_copy_instr:1;
346348
/** @info.has_mert: Device has standalone MERT */

drivers/gpu/drm/xe/xe_hwmon.c

Lines changed: 99 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -53,6 +53,15 @@ enum xe_fan_channel {
5353
FAN_MAX,
5454
};
5555

56+
enum xe_temp_limit {
57+
TEMP_LIMIT_PKG_SHUTDOWN,
58+
TEMP_LIMIT_PKG_CRIT,
59+
TEMP_LIMIT_MEM_SHUTDOWN,
60+
TEMP_LIMIT_PKG_MAX,
61+
TEMP_LIMIT_MEM_CRIT,
62+
TEMP_LIMIT_MAX
63+
};
64+
5665
/* Attribute index for powerX_xxx_interval sysfs entries */
5766
enum sensor_attr_power {
5867
SENSOR_INDEX_PSYS_PL1,
@@ -111,6 +120,18 @@ struct xe_hwmon_fan_info {
111120
u64 time_prev;
112121
};
113122

123+
/**
124+
* struct xe_hwmon_thermal_info - to store temperature data
125+
*/
126+
struct xe_hwmon_thermal_info {
127+
union {
128+
/** @limit: temperatures limits */
129+
u8 limit[TEMP_LIMIT_MAX];
130+
/** @data: temperature limits in dwords */
131+
u32 data[DIV_ROUND_UP(TEMP_LIMIT_MAX, sizeof(u32))];
132+
};
133+
};
134+
114135
/**
115136
* struct xe_hwmon - xe hwmon data structure
116137
*/
@@ -137,7 +158,8 @@ struct xe_hwmon {
137158
u32 pl1_on_boot[CHANNEL_MAX];
138159
/** @pl2_on_boot: power limit PL2 on boot */
139160
u32 pl2_on_boot[CHANNEL_MAX];
140-
161+
/** @temp: Temperature info */
162+
struct xe_hwmon_thermal_info temp;
141163
};
142164

143165
static int xe_hwmon_pcode_read_power_limit(const struct xe_hwmon *hwmon, u32 attr, int channel,
@@ -677,8 +699,11 @@ static const struct attribute_group *hwmon_groups[] = {
677699
};
678700

679701
static const struct hwmon_channel_info * const hwmon_info[] = {
680-
HWMON_CHANNEL_INFO(temp, HWMON_T_LABEL, HWMON_T_INPUT | HWMON_T_LABEL,
681-
HWMON_T_INPUT | HWMON_T_LABEL),
702+
HWMON_CHANNEL_INFO(temp,
703+
HWMON_T_LABEL,
704+
HWMON_T_CRIT | HWMON_T_EMERGENCY | HWMON_T_INPUT | HWMON_T_LABEL |
705+
HWMON_T_MAX,
706+
HWMON_T_CRIT | HWMON_T_EMERGENCY | HWMON_T_INPUT | HWMON_T_LABEL),
682707
HWMON_CHANNEL_INFO(power, HWMON_P_MAX | HWMON_P_RATED_MAX | HWMON_P_LABEL | HWMON_P_CRIT |
683708
HWMON_P_CAP,
684709
HWMON_P_MAX | HWMON_P_RATED_MAX | HWMON_P_LABEL | HWMON_P_CAP),
@@ -689,6 +714,19 @@ static const struct hwmon_channel_info * const hwmon_info[] = {
689714
NULL
690715
};
691716

717+
static int xe_hwmon_pcode_read_thermal_info(struct xe_hwmon *hwmon)
718+
{
719+
struct xe_tile *root_tile = xe_device_get_root_tile(hwmon->xe);
720+
int ret;
721+
722+
ret = xe_pcode_read(root_tile, PCODE_MBOX(PCODE_THERMAL_INFO, READ_THERMAL_LIMITS, 0),
723+
&hwmon->temp.data[0], &hwmon->temp.data[1]);
724+
drm_dbg(&hwmon->xe->drm, "thermal info read val 0x%x val1 0x%x\n",
725+
hwmon->temp.data[0], hwmon->temp.data[1]);
726+
727+
return ret;
728+
}
729+
692730
/* I1 is exposed as power_crit or as curr_crit depending on bit 31 */
693731
static int xe_hwmon_pcode_read_i1(const struct xe_hwmon *hwmon, u32 *uval)
694732
{
@@ -787,6 +825,31 @@ static umode_t
787825
xe_hwmon_temp_is_visible(struct xe_hwmon *hwmon, u32 attr, int channel)
788826
{
789827
switch (attr) {
828+
case hwmon_temp_emergency:
829+
switch (channel) {
830+
case CHANNEL_PKG:
831+
return hwmon->temp.limit[TEMP_LIMIT_PKG_SHUTDOWN] ? 0444 : 0;
832+
case CHANNEL_VRAM:
833+
return hwmon->temp.limit[TEMP_LIMIT_MEM_SHUTDOWN] ? 0444 : 0;
834+
default:
835+
return 0;
836+
}
837+
case hwmon_temp_crit:
838+
switch (channel) {
839+
case CHANNEL_PKG:
840+
return hwmon->temp.limit[TEMP_LIMIT_PKG_CRIT] ? 0444 : 0;
841+
case CHANNEL_VRAM:
842+
return hwmon->temp.limit[TEMP_LIMIT_MEM_CRIT] ? 0444 : 0;
843+
default:
844+
return 0;
845+
}
846+
case hwmon_temp_max:
847+
switch (channel) {
848+
case CHANNEL_PKG:
849+
return hwmon->temp.limit[TEMP_LIMIT_PKG_MAX] ? 0444 : 0;
850+
default:
851+
return 0;
852+
}
790853
case hwmon_temp_input:
791854
case hwmon_temp_label:
792855
return xe_reg_is_valid(xe_hwmon_get_reg(hwmon, REG_TEMP, channel)) ? 0444 : 0;
@@ -808,6 +871,36 @@ xe_hwmon_temp_read(struct xe_hwmon *hwmon, u32 attr, int channel, long *val)
808871
/* HW register value is in degrees Celsius, convert to millidegrees. */
809872
*val = REG_FIELD_GET(TEMP_MASK, reg_val) * MILLIDEGREE_PER_DEGREE;
810873
return 0;
874+
case hwmon_temp_emergency:
875+
switch (channel) {
876+
case CHANNEL_PKG:
877+
*val = hwmon->temp.limit[TEMP_LIMIT_PKG_SHUTDOWN] * MILLIDEGREE_PER_DEGREE;
878+
return 0;
879+
case CHANNEL_VRAM:
880+
*val = hwmon->temp.limit[TEMP_LIMIT_MEM_SHUTDOWN] * MILLIDEGREE_PER_DEGREE;
881+
return 0;
882+
default:
883+
return -EOPNOTSUPP;
884+
}
885+
case hwmon_temp_crit:
886+
switch (channel) {
887+
case CHANNEL_PKG:
888+
*val = hwmon->temp.limit[TEMP_LIMIT_PKG_CRIT] * MILLIDEGREE_PER_DEGREE;
889+
return 0;
890+
case CHANNEL_VRAM:
891+
*val = hwmon->temp.limit[TEMP_LIMIT_MEM_CRIT] * MILLIDEGREE_PER_DEGREE;
892+
return 0;
893+
default:
894+
return -EOPNOTSUPP;
895+
}
896+
case hwmon_temp_max:
897+
switch (channel) {
898+
case CHANNEL_PKG:
899+
*val = hwmon->temp.limit[TEMP_LIMIT_PKG_MAX] * MILLIDEGREE_PER_DEGREE;
900+
return 0;
901+
default:
902+
return -EOPNOTSUPP;
903+
}
811904
default:
812905
return -EOPNOTSUPP;
813906
}
@@ -1263,6 +1356,9 @@ xe_hwmon_get_preregistration_info(struct xe_hwmon *hwmon)
12631356
for (channel = 0; channel < FAN_MAX; channel++)
12641357
if (xe_hwmon_is_visible(hwmon, hwmon_fan, hwmon_fan_input, channel))
12651358
xe_hwmon_fan_input_read(hwmon, channel, &fan_speed);
1359+
1360+
if (hwmon->xe->info.has_mbx_thermal_info && xe_hwmon_pcode_read_thermal_info(hwmon))
1361+
drm_warn(&hwmon->xe->drm, "Thermal mailbox not supported by card firmware\n");
12661362
}
12671363

12681364
int xe_hwmon_register(struct xe_device *xe)

drivers/gpu/drm/xe/xe_pci.c

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -366,6 +366,7 @@ static const struct xe_device_desc bmg_desc = {
366366
.has_fan_control = true,
367367
.has_flat_ccs = 1,
368368
.has_mbx_power_limits = true,
369+
.has_mbx_thermal_info = true,
369370
.has_gsc_nvm = 1,
370371
.has_heci_cscfi = 1,
371372
.has_i2c = true,
@@ -422,6 +423,7 @@ static const struct xe_device_desc cri_desc = {
422423
.has_gsc_nvm = 1,
423424
.has_i2c = true,
424425
.has_mbx_power_limits = true,
426+
.has_mbx_thermal_info = true,
425427
.has_mert = true,
426428
.has_pre_prod_wa = 1,
427429
.has_soc_remapper_sysctrl = true,
@@ -687,6 +689,7 @@ static int xe_info_init_early(struct xe_device *xe,
687689
/* runtime fusing may force flat_ccs to disabled later */
688690
xe->info.has_flat_ccs = desc->has_flat_ccs;
689691
xe->info.has_mbx_power_limits = desc->has_mbx_power_limits;
692+
xe->info.has_mbx_thermal_info = desc->has_mbx_thermal_info;
690693
xe->info.has_gsc_nvm = desc->has_gsc_nvm;
691694
xe->info.has_heci_gscfi = desc->has_heci_gscfi;
692695
xe->info.has_heci_cscfi = desc->has_heci_cscfi;

drivers/gpu/drm/xe/xe_pci_types.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -48,6 +48,7 @@ struct xe_device_desc {
4848
u8 has_late_bind:1;
4949
u8 has_llc:1;
5050
u8 has_mbx_power_limits:1;
51+
u8 has_mbx_thermal_info:1;
5152
u8 has_mem_copy_instr:1;
5253
u8 has_mert:1;
5354
u8 has_pre_prod_wa:1;

drivers/gpu/drm/xe/xe_pcode_api.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -50,6 +50,9 @@
5050
#define READ_PL_FROM_FW 0x1
5151
#define READ_PL_FROM_PCODE 0x0
5252

53+
#define PCODE_THERMAL_INFO 0x25
54+
#define READ_THERMAL_LIMITS 0x0
55+
5356
#define PCODE_LATE_BINDING 0x5C
5457
#define GET_CAPABILITY_STATUS 0x0
5558
#define V1_FAN_SUPPORTED REG_BIT(0)

0 commit comments

Comments
 (0)