Skip to content

Commit 3a0cb88

Browse files
kpoosarodrigovivi
authored andcommitted
drm/xe/hwmon: Expose memory controller temperature
Expose GPU memory controller average temperature and its limits under temp4_xxx. Update Xe hwmon documentation for this. v2: - Rephrase commit message. (Badal) - Update kernel version in Xe hwmon documentation. (Raag) v3: - Update kernel version in Xe hwmon documentation. - Address review comments from Raag. - Remove obvious comments. - Remove redundant debug logs. - Remove unnecessary checks. - Avoid magic numbers. - Add new comments. - Use temperature sensors count to make memory controller visible. - Use temperature limits of package for memory controller. v4: - Address review comments from Raag. - Group new temperature attributes with existing temperature attributes as per channel index in Xe hwmon documentation. - Use DIV_ROUND_UP to calculate dwords needed for temperature limits. - Minor aesthetic refinements. - Remove unused TEMP_MASK_MAILBOX. v5: - Use REG_FIELD_GET to get count from READ_THERMAL_DATA output. (Raag) - Change count print from decimal to hexadecimal. - Cosmetic changes. Signed-off-by: Karthik Poosa <karthik.poosa@intel.com> Reviewed-by: Raag Jadav <raag.jadav@intel.com> Link: https://patch.msgid.link/20260112203521.1014388-3-karthik.poosa@intel.com Signed-off-by: Rodrigo Vivi <rodrigo.vivi@intel.com>
1 parent c332fba commit 3a0cb88

3 files changed

Lines changed: 100 additions & 5 deletions

File tree

Documentation/ABI/testing/sysfs-driver-intel-xe-hwmon

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -165,6 +165,30 @@ Description: RO. VRAM temperature in millidegree Celsius.
165165

166166
Only supported for particular Intel Xe graphics platforms.
167167

168+
What: /sys/bus/pci/drivers/xe/.../hwmon/hwmon<i>/temp4_crit
169+
Date: January 2026
170+
KernelVersion: 7.0
171+
Contact: intel-xe@lists.freedesktop.org
172+
Description: RO. Memory controller critical temperature in millidegree Celsius.
173+
174+
Only supported for particular Intel Xe graphics platforms.
175+
176+
What: /sys/bus/pci/drivers/xe/.../hwmon/hwmon<i>/temp4_emergency
177+
Date: January 2026
178+
KernelVersion: 7.0
179+
Contact: intel-xe@lists.freedesktop.org
180+
Description: RO. Memory controller shutdown temperature in millidegree Celsius.
181+
182+
Only supported for particular Intel Xe graphics platforms.
183+
184+
What: /sys/bus/pci/drivers/xe/.../hwmon/hwmon<i>/temp4_input
185+
Date: January 2026
186+
KernelVersion: 7.0
187+
Contact: intel-xe@lists.freedesktop.org
188+
Description: RO. Memory controller average temperature in millidegree Celsius.
189+
190+
Only supported for particular Intel Xe graphics platforms.
191+
168192
What: /sys/bus/pci/drivers/xe/.../hwmon/hwmon<i>/fan1_input
169193
Date: March 2025
170194
KernelVersion: 6.16

drivers/gpu/drm/xe/xe_hwmon.c

Lines changed: 74 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,7 @@ enum xe_hwmon_channel {
4343
CHANNEL_CARD,
4444
CHANNEL_PKG,
4545
CHANNEL_VRAM,
46+
CHANNEL_MCTRL,
4647
CHANNEL_MAX,
4748
};
4849

@@ -100,6 +101,9 @@ enum sensor_attr_power {
100101
*/
101102
#define PL_WRITE_MBX_TIMEOUT_MS (1)
102103

104+
/* Index of memory controller in READ_THERMAL_DATA output */
105+
#define TEMP_INDEX_MCTRL 2
106+
103107
/**
104108
* struct xe_hwmon_energy_info - to accumulate energy
105109
*/
@@ -130,6 +134,10 @@ struct xe_hwmon_thermal_info {
130134
/** @data: temperature limits in dwords */
131135
u32 data[DIV_ROUND_UP(TEMP_LIMIT_MAX, sizeof(u32))];
132136
};
137+
/** @count: no of temperature sensors available for the platform */
138+
u8 count;
139+
/** @value: signed value from each sensor */
140+
s8 value[U8_MAX];
133141
};
134142

135143
/**
@@ -703,6 +711,7 @@ static const struct hwmon_channel_info * const hwmon_info[] = {
703711
HWMON_T_LABEL,
704712
HWMON_T_CRIT | HWMON_T_EMERGENCY | HWMON_T_INPUT | HWMON_T_LABEL |
705713
HWMON_T_MAX,
714+
HWMON_T_CRIT | HWMON_T_EMERGENCY | HWMON_T_INPUT | HWMON_T_LABEL,
706715
HWMON_T_CRIT | HWMON_T_EMERGENCY | HWMON_T_INPUT | HWMON_T_LABEL),
707716
HWMON_CHANNEL_INFO(power, HWMON_P_MAX | HWMON_P_RATED_MAX | HWMON_P_LABEL | HWMON_P_CRIT |
708717
HWMON_P_CAP,
@@ -717,16 +726,51 @@ static const struct hwmon_channel_info * const hwmon_info[] = {
717726
static int xe_hwmon_pcode_read_thermal_info(struct xe_hwmon *hwmon)
718727
{
719728
struct xe_tile *root_tile = xe_device_get_root_tile(hwmon->xe);
729+
u32 config = 0;
720730
int ret;
721731

722732
ret = xe_pcode_read(root_tile, PCODE_MBOX(PCODE_THERMAL_INFO, READ_THERMAL_LIMITS, 0),
723733
&hwmon->temp.data[0], &hwmon->temp.data[1]);
734+
if (ret)
735+
return ret;
736+
724737
drm_dbg(&hwmon->xe->drm, "thermal info read val 0x%x val1 0x%x\n",
725738
hwmon->temp.data[0], hwmon->temp.data[1]);
726739

740+
ret = xe_pcode_read(root_tile, PCODE_MBOX(PCODE_THERMAL_INFO, READ_THERMAL_CONFIG, 0),
741+
&config, NULL);
742+
if (ret)
743+
return ret;
744+
745+
drm_dbg(&hwmon->xe->drm, "thermal config count 0x%x\n", config);
746+
hwmon->temp.count = REG_FIELD_GET(TEMP_MASK, config);
747+
727748
return ret;
728749
}
729750

751+
static int get_mc_temp(struct xe_hwmon *hwmon, long *val)
752+
{
753+
struct xe_tile *root_tile = xe_device_get_root_tile(hwmon->xe);
754+
u32 *dword = (u32 *)hwmon->temp.value;
755+
s32 average = 0;
756+
int ret, i;
757+
758+
for (i = 0; i < DIV_ROUND_UP(TEMP_LIMIT_MAX, sizeof(u32)); i++) {
759+
ret = xe_pcode_read(root_tile, PCODE_MBOX(PCODE_THERMAL_INFO, READ_THERMAL_DATA, i),
760+
(dword + i), NULL);
761+
if (ret)
762+
return ret;
763+
drm_dbg(&hwmon->xe->drm, "thermal data for group %d val 0x%x\n", i, dword[i]);
764+
}
765+
766+
for (i = TEMP_INDEX_MCTRL; i < hwmon->temp.count - 1; i++)
767+
average += hwmon->temp.value[i];
768+
769+
average /= (hwmon->temp.count - TEMP_INDEX_MCTRL - 1);
770+
*val = average * MILLIDEGREE_PER_DEGREE;
771+
return 0;
772+
}
773+
730774
/* I1 is exposed as power_crit or as curr_crit depending on bit 31 */
731775
static int xe_hwmon_pcode_read_i1(const struct xe_hwmon *hwmon, u32 *uval)
732776
{
@@ -831,6 +875,8 @@ xe_hwmon_temp_is_visible(struct xe_hwmon *hwmon, u32 attr, int channel)
831875
return hwmon->temp.limit[TEMP_LIMIT_PKG_SHUTDOWN] ? 0444 : 0;
832876
case CHANNEL_VRAM:
833877
return hwmon->temp.limit[TEMP_LIMIT_MEM_SHUTDOWN] ? 0444 : 0;
878+
case CHANNEL_MCTRL:
879+
return hwmon->temp.count ? 0444 : 0;
834880
default:
835881
return 0;
836882
}
@@ -840,6 +886,8 @@ xe_hwmon_temp_is_visible(struct xe_hwmon *hwmon, u32 attr, int channel)
840886
return hwmon->temp.limit[TEMP_LIMIT_PKG_CRIT] ? 0444 : 0;
841887
case CHANNEL_VRAM:
842888
return hwmon->temp.limit[TEMP_LIMIT_MEM_CRIT] ? 0444 : 0;
889+
case CHANNEL_MCTRL:
890+
return hwmon->temp.count ? 0444 : 0;
843891
default:
844892
return 0;
845893
}
@@ -852,7 +900,16 @@ xe_hwmon_temp_is_visible(struct xe_hwmon *hwmon, u32 attr, int channel)
852900
}
853901
case hwmon_temp_input:
854902
case hwmon_temp_label:
855-
return xe_reg_is_valid(xe_hwmon_get_reg(hwmon, REG_TEMP, channel)) ? 0444 : 0;
903+
switch (channel) {
904+
case CHANNEL_PKG:
905+
case CHANNEL_VRAM:
906+
return xe_reg_is_valid(xe_hwmon_get_reg(hwmon, REG_TEMP,
907+
channel)) ? 0444 : 0;
908+
case CHANNEL_MCTRL:
909+
return hwmon->temp.count ? 0444 : 0;
910+
default:
911+
return 0;
912+
}
856913
default:
857914
return 0;
858915
}
@@ -866,14 +923,23 @@ xe_hwmon_temp_read(struct xe_hwmon *hwmon, u32 attr, int channel, long *val)
866923

867924
switch (attr) {
868925
case hwmon_temp_input:
869-
reg_val = xe_mmio_read32(mmio, xe_hwmon_get_reg(hwmon, REG_TEMP, channel));
926+
switch (channel) {
927+
case CHANNEL_PKG:
928+
case CHANNEL_VRAM:
929+
reg_val = xe_mmio_read32(mmio, xe_hwmon_get_reg(hwmon, REG_TEMP, channel));
870930

871-
/* HW register value is in degrees Celsius, convert to millidegrees. */
872-
*val = REG_FIELD_GET(TEMP_MASK, reg_val) * MILLIDEGREE_PER_DEGREE;
873-
return 0;
931+
/* HW register value is in degrees Celsius, convert to millidegrees. */
932+
*val = REG_FIELD_GET(TEMP_MASK, reg_val) * MILLIDEGREE_PER_DEGREE;
933+
return 0;
934+
case CHANNEL_MCTRL:
935+
return get_mc_temp(hwmon, val);
936+
default:
937+
return -EOPNOTSUPP;
938+
}
874939
case hwmon_temp_emergency:
875940
switch (channel) {
876941
case CHANNEL_PKG:
942+
case CHANNEL_MCTRL:
877943
*val = hwmon->temp.limit[TEMP_LIMIT_PKG_SHUTDOWN] * MILLIDEGREE_PER_DEGREE;
878944
return 0;
879945
case CHANNEL_VRAM:
@@ -885,6 +951,7 @@ xe_hwmon_temp_read(struct xe_hwmon *hwmon, u32 attr, int channel, long *val)
885951
case hwmon_temp_crit:
886952
switch (channel) {
887953
case CHANNEL_PKG:
954+
case CHANNEL_MCTRL:
888955
*val = hwmon->temp.limit[TEMP_LIMIT_PKG_CRIT] * MILLIDEGREE_PER_DEGREE;
889956
return 0;
890957
case CHANNEL_VRAM:
@@ -1262,6 +1329,8 @@ static int xe_hwmon_read_label(struct device *dev,
12621329
*str = "pkg";
12631330
else if (channel == CHANNEL_VRAM)
12641331
*str = "vram";
1332+
else if (channel == CHANNEL_MCTRL)
1333+
*str = "mctrl";
12651334
return 0;
12661335
case hwmon_power:
12671336
case hwmon_energy:

drivers/gpu/drm/xe/xe_pcode_api.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -52,6 +52,8 @@
5252

5353
#define PCODE_THERMAL_INFO 0x25
5454
#define READ_THERMAL_LIMITS 0x0
55+
#define READ_THERMAL_CONFIG 0x1
56+
#define READ_THERMAL_DATA 0x2
5557

5658
#define PCODE_LATE_BINDING 0x5C
5759
#define GET_CAPABILITY_STATUS 0x0

0 commit comments

Comments
 (0)