@@ -19,11 +19,22 @@ import (
1919 "path/filepath"
2020 "strings"
2121
22+ "github.com/alecthomas/kingpin/v2"
2223 "github.com/go-kit/log"
2324 "github.com/go-kit/log/level"
2425 "github.com/prometheus/client_golang/prometheus"
26+ "gopkg.in/yaml.v2"
2527)
2628
29+ type vendorModels struct {
30+ VendorName string `yaml:"vendorName"`
31+ VendorID string `yaml:"vendorID"`
32+ Models []struct {
33+ PciID string `yaml:"pciID"`
34+ ModelName string `yaml:"modelName"`
35+ } `yaml:"models"`
36+ }
37+
2738type cardData struct {
2839 vendor string
2940 model string
@@ -36,107 +47,37 @@ type vendorData struct {
3647}
3748
3849type acceleratorsCollector struct {
39- pciDevicesPath string
40- logger log.Logger
41- }
42-
43- func init () {
44- registerCollector ("accelerator" , defaultEnabled , NewAcceleratorCollector )
45- }
46-
47- // NewAcceleratorCollector returns a new Collector exposing accelerator cards count.
48- func NewAcceleratorCollector (logger log.Logger ) (Collector , error ) {
49- return & acceleratorsCollector {
50- pciDevicesPath : filepath .Join (* sysPath , "bus/pci/devices" ),
51- logger : logger ,
52- }, nil
50+ pciDevicesPath string
51+ logger log.Logger
52+ vendorToDeviceMap map [string ]vendorData
5353}
5454
5555var (
56+ mappingFile = kingpin .Flag ("collector.accelerators.mapping-file" , "Path to the mapped accelerators data config." ).Default (
57+ "/var/node_exporter/accelerators_collector_config/config.yaml" ).String ()
5658 acceleratorCardsDesc = prometheus .NewDesc (
5759 prometheus .BuildFQName (namespace , "accelerator" , "card_info" ),
5860 "Accelerator card info including vendor, model and pci id (address)" ,
5961 []string {"vendor" , "model" , "id" }, nil ,
6062 )
63+ )
6164
62- nvidiaDeviceIDsMap = map [string ]string {
63- "0x20f5" : "NVIDIA A800 PCIe 80GB" ,
64- "0x20f6" : "NVIDIA A800 40GB PCIe active cooled" ,
65- "0x20fd" : "NVIDIA AX800" ,
66- "0x20f1" : "NVIDIA A100 PCIe 40GB" ,
67- "0x20b5" : "NVIDIA A100 PCIe 80GB" ,
68- "0x2235" : "NVIDIA A40" ,
69- "0x20b7" : "NVIDIA A30" ,
70- "0x2236" : "NVIDIA A10" ,
71- "0x25b6" : "NVIDIA A16" ,
72- "0x2322" : "H800 NVL" ,
73- "0x2321" : "NVIDIA H100 NVL" ,
74- "0x2331" : "NVIDIA H100 PCIe 80GB" ,
75- "0x26b5" : "NVIDIA L40" ,
76- "0x26b9" : "NVIDIA L40S" ,
77- "0x26bA" : "NVIDIA L20 liquid cooled" ,
78- "0x27b8" : "NVIDIA L4" ,
79- "0x27b6" : "NVIDIA L2" ,
80- "0x26b1" : "NVIDIA RTX 6000 Ada" ,
81- "0x26b3" : "NVIDIA RTX 5880 Ada" ,
82- "0x2231" : "NVIDIA RTX 5000 Ada" ,
83- "0x2230" : "NVIDIA RTX A6000" ,
84- "0x2233" : "NVIDIA RTX A5500" ,
85- "0x1e30" : "NVIDIA RTX 8000 passive" ,
86- "0x2531" : "NVIDIA RTX A2000" ,
87- "0x20b0" : "NVIDIA A100 SXM4 40G" ,
88- "0x233a" : "NVIDIA H800 NVL" ,
89- "0x233b" : "NVIDIA H200 NVL" ,
90- "0x20b2" : "NVIDIA A100SXM4 80GB" ,
91- "0x20b3" : "NVIDIA A100 SXM 64GB" ,
92- "0x20bd" : "NVIDIA A800 SXM4 40GB" ,
93- "0x20f3" : "NVIDIA A800 SXM4 80GB" ,
94- "0x25b0" : "NVIDIA RTX A1000" ,
95- }
96-
97- amdDeviceIDsMap = map [string ]string {
98- "0x740f" : "AMD MI210" ,
99- "0x740c" : "AMD MI250" ,
100- "0x7408" : "AMD MI250X" ,
101- "0x74a0" : "AMD MI300" ,
102- "0x74a1" : "AMD MI300X" ,
103- "0x74a5" : "AMD MI325X" ,
104- "0x7aa2" : "AMD MI308X" ,
105- "0x74b5" : "AMD MI300X VF" ,
106- "0x7410" : "AMD MI210 VF" ,
107- }
108-
109- gaudiDeviceIDsMap = map [string ]string {
110- "0x1000" : "Gaudi 1" ,
111- "0x1020" : "Gaudi 2" ,
112- }
113-
114- intelDeviceIDsMap = map [string ]string {
115- "0x0bd5" : "Intel Data Center GPU Max 1550" ,
116- "0x0bda" : "Intel Data Center GPU Max 1100" ,
117- "0x56c0" : "Intel Data Center GPU Flex 170" ,
118- "0x56c1" : "Intel Data Center GPU Flex 140" ,
119- }
120-
121- qualcommDeviceIDsMap = map [string ]string {
122- "0xa100" : "Qualcomm AI 100" ,
123- "0xa080" : "Qualcomm AI 80" ,
124- }
65+ func init () {
66+ registerCollector ("accelerator" , defaultEnabled , NewAcceleratorCollector )
67+ }
12568
126- // vendor map, add any new vendor to this map
127- vendorToDeviceMap = map [string ]vendorData {
128- // nvidia devices
129- "0x10de" : vendorData {"NVIDIA" , nvidiaDeviceIDsMap },
130- // amd devices
131- "0x1002" : vendorData {"AMD" , amdDeviceIDsMap },
132- // gaudi devices
133- "0x1da3" : vendorData {"GAUDI" , gaudiDeviceIDsMap },
134- // intel devices
135- "0x8086" : vendorData {"INTEL" , intelDeviceIDsMap },
136- // qualcomm devices
137- "0x17cb" : vendorData {"QUALCOMM" , qualcommDeviceIDsMap },
69+ // NewAcceleratorCollector returns a new Collector exposing accelerator cards count.
70+ func NewAcceleratorCollector (logger log.Logger ) (Collector , error ) {
71+ vendorToDeviceMap , err := prepareVendorModelData (* mappingFile )
72+ if err != nil {
73+ return nil , fmt .Errorf ("failed to get the accelerator configuration: %v" , err )
13874 }
139- )
75+ return & acceleratorsCollector {
76+ pciDevicesPath : filepath .Join (* sysPath , "bus/pci/devices" ),
77+ logger : logger ,
78+ vendorToDeviceMap : vendorToDeviceMap ,
79+ }, nil
80+ }
14081
14182func (a * acceleratorsCollector ) Update (ch chan <- prometheus.Metric ) error {
14283 pciDevices , err := os .ReadDir (a .pciDevicesPath )
@@ -159,7 +100,7 @@ func (a *acceleratorsCollector) Update(ch chan<- prometheus.Metric) error {
159100
160101 level .Debug (a .logger ).Log ("msg" , "checking pci device" , "vendor" , vendorID , "device" , deviceID )
161102
162- cardData , isMonitored := isMonitoredAccelerator (vendorID , deviceID , pciID )
103+ cardData , isMonitored := a . isMonitoredAccelerator (vendorID , deviceID , pciID )
163104 if ! isMonitored {
164105 continue
165106 }
@@ -187,8 +128,8 @@ func (a *acceleratorsCollector) getPCIFileData(pciID, fileName string) (string,
187128 return strings .TrimSpace (string (data )), nil
188129}
189130
190- func isMonitoredAccelerator (vendor , device , pciID string ) (cardData , bool ) {
191- vendorData , ok := vendorToDeviceMap [vendor ]
131+ func ( a * acceleratorsCollector ) isMonitoredAccelerator (vendor , device , pciID string ) (cardData , bool ) {
132+ vendorData , ok := a . vendorToDeviceMap [vendor ]
192133 if ! ok {
193134 return cardData {}, false
194135 }
@@ -199,3 +140,36 @@ func isMonitoredAccelerator(vendor, device, pciID string) (cardData, bool) {
199140 }
200141 return cardData {vendorData .vendorName , deviceDesc , pciID }, true
201142}
143+
144+ func prepareVendorModelData (mappingFilePath string ) (map [string ]vendorData , error ) {
145+ yamlStr , err := os .ReadFile (mappingFilePath )
146+ if err != nil {
147+ return nil , fmt .Errorf ("failed to open accelerators config file %s: %v" , mappingFilePath , err )
148+ }
149+ var vendorsModelsConfig []vendorModels
150+ err = yaml .UnmarshalStrict (yamlStr , & vendorsModelsConfig )
151+ if err != nil {
152+ return nil , fmt .Errorf ("failed to unmarshal accelerators config data: %v" , err )
153+ }
154+ vendorToDeviceMap := make (map [string ]vendorData , len (vendorsModelsConfig ))
155+
156+ for _ , vendorModelsConfig := range vendorsModelsConfig {
157+ if _ , ok := vendorToDeviceMap [vendorModelsConfig .VendorID ]; ok {
158+ return nil , fmt .Errorf ("mapping file contains duplicate of vendor id %s" , vendorModelsConfig .VendorID )
159+ }
160+ devicesIDs := make (map [string ]string , len (vendorModelsConfig .Models ))
161+ for _ , model := range vendorModelsConfig .Models {
162+ if _ , ok := devicesIDs [model .PciID ]; ok {
163+ return nil , fmt .Errorf ("mapping file contains duplicate of device id %s for vendor id %s" , model .PciID , vendorModelsConfig .VendorID )
164+ }
165+ devicesIDs [model .PciID ] = model .ModelName
166+ }
167+ vendorToDeviceMap [vendorModelsConfig .VendorID ] = vendorData {
168+ vendorName : vendorModelsConfig .VendorName ,
169+ devicesIDs : devicesIDs ,
170+ }
171+
172+ }
173+
174+ return vendorToDeviceMap , nil
175+ }
0 commit comments