Skip to content

Commit 57a4e99

Browse files
Merge pull request openshift#155 from yevgeny-shnaidman/yevgeny/add-configmap-support
MGMT-19498:Add configmap support to the accelerator's collector
2 parents 1844bcf + 0e595e2 commit 57a4e99

6 files changed

Lines changed: 162 additions & 100 deletions

collector/accelerators.go

Lines changed: 67 additions & 93 deletions
Original file line numberDiff line numberDiff line change
@@ -19,11 +19,22 @@ import (
1919
"path/filepath"
2020
"strings"
2121

22+
"github.com/alecthomas/kingpin/v2"
2223
"github.com/go-kit/log"
2324
"github.com/go-kit/log/level"
2425
"github.com/prometheus/client_golang/prometheus"
26+
"gopkg.in/yaml.v2"
2527
)
2628

29+
type vendorModels struct {
30+
VendorName string `yaml:"vendorName"`
31+
VendorID string `yaml:"vendorID"`
32+
Models []struct {
33+
PciID string `yaml:"pciID"`
34+
ModelName string `yaml:"modelName"`
35+
} `yaml:"models"`
36+
}
37+
2738
type cardData struct {
2839
vendor string
2940
model string
@@ -36,107 +47,37 @@ type vendorData struct {
3647
}
3748

3849
type acceleratorsCollector struct {
39-
pciDevicesPath string
40-
logger log.Logger
41-
}
42-
43-
func init() {
44-
registerCollector("accelerator", defaultEnabled, NewAcceleratorCollector)
45-
}
46-
47-
// NewAcceleratorCollector returns a new Collector exposing accelerator cards count.
48-
func NewAcceleratorCollector(logger log.Logger) (Collector, error) {
49-
return &acceleratorsCollector{
50-
pciDevicesPath: filepath.Join(*sysPath, "bus/pci/devices"),
51-
logger: logger,
52-
}, nil
50+
pciDevicesPath string
51+
logger log.Logger
52+
vendorToDeviceMap map[string]vendorData
5353
}
5454

5555
var (
56+
mappingFile = kingpin.Flag("collector.accelerators.mapping-file", "Path to the mapped accelerators data config.").Default(
57+
"/var/node_exporter/accelerators_collector_config/config.yaml").String()
5658
acceleratorCardsDesc = prometheus.NewDesc(
5759
prometheus.BuildFQName(namespace, "accelerator", "card_info"),
5860
"Accelerator card info including vendor, model and pci id (address)",
5961
[]string{"vendor", "model", "id"}, nil,
6062
)
63+
)
6164

62-
nvidiaDeviceIDsMap = map[string]string{
63-
"0x20f5": "NVIDIA A800 PCIe 80GB",
64-
"0x20f6": "NVIDIA A800 40GB PCIe active cooled",
65-
"0x20fd": "NVIDIA AX800",
66-
"0x20f1": "NVIDIA A100 PCIe 40GB",
67-
"0x20b5": "NVIDIA A100 PCIe 80GB",
68-
"0x2235": "NVIDIA A40",
69-
"0x20b7": "NVIDIA A30",
70-
"0x2236": "NVIDIA A10",
71-
"0x25b6": "NVIDIA A16",
72-
"0x2322": "H800 NVL",
73-
"0x2321": "NVIDIA H100 NVL",
74-
"0x2331": "NVIDIA H100 PCIe 80GB",
75-
"0x26b5": "NVIDIA L40",
76-
"0x26b9": "NVIDIA L40S",
77-
"0x26bA": "NVIDIA L20 liquid cooled",
78-
"0x27b8": "NVIDIA L4",
79-
"0x27b6": "NVIDIA L2",
80-
"0x26b1": "NVIDIA RTX 6000 Ada",
81-
"0x26b3": "NVIDIA RTX 5880 Ada",
82-
"0x2231": "NVIDIA RTX 5000 Ada",
83-
"0x2230": "NVIDIA RTX A6000",
84-
"0x2233": "NVIDIA RTX A5500",
85-
"0x1e30": "NVIDIA RTX 8000 passive",
86-
"0x2531": "NVIDIA RTX A2000",
87-
"0x20b0": "NVIDIA A100 SXM4 40G",
88-
"0x233a": "NVIDIA H800 NVL",
89-
"0x233b": "NVIDIA H200 NVL",
90-
"0x20b2": "NVIDIA A100SXM4 80GB",
91-
"0x20b3": "NVIDIA A100 SXM 64GB",
92-
"0x20bd": "NVIDIA A800 SXM4 40GB",
93-
"0x20f3": "NVIDIA A800 SXM4 80GB",
94-
"0x25b0": "NVIDIA RTX A1000",
95-
}
96-
97-
amdDeviceIDsMap = map[string]string{
98-
"0x740f": "AMD MI210",
99-
"0x740c": "AMD MI250",
100-
"0x7408": "AMD MI250X",
101-
"0x74a0": "AMD MI300",
102-
"0x74a1": "AMD MI300X",
103-
"0x74a5": "AMD MI325X",
104-
"0x7aa2": "AMD MI308X",
105-
"0x74b5": "AMD MI300X VF",
106-
"0x7410": "AMD MI210 VF",
107-
}
108-
109-
gaudiDeviceIDsMap = map[string]string{
110-
"0x1000": "Gaudi 1",
111-
"0x1020": "Gaudi 2",
112-
}
113-
114-
intelDeviceIDsMap = map[string]string{
115-
"0x0bd5": "Intel Data Center GPU Max 1550",
116-
"0x0bda": "Intel Data Center GPU Max 1100",
117-
"0x56c0": "Intel Data Center GPU Flex 170",
118-
"0x56c1": "Intel Data Center GPU Flex 140",
119-
}
120-
121-
qualcommDeviceIDsMap = map[string]string{
122-
"0xa100": "Qualcomm AI 100",
123-
"0xa080": "Qualcomm AI 80",
124-
}
65+
func init() {
66+
registerCollector("accelerator", defaultEnabled, NewAcceleratorCollector)
67+
}
12568

126-
// vendor map, add any new vendor to this map
127-
vendorToDeviceMap = map[string]vendorData{
128-
// nvidia devices
129-
"0x10de": vendorData{"NVIDIA", nvidiaDeviceIDsMap},
130-
// amd devices
131-
"0x1002": vendorData{"AMD", amdDeviceIDsMap},
132-
// gaudi devices
133-
"0x1da3": vendorData{"GAUDI", gaudiDeviceIDsMap},
134-
// intel devices
135-
"0x8086": vendorData{"INTEL", intelDeviceIDsMap},
136-
// qualcomm devices
137-
"0x17cb": vendorData{"QUALCOMM", qualcommDeviceIDsMap},
69+
// NewAcceleratorCollector returns a new Collector exposing accelerator cards count.
70+
func NewAcceleratorCollector(logger log.Logger) (Collector, error) {
71+
vendorToDeviceMap, err := prepareVendorModelData(*mappingFile)
72+
if err != nil {
73+
return nil, fmt.Errorf("failed to get the accelerator configuration: %v", err)
13874
}
139-
)
75+
return &acceleratorsCollector{
76+
pciDevicesPath: filepath.Join(*sysPath, "bus/pci/devices"),
77+
logger: logger,
78+
vendorToDeviceMap: vendorToDeviceMap,
79+
}, nil
80+
}
14081

14182
func (a *acceleratorsCollector) Update(ch chan<- prometheus.Metric) error {
14283
pciDevices, err := os.ReadDir(a.pciDevicesPath)
@@ -159,7 +100,7 @@ func (a *acceleratorsCollector) Update(ch chan<- prometheus.Metric) error {
159100

160101
level.Debug(a.logger).Log("msg", "checking pci device", "vendor", vendorID, "device", deviceID)
161102

162-
cardData, isMonitored := isMonitoredAccelerator(vendorID, deviceID, pciID)
103+
cardData, isMonitored := a.isMonitoredAccelerator(vendorID, deviceID, pciID)
163104
if !isMonitored {
164105
continue
165106
}
@@ -187,8 +128,8 @@ func (a *acceleratorsCollector) getPCIFileData(pciID, fileName string) (string,
187128
return strings.TrimSpace(string(data)), nil
188129
}
189130

190-
func isMonitoredAccelerator(vendor, device, pciID string) (cardData, bool) {
191-
vendorData, ok := vendorToDeviceMap[vendor]
131+
func (a *acceleratorsCollector) isMonitoredAccelerator(vendor, device, pciID string) (cardData, bool) {
132+
vendorData, ok := a.vendorToDeviceMap[vendor]
192133
if !ok {
193134
return cardData{}, false
194135
}
@@ -199,3 +140,36 @@ func isMonitoredAccelerator(vendor, device, pciID string) (cardData, bool) {
199140
}
200141
return cardData{vendorData.vendorName, deviceDesc, pciID}, true
201142
}
143+
144+
func prepareVendorModelData(mappingFilePath string) (map[string]vendorData, error) {
145+
yamlStr, err := os.ReadFile(mappingFilePath)
146+
if err != nil {
147+
return nil, fmt.Errorf("failed to open accelerators config file %s: %v", mappingFilePath, err)
148+
}
149+
var vendorsModelsConfig []vendorModels
150+
err = yaml.UnmarshalStrict(yamlStr, &vendorsModelsConfig)
151+
if err != nil {
152+
return nil, fmt.Errorf("failed to unmarshal accelerators config data: %v", err)
153+
}
154+
vendorToDeviceMap := make(map[string]vendorData, len(vendorsModelsConfig))
155+
156+
for _, vendorModelsConfig := range vendorsModelsConfig {
157+
if _, ok := vendorToDeviceMap[vendorModelsConfig.VendorID]; ok {
158+
return nil, fmt.Errorf("mapping file contains duplicate of vendor id %s", vendorModelsConfig.VendorID)
159+
}
160+
devicesIDs := make(map[string]string, len(vendorModelsConfig.Models))
161+
for _, model := range vendorModelsConfig.Models {
162+
if _, ok := devicesIDs[model.PciID]; ok {
163+
return nil, fmt.Errorf("mapping file contains duplicate of device id %s for vendor id %s", model.PciID, vendorModelsConfig.VendorID)
164+
}
165+
devicesIDs[model.PciID] = model.ModelName
166+
}
167+
vendorToDeviceMap[vendorModelsConfig.VendorID] = vendorData{
168+
vendorName: vendorModelsConfig.VendorName,
169+
devicesIDs: devicesIDs,
170+
}
171+
172+
}
173+
174+
return vendorToDeviceMap, nil
175+
}

collector/accelerators_test.go

Lines changed: 25 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@ package collector
1616
import (
1717
"fmt"
1818
"os"
19+
"path/filepath"
1920
"strings"
2021
"testing"
2122

@@ -39,17 +40,23 @@ func (c testAcceleratorCollector) Describe(ch chan<- *prometheus.Desc) {
3940
func TestAccelerator(t *testing.T) {
4041
testcase := `# HELP node_accelerator_card_info Accelerator card info including vendor, model and pci id (address)
4142
# TYPE node_accelerator_card_info counter
42-
node_accelerator_card_info{id="0000:00:02.0",model="NVIDIA A100 PCIe 80GB",vendor="NVIDIA"} 1
43-
node_accelerator_card_info{id="0000:00:09.0",model="NVIDIA A100 PCIe 80GB",vendor="NVIDIA"} 1
43+
node_accelerator_card_info{id="0000:00:02.0",model="A100",vendor="NVIDIA"} 1
44+
node_accelerator_card_info{id="0000:00:09.0",model="A100",vendor="NVIDIA"} 1
45+
node_accelerator_card_info{id="0000:00:1f.5",model="RTX_4090",vendor="NVIDIA"} 1
4446
`
47+
vendorToDeviceMap, err := prepareVendorModelData("testdata/accelerators_test_data.yaml")
48+
if err != nil {
49+
t.Fatal(err)
50+
}
4551

4652
*sysPath = "fixtures/sys"
47-
4853
logger := log.NewLogfmtLogger(os.Stderr)
49-
c, err := NewAcceleratorCollector(logger)
50-
if err != nil {
51-
t.Fatal(err)
54+
c := &acceleratorsCollector{
55+
pciDevicesPath: filepath.Join(*sysPath, "bus/pci/devices"),
56+
logger: logger,
57+
vendorToDeviceMap: vendorToDeviceMap,
5258
}
59+
5360
reg := prometheus.NewRegistry()
5461
reg.MustRegister(&testAcceleratorCollector{xc: c})
5562

@@ -67,3 +74,15 @@ func TestAccelerator(t *testing.T) {
6774
t.Fatal(err)
6875
}
6976
}
77+
78+
func Test_prepareVendorModelData_badMapping(t *testing.T) {
79+
_, err := prepareVendorModelData("testdata/accelerators_test_data_duplicated_vendors.bad.yaml")
80+
if err == nil {
81+
t.Fatal(err)
82+
}
83+
84+
_, err = prepareVendorModelData("testdata/accelerators_test_data_duplicated_device_ids.bad.yaml")
85+
if err == nil {
86+
t.Fatal(err)
87+
}
88+
}
Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,23 @@
1+
- vendorName: NVIDIA
2+
vendorID: 0x10de
3+
models:
4+
- pciID: 0x20b5
5+
modelName: A100
6+
- pciID: 0x2230
7+
modelName: RTX_A6000
8+
- pciID: 0x2717
9+
modelName: RTX_4090
10+
- pciID: 0x2235
11+
modelName: A40
12+
- pciID: 0x1df5
13+
modelName: V100
14+
- pciID: 0x20f1
15+
modelName: A100 40G
16+
- vendorName: AMD
17+
vendorID: 0x1002
18+
models:
19+
- pciID: 0x740f
20+
modelName: MI210
21+
- pciID: 0x740c
22+
modelName: MI250
23+
Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,23 @@
1+
- vendorName: NVIDIA
2+
vendorID: 0x10de
3+
models:
4+
- pciID: 0x20b5
5+
modelName: A100
6+
- pciID: 0x2230
7+
modelName: RTX_A6000
8+
- pciID: 0x2717
9+
modelName: RTX_4090
10+
- pciID: 0x2235
11+
modelName: A40
12+
- pciID: 0x2235
13+
modelName: V100
14+
- pciID: 0x20f1
15+
modelName: A100 40G
16+
- vendorName: AMD
17+
vendorID: 0x1002
18+
models:
19+
- pciID: 0x740f
20+
modelName: MI210
21+
- pciID: 0x740c
22+
modelName: MI250
23+
Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,23 @@
1+
- vendorName: NVIDIA
2+
vendorID: 0x10de
3+
models:
4+
- pciID: 0x20b5
5+
modelName: A100
6+
- pciID: 0x2230
7+
modelName: RTX_A6000
8+
- pciID: 0x2717
9+
modelName: RTX_4090
10+
- pciID: 0x2235
11+
modelName: A40
12+
- pciID: 0x1df5
13+
modelName: V100
14+
- pciID: 0x20f1
15+
modelName: A100 40G
16+
- vendorName: AMD
17+
vendorID: 0x10de
18+
models:
19+
- pciID: 0x740f
20+
modelName: MI210
21+
- pciID: 0x740c
22+
modelName: MI250
23+

go.mod

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,7 @@ require (
3030
github.com/safchain/ethtool v0.3.0
3131
golang.org/x/exp v0.0.0-20240416160154-fe59bbe5cc7f
3232
golang.org/x/sys v0.19.0
33+
gopkg.in/yaml.v2 v2.4.0
3334
howett.net/plist v1.0.1
3435
)
3536

@@ -57,5 +58,4 @@ require (
5758
golang.org/x/text v0.14.0 // indirect
5859
google.golang.org/appengine v1.6.7 // indirect
5960
google.golang.org/protobuf v1.33.0 // indirect
60-
gopkg.in/yaml.v2 v2.4.0 // indirect
6161
)

0 commit comments

Comments
 (0)