Skip to content

Commit 2d869c1

Browse files
AlexW-FAlex Williams-FerreiraCopilot
authored
Fix AMD GPU driver installation failure on Ubuntu 22.04 (jammy) (#652)
* Added check to see if path bashrc path exists * Add OS version-aware AMD GPU driver installation Detect Ubuntu codename from /etc/os-release and resolve the correct ROCm installation URL from a built-in mapping (focal->5.5, jammy->6.3.3). This fixes BabelStream failures on Ubuntu 22.04 where ROCm 5.5's amdgpu-dkms module fails to build on kernel 6.8.x, leaving dpkg broken and crashing VirtualClient repeatedly. Changes: - Add SupportedInstallationFiles codename-to-URL mapping - Add DetectOsVersionCodenameAsync and ResolveLinuxInstallationFile - Add dpkg cleanup commands for broken amdgpu-dkms package state - Fix bashrc path for root users via GetUserHomePath - Add unit tests for codename resolution, fallback, and error cases Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> * Fix bash quoting in dpkg cleanup commands .NET ProcessStartInfo on Linux does not handle single quotes for argument grouping - only double quotes work. The previous bash -c commands with single quotes caused argv to be split incorrectly, resulting in 'unexpected EOF' errors. Also removed 2>/dev/null redirects since stderr is captured by the process proxy anyway. Verified on Ubuntu 22.04 VM: dpkg cleanup runs successfully, codename detection resolves to jammy, and ROCm 6.3.3 driver installation begins correctly. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --------- Co-authored-by: Alex Williams-Ferreira <alexwill@microsoft.com> Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
1 parent e7a52a0 commit 2d869c1

2 files changed

Lines changed: 139 additions & 5 deletions

File tree

src/VirtualClient/VirtualClient.Dependencies.UnitTests/AMDGPUDriverInstallationTests.cs

Lines changed: 50 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -47,21 +47,57 @@ public void TearDown()
4747
}
4848

4949
[Test]
50-
public void AMDGPUDriverInstallationDependencyThrowsIfLinuxInstallationFileIsEmpty()
50+
public void AMDGPUDriverInstallationDependencyThrowsIfNoInstallFileForUnsupportedCodename()
5151
{
52-
this.SetupDefaultMockBehavior(PlatformID.Unix, string.Empty, string.Empty);
52+
this.SetupDefaultMockBehavior(PlatformID.Unix, string.Empty, string.Empty, "unsupported_codename");
5353

5454
DependencyException exc = Assert.ThrowsAsync<DependencyException>(() => this.component.ExecuteAsync(CancellationToken.None));
5555
Assert.AreEqual(ErrorReason.DependencyNotFound, exc.Reason);
5656
}
5757

58+
[Test]
59+
public async Task AMDGPUDriverInstallationResolvesInstallFileFromMappingForJammy()
60+
{
61+
// No explicit LinuxInstallationFile — should resolve from built-in mapping for jammy
62+
this.SetupDefaultMockBehavior(PlatformID.Unix, linuxInstallationFile: string.Empty, osVersionCodename: "jammy");
63+
64+
await this.component.ExecuteAsync(CancellationToken.None);
65+
Assert.IsTrue(this.fixture.ProcessManager.CommandsExecuted(
66+
"wget https://repo.radeon.com/amdgpu-install/6.3.3/ubuntu/jammy/amdgpu-install_6.3.60303-1_all.deb"));
67+
}
68+
69+
[Test]
70+
public async Task AMDGPUDriverInstallationResolvesInstallFileFromMappingForFocal()
71+
{
72+
// No explicit LinuxInstallationFile — should resolve from built-in mapping for focal
73+
this.SetupDefaultMockBehavior(PlatformID.Unix, linuxInstallationFile: string.Empty, osVersionCodename: "focal");
74+
75+
await this.component.ExecuteAsync(CancellationToken.None);
76+
Assert.IsTrue(this.fixture.ProcessManager.CommandsExecuted(
77+
"wget https://repo.radeon.com/amdgpu-install/5.5/ubuntu/focal/amdgpu-install_5.5.50500-1_all.deb"));
78+
}
79+
80+
[Test]
81+
public async Task AMDGPUDriverInstallationUsesProfileInstallFileWhenProvided()
82+
{
83+
// Explicit LinuxInstallationFile provided — should use it regardless of codename
84+
string customUrl = "https://repo.radeon.com/amdgpu-install/5.5/ubuntu/focal/amdgpu-install_5.5.50500-1_all.deb";
85+
this.SetupDefaultMockBehavior(PlatformID.Unix, linuxInstallationFile: customUrl, osVersionCodename: "jammy");
86+
87+
await this.component.ExecuteAsync(CancellationToken.None);
88+
Assert.IsTrue(this.fixture.ProcessManager.CommandsExecuted(
89+
$"wget {customUrl}"));
90+
}
91+
5892
[Test]
5993
public async Task AMDGPUDriverInstallationDependencyStartsCorrectProcessesOnExecuteForLinux()
6094
{
6195
this.SetupDefaultMockBehavior(PlatformID.Unix);
6296

6397
List<string> commands = new List<string>
6498
{
99+
"sudo bash -c \"dpkg --remove --force-remove-reinstreq amdgpu-dkms || true\"",
100+
"sudo bash -c \"dpkg --configure -a || true\"",
65101
"apt-get -yq update",
66102
"sudo apt-get install -yq libpci3 libpci-dev doxygen unzip cmake git",
67103
"sudo apt-get install -yq libnuma-dev libncurses5",
@@ -111,7 +147,7 @@ public async Task AMDGPUDriverInstallationDependencyDoesNotInstallAMDGPUDriverIf
111147
Assert.IsFalse(this.fixture.ProcessManager.CommandsExecuted($"{installScriptPath} -INSTALL -OUTPUT screen"));
112148
}
113149

114-
private void SetupDefaultMockBehavior(PlatformID platformID, string gpuModel = "v620", string linuxInstallationFile = "https://repo.radeon.com/amdgpu-install/5.5/ubuntu/focal/amdgpu-install_5.5.50500-1_all.deb")
150+
private void SetupDefaultMockBehavior(PlatformID platformID, string gpuModel = "v620", string linuxInstallationFile = "https://repo.radeon.com/amdgpu-install/5.5/ubuntu/focal/amdgpu-install_5.5.50500-1_all.deb", string osVersionCodename = "focal")
115151
{
116152
this.fixture.Setup(platformID);
117153
this.mockPackage = new DependencyPath("amddriverpackage", this.fixture.GetPackagePath("amddriverpackage"));
@@ -141,6 +177,17 @@ private void SetupDefaultMockBehavior(PlatformID platformID, string gpuModel = "
141177

142178
this.fixture.File.Setup(file => file.Exists(It.IsAny<string>())).Returns(true);
143179

180+
// Mock /etc/os-release so the codename can be detected during InitializeAsync
181+
if (platformID == PlatformID.Unix)
182+
{
183+
string osReleaseContent = string.IsNullOrEmpty(osVersionCodename)
184+
? "NAME=\"Ubuntu\"\nVERSION=\"22.04 LTS\"\nID=ubuntu\nPRETTY_NAME=\"Ubuntu 22.04 LTS\""
185+
: $"NAME=\"Ubuntu\"\nVERSION_CODENAME={osVersionCodename}\nID=ubuntu\nPRETTY_NAME=\"Ubuntu LTS\"";
186+
187+
this.fixture.File.Setup(file => file.ReadAllTextAsync("/etc/os-release", It.IsAny<CancellationToken>()))
188+
.ReturnsAsync(osReleaseContent);
189+
}
190+
144191
this.fixture.SystemManagement.SetupGet(mgr => mgr.ProcessManager).Returns(this.mockProcessManager.Object);
145192

146193
this.fixture.ApiClient.Setup(client => client.GetStateAsync(It.IsAny<string>(), It.IsAny<CancellationToken>(), It.IsAny<IAsyncPolicy<HttpResponseMessage>>()))

src/VirtualClient/VirtualClient.Dependencies/AMDGPUDriverInstallation.cs

Lines changed: 89 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@ namespace VirtualClient.Dependencies
99
using System.IO;
1010
using System.IO.Abstractions;
1111
using System.Linq;
12+
using System.Text.RegularExpressions;
1213
using System.Threading;
1314
using System.Threading.Tasks;
1415
using Microsoft.CodeAnalysis;
@@ -26,11 +27,21 @@ public class AMDGPUDriverInstallation : VirtualClientComponent
2627
{
2728
private const string Mi25ExeName = "AMD-mi25.exe";
2829
private const string V620ExeName = "Setup.exe";
30+
31+
// Known-good ROCm installation URLs for each supported Ubuntu codename.
32+
// These are the default URLs used when the profile does not specify a LinuxInstallationFile.
33+
private static readonly Dictionary<string, string> SupportedInstallationFiles = new Dictionary<string, string>(StringComparer.OrdinalIgnoreCase)
34+
{
35+
{ "focal", "https://repo.radeon.com/amdgpu-install/5.5/ubuntu/focal/amdgpu-install_5.5.50500-1_all.deb" },
36+
{ "jammy", "https://repo.radeon.com/amdgpu-install/6.3.3/ubuntu/jammy/amdgpu-install_6.3.60303-1_all.deb" }
37+
};
38+
2939
private IPackageManager packageManager;
3040
private IFileSystem fileSystem;
3141
private ISystemManagement systemManager;
3242
private IStateManager stateManager;
3343
private LinuxDistributionInfo linuxDistributionInfo;
44+
private string osVersionCodename;
3445

3546
/// <summary>
3647
/// Initializes a new instance of the <see cref="AMDGPUDriverInstallation"/> class.
@@ -185,12 +196,17 @@ protected override async Task InitializeAsync(EventContext telemetryContext, Can
185196
{
186197
this.linuxDistributionInfo = await this.systemManager.GetLinuxDistributionAsync(cancellationToken)
187198
.ConfigureAwait(false);
199+
200+
this.osVersionCodename = await this.DetectOsVersionCodenameAsync(cancellationToken)
201+
.ConfigureAwait(false);
188202
}
189203
}
190204

191205
[SuppressMessage("StyleCop.CSharp.ReadabilityRules", "SA1118:Parameter should not span multiple lines", Justification = "Readability")]
192206
private async Task InstallAMDGPUDriverLinux(EventContext telemetryContext, CancellationToken cancellationToken)
193207
{
208+
this.ResolveLinuxInstallationFile();
209+
194210
if (string.IsNullOrWhiteSpace(this.LinuxInstallationFile))
195211
{
196212
throw new DependencyException($"The linux installation file can not be null or empty and it is: {this.LinuxInstallationFile}", ErrorReason.DependencyNotFound);
@@ -199,7 +215,10 @@ private async Task InstallAMDGPUDriverLinux(EventContext telemetryContext, Cance
199215
// The .bashrc file is used to define commands that should be run whenever the system
200216
// is booted. For the purpose of the AMD GPU driver installation, we want to include extra
201217
// paths in the $PATH environment variable post installation.
202-
string bashRcPath = $"/home/{this.Username}/.bashrc";
218+
string userHome = this.GetUserHomePath();
219+
string bashRcPath = $"{userHome}/.bashrc";
220+
221+
this.fileSystem.Directory.CreateDirectory(Path.GetDirectoryName(bashRcPath) !);
203222

204223
this.fileSystem.Directory.CreateDirectory(Path.GetDirectoryName(bashRcPath) !);
205224

@@ -255,6 +274,11 @@ private List<string> PrerequisiteCommands()
255274
switch (this.linuxDistributionInfo.LinuxDistribution)
256275
{
257276
case LinuxDistribution.Ubuntu:
277+
// Clean up any broken package state from previous failed installation attempts.
278+
// The amdgpu-dkms package can be left in a half-configured state if the DKMS module
279+
// build fails, which causes all subsequent apt-get commands to fail.
280+
commands.Add("bash -c \"dpkg --remove --force-remove-reinstreq amdgpu-dkms || true\"");
281+
commands.Add("bash -c \"dpkg --configure -a || true\"");
258282
commands.Add("apt-get -yq update");
259283
commands.Add("apt-get install -yq libpci3 libpci-dev doxygen unzip cmake git");
260284
commands.Add("apt-get install -yq libnuma-dev libncurses5");
@@ -293,12 +317,14 @@ private List<string> VersionSpecificInstallationCommands()
293317

294318
private List<string> PostInstallationCommands()
295319
{
320+
string userHome = this.GetUserHomePath();
321+
296322
// last 2 command are to make sure that we are blacklisting AMD GPU drivers before rebooting
297323
return new List<string>
298324
{
299325
"amdgpu-install -y --usecase=hiplibsdk,rocm,dkms",
300326
$"bash -c \"echo 'export PATH=/opt/rocm/bin${{PATH:+:${{PATH}}}}' | " +
301-
$"sudo tee -a /home/{this.Username}/.bashrc\"",
327+
$"sudo tee -a {userHome}/.bashrc\"",
302328
$"bash -c \"echo 'blacklist amdgpu' | sudo tee -a /etc/modprobe.d/amdgpu.conf \"",
303329
"update-initramfs -u -k all"
304330
};
@@ -336,6 +362,67 @@ await this.LogProcessDetailsAsync(process, telemetryContext, "AMDGPUDriverInstal
336362
}
337363
}
338364

365+
/// <summary>
366+
/// Reads /etc/os-release to detect the OS version codename (e.g., "focal", "jammy").
367+
/// Follows the same pattern as MongoDBServerInstallation's codename detection.
368+
/// </summary>
369+
private async Task<string> DetectOsVersionCodenameAsync(CancellationToken cancellationToken)
370+
{
371+
try
372+
{
373+
string osReleaseContent = await this.fileSystem.File.ReadAllTextAsync("/etc/os-release", cancellationToken)
374+
.ConfigureAwait(false);
375+
376+
Match match = Regex.Match(osReleaseContent, @"VERSION_CODENAME=(\w+)", RegexOptions.Multiline);
377+
return match.Success ? match.Groups[1].Value : null;
378+
}
379+
catch
380+
{
381+
// If /etc/os-release cannot be read, return null and let ResolveLinuxInstallationFile
382+
// fall back to the profile-provided LinuxInstallationFile parameter.
383+
return null;
384+
}
385+
}
386+
387+
/// <summary>
388+
/// Resolves the correct Linux installation file URL based on the detected OS codename.
389+
/// If the profile provides an explicit LinuxInstallationFile, that takes precedence.
390+
/// Otherwise, the built-in mapping of codename to ROCm URL is used.
391+
/// </summary>
392+
private void ResolveLinuxInstallationFile()
393+
{
394+
// If the profile explicitly provided a LinuxInstallationFile, use it as-is.
395+
if (!string.IsNullOrWhiteSpace(this.LinuxInstallationFile))
396+
{
397+
return;
398+
}
399+
400+
// Look up the detected codename in the built-in mapping.
401+
if (!string.IsNullOrWhiteSpace(this.osVersionCodename)
402+
&& AMDGPUDriverInstallation.SupportedInstallationFiles.TryGetValue(this.osVersionCodename, out string resolvedUrl))
403+
{
404+
this.LinuxInstallationFile = resolvedUrl;
405+
return;
406+
}
407+
408+
throw new DependencyException(
409+
$"No AMD GPU driver installation file is available for the detected OS codename '{this.osVersionCodename}'. " +
410+
$"Supported codenames: {string.Join(", ", AMDGPUDriverInstallation.SupportedInstallationFiles.Keys)}. " +
411+
$"You can provide an explicit URL via the 'LinuxInstallationFile' profile parameter.",
412+
ErrorReason.DependencyNotFound);
413+
}
414+
415+
private string GetUserHomePath()
416+
{
417+
string username = this.Username;
418+
if (string.Equals(username, "root", StringComparison.OrdinalIgnoreCase))
419+
{
420+
return "/root";
421+
}
422+
423+
return $"/home/{username}";
424+
}
425+
339426
private async Task InstallAMDGPUDriverWindows(EventContext telemetryContext, CancellationToken cancellationToken)
340427
{
341428
string installerPath = string.Empty;

0 commit comments

Comments
 (0)