From f1871c9fe4d53548523a8fbd75116c9937bfd992 Mon Sep 17 00:00:00 2001 From: Blair L Murri Date: Thu, 27 Jul 2023 14:54:07 -0700 Subject: [PATCH] Mitigate flakey IsPublic container image calls and include drs image when needed (#311) --- src/TesApi.Tests/BatchSchedulerTests.cs | 2 +- src/TesApi.Web/BatchScheduler.BatchPools.cs | 12 +-- src/TesApi.Web/BatchScheduler.cs | 91 ++++++++++++--------- 3 files changed, 61 insertions(+), 44 deletions(-) diff --git a/src/TesApi.Tests/BatchSchedulerTests.cs b/src/TesApi.Tests/BatchSchedulerTests.cs index 13d8bc1cd..8f4098399 100644 --- a/src/TesApi.Tests/BatchSchedulerTests.cs +++ b/src/TesApi.Tests/BatchSchedulerTests.cs @@ -634,7 +634,7 @@ public async Task BatchJobContainsExpectedBatchPoolInformation() Assert.IsNull(poolInformation.AutoPoolSpecification); Assert.IsNotNull(poolInformation.PoolId); - Assert.AreEqual("TES-hostname-edicated1-gmnsliori642muklue6izysbjl4qoypj-", poolInformation.PoolId[0..^8]); + Assert.AreEqual("TES-hostname-edicated1-6aczoqjox53tytv3h7hxwrp5t5ne4yzs-", poolInformation.PoolId[0..^8]); Assert.AreEqual("VmSizeDedicated1", pool.VmSize); Assert.IsTrue(((BatchScheduler)batchScheduler).TryGetPool(poolInformation.PoolId, out _)); Assert.AreEqual(1, pool.DeploymentConfiguration.VirtualMachineConfiguration.ContainerConfiguration.ContainerRegistries.Count); diff --git a/src/TesApi.Web/BatchScheduler.BatchPools.cs b/src/TesApi.Web/BatchScheduler.BatchPools.cs index b2697b319..53e14946d 100644 --- a/src/TesApi.Web/BatchScheduler.BatchPools.cs +++ b/src/TesApi.Web/BatchScheduler.BatchPools.cs @@ -27,25 +27,25 @@ public partial class BatchScheduler internal delegate ValueTask ModelPoolFactory(string poolId, CancellationToken cancellationToken); - private async Task<(string PoolKey, string DisplayName)> GetPoolKey(TesTask tesTask, VirtualMachineInformation virtualMachineInformation, CancellationToken cancellationToken) + private (string PoolKey, string DisplayName) GetPoolKey(TesTask tesTask, VirtualMachineInformation virtualMachineInformation, ContainerConfiguration containerConfiguration, CancellationToken cancellationToken) { var identityResourceId = tesTask.Resources?.ContainsBackendParameterValue(TesResources.SupportedBackendParameters.workflow_execution_identity) == true ? tesTask.Resources?.GetBackendParameterValue(TesResources.SupportedBackendParameters.workflow_execution_identity) : default; var executorImage = tesTask.Executors.First().Image; - string registryServer = null; + string containerImageNames = null; - if (!containerRegistryProvider.IsImagePublic(executorImage)) + if (containerConfiguration?.ContainerImageNames?.Any() ?? false) { - registryServer = (await containerRegistryProvider.GetContainerRegistryInfoAsync(executorImage, cancellationToken))?.RegistryServer; + containerImageNames = string.Join(';', containerConfiguration.ContainerImageNames); } var label = string.IsNullOrWhiteSpace(batchPrefix) ? "" : batchPrefix; var vmSize = virtualMachineInformation.VmSize ?? ""; var isPreemptable = virtualMachineInformation.LowPriority; - registryServer ??= ""; + containerImageNames ??= ""; identityResourceId ??= ""; // Generate hash of everything that differentiates this group of pools - var displayName = $"{label}:{vmSize}:{isPreemptable}:{registryServer}:{identityResourceId}"; + var displayName = $"{label}:{vmSize}:{isPreemptable}:{containerImageNames}:{identityResourceId}"; var hash = CommonUtilities.Base32.ConvertToBase32(SHA1.HashData(Encoding.UTF8.GetBytes(displayName))).TrimEnd('=').ToLowerInvariant(); // This becomes 32 chars // Build a PoolName that is of legal length, while exposing the most important metadata without requiring user to find DisplayName diff --git a/src/TesApi.Web/BatchScheduler.cs b/src/TesApi.Web/BatchScheduler.cs index 874c47dff..0485c7a37 100644 --- a/src/TesApi.Web/BatchScheduler.cs +++ b/src/TesApi.Web/BatchScheduler.cs @@ -491,14 +491,12 @@ private async Task AddBatchTaskAsync(TesTask tesTask, CancellationToken cancella { var virtualMachineInfo = await GetVmSizeAsync(tesTask, cancellationToken); - (poolKey, var displayName) = enableBatchAutopool ? default : await GetPoolKey(tesTask, virtualMachineInfo, cancellationToken); + var containerMetadata = await GetContainerConfigurationIfNeededAsync(tesTask, cancellationToken); + (poolKey, var displayName) = enableBatchAutopool ? default : GetPoolKey(tesTask, virtualMachineInfo, containerMetadata.ContainerConfiguration, cancellationToken); await quotaVerifier.CheckBatchAccountQuotasAsync(virtualMachineInfo, needPoolOrJobQuotaCheck: enableBatchAutopool || !IsPoolAvailable(poolKey), needCoresUtilizationQuotaCheck: enableBatchAutopool, cancellationToken: cancellationToken); var tesTaskLog = tesTask.AddTesTaskLog(); tesTaskLog.VirtualMachineInfo = virtualMachineInfo; - - // TODO?: Support for multiple executors. Cromwell has single executor per task. - var containerConfiguration = await GetContainerConfigurationIfNeededAsync(tesTask.Executors.First().Image, cancellationToken); var identities = new List(); if (!string.IsNullOrWhiteSpace(globalManagedIdentity)) @@ -522,7 +520,7 @@ private async Task AddBatchTaskAsync(TesTask tesTask, CancellationToken cancella autoscaled: false, preemptable: virtualMachineInfo.LowPriority, nodeInfo: useGen2.GetValueOrDefault() ? gen2BatchNodeInfo : gen1BatchNodeInfo, - containerConfiguration: containerConfiguration, + containerConfiguration: containerMetadata.ContainerConfiguration, cancellationToken: cancellationToken), tesTaskId: tesTask.Id, jobId: jobOrTaskId, @@ -543,7 +541,7 @@ private async Task AddBatchTaskAsync(TesTask tesTask, CancellationToken cancella autoscaled: true, preemptable: virtualMachineInfo.LowPriority, nodeInfo: useGen2.GetValueOrDefault() ? gen2BatchNodeInfo : gen1BatchNodeInfo, - containerConfiguration: containerConfiguration, + containerConfiguration: containerMetadata.ContainerConfiguration, cancellationToken: ct)), cancellationToken: cancellationToken) ).Pool; @@ -551,7 +549,7 @@ private async Task AddBatchTaskAsync(TesTask tesTask, CancellationToken cancella } tesTask.PoolId = poolInformation.PoolId; - var cloudTask = await ConvertTesTaskToBatchTaskAsync(enableBatchAutopool ? tesTask.Id : jobOrTaskId, tesTask, containerConfiguration is not null, cancellationToken); + var cloudTask = await ConvertTesTaskToBatchTaskAsync(enableBatchAutopool ? tesTask.Id : jobOrTaskId, tesTask, containerMetadata.IsPublic, cancellationToken); logger.LogInformation($"Creating batch task for TES task {tesTask.Id}. Using VM size {virtualMachineInfo.VmSize}."); if (enableBatchAutopool) @@ -945,11 +943,12 @@ private ValueTask HandleTesTaskTransitionAsync(TesTask tesTask, CombinedBa /// /// The Batch Task Id /// The - /// Indicates that must be set. + /// Indicates which container images are public. /// A for controlling the lifetime of the asynchronous operation. /// Job preparation and main Batch tasks - private async Task ConvertTesTaskToBatchTaskAsync(string taskId, TesTask task, bool poolHasContainerConfig, CancellationToken cancellationToken) + private async Task ConvertTesTaskToBatchTaskAsync(string taskId, TesTask task, (bool ExecutorImage, bool DockerInDockerImage, bool CromwellDrsImage) isPublic, CancellationToken cancellationToken) { + var poolHasContainerConfig = !(isPublic.ExecutorImage && isPublic.DockerInDockerImage && isPublic.CromwellDrsImage); var cromwellExecutionDirectoryPath = GetCromwellExecutionDirectoryPath(task); var isCromwell = cromwellExecutionDirectoryPath is not null; @@ -1034,8 +1033,6 @@ private async Task ConvertTesTaskToBatchTaskAsync(string taskId, TesT .Select(s => $"-v $AZ_BATCH_TASK_WORKING_DIR/wd/{s}:/{s}")); var workdirOption = string.IsNullOrWhiteSpace(executor.Workdir) ? string.Empty : $"--workdir {executor.Workdir} "; - var executorImageIsPublic = containerRegistryProvider.IsImagePublic(executor.Image); - var dockerInDockerImageIsPublic = containerRegistryProvider.IsImagePublic(dockerInDockerImageName); var sb = new StringBuilder(); @@ -1043,7 +1040,7 @@ private async Task ConvertTesTaskToBatchTaskAsync(string taskId, TesT sb.AppendLinuxLine($"write_ts() {{ write_kv $1 $(date -Iseconds); }} && \\"); // Function that appends key= to metrics.txt file sb.AppendLinuxLine($"mkdir -p $AZ_BATCH_TASK_WORKING_DIR/wd && \\"); - if (dockerInDockerImageIsPublic) + if (isPublic.DockerInDockerImage) { sb.AppendLinuxLine($"(grep -q alpine /etc/os-release && apk add bash || :) && \\"); // Install bash if running on alpine (will be the case if running inside "docker" image) } @@ -1057,7 +1054,7 @@ private async Task ConvertTesTaskToBatchTaskAsync(string taskId, TesT sb.AppendLinuxLine($"write_ts CromwellDrsLocalizerPullEnd && \\"); } - if (executorImageIsPublic) + if (isPublic.ExecutorImage) { // Private executor images are pulled via pool ContainerConfiguration sb.AppendLinuxLine($"write_ts ExecutorPullStart && docker pull --quiet {executor.Image} && write_ts ExecutorPullEnd && \\"); @@ -1301,52 +1298,72 @@ var s when s.StartsWith("batch.node.centos ") => "sudo yum install epel-release /// /// Constructs an Azure Batch Container Configuration instance /// - /// The image name for the current + /// The to schedule on Azure Batch /// A for controlling the lifetime of the asynchronous operation. /// - private async ValueTask GetContainerConfigurationIfNeededAsync(string executorImage, CancellationToken cancellationToken) + private async ValueTask<(ContainerConfiguration ContainerConfiguration, (bool ExecutorImage, bool DockerInDockerImage, bool CromwellDrsImage) IsPublic)> GetContainerConfigurationIfNeededAsync(TesTask tesTask, CancellationToken cancellationToken) { - if (containerRegistryProvider.IsImagePublic(executorImage)) - { - return default; - } + var drsImageNeeded = tesTask.Inputs?.Any(i => i?.Url?.StartsWith("drs://") ?? false) ?? false; + // TODO: Support for multiple executors. Cromwell has single executor per task. + var executorImage = tesTask.Executors.First().Image; + + var dockerInDockerIsPublic = true; + var executorImageIsPublic = containerRegistryProvider.IsImagePublic(executorImage); + var cromwellDrsIsPublic = drsImageNeeded ? containerRegistryProvider.IsImagePublic(cromwellDrsLocalizerImageName) : true; BatchModels.ContainerConfiguration result = default; - var containerRegistryInfo = await containerRegistryProvider.GetContainerRegistryInfoAsync(executorImage, cancellationToken); - if (containerRegistryInfo is not null) + if (!executorImageIsPublic || !cromwellDrsIsPublic) { + var neededImages = new List { executorImage, dockerInDockerImageName }; + if (drsImageNeeded) + { + neededImages.Add(cromwellDrsLocalizerImageName); + } + // Download private images at node startup, since those cannot be downloaded in the main task that runs multiple containers. // Doing this also requires that the main task runs inside a container, hence downloading the "docker" image (contains docker client) as well. - result = new BatchModels.ContainerConfiguration + result = new BatchModels.ContainerConfiguration { ContainerImageNames = neededImages, ContainerRegistries = new List() }; + + if (!executorImageIsPublic) { - ContainerImageNames = new List { executorImage, dockerInDockerImageName }, - ContainerRegistries = new List + var containerRegistryInfo = await containerRegistryProvider.GetContainerRegistryInfoAsync(executorImage, cancellationToken); + if (containerRegistryInfo is not null) { - new( + result.ContainerRegistries.Add(new( userName: containerRegistryInfo.Username, registryServer: containerRegistryInfo.RegistryServer, - password: containerRegistryInfo.Password) + password: containerRegistryInfo.Password)); } - }; - - ContainerRegistryInfo containerRegistryInfoForDockerInDocker = null; + } - if (!containerRegistryProvider.IsImagePublic(dockerInDockerImageName)) + if (!cromwellDrsIsPublic) { - containerRegistryInfoForDockerInDocker = await containerRegistryProvider.GetContainerRegistryInfoAsync(dockerInDockerImageName, cancellationToken); + var containerRegistryInfo = await containerRegistryProvider.GetContainerRegistryInfoAsync(cromwellDrsLocalizerImageName, cancellationToken); + if (containerRegistryInfo is not null && !result.ContainerRegistries.Any(registry => registry.RegistryServer == containerRegistryInfo.RegistryServer)) + { + result.ContainerRegistries.Add(new( + userName: containerRegistryInfo.Username, + registryServer: containerRegistryInfo.RegistryServer, + password: containerRegistryInfo.Password)); + } + } - if (containerRegistryInfoForDockerInDocker is not null && containerRegistryInfoForDockerInDocker.RegistryServer != containerRegistryInfo.RegistryServer) + if (result.ContainerRegistries.Count != 0) + { + var containerRegistryInfo = await containerRegistryProvider.GetContainerRegistryInfoAsync(dockerInDockerImageName, cancellationToken); + dockerInDockerIsPublic = containerRegistryInfo is null; + if (containerRegistryInfo is not null && !result.ContainerRegistries.Any(registry => registry.RegistryServer == containerRegistryInfo.RegistryServer)) { result.ContainerRegistries.Add(new( - userName: containerRegistryInfoForDockerInDocker.Username, - registryServer: containerRegistryInfoForDockerInDocker.RegistryServer, - password: containerRegistryInfoForDockerInDocker.Password)); + userName: containerRegistryInfo.Username, + registryServer: containerRegistryInfo.RegistryServer, + password: containerRegistryInfo.Password)); } } } - return result is null ? default : new() + return result is null || result.ContainerRegistries.Count == 0 ? (default, (true, true, true)) : (new() { ContainerImageNames = result.ContainerImageNames, ContainerRegistries = result @@ -1357,7 +1374,7 @@ private async ValueTask GetContainerConfigurationIfNeede registryServer: r.RegistryServer, identityReference: r.IdentityReference is null ? null : new() { ResourceId = r.IdentityReference.ResourceId })) .ToList() - }; + }, (executorImageIsPublic, dockerInDockerIsPublic, cromwellDrsIsPublic)); } ///