Skip to content

Commit

Permalink
Mitigate flakey IsPublic container image calls and include drs image …
Browse files Browse the repository at this point in the history
…when needed (#311)
  • Loading branch information
BMurri authored Jul 27, 2023
1 parent c066c85 commit f1871c9
Show file tree
Hide file tree
Showing 3 changed files with 61 additions and 44 deletions.
2 changes: 1 addition & 1 deletion src/TesApi.Tests/BatchSchedulerTests.cs
Original file line number Diff line number Diff line change
Expand Up @@ -634,7 +634,7 @@ public async Task BatchJobContainsExpectedBatchPoolInformation()

Assert.IsNull(poolInformation.AutoPoolSpecification);
Assert.IsNotNull(poolInformation.PoolId);
Assert.AreEqual("TES-hostname-edicated1-gmnsliori642muklue6izysbjl4qoypj-", poolInformation.PoolId[0..^8]);
Assert.AreEqual("TES-hostname-edicated1-6aczoqjox53tytv3h7hxwrp5t5ne4yzs-", poolInformation.PoolId[0..^8]);
Assert.AreEqual("VmSizeDedicated1", pool.VmSize);
Assert.IsTrue(((BatchScheduler)batchScheduler).TryGetPool(poolInformation.PoolId, out _));
Assert.AreEqual(1, pool.DeploymentConfiguration.VirtualMachineConfiguration.ContainerConfiguration.ContainerRegistries.Count);
Expand Down
12 changes: 6 additions & 6 deletions src/TesApi.Web/BatchScheduler.BatchPools.cs
Original file line number Diff line number Diff line change
Expand Up @@ -27,25 +27,25 @@ public partial class BatchScheduler

internal delegate ValueTask<BatchModels.Pool> ModelPoolFactory(string poolId, CancellationToken cancellationToken);

private async Task<(string PoolKey, string DisplayName)> GetPoolKey(TesTask tesTask, VirtualMachineInformation virtualMachineInformation, CancellationToken cancellationToken)
private (string PoolKey, string DisplayName) GetPoolKey(TesTask tesTask, VirtualMachineInformation virtualMachineInformation, ContainerConfiguration containerConfiguration, CancellationToken cancellationToken)
{
var identityResourceId = tesTask.Resources?.ContainsBackendParameterValue(TesResources.SupportedBackendParameters.workflow_execution_identity) == true ? tesTask.Resources?.GetBackendParameterValue(TesResources.SupportedBackendParameters.workflow_execution_identity) : default;
var executorImage = tesTask.Executors.First().Image;
string registryServer = null;
string containerImageNames = null;

if (!containerRegistryProvider.IsImagePublic(executorImage))
if (containerConfiguration?.ContainerImageNames?.Any() ?? false)
{
registryServer = (await containerRegistryProvider.GetContainerRegistryInfoAsync(executorImage, cancellationToken))?.RegistryServer;
containerImageNames = string.Join(';', containerConfiguration.ContainerImageNames);
}

var label = string.IsNullOrWhiteSpace(batchPrefix) ? "<none>" : batchPrefix;
var vmSize = virtualMachineInformation.VmSize ?? "<none>";
var isPreemptable = virtualMachineInformation.LowPriority;
registryServer ??= "<none>";
containerImageNames ??= "<none>";
identityResourceId ??= "<none>";

// Generate hash of everything that differentiates this group of pools
var displayName = $"{label}:{vmSize}:{isPreemptable}:{registryServer}:{identityResourceId}";
var displayName = $"{label}:{vmSize}:{isPreemptable}:{containerImageNames}:{identityResourceId}";
var hash = CommonUtilities.Base32.ConvertToBase32(SHA1.HashData(Encoding.UTF8.GetBytes(displayName))).TrimEnd('=').ToLowerInvariant(); // This becomes 32 chars

// Build a PoolName that is of legal length, while exposing the most important metadata without requiring user to find DisplayName
Expand Down
91 changes: 54 additions & 37 deletions src/TesApi.Web/BatchScheduler.cs
Original file line number Diff line number Diff line change
Expand Up @@ -491,14 +491,12 @@ private async Task AddBatchTaskAsync(TesTask tesTask, CancellationToken cancella
{
var virtualMachineInfo = await GetVmSizeAsync(tesTask, cancellationToken);

(poolKey, var displayName) = enableBatchAutopool ? default : await GetPoolKey(tesTask, virtualMachineInfo, cancellationToken);
var containerMetadata = await GetContainerConfigurationIfNeededAsync(tesTask, cancellationToken);
(poolKey, var displayName) = enableBatchAutopool ? default : GetPoolKey(tesTask, virtualMachineInfo, containerMetadata.ContainerConfiguration, cancellationToken);
await quotaVerifier.CheckBatchAccountQuotasAsync(virtualMachineInfo, needPoolOrJobQuotaCheck: enableBatchAutopool || !IsPoolAvailable(poolKey), needCoresUtilizationQuotaCheck: enableBatchAutopool, cancellationToken: cancellationToken);

var tesTaskLog = tesTask.AddTesTaskLog();
tesTaskLog.VirtualMachineInfo = virtualMachineInfo;

// TODO?: Support for multiple executors. Cromwell has single executor per task.
var containerConfiguration = await GetContainerConfigurationIfNeededAsync(tesTask.Executors.First().Image, cancellationToken);
var identities = new List<string>();

if (!string.IsNullOrWhiteSpace(globalManagedIdentity))
Expand All @@ -522,7 +520,7 @@ private async Task AddBatchTaskAsync(TesTask tesTask, CancellationToken cancella
autoscaled: false,
preemptable: virtualMachineInfo.LowPriority,
nodeInfo: useGen2.GetValueOrDefault() ? gen2BatchNodeInfo : gen1BatchNodeInfo,
containerConfiguration: containerConfiguration,
containerConfiguration: containerMetadata.ContainerConfiguration,
cancellationToken: cancellationToken),
tesTaskId: tesTask.Id,
jobId: jobOrTaskId,
Expand All @@ -543,15 +541,15 @@ private async Task AddBatchTaskAsync(TesTask tesTask, CancellationToken cancella
autoscaled: true,
preemptable: virtualMachineInfo.LowPriority,
nodeInfo: useGen2.GetValueOrDefault() ? gen2BatchNodeInfo : gen1BatchNodeInfo,
containerConfiguration: containerConfiguration,
containerConfiguration: containerMetadata.ContainerConfiguration,
cancellationToken: ct)),
cancellationToken: cancellationToken)
).Pool;
jobOrTaskId = $"{tesTask.Id}-{tesTask.Logs.Count}";
}

tesTask.PoolId = poolInformation.PoolId;
var cloudTask = await ConvertTesTaskToBatchTaskAsync(enableBatchAutopool ? tesTask.Id : jobOrTaskId, tesTask, containerConfiguration is not null, cancellationToken);
var cloudTask = await ConvertTesTaskToBatchTaskAsync(enableBatchAutopool ? tesTask.Id : jobOrTaskId, tesTask, containerMetadata.IsPublic, cancellationToken);
logger.LogInformation($"Creating batch task for TES task {tesTask.Id}. Using VM size {virtualMachineInfo.VmSize}.");

if (enableBatchAutopool)
Expand Down Expand Up @@ -945,11 +943,12 @@ private ValueTask<bool> HandleTesTaskTransitionAsync(TesTask tesTask, CombinedBa
/// </summary>
/// <param name="taskId">The Batch Task Id</param>
/// <param name="task">The <see cref="TesTask"/></param>
/// <param name="poolHasContainerConfig">Indicates that <see cref="CloudTask.ContainerSettings"/> must be set.</param>
/// <param name="isPublic">Indicates which container images are public.</param>
/// <param name="cancellationToken">A <see cref="CancellationToken"/> for controlling the lifetime of the asynchronous operation.</param>
/// <returns>Job preparation and main Batch tasks</returns>
private async Task<CloudTask> ConvertTesTaskToBatchTaskAsync(string taskId, TesTask task, bool poolHasContainerConfig, CancellationToken cancellationToken)
private async Task<CloudTask> ConvertTesTaskToBatchTaskAsync(string taskId, TesTask task, (bool ExecutorImage, bool DockerInDockerImage, bool CromwellDrsImage) isPublic, CancellationToken cancellationToken)
{
var poolHasContainerConfig = !(isPublic.ExecutorImage && isPublic.DockerInDockerImage && isPublic.CromwellDrsImage);
var cromwellExecutionDirectoryPath = GetCromwellExecutionDirectoryPath(task);
var isCromwell = cromwellExecutionDirectoryPath is not null;

Expand Down Expand Up @@ -1034,16 +1033,14 @@ private async Task<CloudTask> ConvertTesTaskToBatchTaskAsync(string taskId, TesT
.Select(s => $"-v $AZ_BATCH_TASK_WORKING_DIR/wd/{s}:/{s}"));

var workdirOption = string.IsNullOrWhiteSpace(executor.Workdir) ? string.Empty : $"--workdir {executor.Workdir} ";
var executorImageIsPublic = containerRegistryProvider.IsImagePublic(executor.Image);
var dockerInDockerImageIsPublic = containerRegistryProvider.IsImagePublic(dockerInDockerImageName);

var sb = new StringBuilder();

sb.AppendLinuxLine($"write_kv() {{ echo \"$1=$2\" >> $AZ_BATCH_TASK_WORKING_DIR/metrics.txt; }} && \\"); // Function that appends key=value pair to metrics.txt file
sb.AppendLinuxLine($"write_ts() {{ write_kv $1 $(date -Iseconds); }} && \\"); // Function that appends key=<current datetime> to metrics.txt file
sb.AppendLinuxLine($"mkdir -p $AZ_BATCH_TASK_WORKING_DIR/wd && \\");

if (dockerInDockerImageIsPublic)
if (isPublic.DockerInDockerImage)
{
sb.AppendLinuxLine($"(grep -q alpine /etc/os-release && apk add bash || :) && \\"); // Install bash if running on alpine (will be the case if running inside "docker" image)
}
Expand All @@ -1057,7 +1054,7 @@ private async Task<CloudTask> ConvertTesTaskToBatchTaskAsync(string taskId, TesT
sb.AppendLinuxLine($"write_ts CromwellDrsLocalizerPullEnd && \\");
}

if (executorImageIsPublic)
if (isPublic.ExecutorImage)
{
// Private executor images are pulled via pool ContainerConfiguration
sb.AppendLinuxLine($"write_ts ExecutorPullStart && docker pull --quiet {executor.Image} && write_ts ExecutorPullEnd && \\");
Expand Down Expand Up @@ -1301,52 +1298,72 @@ var s when s.StartsWith("batch.node.centos ") => "sudo yum install epel-release
/// <summary>
/// Constructs an Azure Batch Container Configuration instance
/// </summary>
/// <param name="executorImage">The image name for the current <see cref="TesTask"/></param>
/// <param name="tesTask">The <see cref="TesTask"/> to schedule on Azure Batch</param>
/// <param name="cancellationToken">A <see cref="CancellationToken"/> for controlling the lifetime of the asynchronous operation.</param>
/// <returns></returns>
private async ValueTask<ContainerConfiguration> GetContainerConfigurationIfNeededAsync(string executorImage, CancellationToken cancellationToken)
private async ValueTask<(ContainerConfiguration ContainerConfiguration, (bool ExecutorImage, bool DockerInDockerImage, bool CromwellDrsImage) IsPublic)> GetContainerConfigurationIfNeededAsync(TesTask tesTask, CancellationToken cancellationToken)
{
if (containerRegistryProvider.IsImagePublic(executorImage))
{
return default;
}
var drsImageNeeded = tesTask.Inputs?.Any(i => i?.Url?.StartsWith("drs://") ?? false) ?? false;
// TODO: Support for multiple executors. Cromwell has single executor per task.
var executorImage = tesTask.Executors.First().Image;

var dockerInDockerIsPublic = true;
var executorImageIsPublic = containerRegistryProvider.IsImagePublic(executorImage);
var cromwellDrsIsPublic = drsImageNeeded ? containerRegistryProvider.IsImagePublic(cromwellDrsLocalizerImageName) : true;

BatchModels.ContainerConfiguration result = default;
var containerRegistryInfo = await containerRegistryProvider.GetContainerRegistryInfoAsync(executorImage, cancellationToken);

if (containerRegistryInfo is not null)
if (!executorImageIsPublic || !cromwellDrsIsPublic)
{
var neededImages = new List<string> { executorImage, dockerInDockerImageName };
if (drsImageNeeded)
{
neededImages.Add(cromwellDrsLocalizerImageName);
}

// Download private images at node startup, since those cannot be downloaded in the main task that runs multiple containers.
// Doing this also requires that the main task runs inside a container, hence downloading the "docker" image (contains docker client) as well.
result = new BatchModels.ContainerConfiguration
result = new BatchModels.ContainerConfiguration { ContainerImageNames = neededImages, ContainerRegistries = new List<BatchModels.ContainerRegistry>() };

if (!executorImageIsPublic)
{
ContainerImageNames = new List<string> { executorImage, dockerInDockerImageName },
ContainerRegistries = new List<BatchModels.ContainerRegistry>
var containerRegistryInfo = await containerRegistryProvider.GetContainerRegistryInfoAsync(executorImage, cancellationToken);
if (containerRegistryInfo is not null)
{
new(
result.ContainerRegistries.Add(new(
userName: containerRegistryInfo.Username,
registryServer: containerRegistryInfo.RegistryServer,
password: containerRegistryInfo.Password)
password: containerRegistryInfo.Password));
}
};

ContainerRegistryInfo containerRegistryInfoForDockerInDocker = null;
}

if (!containerRegistryProvider.IsImagePublic(dockerInDockerImageName))
if (!cromwellDrsIsPublic)
{
containerRegistryInfoForDockerInDocker = await containerRegistryProvider.GetContainerRegistryInfoAsync(dockerInDockerImageName, cancellationToken);
var containerRegistryInfo = await containerRegistryProvider.GetContainerRegistryInfoAsync(cromwellDrsLocalizerImageName, cancellationToken);
if (containerRegistryInfo is not null && !result.ContainerRegistries.Any(registry => registry.RegistryServer == containerRegistryInfo.RegistryServer))
{
result.ContainerRegistries.Add(new(
userName: containerRegistryInfo.Username,
registryServer: containerRegistryInfo.RegistryServer,
password: containerRegistryInfo.Password));
}
}

if (containerRegistryInfoForDockerInDocker is not null && containerRegistryInfoForDockerInDocker.RegistryServer != containerRegistryInfo.RegistryServer)
if (result.ContainerRegistries.Count != 0)
{
var containerRegistryInfo = await containerRegistryProvider.GetContainerRegistryInfoAsync(dockerInDockerImageName, cancellationToken);
dockerInDockerIsPublic = containerRegistryInfo is null;
if (containerRegistryInfo is not null && !result.ContainerRegistries.Any(registry => registry.RegistryServer == containerRegistryInfo.RegistryServer))
{
result.ContainerRegistries.Add(new(
userName: containerRegistryInfoForDockerInDocker.Username,
registryServer: containerRegistryInfoForDockerInDocker.RegistryServer,
password: containerRegistryInfoForDockerInDocker.Password));
userName: containerRegistryInfo.Username,
registryServer: containerRegistryInfo.RegistryServer,
password: containerRegistryInfo.Password));
}
}
}

return result is null ? default : new()
return result is null || result.ContainerRegistries.Count == 0 ? (default, (true, true, true)) : (new()
{
ContainerImageNames = result.ContainerImageNames,
ContainerRegistries = result
Expand All @@ -1357,7 +1374,7 @@ private async ValueTask<ContainerConfiguration> GetContainerConfigurationIfNeede
registryServer: r.RegistryServer,
identityReference: r.IdentityReference is null ? null : new() { ResourceId = r.IdentityReference.ResourceId }))
.ToList()
};
}, (executorImageIsPublic, dockerInDockerIsPublic, cromwellDrsIsPublic));
}

/// <summary>
Expand Down

0 comments on commit f1871c9

Please sign in to comment.