Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Mitigate batch docker misconfiguration #309

Merged
merged 5 commits into from
Jul 27, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 11 additions & 1 deletion src/TesApi.Tests/BatchSchedulerTests.cs
Original file line number Diff line number Diff line change
Expand Up @@ -1556,7 +1556,17 @@ private static Action<Mock<IAzureProxy>> GetMockAzureProxy(AzureProxyReturnValue
{
var config = Enumerable.Empty<(string Key, string Value)>()
.Append(("Storage:DefaultAccountName", "defaultstorageaccount"))
.Append(("BatchScheduling:Prefix", "hostname"));
.Append(("BatchScheduling:Prefix", "hostname"))
//.Append(("BatchImageGen1:Offer", "ubuntu-server-container"))
//.Append(("BatchImageGen1:Publisher", "microsoft-azure-batch"))
//.Append(("BatchImageGen1:Sku", "20-04-lts"))
//.Append(("BatchImageGen1:Version", "latest"))
.Append(("BatchImageGen1:NodeAgentSkuId", "batch.node.ubuntu 20.04"))
//.Append(("BatchImageGen2:Offer", "ubuntu-hpc"))
//.Append(("BatchImageGen2:Publisher", "microsoft-dsvm"))
//.Append(("BatchImageGen2:Sku", "2004"))
//.Append(("BatchImageGen2:Version", "latest"))
.Append(("BatchImageGen2:NodeAgentSkuId", "batch.node.ubuntu 20.04"));
if (autopool)
{
config = config.Append(("BatchScheduling:UseLegacyAutopools", "true"));
Expand Down
2 changes: 1 addition & 1 deletion src/TesApi.Tests/TerraStorageAccessProviderTests.cs
Original file line number Diff line number Diff line change
Expand Up @@ -95,7 +95,7 @@ public async Task MapLocalPathToSasUrlAsync_GetContainerSasIsTrue(string input,
{
SetUpTerraApiClient();

var result = await terraStorageAccessProvider.MapLocalPathToSasUrlAsync(input + blobPath, CancellationToken.None, true);
var result = await terraStorageAccessProvider.MapLocalPathToSasUrlAsync(input + blobPath, CancellationToken.None, getContainerSas: true);

Assert.IsNotNull(result);
Assert.AreEqual($"{expected}?sv={TerraApiStubData.SasToken}", result);
Expand Down
46 changes: 29 additions & 17 deletions src/TesApi.Web/BatchScheduler.cs
Original file line number Diff line number Diff line change
Expand Up @@ -73,6 +73,7 @@ public partial class BatchScheduler : IBatchScheduler
private readonly string batchNodesSubnetId;
private readonly bool disableBatchNodesPublicIpAddress;
private readonly bool enableBatchAutopool;
private readonly TimeSpan poolLifetime;
private readonly BatchNodeInfo gen2BatchNodeInfo;
private readonly BatchNodeInfo gen1BatchNodeInfo;
private readonly string marthaUrl;
Expand Down Expand Up @@ -148,6 +149,7 @@ public BatchScheduler(
if (string.IsNullOrWhiteSpace(this.cromwellDrsLocalizerImageName)) { this.cromwellDrsLocalizerImageName = Options.MarthaOptions.DefaultCromwellDrsLocalizer; }
this.disableBatchNodesPublicIpAddress = batchNodesOptions.Value.DisablePublicIpAddress;
this.enableBatchAutopool = batchSchedulingOptions.Value.UseLegacyAutopools;
this.poolLifetime = this.enableBatchAutopool ? TimeSpan.Zero : TimeSpan.FromDays(batchSchedulingOptions.Value.PoolRotationForcedDays == 0 ? Options.BatchSchedulingOptions.DefaultPoolRotationForcedDays : batchSchedulingOptions.Value.PoolRotationForcedDays);
this.defaultStorageAccountName = storageOptions.Value.DefaultAccountName;
this.marthaUrl = marthaOptions.Value.Url;
this.marthaKeyVaultName = marthaOptions.Value.KeyVaultName;
Expand Down Expand Up @@ -1258,32 +1260,42 @@ private async Task<TesInput> GetTesInputFileUrlAsync(TesInput inputFile, TesTask
/// <summary>
/// Constructs a universal Azure Start Task instance if needed
/// </summary>
/// <param name="machineConfiguration">A <see cref="VirtualMachineConfiguration"/> describing the OS of the pool's nodes.</param>
/// <param name="cancellationToken">A <see cref="CancellationToken"/> for controlling the lifetime of the asynchronous operation.</param>
/// <returns></returns>
private async Task<StartTask> StartTaskIfNeeded(CancellationToken cancellationToken)
/// <remarks>This method also mitigates errors associated with docker daemons that are not configured to place their filestsystem assets on the data drive. Errors attempting to do so are ignored.</remarks>
private async Task<StartTask> StartTaskIfNeeded(VirtualMachineConfiguration machineConfiguration, CancellationToken cancellationToken)
{
string startTaskSasUrl = default;
var installJq = machineConfiguration.NodeAgentSkuId switch
{
var s when s.StartsWith("batch.node.ubuntu ") => "sudo apt-get install -y jq",
var s when s.StartsWith("batch.node.centos ") => "sudo yum install epel-release -y && sudo yum update -y && sudo yum install jq -y",
_ => throw new InvalidOperationException($"Unrecognized OS. Please send open an issue @ 'https://github.com/microsoft/ga4gh-tes/issues' with this message. ({machineConfiguration.NodeAgentSkuId})")
};

if (!string.IsNullOrWhiteSpace(globalStartTaskPath))
var commandPart1 = @"/usr/bin/bash -c 'trap ""echo Error trapped; exit 0"" ERR; sudo touch tmp2.json && sudo cp /etc/docker/daemon.json tmp1.json && sudo chmod a+w tmp?.json && if fgrep -q ""$(dirname ""$AZ_BATCH_NODE_ROOT_DIR"")/docker"" tmp1.json; then ";
var commandPart2 = @" && jq \.\[\""data-root\""\]=\""""$(dirname ""$AZ_BATCH_NODE_ROOT_DIR"")/docker""\"" tmp1.json >> tmp2.json && sudo mv tmp2.json /etc/docker/daemon.json && sudo systemctl restart docker; fi'";

var startTask = new StartTask
{
startTaskSasUrl = await storageAccessProvider.MapLocalPathToSasUrlAsync(globalStartTaskPath, cancellationToken);
if (!await azureProxy.BlobExistsAsync(new Uri(startTaskSasUrl), cancellationToken))
{
startTaskSasUrl = null;
}
}
CommandLine = commandPart1 + installJq + commandPart2,
UserIdentity = new UserIdentity(new AutoUserSpecification(elevationLevel: ElevationLevel.Admin, scope: AutoUserScope.Pool)),
};

if (!string.IsNullOrWhiteSpace(startTaskSasUrl) && !string.IsNullOrWhiteSpace(StartTaskScriptFilename))
if (!string.IsNullOrWhiteSpace(globalStartTaskPath))
{
return new StartTask
var startTaskSasUrl = enableBatchAutopool
? await storageAccessProvider.MapLocalPathToSasUrlAsync(globalStartTaskPath, cancellationToken)
: await storageAccessProvider.MapLocalPathToSasUrlAsync(globalStartTaskPath, cancellationToken, sasTokenDuration: BatchPoolService.RunInterval.Multiply(2).Add(poolLifetime).Add(TimeSpan.FromMinutes(15)));

if (await azureProxy.BlobExistsAsync(new Uri(startTaskSasUrl), cancellationToken))
{
CommandLine = $"./{StartTaskScriptFilename}",
UserIdentity = new UserIdentity(new AutoUserSpecification(elevationLevel: ElevationLevel.Admin, scope: AutoUserScope.Pool)),
ResourceFiles = new List<ResourceFile> { ResourceFile.FromUrl(startTaskSasUrl, StartTaskScriptFilename) }
};
startTask.ResourceFiles = new List<ResourceFile> { ResourceFile.FromUrl(startTaskSasUrl, StartTaskScriptFilename) };
startTask.CommandLine = $"({startTask.CommandLine}) && ./{StartTaskScriptFilename}";
}
}

return default;
return startTask;
}

/// <summary>
Expand Down Expand Up @@ -1440,7 +1452,7 @@ private async ValueTask<PoolSpecification> GetPoolSpecification(string vmSize, b
VirtualMachineConfiguration = vmConfig,
VirtualMachineSize = vmSize,
ResizeTimeout = TimeSpan.FromMinutes(30),
StartTask = await StartTaskIfNeeded(cancellationToken),
StartTask = await StartTaskIfNeeded(vmConfig, cancellationToken),
TargetNodeCommunicationMode = NodeCommunicationMode.Simplified,
};

Expand Down
10 changes: 5 additions & 5 deletions src/TesApi.Web/Storage/DefaultStorageAccessProvider.cs
Original file line number Diff line number Diff line change
Expand Up @@ -89,7 +89,7 @@ public override async Task<bool> IsPublicHttpUrlAsync(string uriString, Cancella
}

/// <inheritdoc />
public override async Task<string> MapLocalPathToSasUrlAsync(string path, CancellationToken cancellationToken, bool getContainerSas = false)
public override async Task<string> MapLocalPathToSasUrlAsync(string path, CancellationToken cancellationToken, TimeSpan? sasTokenDuration = default, bool getContainerSas = false)
{
// TODO: Optional: If path is /container/... where container matches the name of the container in the default storage account, prepend the account name to the path.
// This would allow the user to omit the account name for files stored in the default storage account
Expand Down Expand Up @@ -130,15 +130,15 @@ public override async Task<string> MapLocalPathToSasUrlAsync(string path, Cancel
var policy = new SharedAccessBlobPolicy()
{
Permissions = SharedAccessBlobPermissions.Add | SharedAccessBlobPermissions.Create | SharedAccessBlobPermissions.List | SharedAccessBlobPermissions.Read | SharedAccessBlobPermissions.Write,
SharedAccessExpiryTime = DateTime.Now.Add(SasTokenDuration)
SharedAccessExpiryTime = DateTime.Now.Add((sasTokenDuration ?? TimeSpan.Zero) + SasTokenDuration)
};

var containerUri = new StorageAccountUrlSegments(storageAccountInfo.BlobEndpoint, pathSegments.ContainerName).ToUri();
resultPathSegments.SasToken = new CloudBlobContainer(containerUri, new StorageCredentials(storageAccountInfo.Name, accountKey)).GetSharedAccessSignature(policy, null, SharedAccessProtocol.HttpsOnly, null);
}
else
{
var policy = new SharedAccessBlobPolicy() { Permissions = SharedAccessBlobPermissions.Read, SharedAccessExpiryTime = DateTime.Now.Add(SasTokenDuration) };
var policy = new SharedAccessBlobPolicy() { Permissions = SharedAccessBlobPermissions.Read, SharedAccessExpiryTime = DateTime.Now.Add((sasTokenDuration ?? TimeSpan.Zero) + SasTokenDuration) };
resultPathSegments.SasToken = new CloudBlob(resultPathSegments.ToUri(), new StorageCredentials(storageAccountInfo.Name, accountKey)).GetSharedAccessSignature(policy, null, null, SharedAccessProtocol.HttpsOnly, null);
}

Expand All @@ -157,7 +157,7 @@ public override async Task<string> GetInternalTesBlobUrlAsync(string blobPath, C
{
var normalizedBlobPath = NormalizedBlobPath(blobPath);

return await MapLocalPathToSasUrlAsync($"/{defaultStorageAccountName}{TesExecutionsPathPrefix}{normalizedBlobPath}", cancellationToken, true);
return await MapLocalPathToSasUrlAsync($"/{defaultStorageAccountName}{TesExecutionsPathPrefix}{normalizedBlobPath}", cancellationToken, getContainerSas: true);
}


Expand All @@ -176,7 +176,7 @@ public override async Task<string> GetInternalTesTaskBlobUrlAsync(TesTask task,
{
var blobPathWithPathPrefix =
$"/{defaultStorageAccountName}/{task.Resources.GetBackendParameterValue(TesResources.SupportedBackendParameters.internal_path_prefix).Trim('/')}{normalizedBlobPath}";
return await MapLocalPathToSasUrlAsync(blobPathWithPathPrefix, cancellationToken, true);
return await MapLocalPathToSasUrlAsync(blobPathWithPathPrefix, cancellationToken, getContainerSas: true);
}

return await GetInternalTesBlobUrlAsync($"/{task.Id}{normalizedBlobPath}", cancellationToken);
Expand Down
3 changes: 2 additions & 1 deletion src/TesApi.Web/Storage/IStorageAccessProvider.cs
Original file line number Diff line number Diff line change
Expand Up @@ -74,9 +74,10 @@ public interface IStorageAccessProvider
/// </summary>
/// <param name="path">The file path to convert. Two-part path is treated as container path. Paths with three or more parts are treated as blobs.</param>
/// <param name="cancellationToken">A <see cref="CancellationToken"/> for controlling the lifetime of the asynchronous operation.</param>
/// <param name="sasTokenDuration">Duration SAS should be valid.</param>
/// <param name="getContainerSas">Get the container SAS even if path is longer than two parts</param>
/// <returns>An Azure Block Blob or Container URL with SAS token</returns>
public Task<string> MapLocalPathToSasUrlAsync(string path, CancellationToken cancellationToken, bool getContainerSas = false);
public Task<string> MapLocalPathToSasUrlAsync(string path, CancellationToken cancellationToken, TimeSpan? sasTokenDuration = default, bool getContainerSas = false);

/// <summary>
/// Returns an Azure Storage Blob URL with a SAS token for the specified blob path in the TES internal storage location
Expand Down
6 changes: 3 additions & 3 deletions src/TesApi.Web/Storage/StorageAccessProvider.cs
Original file line number Diff line number Diff line change
Expand Up @@ -80,7 +80,7 @@ public async Task<string> DownloadBlobAsync(Uri blobAbsoluteUrl, CancellationTok

/// <inheritdoc />
public async Task UploadBlobAsync(string blobRelativePath, string content, CancellationToken cancellationToken)
=> await this.AzureProxy.UploadBlobAsync(new Uri(await MapLocalPathToSasUrlAsync(blobRelativePath, cancellationToken, true)), content, cancellationToken);
=> await this.AzureProxy.UploadBlobAsync(new Uri(await MapLocalPathToSasUrlAsync(blobRelativePath, cancellationToken, getContainerSas: true)), content, cancellationToken);

/// <inheritdoc />
public async Task UploadBlobAsync(Uri blobAbsoluteUrl, string content,
Expand All @@ -93,13 +93,13 @@ public async Task UploadBlobAsync(Uri blobAbsoluteUrl, string content,

/// <inheritdoc />
public async Task UploadBlobFromFileAsync(string blobRelativePath, string sourceLocalFilePath, CancellationToken cancellationToken)
=> await this.AzureProxy.UploadBlobFromFileAsync(new Uri(await MapLocalPathToSasUrlAsync(blobRelativePath, cancellationToken, true)), sourceLocalFilePath, cancellationToken);
=> await this.AzureProxy.UploadBlobFromFileAsync(new Uri(await MapLocalPathToSasUrlAsync(blobRelativePath, cancellationToken, getContainerSas: true)), sourceLocalFilePath, cancellationToken);

/// <inheritdoc />
public abstract Task<bool> IsPublicHttpUrlAsync(string uriString, CancellationToken cancellationToken);

/// <inheritdoc />
public abstract Task<string> MapLocalPathToSasUrlAsync(string path, CancellationToken cancellationToken, bool getContainerSas = false);
public abstract Task<string> MapLocalPathToSasUrlAsync(string path, CancellationToken cancellationToken, TimeSpan? sasTokenDuration = default, bool getContainerSas = false);

/// <inheritdoc />
public abstract Task<string> GetInternalTesBlobUrlAsync(string blobPath, CancellationToken cancellationToken);
Expand Down
7 changes: 5 additions & 2 deletions src/TesApi.Web/Storage/TerraStorageAccessProvider.cs
Original file line number Diff line number Diff line change
Expand Up @@ -79,10 +79,13 @@ public override Task<bool> IsPublicHttpUrlAsync(string uriString, CancellationTo
}

/// <inheritdoc />
public override async Task<string> MapLocalPathToSasUrlAsync(string path, CancellationToken cancellationToken, bool getContainerSas = false)
public override async Task<string> MapLocalPathToSasUrlAsync(string path, CancellationToken cancellationToken, TimeSpan? sasTokenDuration = default, bool getContainerSas = false)
{
ArgumentException.ThrowIfNullOrEmpty(path);

if (sasTokenDuration is not null)
{
throw new ArgumentException("Terra does not support extended length SAS tokens.");
}

if (!TryParseHttpUrlFromInput(path, out _))
{
Expand Down