Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[6.2.0]Automatically retry the build if encountered remote cache eviction error #17953

Closed
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -243,12 +243,33 @@ public int getId() {
public ListenableFuture<Void> prefetchInputs()
throws IOException, ForbiddenActionInputException {
if (Spawns.shouldPrefetchInputsForLocalExecution(spawn)) {
return actionExecutionContext
.getActionInputPrefetcher()
.prefetchFiles(
getInputMapping(PathFragment.EMPTY_FRAGMENT, /* willAccessRepeatedly= */ true)
.values(),
getMetadataProvider());
return Futures.catchingAsync(
actionExecutionContext
.getActionInputPrefetcher()
.prefetchFiles(
getInputMapping(PathFragment.EMPTY_FRAGMENT, /* willAccessRepeatedly= */ true)
.values(),
getMetadataProvider(),
Priority.MEDIUM),
BulkTransferException.class,
(BulkTransferException e) -> {
if (BulkTransferException.allCausedByCacheNotFoundException(e)) {
var code =
(executionOptions.useNewExitCodeForLostInputs
|| executionOptions.remoteRetryOnCacheEviction > 0)
? Code.REMOTE_CACHE_EVICTED
: Code.REMOTE_CACHE_FAILED;
throw new EnvironmentalExecException(
e,
FailureDetail.newBuilder()
.setMessage("Failed to fetch blobs because they do not exist remotely.")
.setSpawn(FailureDetails.Spawn.newBuilder().setCode(code))
.build());
} else {
throw e;
}
},
directExecutor());
}

return immediateVoidFuture();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -30,12 +30,14 @@
import com.google.devtools.common.options.Option;
import com.google.devtools.common.options.OptionDocumentationCategory;
import com.google.devtools.common.options.OptionEffectTag;
import com.google.devtools.common.options.OptionMetadataTag;
import com.google.devtools.common.options.Options;
import com.google.devtools.common.options.OptionsBase;
import com.google.devtools.common.options.OptionsParsingException;
import java.util.Collections;
import java.util.List;
import java.util.Map;
import java.util.Objects;

/**
* Options affecting the execution phase of a build.
Expand Down Expand Up @@ -493,6 +495,28 @@ public boolean usingLocalTestJobs() {
+ "Bazel uses a separate action to generate a dummy test.xml file containing the "
+ "test log. Otherwise, Bazel generates a test.xml as part of the test action.")
public boolean splitXmlGeneration;

@Option(
name = "incompatible_remote_use_new_exit_code_for_lost_inputs",
defaultValue = "true",
documentationCategory = OptionDocumentationCategory.REMOTE,
effectTags = {OptionEffectTag.UNKNOWN},
metadataTags = {OptionMetadataTag.INCOMPATIBLE_CHANGE},
help =
"If set to true, Bazel will use new exit code 39 instead of 34 if remote cache evicts"
+ " blobs during the build.")
public boolean useNewExitCodeForLostInputs;

@Option(
name = "experimental_remote_cache_eviction_retries",
defaultValue = "0",
documentationCategory = OptionDocumentationCategory.REMOTE,
effectTags = {OptionEffectTag.EXECUTION},
help =
"The maximum number of attempts to retry if the build encountered remote cache eviction"
+ " error. A non-zero value will implicitly set"
+ " --incompatible_remote_use_new_exit_code_for_lost_inputs to true.")
public int remoteRetryOnCacheEviction;

/** An enum for specifying different formats of test output. */
public enum TestOutputFormat {
Expand Down Expand Up @@ -532,16 +556,13 @@ public static class TestAttemptsConverter extends PerLabelOptions.PerLabelOption
private static final int MAX_VALUE = 10;

private void validateInput(String input) throws OptionsParsingException {
if ("default".equals(input)) {
return;
} else {
Integer value = Integer.parseInt(input);
if (!Objects.equals(input, "default")) {
int value = Integer.parseInt(input);
if (value < MIN_VALUE) {
throw new OptionsParsingException("'" + input + "' should be >= " + MIN_VALUE);
} else if (value < MIN_VALUE || value > MAX_VALUE) {
} else if (value > MAX_VALUE) {
throw new OptionsParsingException("'" + input + "' should be <= " + MAX_VALUE);
}
return;
}
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,6 @@
import com.google.common.annotations.VisibleForTesting;
import com.google.common.base.Preconditions;
import com.google.common.base.Stopwatch;
import com.google.common.base.Throwables;
import com.google.common.util.concurrent.ListeningScheduledExecutorService;
import com.google.devtools.build.lib.actions.ActionInput;
import com.google.devtools.build.lib.actions.CommandLines.ParamFileActionInput;
Expand Down Expand Up @@ -63,7 +62,6 @@
import com.google.devtools.build.lib.remote.options.RemoteOptions;
import com.google.devtools.build.lib.remote.util.Utils;
import com.google.devtools.build.lib.remote.util.Utils.InMemoryOutput;
import com.google.devtools.build.lib.sandbox.SandboxHelpers;
import com.google.devtools.build.lib.server.FailureDetails;
import com.google.devtools.build.lib.server.FailureDetails.FailureDetail;
import com.google.devtools.build.lib.util.ExitCode;
Expand Down Expand Up @@ -254,10 +252,12 @@ public SpawnResult exec(Spawn spawn, SpawnExecutionContext context)
// subtract network time consumed here to ensure wall clock during upload is not
// double
// counted, and metrics time computation does not exceed total time
spawnMetrics.setUploadTime(
uploadTime
.elapsed()
.minus(action.getNetworkTime().getDuration().minus(networkTimeStart)));
spawnMetrics.setUploadTimeInMs(
(int)
uploadTime
.elapsed()
.minus(action.getNetworkTime().getDuration().minus(networkTimeStart))
.toMillis());
}

context.report(SPAWN_SCHEDULING_EVENT);
Expand Down Expand Up @@ -462,8 +462,7 @@ private void maybeWriteParamFilesLocally(Spawn spawn) throws IOException {
for (ActionInput actionInput : spawn.getInputFiles().toList()) {
if (actionInput instanceof ParamFileActionInput) {
ParamFileActionInput paramFileActionInput = (ParamFileActionInput) actionInput;
Path outputPath = execRoot.getRelative(paramFileActionInput.getExecPath());
SandboxHelpers.atomicallyWriteVirtualInput(paramFileActionInput, outputPath, ".remote");
paramFileActionInput.atomicallyWriteRelativeTo(execRoot, ".remote");
}
}
}
Expand Down Expand Up @@ -557,7 +556,8 @@ private SpawnResult handleError(
catastrophe = true;
} else if (remoteCacheFailed) {
status = Status.REMOTE_CACHE_FAILED;
if (remoteOptions.useNewExitCodeForLostInputs) {
if (executionOptions.useNewExitCodeForLostInputs
|| executionOptions.remoteRetryOnCacheEviction > 0) {
detailedCode = FailureDetails.Spawn.Code.REMOTE_CACHE_EVICTED;
} else {
detailedCode = FailureDetails.Spawn.Code.REMOTE_CACHE_FAILED;
Expand All @@ -569,12 +569,8 @@ private SpawnResult handleError(
catastrophe = false;
}

String errorMessage = Utils.grpcAwareErrorMessage(exception);
if (verboseFailures) {
// On --verbose_failures print the whole stack trace
errorMessage += "\n" + Throwables.getStackTraceAsString(exception);
}

String errorMessage = Utils.grpcAwareErrorMessage(exception, verboseFailures);

if (exception.getCause() instanceof ExecutionStatusException) {
ExecutionStatusException e = (ExecutionStatusException) exception.getCause();
if (e.getResponse() != null) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,7 @@
import com.google.devtools.build.lib.events.PrintingEventHandler;
import com.google.devtools.build.lib.events.Reporter;
import com.google.devtools.build.lib.events.StoredEventHandler;
import com.google.devtools.build.lib.exec.ExecutionOptions;
import com.google.devtools.build.lib.profiler.MemoryProfiler;
import com.google.devtools.build.lib.profiler.Profiler;
import com.google.devtools.build.lib.profiler.SilentCloseable;
Expand All @@ -54,6 +55,7 @@
import com.google.devtools.build.lib.util.AnsiStrippingOutputStream;
import com.google.devtools.build.lib.util.DebugLoggerConfigurator;
import com.google.devtools.build.lib.util.DetailedExitCode;
import com.google.devtools.build.lib.util.ExitCode;
import com.google.devtools.build.lib.util.InterruptedFailureDetails;
import com.google.devtools.build.lib.util.LoggingUtil;
import com.google.devtools.build.lib.util.Pair;
Expand Down Expand Up @@ -148,6 +150,55 @@ public BlazeCommandResult exec(
Optional<List<Pair<String, String>>> startupOptionsTaggedWithBazelRc,
List<Any> commandExtensions)
throws InterruptedException {
var remoteCacheEvictionRetries = getRemoteCacheEvictionRetries(args, outErr);
while (true) {
var result =
execOnce(
invocationPolicy,
args,
outErr,
lockingMode,
clientDescription,
firstContactTimeMillis,
startupOptionsTaggedWithBazelRc,
commandExtensions);
if (result.getExitCode() == ExitCode.REMOTE_CACHE_EVICTED && remoteCacheEvictionRetries > 0) {
--remoteCacheEvictionRetries;
outErr.printErrLn("Found remote cache eviction error, retrying the build...");
continue;
}
return result;
}
}

private int getRemoteCacheEvictionRetries(List<String> args, OutErr outErr) {
// Since flags are not parsed yet at this point, we manually extract value of the retry flag.
var retryFlagPrefix = "--experimental_remote_cache_eviction_retries=";
for (var arg : args) {
if (arg.startsWith(retryFlagPrefix)) {
try {
return Integer.parseInt(arg.substring(retryFlagPrefix.length()));
} catch (NumberFormatException e) {
outErr.printErrLn(
String.format(
"Failed to parse retry times: %s, remote cache eviction retry is disabled", e));
return 0;
}
}
}
return 0;
}

public BlazeCommandResult execOnce(
InvocationPolicy invocationPolicy,
List<String> args,
OutErr outErr,
LockingMode lockingMode,
String clientDescription,
long firstContactTimeMillis,
Optional<List<Pair<String, String>>> startupOptionsTaggedWithBazelRc,
List<Any> commandExtensions)
throws InterruptedException {
OriginalUnstructuredCommandLineEvent originalCommandLine =
new OriginalUnstructuredCommandLineEvent(args);
Preconditions.checkNotNull(clientDescription);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -466,9 +466,7 @@ public void remoteCacheEvictBlobs_whenPrefetchingInput_exitWithCode39() throws E
// Assert: Exit code is 39
assertThat(error)
.hasMessageThat()
.contains(
"Build without the Bytes does not work if your remote cache evicts blobs"
+ " during builds");
.contains("Failed to fetch blobs because they do not exist remotely");
assertThat(error).hasMessageThat().contains(String.format("%s/%s", hashCode, bytes.length));
assertThat(error.getDetailedExitCode().getExitCode().getNumericExitCode()).isEqualTo(39);
}
Expand Down
59 changes: 59 additions & 0 deletions src/test/shell/bazel/remote/build_without_the_bytes_test.sh
Original file line number Diff line number Diff line change
Expand Up @@ -1627,4 +1627,63 @@ end_of_record"
expect_log "$expected_result"
}

function test_remote_cache_eviction_retries() {
mkdir -p a

cat > a/BUILD <<'EOF'
genrule(
name = 'foo',
srcs = ['foo.in'],
outs = ['foo.out'],
cmd = 'cat $(SRCS) > $@',
)

genrule(
name = 'bar',
srcs = ['foo.out', 'bar.in'],
outs = ['bar.out'],
cmd = 'cat $(SRCS) > $@',
tags = ['no-remote-exec'],
)
EOF

echo foo > a/foo.in
echo bar > a/bar.in

# Populate remote cache
bazel build \
--remote_executor=grpc://localhost:${worker_port} \
--remote_download_minimal \
//a:bar >& $TEST_log || fail "Failed to build"

bazel clean

# Clean build, foo.out isn't downloaded
bazel build \
--remote_executor=grpc://localhost:${worker_port} \
--remote_download_minimal \
//a:bar >& $TEST_log || fail "Failed to build"

if [[ -f bazel-bin/a/foo.out ]]; then
fail "Expected intermediate output bazel-bin/a/foo.out to not be downloaded"
fi

# Evict blobs from remote cache
stop_worker
start_worker

echo "updated bar" > a/bar.in

# Incremental build triggers remote cache eviction error but Bazel
# automatically retries the build and reruns the generating actions for
# missing blobs
bazel build \
--remote_executor=grpc://localhost:${worker_port} \
--remote_download_minimal \
--experimental_remote_cache_eviction_retries=5 \
//a:bar >& $TEST_log || fail "Failed to build"

expect_log "Found remote cache eviction error, retrying the build..."
}

run_suite "Build without the Bytes tests"