From 50bfe827b5fffbfd3a652ec7cbfb9ec5e24ef622 Mon Sep 17 00:00:00 2001 From: sirutBuasai Date: Fri, 14 Feb 2025 14:44:09 -0800 Subject: [PATCH] disable ap --- tensorflow/training/buildspec-2-18-sm.yml | 2 +- tensorflow/training/docker/2.18/py3/cu125/Dockerfile.gpu | 1 + .../training/docker/build_artifacts/dockerd-entrypoint.py | 5 ++++- 3 files changed, 6 insertions(+), 2 deletions(-) diff --git a/tensorflow/training/buildspec-2-18-sm.yml b/tensorflow/training/buildspec-2-18-sm.yml index 9fda953bd042..08f4cfba681e 100644 --- a/tensorflow/training/buildspec-2-18-sm.yml +++ b/tensorflow/training/buildspec-2-18-sm.yml @@ -5,7 +5,7 @@ framework: &FRAMEWORK tensorflow version: &VERSION 2.18.0 short_version: &SHORT_VERSION "2.18" arch_type: x86 -autopatch_build: "True" +# autopatch_build: "True" repository_info: training_repository: &TRAINING_REPOSITORY diff --git a/tensorflow/training/docker/2.18/py3/cu125/Dockerfile.gpu b/tensorflow/training/docker/2.18/py3/cu125/Dockerfile.gpu index 0da2f5de3ce6..bdc0fbe7a10e 100644 --- a/tensorflow/training/docker/2.18/py3/cu125/Dockerfile.gpu +++ b/tensorflow/training/docker/2.18/py3/cu125/Dockerfile.gpu @@ -396,6 +396,7 @@ RUN rm -rf /tmp/* # Copy workaround script for incorrect hostname COPY start_cuda_compat.sh /usr/local/bin/start_cuda_compat.sh COPY dockerd-entrypoint.py /usr/local/bin/dockerd-entrypoint.py +RUN chmod +x /usr/local/bin/start_cuda_compat.sh RUN chmod +x /usr/local/bin/dockerd-entrypoint.sh RUN HOME_DIR=/root \ diff --git a/tensorflow/training/docker/build_artifacts/dockerd-entrypoint.py b/tensorflow/training/docker/build_artifacts/dockerd-entrypoint.py index 58205157aea4..b9448fcfffc6 100644 --- a/tensorflow/training/docker/build_artifacts/dockerd-entrypoint.py +++ b/tensorflow/training/docker/build_artifacts/dockerd-entrypoint.py @@ -19,7 +19,10 @@ import subprocess # run compat mounting by default -subprocess.run(["bash", "-m", "/usr/local/bin/start_cuda_compat.sh"]) +try: + subprocess.run(["bash", "-m", "/usr/local/bin/start_cuda_compat.sh"]) +except Exception as e: + print(f"Error running script: {e}") if not os.path.exists("/opt/ml/input/config"): subprocess.call(["python", "/usr/local/bin/deep_learning_container.py", "&>/dev/null", "&"])