From 6d57a4a263d9629aa2ad7bc5f1f569b292584fef Mon Sep 17 00:00:00 2001 From: Logan Adams Date: Wed, 29 Jan 2025 14:31:29 -0800 Subject: [PATCH 1/4] Pin torch in nv-ds-chat workflow --- .github/workflows/nv-ds-chat.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/nv-ds-chat.yml b/.github/workflows/nv-ds-chat.yml index 7e209cbe4397..adca48bcd5fe 100644 --- a/.github/workflows/nv-ds-chat.yml +++ b/.github/workflows/nv-ds-chat.yml @@ -37,7 +37,7 @@ jobs: - name: Install pytorch run: | - pip3 install -U --cache-dir $TORCH_CACHE torch --index-url https://download.pytorch.org/whl/cu121 + pip3 install -U --cache-dir $TORCH_CACHE torch==2.5.1 --index-url https://download.pytorch.org/whl/cu121 python -c "import torch; print('torch:', torch.__version__, torch)" python -c "import torch; print('CUDA available:', torch.cuda.is_available())" From 2c8488b7fd2cf7a38e42746b0988a42fa3417e7b Mon Sep 17 00:00:00 2001 From: Logan Adams Date: Wed, 29 Jan 2025 14:47:36 -0800 Subject: [PATCH 2/4] Turn off NCCL debug --- .github/workflows/nv-ds-chat.yml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.github/workflows/nv-ds-chat.yml b/.github/workflows/nv-ds-chat.yml index adca48bcd5fe..2e846e17cbd1 100644 --- a/.github/workflows/nv-ds-chat.yml +++ b/.github/workflows/nv-ds-chat.yml @@ -37,7 +37,7 @@ jobs: - name: Install pytorch run: | - pip3 install -U --cache-dir $TORCH_CACHE torch==2.5.1 --index-url https://download.pytorch.org/whl/cu121 + pip3 install -U torch==2.5.1 --index-url https://download.pytorch.org/whl/cu121 python -c "import torch; print('torch:', torch.__version__, torch)" python -c "import torch; print('CUDA available:', torch.cuda.is_available())" @@ -67,6 +67,7 @@ jobs: run: | cd DeepSpeedExamples/applications/DeepSpeed-Chat unset TORCH_CUDA_ARCH_LIST # only jit compile for current arch + unset NCCL_DEBUG cd tests pytest $PYTEST_OPTS ./ From 289871fdedc0271f1e7edf021b79b726130968f6 Mon Sep 17 00:00:00 2001 From: Logan Adams Date: Wed, 29 Jan 2025 19:07:23 -0800 Subject: [PATCH 3/4] Ensure we get torch+cu vs torch+cpu --- .github/workflows/nv-ds-chat.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/nv-ds-chat.yml b/.github/workflows/nv-ds-chat.yml index 2e846e17cbd1..3f8b98db2401 100644 --- a/.github/workflows/nv-ds-chat.yml +++ b/.github/workflows/nv-ds-chat.yml @@ -37,7 +37,7 @@ jobs: - name: Install pytorch run: | - pip3 install -U torch==2.5.1 --index-url https://download.pytorch.org/whl/cu121 + pip3 install -U --cache-dir $TORCH_CACHE torch --index-url https://download.pytorch.org/whl/cu124 python -c "import torch; print('torch:', torch.__version__, torch)" python -c "import torch; print('CUDA available:', torch.cuda.is_available())" From 634b29ec1b326c4b192aac6de76db04746092666 Mon Sep 17 00:00:00 2001 From: Logan Adams Date: Thu, 30 Jan 2025 09:17:27 -0800 Subject: [PATCH 4/4] Add torchvision at same install time as torch --- .github/workflows/nv-ds-chat.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/nv-ds-chat.yml b/.github/workflows/nv-ds-chat.yml index 3f8b98db2401..5d47519fe204 100644 --- a/.github/workflows/nv-ds-chat.yml +++ b/.github/workflows/nv-ds-chat.yml @@ -37,7 +37,7 @@ jobs: - name: Install pytorch run: | - pip3 install -U --cache-dir $TORCH_CACHE torch --index-url https://download.pytorch.org/whl/cu124 + pip install -U --cache-dir $TORCH_CACHE torch torchvision --index-url https://download.pytorch.org/whl/cu121 python -c "import torch; print('torch:', torch.__version__, torch)" python -c "import torch; print('CUDA available:', torch.cuda.is_available())"