From e2557b5724999c2b431902fa64bfef6fd08ba4d5 Mon Sep 17 00:00:00 2001 From: cifar10 <731376883@qq.com> Date: Fri, 13 Sep 2024 10:42:40 +0800 Subject: [PATCH] Revert "[cambricon] Support llava1.5_7b with flagscale (#742)" This reverts commit d63d8c0a56340a90d582f777460ef7d4e7309af0. --- .../docker_image/flagscale_2409/Dockerfile | 28 ------------------- .../flagscale_2409/flagscale_2409_install.sh | 13 --------- .../README.md | 1 - .../config/config_MLUx4x8.py | 28 ------------------- .../config/requirements.txt | 1 - utils/container_manager.py | 2 +- 6 files changed, 1 insertion(+), 72 deletions(-) delete mode 100644 training/cambricon/docker_image/flagscale_2409/Dockerfile delete mode 100644 training/cambricon/docker_image/flagscale_2409/flagscale_2409_install.sh delete mode 100644 training/cambricon/llava1.5_7b_continuetrain-flagscale/README.md delete mode 100644 training/cambricon/llava1.5_7b_continuetrain-flagscale/config/config_MLUx4x8.py delete mode 100644 training/cambricon/llava1.5_7b_continuetrain-flagscale/config/requirements.txt diff --git a/training/cambricon/docker_image/flagscale_2409/Dockerfile b/training/cambricon/docker_image/flagscale_2409/Dockerfile deleted file mode 100644 index a647df1aa..000000000 --- a/training/cambricon/docker_image/flagscale_2409/Dockerfile +++ /dev/null @@ -1,28 +0,0 @@ -FROM flagperf:cambricon-v24.08.02-torch2.1.0-catch1.22.1-ubuntu22.04-py310-megatron-patch -#shell -SHELL ["/bin/bash", "-c"] -RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -yq --no-install-recommends tzdata && apt-get install -y openssh-server && mkdir -p /run/sshd -RUN apt update -y && apt install -y sudo dmidecode ipmitool sysstat net-tools sshpass -# modify ~/.bashrc file -RUN sed -i '/\[ -z "\$PS1" \] \&\& return/s/^/#/' ~/.bashrc -RUN echo -e "\n# Add environment variables\n\ -export NEUWARE_HOME=/usr/local/neuware\n\ -export LD_LIBRARY_PATH=/usr/local/neuware/lib64:/usr/local/openmpi/lib:${LD_LIBRARY_PATH}\n\ -export PATH=/torch/venv3/pytorch/bin:/usr/local/neuware/bin:/usr/local/openmpi/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:${PATH}\n\ -export CNCL_MLULINK_OVER_ROCE_DISABLE=1\n\ -export CNCL_MLULINK_CROSS_HOSTS_ENABLE=0\n\ -export CNCL_MLU_DIRECT_LEVEL=1" >> ~/.bashrc - -# 在容器内创建 .ssh 目录 -RUN mkdir -p /root/.ssh - -# 将公钥和私钥复制到容器内 -COPY id_rsa /root/.ssh/id_rsa -COPY id_rsa.pub /root/.ssh/id_rsa.pub - -# 设置正确的权限 -RUN chmod 600 /root/.ssh/id_rsa -RUN chmod 644 /root/.ssh/id_rsa.pub - -# 添加公钥到 authorized_keys 文件以实现免密登录 -RUN cat /root/.ssh/id_rsa.pub >> /root/.ssh/authorized_keys \ No newline at end of file diff --git a/training/cambricon/docker_image/flagscale_2409/flagscale_2409_install.sh b/training/cambricon/docker_image/flagscale_2409/flagscale_2409_install.sh deleted file mode 100644 index 69902fa31..000000000 --- a/training/cambricon/docker_image/flagscale_2409/flagscale_2409_install.sh +++ /dev/null @@ -1,13 +0,0 @@ -#!/bin/bash -#!/bin/bash -set -xe -pip3 config set global.index-url https://pypi.tuna.tsinghua.edu.cn/simple -pip3 install regex==2024.5.15 schedule==1.2.2 accelerate==0.31.0 transformers==4.40.1 -pip3 install pybind11 hydra-core s3fs braceexpand webdataset wandb loguru sentencepiece -pip3 install megatron-energon==2.2.0 -#配置免密 -sed -i '/StrictHostKeyChecking/c StrictHostKeyChecking no' /etc/ssh/ssh_config -sed -i 's/#Port 22/Port 9876/g' /etc/ssh/sshd_config -sed -i 's/#PubkeyAuthentication yes/PubkeyAuthentication yes/g' /etc/ssh/sshd_config -sed -i 's/#PermitRootLogin prohibit-password/PermitRootLogin yes/g' /etc/ssh/sshd_config -/etc/init.d/ssh restart diff --git a/training/cambricon/llava1.5_7b_continuetrain-flagscale/README.md b/training/cambricon/llava1.5_7b_continuetrain-flagscale/README.md deleted file mode 100644 index d44c78c53..000000000 --- a/training/cambricon/llava1.5_7b_continuetrain-flagscale/README.md +++ /dev/null @@ -1 +0,0 @@ -此测例为FlagScale相关项目测例 diff --git a/training/cambricon/llava1.5_7b_continuetrain-flagscale/config/config_MLUx4x8.py b/training/cambricon/llava1.5_7b_continuetrain-flagscale/config/config_MLUx4x8.py deleted file mode 100644 index c5c4b92c5..000000000 --- a/training/cambricon/llava1.5_7b_continuetrain-flagscale/config/config_MLUx4x8.py +++ /dev/null @@ -1,28 +0,0 @@ -# scale_parent must under FlagPerf/ or data_dir/, otherwise you cannot mount it into baremetal, therefore cannot use shared storage -scale_parent = "/share" -scale_home = f"{scale_parent}/FlagScale" - -# this cmd should install scale at . is set by flagperf.training.benchmarks.llava1.5_7b.flagscale.run_pretraining.py -scale_download_cmd = f"cd {scale_parent}" - -# NV need nothing because all requirements have been established in base docker image. vendor can do anything related here -scale_install_cmd = "" - -# locate energon. the copy from energon_install_path to flagscale/megatron/ is done by flagperf...run_pretraining.py -energon_locate_cmd = r"pip show megatron-energon | grep Location | awk -F: '{print $2}' | xargs" - -scale_conf_dir = f"{scale_home}/examples/llava/conf" -configyaml = f"{scale_conf_dir}/config.yaml" -trainyaml = f"{scale_conf_dir}/train/train_llava1.5_7b.yaml" -datasetyaml = f"{scale_home}/megatron/examples/multimodal/pretrain_dataset.yaml" -prompt = f"{scale_home}/megatron/examples/multimodal/manual_prompts.json" - -#cmds = {"before_start": "source /root/miniconda3/bin/activate flagscale"} -cmds = {} -# flagscale's requirements -flagscale_chip_type = "MLU" -flagscale_ssh_port = 9876 -flops = 303.93E12 - -# for llava's algorithm -steps = 5000 diff --git a/training/cambricon/llava1.5_7b_continuetrain-flagscale/config/requirements.txt b/training/cambricon/llava1.5_7b_continuetrain-flagscale/config/requirements.txt deleted file mode 100644 index 4f0d1d961..000000000 --- a/training/cambricon/llava1.5_7b_continuetrain-flagscale/config/requirements.txt +++ /dev/null @@ -1 +0,0 @@ -megatron-energon==2.2.0 diff --git a/utils/container_manager.py b/utils/container_manager.py index 7599ad067..91c68f6cc 100644 --- a/utils/container_manager.py +++ b/utils/container_manager.py @@ -28,7 +28,7 @@ def run_new(self, container_run_args, docker_image): run_new_cmd = "docker run " + container_run_args + \ " --name=" + self.name + " \"" + docker_image + "\" " + \ - "bash -c \"/etc/init.d/ssh restart && sleep infinity\"" + "sleep infinity" print(run_new_cmd) ret, outs = run_cmd.run_cmd_wait(run_new_cmd, 10) return ret, outs