From 98716ebb4818498fe118ddb07c62bfa94a48da2a Mon Sep 17 00:00:00 2001 From: Yang Zhang <7129+yang@users.noreply.github.com> Date: Thu, 4 Jan 2024 10:27:38 -0800 Subject: [PATCH] Fix install, Dockerfile, CI (#1104) * Add missing jinja2 dep Missing transitive dep of lm_eval * Fix Dockerfile Only devel has nvcc, needed to build packages And don't rebuild fused kernels if no relevant change * Ensure Dockerfile builds in CI Also ensures that install actually works --------- Co-authored-by: Yang Zhang --- .github/workflows/pull_request.yml | 7 +++++++ Dockerfile | 18 +++++++++--------- requirements/requirements.txt | 1 + 3 files changed, 17 insertions(+), 9 deletions(-) diff --git a/.github/workflows/pull_request.yml b/.github/workflows/pull_request.yml index d05df91c3..8ee2f2a62 100644 --- a/.github/workflows/pull_request.yml +++ b/.github/workflows/pull_request.yml @@ -15,6 +15,13 @@ jobs: # Need the right version of clang-format - run: pip install -r requirements/requirements-dev.txt - uses: pre-commit/action@v2.0.3 + - + name: Set up Docker Buildx + uses: docker/setup-buildx-action@v1 + - + name: Docker build + id: docker_build + uses: docker/build-push-action@v2 update-documentation: runs-on: ubuntu-20.04 diff --git a/Dockerfile b/Dockerfile index 3d7295b43..8de2011f5 100644 --- a/Dockerfile +++ b/Dockerfile @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -FROM nvidia/cuda:11.7.1-runtime-ubuntu20.04 +FROM nvidia/cuda:11.7.1-devel-ubuntu20.04 ENV DEBIAN_FRONTEND=noninteractive @@ -21,7 +21,7 @@ LABEL org.opencontainers.image.version = "2.0" LABEL org.opencontainers.image.authors = "contact@eleuther.ai" LABEL org.opencontainers.image.source = "https://www.github.com/eleutherai/gpt-neox" LABEL org.opencontainers.image.licenses = " Apache-2.0" -LABEL org.opencontainers.image.base.name="docker.io/nvidia/cuda:11.7.1-runtime-ubuntu20.04" +LABEL org.opencontainers.image.base.name="docker.io/nvidia/cuda:11.7.1-devel-ubuntu20.04" #### System package (uses default Python 3 version in Ubuntu 20.04) RUN apt-get update -y && \ @@ -94,17 +94,17 @@ COPY requirements/requirements-wandb.txt . COPY requirements/requirements-onebitadam.txt . COPY requirements/requirements-sparseattention.txt . COPY requirements/requirements-flashattention.txt . -RUN pip install -r requirements.txt && pip install -r requirements-onebitadam.txt && \ - pip install -r requirements-sparseattention.txt && \ - pip install -r requirements-flashattention.txt && \ - pip install -r requirements-wandb.txt && \ - pip install protobuf==3.20.* && \ - pip cache purge +RUN pip install -r requirements.txt && pip install -r requirements-onebitadam.txt +RUN pip install -r requirements-sparseattention.txt +RUN pip install -r requirements-flashattention.txt +RUN pip install -r requirements-wandb.txt +RUN pip install protobuf==3.20.* +RUN pip cache purge ## Install APEX RUN pip install -v --disable-pip-version-check --no-cache-dir --global-option="--cpp_ext" --global-option="--cuda_ext" git+https://github.com/NVIDIA/apex.git@a651e2c24ecf97cbf367fd3f330df36760e1c597 -COPY megatron/ megatron +COPY megatron/fused_kernels/ megatron/fused_kernels RUN python megatron/fused_kernels/setup.py install # Clear staging diff --git a/requirements/requirements.txt b/requirements/requirements.txt index 807a55974..f1beae87a 100644 --- a/requirements/requirements.txt +++ b/requirements/requirements.txt @@ -3,6 +3,7 @@ git+https://github.com/EleutherAI/DeeperSpeed.git@b9260436e7da3e297fc6bedfd27d9e ftfy>=6.0.1 git+https://github.com/EleutherAI/lm_dataformat.git@4eec05349977071bf67fc072290b95e31c8dd836 huggingface_hub>=0.11.0 +jinja2==3.1.2 git+https://github.com/EleutherAI/lm-evaluation-harness.git@main#egg=lm_eval mpi4py>=3.0.3 numpy>=1.22.0