Fix install, Dockerfile, CI (#1104)

* Add missing jinja2 dep Missing transitive dep of lm_eval * Fix Dockerfile Only devel has nvcc, needed to build packages And don't rebuild fused kernels if no relevant change * Ensure Dockerfile builds in CI Also ensures that install actually works --------- Co-authored-by: Yang Zhang <[email protected]>
EleutherAI · Jan 4, 2024 · 98716eb · 98716eb
1 parent eca6b1a
commit 98716eb
Show file tree

Hide file tree

Showing 3 changed files with 17 additions and 9 deletions.
diff --git a/.github/workflows/pull_request.yml b/.github/workflows/pull_request.yml
@@ -15,6 +15,13 @@ jobs:
       # Need the right version of clang-format
       - run: pip install -r requirements/requirements-dev.txt
       - uses: pre-commit/[email protected]
+      -
+        name: Set up Docker Buildx
+        uses: docker/setup-buildx-action@v1
+      -
+        name: Docker build
+        id: docker_build
+        uses: docker/build-push-action@v2
 
   update-documentation:
     runs-on: ubuntu-20.04

diff --git a/Dockerfile b/Dockerfile
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-FROM nvidia/cuda:11.7.1-runtime-ubuntu20.04
+FROM nvidia/cuda:11.7.1-devel-ubuntu20.04
 
 ENV DEBIAN_FRONTEND=noninteractive
 
@@ -21,7 +21,7 @@ LABEL org.opencontainers.image.version = "2.0"
 LABEL org.opencontainers.image.authors = "[email protected]"
 LABEL org.opencontainers.image.source = "https://www.github.com/eleutherai/gpt-neox"
 LABEL org.opencontainers.image.licenses = " Apache-2.0"
-LABEL org.opencontainers.image.base.name="docker.io/nvidia/cuda:11.7.1-runtime-ubuntu20.04"
+LABEL org.opencontainers.image.base.name="docker.io/nvidia/cuda:11.7.1-devel-ubuntu20.04"
 
 #### System package (uses default Python 3 version in Ubuntu 20.04)
 RUN apt-get update -y && \
@@ -94,17 +94,17 @@ COPY requirements/requirements-wandb.txt .
 COPY requirements/requirements-onebitadam.txt .
 COPY requirements/requirements-sparseattention.txt .
 COPY requirements/requirements-flashattention.txt .
-RUN pip install -r requirements.txt && pip install -r requirements-onebitadam.txt && \
-    pip install -r requirements-sparseattention.txt && \
-    pip install -r requirements-flashattention.txt && \
-    pip install -r requirements-wandb.txt && \
-    pip install protobuf==3.20.* && \
-    pip cache purge
+RUN pip install -r requirements.txt && pip install -r requirements-onebitadam.txt
+RUN pip install -r requirements-sparseattention.txt
+RUN pip install -r requirements-flashattention.txt
+RUN pip install -r requirements-wandb.txt
+RUN pip install protobuf==3.20.*
+RUN pip cache purge
 
 ## Install APEX
 RUN pip install -v --disable-pip-version-check --no-cache-dir --global-option="--cpp_ext" --global-option="--cuda_ext" git+https://github.com/NVIDIA/apex.git@a651e2c24ecf97cbf367fd3f330df36760e1c597
 
-COPY megatron/ megatron
+COPY megatron/fused_kernels/ megatron/fused_kernels
 RUN python megatron/fused_kernels/setup.py install
 
 # Clear staging

diff --git a/requirements/requirements.txt b/requirements/requirements.txt
@@ -3,6 +3,7 @@ git+https://github.com/EleutherAI/DeeperSpeed.git@b9260436e7da3e297fc6bedfd27d9e
 ftfy>=6.0.1
 git+https://github.com/EleutherAI/lm_dataformat.git@4eec05349977071bf67fc072290b95e31c8dd836
 huggingface_hub>=0.11.0
+jinja2==3.1.2
 git+https://github.com/EleutherAI/lm-evaluation-harness.git@main#egg=lm_eval
 mpi4py>=3.0.3
 numpy>=1.22.0