Skip to content

Commit

Permalink
Pstjohn/update base image 2410 (#551)
Browse files Browse the repository at this point in the history
Cherry picks some additional commits into #550 to see if these also fix
CI for 24.10

---------

Signed-off-by: John St. John <[email protected]>
Co-authored-by: Farhad Ramezanghorbani <[email protected]>
Co-authored-by: John St John <[email protected]>
Co-authored-by: John St. John <[email protected]>
  • Loading branch information
4 people authored Dec 23, 2024
1 parent 19809c0 commit c7281be
Show file tree
Hide file tree
Showing 2 changed files with 689 additions and 286 deletions.
21 changes: 11 additions & 10 deletions Dockerfile
Original file line number Diff line number Diff line change
@@ -1,5 +1,11 @@
# Base image with apex and transformer engine, but without NeMo or Megatron-LM.
ARG BASE_IMAGE=nvcr.io/nvidia/pytorch:24.07-py3
# Note that the core NeMo docker container is defined here:
# https://gitlab-master.nvidia.com/dl/JoC/nemo-ci/-/blob/main/llm_train/Dockerfile.train
# with settings that get defined/injected from this config:
# https://gitlab-master.nvidia.com/dl/JoC/nemo-ci/-/blob/main/.gitlab-ci.yml
# We should keep versions in our container up to date to ensure that we get the latest tested perf improvements and
# training loss curves from NeMo.
ARG BASE_IMAGE=nvcr.io/nvidia/pytorch:24.10-py3

FROM rust:1.82.0 as rust-env

Expand All @@ -25,7 +31,7 @@ RUN git clone https://github.com/NVIDIA/apex.git && \
--config-settings "--build-option=--cpp_ext --cuda_ext --fast_layer_norm --distributed_adam --deprecated_fused_adam --group_norm"

# Transformer Engine pre-1.7.0. 1.7 standardizes the meaning of bits in the attention mask to match
ARG TE_COMMIT=c27ee60ec746210bcea4ec33958dbbff06706506
ARG TE_COMMIT=2215fa5c7557b66034068816020f9f611019e457
RUN git clone https://github.com/NVIDIA/TransformerEngine.git && \
cd TransformerEngine && \
git fetch origin ${TE_COMMIT} && \
Expand All @@ -49,11 +55,11 @@ RUN apt-get install -y gnupg
# Check the nemo dependency for causal conv1d and make sure this checkout
# tag matches. If not, update the tag in the following line.
RUN CAUSAL_CONV1D_FORCE_BUILD=TRUE pip --disable-pip-version-check --no-cache-dir install \
git+https://github.com/Dao-AILab/[email protected].0.post2
git+https://github.com/Dao-AILab/[email protected].2.post1

# Mamba dependancy installation
RUN pip --disable-pip-version-check --no-cache-dir install \
git+https://github.com/state-spaces/mamba.git@v2.0.3
git+https://github.com/state-spaces/mamba.git@v2.2.2

RUN pip install hatchling # needed to install nemo-run
ARG NEMU_RUN_TAG=34259bd3e752fef94045a9a019e4aaf62bd11ce2
Expand All @@ -72,12 +78,7 @@ RUN apt-get update && \
rm -rf /var/lib/apt/lists/*
RUN apt purge -y libslurm37 libpmi2-0 && \
apt autoremove -y
RUN source /usr/local/nvm/nvm.sh && \
NODE_VER=$(nvm current) && \
nvm deactivate && \
nvm uninstall $NODE_VER && \
sed -i "/NVM/d" /root/.bashrc && \
sed -i "/nvm.sh/d" /etc/bash.bashrc


# Use UV to install python packages from the workspace. This just installs packages into the system's python
# environment, and does not use the current uv.lock file.
Expand Down
Loading

0 comments on commit c7281be

Please sign in to comment.