Skip to content

Commit

Permalink
Dockerfile update for Release 2.21 (#31)
Browse files Browse the repository at this point in the history
*Description of changes:*
* Addition JAX NeuronX Dockerfile
* Removal of PyTorch 1.13 and 2.1 Dockerfiles

By submitting this pull request, I confirm that you can use, modify,
copy, and redistribute this contribution, under the terms of your
choice.
  • Loading branch information
ahsan-z-khan authored Dec 27, 2024
1 parent 947ce54 commit 315b77b
Show file tree
Hide file tree
Showing 12 changed files with 358 additions and 982 deletions.
5 changes: 0 additions & 5 deletions .github/workflows/ci.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -18,11 +18,6 @@ jobs:
HADOLINT_RECURSIVE: "true"
steps:
- uses: actions/checkout@v4
- uses: hadolint/[email protected]
with:
dockerfile: Dockerfile.neuron
recursive: true
failure-threshold: error # TODO: enable more linter rules other than error.
- uses: hadolint/[email protected]
with:
dockerfile: Dockerfile.neuronx
Expand Down
Original file line number Diff line number Diff line change
@@ -1,23 +1,21 @@
FROM public.ecr.aws/docker/library/ubuntu:20.04
FROM public.ecr.aws/docker/library/ubuntu:22.04

LABEL maintainer="Amazon AI"
LABEL dlc_major_version="1"

Check failure on line 3 in docker/jax/training/0.4/Dockerfile.neuronx

View workflow job for this annotation

GitHub Actions / dockerfile-linter

DL3048 style: Invalid label key.
LABEL maintainer="Amazon AI"

# Neuron SDK components version numbers
ARG NEURONX_FRAMEWORK_VERSION=1.13.1.1.16.0
ARG NEURONX_DISTRIBUTED_VERSION=0.9.0
ARG NEURONX_DISTRIBUTED_TRAINING_VERSION=1.0.1
ARG NEURONX_CC_VERSION=2.15.143.0
ARG NEURONX_COLLECTIVES_LIB_VERSION=2.22.33.0-d2128d1aa
ARG NEURONX_RUNTIME_LIB_VERSION=2.22.19.0-5856c0b42
ARG NEURONX_TOOLS_VERSION=2.19.0.0
ARG NEURONX_RUNTIME_LIB_VERSION=2.23.110.0-9b5179492
ARG NEURONX_COLLECTIVES_LIB_VERSION=2.23.133.0-3e70920f2
ARG NEURONX_TOOLS_VERSION=2.20.204.0
ARG NEURONX_CC_VERSION=2.16.345.0
ARG NEURONX_JAX_TRAINING_VERSION=0.1.2

ARG PYTHON=python3.10
ARG PYTHON_VERSION=3.10.12
ARG PIP=pip3
ARG OMPI_VERSION=4.1.5

# This arg required to stop docker build waiting for region configuration while installing tz data from ubuntu 20
# This arg required to stop docker build waiting for region configuration while installing tz data from ubuntu 22
ARG DEBIAN_FRONTEND=noninteractive

# Python won’t try to write .pyc or .pyo files on the import of source modules
Expand All @@ -32,9 +30,6 @@ ENV LD_LIBRARY_PATH="${LD_LIBRARY_PATH}:/opt/amazon/efa/lib"
ENV LD_LIBRARY_PATH="${LD_LIBRARY_PATH}:/opt/amazon/efa/lib64"
ENV LD_LIBRARY_PATH="${LD_LIBRARY_PATH}:/opt/amazon/openmpi/lib64"
ENV LD_LIBRARY_PATH="${LD_LIBRARY_PATH}:/usr/local/lib"
ENV PATH /opt/aws/neuron/bin/:$PATH
ENV SAGEMAKER_TRAINING_MODULE=sagemaker_pytorch_container.training:main
ENV DGLBACKEND=pytorch

RUN apt-get update \

Check failure on line 34 in docker/jax/training/0.4/Dockerfile.neuronx

View workflow job for this annotation

GitHub Actions / dockerfile-linter

DL3008 warning: Pin versions in apt get install. Instead of `apt-get install <package>` use `apt-get install <package>=<version>`
&& apt-get upgrade -y \
Expand All @@ -45,45 +40,35 @@ RUN apt-get update \
curl \
emacs \
git \
gnupg2 \
gpg-agent \
jq \
libopencv-dev \
openjdk-8-jdk-headless \
openjdk-8-jdk \
openjdk-8-jre \
libglib2.0-0 \
libgl1-mesa-glx \
libsm6 \
libxext6 \
libxrender-dev \
openjdk-11-jdk \
software-properties-common \
wget \
unzip \
vim \
zlib1g-dev \
openssl \
libssl-dev \
libsqlite3-dev \
libgdbm-dev \
libc6-dev \
libbz2-dev \
libncurses-dev \
tk-dev \
libffi-dev \
libcap-dev \
gnupg2 \
gpg-agent \
&& rm -rf /var/lib/apt/lists/* \
&& apt-get clean

RUN echo "deb https://apt.repos.neuron.amazonaws.com focal main" > /etc/apt/sources.list.d/neuron.list
RUN wget -qO - https://apt.repos.neuron.amazonaws.com/GPG-PUB-KEY-AMAZON-AWS-NEURON.PUB | apt-key add -

RUN apt-get update \
&& apt-get install -y \
aws-neuronx-tools=$NEURONX_TOOLS_VERSION \
aws-neuronx-collectives=$NEURONX_COLLECTIVES_LIB_VERSION \
aws-neuronx-runtime-lib=$NEURONX_RUNTIME_LIB_VERSION \
libhwloc-dev \
openjdk-8-jdk-headless \
openjdk-8-jdk \
openjdk-8-jre \
openjdk-11-jdk \
openssl \
software-properties-common \
tk-dev \
unzip \
wget \
vim \
zlib1g-dev \
&& rm -rf /var/lib/apt/lists/* \
&& rm -rf /tmp/tmp* \
&& apt-get clean
Expand All @@ -100,6 +85,15 @@ RUN mkdir -p /tmp/openmpi \
&& ldconfig \
&& rm -rf /tmp/openmpi

# Install packages and configure SSH for MPI operator in k8s
RUN apt-get update && apt-get install -y openmpi-bin openssh-server \

Check failure on line 89 in docker/jax/training/0.4/Dockerfile.neuronx

View workflow job for this annotation

GitHub Actions / dockerfile-linter

DL3008 warning: Pin versions in apt get install. Instead of `apt-get install <package>` use `apt-get install <package>=<version>`

Check failure on line 89 in docker/jax/training/0.4/Dockerfile.neuronx

View workflow job for this annotation

GitHub Actions / dockerfile-linter

DL3015 info: Avoid additional packages by specifying `--no-install-recommends`
&& mkdir -p /var/run/sshd \
&& echo " UserKnownHostsFile /dev/null" >> /etc/ssh/ssh_config \
&& echo " StrictHostKeyChecking no" >> /etc/ssh/ssh_config \
&& sed -i 's/#\(StrictModes \).*/\1no/g' /etc/ssh/sshd_config \
&& rm -rf /var/lib/apt/lists/* \
&& apt-get clean

# install Python
RUN wget -q https://www.python.org/ftp/python/$PYTHON_VERSION/Python-$PYTHON_VERSION.tgz \

Check failure on line 98 in docker/jax/training/0.4/Dockerfile.neuronx

View workflow job for this annotation

GitHub Actions / dockerfile-linter

SC2046 warning: Quote this to prevent word splitting.

Check failure on line 98 in docker/jax/training/0.4/Dockerfile.neuronx

View workflow job for this annotation

GitHub Actions / dockerfile-linter

DL3003 warning: Use WORKDIR to switch to a directory
&& tar -xzf Python-$PYTHON_VERSION.tgz \
Expand All @@ -122,76 +116,31 @@ ENV PATH="$PATH:/home/.openmpi/bin"
ENV LD_LIBRARY_PATH="$LD_LIBRARY_PATH:/home/.openmpi/lib/"
RUN ompi_info --parsable --all | grep mpi_built_with_cuda_support:value

Check failure on line 117 in docker/jax/training/0.4/Dockerfile.neuronx

View workflow job for this annotation

GitHub Actions / dockerfile-linter

DL4006 warning: Set the SHELL option -o pipefail before RUN with a pipe in it. If you are using /bin/sh in an alpine image or if your shell is symlinked to busybox then consider explicitly setting your SHELL to /bin/ash, or disable this check

# Copy workaround script for incorrect hostname
COPY changehostname.c /
COPY start_with_right_hostname.sh /usr/local/bin/start_with_right_hostname.sh
RUN mkdir -p /etc/pki/tls/certs && cp /etc/ssl/certs/ca-certificates.crt /etc/pki/tls/certs/ca-bundle.crt

RUN ${PIP} install --no-cache-dir -U \
"bokeh>=2.3,<3" \
"awscli<2" \
scipy \
click \
"cryptography" \
"sagemaker>=2,<2.184" \
"sagemaker-pytorch-training" \
psutil==5.6.7 \
dataset \
transformers==4.36.2 \
Pillow
# Install Neuron Driver, Runtime and Tools
RUN echo "deb https://apt.repos.neuron.amazonaws.com focal main" > /etc/apt/sources.list.d/neuron.list
RUN wget -qO - https://apt.repos.neuron.amazonaws.com/GPG-PUB-KEY-AMAZON-AWS-NEURON.PUB | apt-key add -

Check failure on line 123 in docker/jax/training/0.4/Dockerfile.neuronx

View workflow job for this annotation

GitHub Actions / dockerfile-linter

DL4006 warning: Set the SHELL option -o pipefail before RUN with a pipe in it. If you are using /bin/sh in an alpine image or if your shell is symlinked to busybox then consider explicitly setting your SHELL to /bin/ash, or disable this check

RUN mkdir -p /etc/pki/tls/certs && cp /etc/ssl/certs/ca-certificates.crt /etc/pki/tls/certs/ca-bundle.crt
RUN apt-get update \
&& apt-get install -y \
aws-neuronx-tools=$NEURONX_TOOLS_VERSION \
aws-neuronx-collectives=$NEURONX_COLLECTIVES_LIB_VERSION \
aws-neuronx-runtime-lib=$NEURONX_RUNTIME_LIB_VERSION \
&& rm -rf /var/lib/apt/lists/* \
&& rm -rf /tmp/tmp* \
&& apt-get clean

# Add Neuron PATH
ENV PATH="/opt/aws/neuron/bin:${PATH}"

# Install AWS CLI
RUN ${PIP} install --no-cache-dir -U "awscli<2"

# Install JAX & Neuron CC
RUN ${PIP} config set global.extra-index-url https://pip.repos.neuron.amazonaws.com \
&& ${PIP} install --force-reinstall torch-neuronx==$NEURONX_FRAMEWORK_VERSION --extra-index-url https://pip.repos.neuron.amazonaws.com \
&& ${PIP} install --force-reinstall neuronx-cc==$NEURONX_CC_VERSION --extra-index-url https://pip.repos.neuron.amazonaws.com

RUN ${PIP} install --force-reinstall --no-deps neuronx_distributed==$NEURONX_DISTRIBUTED_VERSION --extra-index-url https://pip.repos.neuron.amazonaws.com

## Installation for Neuronx Distributed Training framework
# Install Cython
RUN pip install --no-cache-dir Cython

# Copy the apex_setup.py file
COPY apex_setup.py /root/apex_setup.py

# Clone and build Apex
RUN git clone https://github.com/NVIDIA/apex.git /root/apex \
&& cd /root/apex \
&& git checkout 23.05 \
&& cp /root/apex_setup.py setup.py \
&& python3 setup.py bdist_wheel

#Install dependencies from requirements and extras for SageMaker usecase
RUN wget https://raw.githubusercontent.com/aws-neuron/neuronx-distributed-training/master/requirements.txt \
&& pip install --no-cache-dir -r requirements.txt /root/apex/dist/apex-0.1-py3-none-any.whl \
&& pip install --force-reinstall "multiprocess==0.70.16" \
"dill==0.3.8" \
"torch==1.13.1"


RUN ${PIP} install --force-reinstall --no-deps neuronx_distributed_training==$NEURONX_DISTRIBUTED_TRAINING_VERSION --extra-index-url https://pip.repos.neuron.amazonaws.com

# attrs, neuronx-cc required: >=19.2.0, sagemaker <24,>=23.1.0
# protobuf neuronx-cc<4, sagemaker-training >=3.9.2,<=3.20.3
# awscli 1.25.47 has requirement docutils<0.17,>=0.10
# etcd for kubernetes installation
# awscli 1.27.127 has requirement rsa<4.8,>=3.1.2, but you have rsa 4.9.
# awscli 1.27.127 requires urllib3 < 1.27, python-etcd requires urllib3 >= 1.7, latest urllib3 release is 2.0.2
RUN ${PIP} install --no-cache-dir -U \
"attrs<24,>=23.1.0" \
"protobuf>=3.18.3,<=3.20.3" \
"docutils>=0.10,<0.17" \
"rsa<4.8,>=3.1.2" \
"python-etcd" \
"urllib3>=1.26.0,<1.27"

# Install extra packages needed by sagemaker (for passing test_utility_packages_using_import)
RUN pip install --no-cache-dir -U \
"bokeh>=3.0.1,<4" \
"imageio>=2.22,<3" \
"opencv-python>=4.8.1.78" \
"plotly>=5.11,<6" \
"seaborn>=0.12,<1" \
"shap>=0.41,<1"
&& ${PIP} install --force-reinstall neuronx-cc==$NEURONX_CC_VERSION --extra-index-url https://pip.repos.neuron.amazonaws.com \
&& ${PIP} install --force-reinstall jax-neuronx==$NEURONX_JAX_TRAINING_VERSION --extra-index-url https://pip.repos.neuron.amazonaws.com

# EFA Installer does apt get. Make sure to run apt update before that
RUN apt-get update
Expand All @@ -205,27 +154,14 @@ RUN cd $HOME \
&& ./efa_installer.sh -y -g -d --skip-kmod --skip-limit-conf --no-verify \
&& cd $HOME


# Clean up after apt update
RUN rm -rf /var/lib/apt/lists/* \
&& rm -rf /tmp/tmp* \
&& apt-get clean

# Install some common packages used by training scripts
# torchvision needed for MLP. since it depends on torch and torch neuron/torch
# is already installed install it with nodeps
RUN pip3 install --no-cache-dir --no-deps -U \
torchvision==0.14.*

# Needed for running bert training scripts
RUN pip3 install --no-cache-dir -U \
graphviz \
tensorboard==2.6 \
accelerate \
sentencepiece!=0.1.92 \
h5py \
requests

# Copy workaround script for incorrect hostname
COPY changehostname.c /
COPY start_with_right_hostname.sh /usr/local/bin/start_with_right_hostname.sh
COPY deep_learning_container.py /usr/local/bin/deep_learning_container.py

RUN chmod +x /usr/local/bin/start_with_right_hostname.sh \
Expand All @@ -241,8 +177,8 @@ RUN HOME_DIR=/root \
&& rm -rf ${HOME_DIR}/oss_compliance* \
&& rm -rf /tmp/tmp*

RUN curl -o /license.txt https://aws-dlc-licenses.s3.amazonaws.com/pytorch-1.13/license.txt

# Starts framework
ENTRYPOINT ["bash", "-m", "start_with_right_hostname.sh"]
CMD ["/bin/bash"]

HEALTHCHECK CMD curl --fail http://localhost:8080/ping || exit 1
27 changes: 27 additions & 0 deletions docker/jax/training/0.4/Dockerfile.neuronx.cve_allowlist.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
{
"CVE-2024-35195": {
"description": "Requests is a HTTP library. Prior to 2.32.0, when making requests through a Requests `Session`, if the first request is made with `verify=False` to disable cert verification, all subsequent requests to the same host will continue to ignore cert verification regardless of changes to the value of `verify`. This behavior will continue for the lifecycle of the connection in the connection pool. This vulnerability is fixed in 2.32.0.",
"remediation": {
"recommendation": {
"text": "None Provided"
}
},
"score": 0.0,
"score_details": {},
"severity": "UNTRIAGED",
"source": "NVD",
"source_url": "https://nvd.nist.gov/vuln/detail/CVE-2024-35195",
"status": "ACTIVE",
"title": "CVE-2024-35195 - requests",
"vulnerability_id": "CVE-2024-35195",
"vulnerable_packages": [
{
"epoch": 0,
"filePath": "usr/local/lib/python3.10/site-packages/requests-2.31.0.dist-info/METADATA",
"name": "requests",
"packageManager": "PYTHONPKG",
"version": "2.31.0"
}
]
}
}
Loading

0 comments on commit 315b77b

Please sign in to comment.