Skip to content

Latest commit

 

History

History
54 lines (44 loc) · 1.7 KB

README.md

File metadata and controls

54 lines (44 loc) · 1.7 KB

Guide

This is the guide to run benchmark on a Lambda Cloud On-demand instance.

# 1xGH200_96GB on Lambda Cloud On-demand
# The benchmark log files will be saved to deeplearning-benchmark/pytorch/results_v2/LambdaOD_test_1xGH200_96GB_$(hostname)_v2
export NAME_NGC=pytorch:24.10-py3
export NAME_TYPE=LambdaOD_test
export NAME_GPU=GH200_96GB
export NUM_GPU=1
export NAME_RESULTS=results_v2
export NAME_DATASET=all
export NAME_TASKS=all


# Remove sudo requirement for docker
sudo usermod -aG docker $USER && newgrp docker
docker pull nvcr.io/nvidia/${NAME_NGC}


# Clone repos
git clone https://github.com/LambdaLabsML/DeepLearningExamples.git && \
cd DeepLearningExamples && \
git checkout lambda/benchmark && \
git pull origin lambda/benchmark && \
cd ..

git clone https://github.com/lambdal/deeplearning-benchmark.git && \
cd deeplearning-benchmark/pytorch

# Prepare data
mkdir ~/data
docker run --gpus all --rm --shm-size=256g \
-v ~/DeepLearningExamples/PyTorch:/workspace/benchmark \
-v ~/data:/data \
-v $(pwd)"/scripts":/scripts \
nvcr.io/nvidia/${NAME_NGC} \
/bin/bash -c "cp -r /scripts/* /workspace;  ./run_prepare.sh $NAME_DATASET"

# Create benchmark config file
cp scripts/config_v2/config_pytorch_{NUM_GPU}x${NAME_GPU}_v2.sh scripts/config_v2/config_pytorch_${NAME_TYPE}_{NUM_GPU}x${NAME_GPU}_$(hostname)_v2.sh

# Run benchmark
mkdir -p ${NAME_RESULTS} && \
docker run --rm --shm-size=1024g \
--gpus all \
-v ~/DeepLearningExamples/PyTorch:/workspace/benchmark \
-v ~/data:/data \
-v $(pwd)"/scripts":/scripts \
-v $(pwd)/${NAME_RESULTS}:/results \
nvcr.io/nvidia/${NAME_NGC} \
/bin/bash -c "cp -r /scripts/* /workspace; ./run_benchmark.sh ${NAME_TYPE}_{NUM_GPU}x${NAME_GPU}_$(hostname)_v2 ${NAME_TASKS} 3000"