Skip to content

Commit

Permalink
Merge branch 'main' into aoti-runner
Browse files Browse the repository at this point in the history
  • Loading branch information
mikekgfb committed May 12, 2024
2 parents 15a04bd + baea3de commit 838e19b
Show file tree
Hide file tree
Showing 10 changed files with 161 additions and 84 deletions.
36 changes: 26 additions & 10 deletions .github/workflows/run-readme-periodic.yml
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ jobs:
secrets-env: "HF_TOKEN_PERIODIC"
gpu-arch-type: cuda
gpu-arch-version: "12.1"
timeout: 60
script: |
echo "::group::Print machine info"
uname -a
Expand All @@ -27,15 +28,8 @@ jobs:
export PATH=/opt/rh/devtoolset-10/root/usr/bin/:$PATH
echo "::endgroup::"
# echo "::group::get_llama"
# (
# set +x
# HF_TOKEN="${SECRET_HF_TOKEN_PERIODIC}" bash .ci/scripts/download_llama.sh
# )
# echo "::endgroup::"
echo "::group::Create script to run README"
python3 scripts/updown.py --file README.md > ./run-readme.sh
python3 scripts/updown.py --create-sections --file README.md > ./run-readme.sh
# for good measure, if something happened to updown processor,
# and it did not error out, fail with an exit 1
echo "exit 1" >> ./run-readme.sh
Expand All @@ -48,8 +42,31 @@ jobs:
bash -x ./run-readme.sh
echo "::endgroup::"
echo "::group::Completion"
echo "tests complete"
echo "*******************************************"
echo "::endgroup::"
test-quantization-any:
uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
with:
runner: linux.g5.4xlarge.nvidia.gpu
secrets: inherit
gpu-arch-type: cuda
gpu-arch-version: "12.1"
timeout: 60
script: |
echo "::group::Print machine info"
uname -a
echo "::endgroup::"
echo "::group::Install newer objcopy that supports --set-section-alignment"
yum install -y devtoolset-10-binutils
export PATH=/opt/rh/devtoolset-10/root/usr/bin/:$PATH
echo "::endgroup::"
echo "::group::Create script to run quantization"
python3 scripts/updown.py --file docs/quantization.md > ./run-quantization.sh
python3 scripts/updown.py --create-sections --file docs/quantization.md > ./run-quantization.sh
# for good measure, if something happened to updown processor,
# and it did not error out, fail with an exit 1
echo "exit 1" >> ./run-quantization.sh
Expand All @@ -66,4 +83,3 @@ jobs:
echo "tests complete"
echo "*******************************************"
echo "::endgroup::"
10 changes: 0 additions & 10 deletions .github/workflows/run-readme-pr-macos.yml
Original file line number Diff line number Diff line change
Expand Up @@ -33,11 +33,6 @@ jobs:
sysctl machdep.cpu.core_count
echo "::endgroup::"
# echo "::group::Install newer objcopy that supports --set-section-alignment"
# yum install -y devtoolset-10-binutils
# export PATH=/opt/rh/devtoolset-10/root/usr/bin/:$PATH
# echo "::endgroup::"
echo "::group::Create script to run README"
python3 scripts/updown.py --file README.md --replace 'llama3:stories15M,-l 3:-l 2,meta-llama/Meta-Llama-3-8B-Instruct:stories15M' --suppress huggingface-cli,HF_TOKEN > ./run-readme.sh
# for good measure, if something happened to updown processor,
Expand Down Expand Up @@ -85,11 +80,6 @@ jobs:
sysctl machdep.cpu.core_count
echo "::endgroup::"
# echo "::group::Install newer objcopy that supports --set-section-alignment"
# yum install -y devtoolset-10-binutils
# export PATH=/opt/rh/devtoolset-10/root/usr/bin/:$PATH
# echo "::endgroup::"
echo "::group::Create script to run quantization"
python3 scripts/updown.py --file docs/quantization.md --replace llama3:stories15M --suppress huggingface-cli,HF_TOKEN > ./run-quantization.sh
# for good measure, if something happened to updown processor,
Expand Down
88 changes: 44 additions & 44 deletions .github/workflows/run-readme-pr-mps.yml
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ jobs:
test-readme-mps-macos:
uses: pytorch/test-infra/.github/workflows/macos_job.yml@main
with:
runner: macos-m1-stable # neeps MPS, was macos-m1-stable
runner: macos-m1-14
script: |
conda create -y -n test-readme-mps-macos python=3.10.11
conda activate test-readme-mps-macos
Expand Down Expand Up @@ -49,46 +49,46 @@ jobs:
echo "*******************************************"
echo "::endgroup::"

# test-quantization-mps-macos:
# uses: pytorch/test-infra/.github/workflows/macos_job.yml@main
# with:
# runner: macos-m1-stable # neeps MPS, was macos-m1-stable
# script: |
# set -x
# conda create -y -n test-quantization-mps-macos python=3.10.11
# conda activate test-quantization-mps-macos
# # NS: Remove previous installation of torch first
# # as this script does not isntall anything into conda env but rather as system dep
# pip3 uninstall -y torch || true
# set -eou pipefail
#
# echo "::group::Print machine info"
# uname -a
# sysctl machdep.cpu.brand_string
# sysctl machdep.cpu.core_count
# echo "::endgroup::"
#
# # echo "::group::Install newer objcopy that supports --set-section-alignment"
# # yum install -y devtoolset-10-binutils
# # export PATH=/opt/rh/devtoolset-10/root/usr/bin/:$PATH
# # echo "::endgroup::"
#
# echo "::group::Create script to run quantization"
# python3 scripts/updown.py --file docs/quantization.md --replace llama3:stories15M --suppress huggingface-cli,HF_TOKEN > ./run-quantization.sh
# # for good measure, if something happened to updown processor,
# # and it did not error out, fail with an exit 1
# echo "exit 1" >> ./run-quantization.sh
# echo "::endgroup::"
#
# echo "::group::Run quantization"
# echo "*******************************************"
# cat ./run-quantization.sh
# echo "*******************************************"
# bash -x ./run-quantization.sh
# echo "::endgroup::"
#
# echo "::group::Completion"
# echo "tests complete"
# echo "*******************************************"
# echo "::endgroup::"
#
test-quantization-mps-macos:
uses: pytorch/test-infra/.github/workflows/macos_job.yml@main
with:
runner: macos-m1-stable # neeps MPS, was macos-m1-stable
script: |
set -x
conda create -y -n test-quantization-mps-macos python=3.10.11
conda activate test-quantization-mps-macos
# NS: Remove previous installation of torch first
# as this script does not isntall anything into conda env
#but rather system dep
pip3 uninstall -y torch || true
set -eou pipefail

echo "::group::Print machine info"
uname -a
sysctl machdep.cpu.brand_string
sysctl machdep.cpu.core_count
echo "::endgroup::"

# echo "::group::Install newer objcopy that supports --set-section-algnment"
# yum install -y devtoolset-10-binutils
# export PATH=/opt/rh/devtoolset-10/root/usr/bin/:$PATH
# echo "::endgroup::"
echo "::group::Create script to run quantization"
python3 scripts/updown.py --file docs/quantization.md --replace llama3:stories15M --suppress huggingface-cli,HF_TOKEN > ./run-quantization.sh
# for good measure, if something happened to updown processor,
# and it did not error out, fail with an exit 1
echo "exit 1" >> ./run-quantization.sh
echo "::endgroup::"
echo "::group::Run quantization"
echo "*******************************************"
cat ./run-quantization.sh
echo "*******************************************"
bash -x ./run-quantization.sh
echo "::endgroup::"
echo "::group::Completion"
echo "tests complete"
echo "*******************************************"
echo "::endgroup::"
13 changes: 7 additions & 6 deletions .github/workflows/run-readme-pr.yml
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ jobs:
uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
with:
runner: linux.g5.4xlarge.nvidia.gpu
secrets-env: "HF_TOKEN_PERIODIC"
gpu-arch-type: cuda
gpu-arch-version: "12.1"
timeout: 60
Expand All @@ -26,7 +27,7 @@ jobs:
echo "::endgroup::"
echo "::group::Create script to run README"
python3 scripts/updown.py --file README.md --replace 'llama3:stories15M,-l 3:-l 2,meta-llama/Meta-Llama-3-8B-Instruct:stories15M' --suppress huggingface-cli,HF_TOKEN > ./run-readme.sh
python3 scripts/updown.py --create-sections --file README.md --replace 'llama3:stories15M,-l 3:-l 2,meta-llama/Meta-Llama-3-8B-Instruct:stories15M' --suppress huggingface-cli,HF_TOKEN > ./run-readme.sh
# for good measure, if something happened to updown processor,
# and it did not error out, fail with an exit 1
echo "exit 1" >> ./run-readme.sh
Expand Down Expand Up @@ -56,13 +57,13 @@ jobs:
uname -a
echo "::endgroup::"
# echo "::group::Install newer objcopy that supports --set-section-alignment"
# yum install -y devtoolset-10-binutils
# export PATH=/opt/rh/devtoolset-10/root/usr/bin/:$PATH
# echo "::endgroup::"
echo "::group::Install newer objcopy that supports --set-section-alignment"
yum install -y devtoolset-10-binutils
export PATH=/opt/rh/devtoolset-10/root/usr/bin/:$PATH
echo "::endgroup::"
echo "::group::Create script to run quantization"
python3 scripts/updown.py --file docs/quantization.md --replace llama3:stories15M --suppress huggingface-cli,HF_TOKEN > ./run-quantization.sh
python3 scripts/updown.py --create-sections --file docs/quantization.md --replace llama3:stories15M --suppress huggingface-cli,HF_TOKEN > ./run-quantization.sh
# for good measure, if something happened to updown processor,
# and it did not error out, fail with an exit 1
echo "exit 1" >> ./run-quantization.sh
Expand Down
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -268,7 +268,7 @@ For more information run `python3 torchchat.py eval --help`

Eager mode:
```
python3 torchchat.py eval llama3 -d fp32 --limit 5
python3 torchchat.py eval llama3 --dtype fp32 --limit 5
```

To test the perplexity for a lowered or quantized model, pass it in
Expand Down
7 changes: 5 additions & 2 deletions cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
import json
import logging
import os
import sys
from pathlib import Path

import torch
Expand All @@ -20,8 +21,7 @@
logging.basicConfig(filename="/tmp/torchchat.log", level=logging.INFO, format=FORMAT)
logger = logging.getLogger(__name__)


default_device = "fast"
default_device = os.getenv("TORCHCHAT_DEVICE", "fast")
default_model_dir = Path(
os.getenv("TORCHCHAT_MODELDIR", "~/.torchchat/model-cache")
).expanduser()
Expand Down Expand Up @@ -311,6 +311,9 @@ def arg_init(args):
f"You are using PyTorch {torch.__version__}. At this time, torchchat uses the latest PyTorch technology with high-performance kernels only available in PyTorch nightly until the PyTorch 2.4 release"
)

if sys.version_info.major != 3 or sys.version_info.minor < 10:
raise RuntimeError("Please use Python 3.10 or later.")

if hasattr(args, "quantize") and Path(args.quantize).is_file():
with open(args.quantize, "r") as f:
args.quantize = json.loads(f.read())
Expand Down
3 changes: 2 additions & 1 deletion docs/quantization.md
Original file line number Diff line number Diff line change
@@ -1,9 +1,10 @@

# Quantization

<!--
[shell default]: HF_TOKEN="${SECRET_HF_TOKEN_PERIODIC}" huggingface-cli login

[shell default]: TORCHCHAT_ROOT=${PWD} ./scripts/install_et.sh
-->

## Introduction
Quantization focuses on reducing the precision of model parameters and computations from floating-point to lower-bit integers, such as 8-bit integers. This approach aims to minimize memory requirements, accelerate inference speeds, and decrease power consumption, making models more feasible for deployment on edge devices with limited computational resources. For high-performance devices such as GPUs, quantization provides a way to reduce the required memory bandwidth and take advantage of the massive compute capabilities provided by today's server-based accelerators such as GPUs.
Expand Down
23 changes: 17 additions & 6 deletions generate.py
Original file line number Diff line number Diff line change
Expand Up @@ -726,9 +726,10 @@ def callback(x):
)
aggregate_metrics["accept_counts"].append(metrics["accept_counts"])
start_pos += y.size(0)
if i == -1:
logging.info(f"Compilation time: {time.perf_counter() - t0:.2f} seconds")
continue
jit_compile = (i == 0) and (
generator_args.compile or generator_args.compile_prefill
)
compilation_time = time.perf_counter() - t0
if hasattr(prof, "export_chrome_trace"):
if use_tp:
prof.export_chrome_trace(f"{profile}_rank_{rank}.json")
Expand All @@ -738,18 +739,28 @@ def callback(x):
t = time.perf_counter() - t0

print()
if start_pos >= max_seq_length:
print(f"[Max Sequence Length Reached. Ending Conversation.]")
print(f"---------------------------------------------------")

tokens_generated = y.size(0) - prompt_length
tokens_sec = tokens_generated / t
aggregate_metrics["tokens_per_sec"].append(tokens_sec)

if jit_compile:
print(f"JIT compilation time (incl runtime): {compilation_time:.2} seconds")
# Don't continue here.... because we need to report and reset
# continue

print(
f"Time for inference {i + 1}: {t:.02f} sec total, {tokens_sec:.02f} tokens/sec"
)
print(f"Bandwidth achieved: {model_size * tokens_sec / 1e9:.02f} GB/s")

if i == 0:
print(
f"*** This first iteration will include cold start effects for dynamic import, hardware caches{', JIT compilation' if jit_compile else ''}. ***"
)
if start_pos >= max_seq_length:
print(f"[Max Sequence Length Reached. Ending Conversation.]")
print(f"---------------------------------------------------")
if generator_args.chat_mode:
break

Expand Down
9 changes: 8 additions & 1 deletion install_requirements.sh
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,13 @@ then
fi
fi

# Check python version. Expect 3.10.x or 3.11.x
printf "import sys\nif sys.version_info.major != 3 or sys.version_info.minor < 10 :\n\tprint('Please use Python >=3.10');sys.exit(1)\n" | python3
if [[ $? -ne 0 ]]
then
exit 1
fi

if [[ "$PYTHON_EXECUTABLE" == "python" ]];
then
PIP_EXECUTABLE=pip
Expand Down Expand Up @@ -44,7 +51,7 @@ NIGHTLY_VERSION=dev20240507
# The pip repository that hosts nightly torch packages. cpu by default.
# If cuda is available, based on presence of nvidia-smi, install the pytorch nightly
# with cuda for faster execution on cuda GPUs.
if [[ -x "$(command -v nvidia-smi)" ]];
if [[ -x "$(command -v nvidia-smi)" ]];
then
TORCH_NIGHTLY_URL="https://download.pytorch.org/whl/nightly/cu121"
# Uninstall triton, as nightly will depend on pytorch-triton, which is one and the same
Expand Down
Loading

0 comments on commit 838e19b

Please sign in to comment.