Merge branch 'main' into aoti-runner

pytorch · May 12, 2024 · 838e19b · 838e19b
2 parents 15a04bd + baea3de
commit 838e19b
Show file tree

Hide file tree

Showing 10 changed files with 161 additions and 84 deletions.
diff --git a/.github/workflows/run-readme-periodic.yml b/.github/workflows/run-readme-periodic.yml
@@ -17,6 +17,7 @@ jobs:
       secrets-env: "HF_TOKEN_PERIODIC"
       gpu-arch-type: cuda
       gpu-arch-version: "12.1"
+      timeout: 60
       script: |
         echo "::group::Print machine info"
         uname -a
@@ -27,15 +28,8 @@ jobs:
         export PATH=/opt/rh/devtoolset-10/root/usr/bin/:$PATH
         echo "::endgroup::"
 
-        # echo "::group::get_llama"
-        # (
-        #     set +x
-        #     HF_TOKEN="${SECRET_HF_TOKEN_PERIODIC}" bash .ci/scripts/download_llama.sh
-        # )
-        # echo "::endgroup::"
-
         echo "::group::Create script to run README"
-        python3 scripts/updown.py --file README.md > ./run-readme.sh
+        python3 scripts/updown.py --create-sections --file README.md > ./run-readme.sh
         # for good measure, if something happened to updown processor,
         # and it did not error out, fail with an exit 1
         echo "exit 1" >> ./run-readme.sh
@@ -48,8 +42,31 @@ jobs:
         bash -x ./run-readme.sh
         echo "::endgroup::"
 
+        echo "::group::Completion"
+        echo "tests complete"
+        echo "*******************************************"
+        echo "::endgroup::"
+
+  test-quantization-any:
+    uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
+    with:
+      runner: linux.g5.4xlarge.nvidia.gpu
+      secrets: inherit
+      gpu-arch-type: cuda
+      gpu-arch-version: "12.1"
+      timeout: 60
+      script: |
+        echo "::group::Print machine info"
+        uname -a
+        echo "::endgroup::"
+
+        echo "::group::Install newer objcopy that supports --set-section-alignment"
+        yum install -y  devtoolset-10-binutils
+        export PATH=/opt/rh/devtoolset-10/root/usr/bin/:$PATH
+        echo "::endgroup::"
+
         echo "::group::Create script to run quantization"
-        python3 scripts/updown.py --file docs/quantization.md  > ./run-quantization.sh
+        python3 scripts/updown.py --create-sections --file docs/quantization.md > ./run-quantization.sh
         # for good measure, if something happened to updown processor,
         # and it did not error out, fail with an exit 1
         echo "exit 1" >> ./run-quantization.sh
@@ -66,4 +83,3 @@ jobs:
         echo "tests complete"
         echo "*******************************************"
         echo "::endgroup::"
-
diff --git a/.github/workflows/run-readme-pr-macos.yml b/.github/workflows/run-readme-pr-macos.yml
@@ -33,11 +33,6 @@ jobs:
           sysctl machdep.cpu.core_count
           echo "::endgroup::"
 
-          # echo "::group::Install newer objcopy that supports --set-section-alignment"
-          # yum install -y  devtoolset-10-binutils
-          # export PATH=/opt/rh/devtoolset-10/root/usr/bin/:$PATH
-          # echo "::endgroup::"
-  
           echo "::group::Create script to run README"
           python3 scripts/updown.py --file README.md --replace 'llama3:stories15M,-l 3:-l 2,meta-llama/Meta-Llama-3-8B-Instruct:stories15M' --suppress huggingface-cli,HF_TOKEN > ./run-readme.sh
           # for good measure, if something happened to updown processor,
@@ -85,11 +80,6 @@ jobs:
           sysctl machdep.cpu.core_count
           echo "::endgroup::"
 
-          # echo "::group::Install newer objcopy that supports --set-section-alignment"
-          # yum install -y  devtoolset-10-binutils
-          # export PATH=/opt/rh/devtoolset-10/root/usr/bin/:$PATH
-          # echo "::endgroup::"
-  
           echo "::group::Create script to run quantization"
           python3 scripts/updown.py --file docs/quantization.md --replace llama3:stories15M --suppress huggingface-cli,HF_TOKEN > ./run-quantization.sh
           # for good measure, if something happened to updown processor,

diff --git a/.github/workflows/run-readme-pr-mps.yml b/.github/workflows/run-readme-pr-mps.yml
@@ -9,7 +9,7 @@ jobs:
   test-readme-mps-macos:
     uses: pytorch/test-infra/.github/workflows/macos_job.yml@main
     with:
-      runner: macos-m1-stable  # neeps MPS, was macos-m1-stable
+      runner: macos-m1-14
       script: |
           conda create -y -n test-readme-mps-macos python=3.10.11
           conda activate test-readme-mps-macos
@@ -49,46 +49,46 @@ jobs:
           echo "*******************************************"
           echo "::endgroup::"
 
-#  test-quantization-mps-macos:
-#    uses: pytorch/test-infra/.github/workflows/macos_job.yml@main
-#    with:
-#      runner: macos-m1-stable  # neeps MPS, was macos-m1-stable
-#      script: |
-#          set -x
-#          conda create -y -n test-quantization-mps-macos python=3.10.11
-#          conda activate test-quantization-mps-macos
-#          # NS: Remove previous installation  of torch first
-#          # as this script does not isntall anything into conda env but rather as system dep
-#          pip3 uninstall -y torch || true
-#          set -eou pipefail
-#
-#          echo "::group::Print machine info"
-#          uname -a
-#          sysctl machdep.cpu.brand_string
-#          sysctl machdep.cpu.core_count
-#          echo "::endgroup::"
-#
-#          # echo "::group::Install newer objcopy that supports --set-section-alignment"
-#          # yum install -y  devtoolset-10-binutils
-#          # export PATH=/opt/rh/devtoolset-10/root/usr/bin/:$PATH
-#          # echo "::endgroup::"
-#  
-#          echo "::group::Create script to run quantization"
-#          python3 scripts/updown.py --file docs/quantization.md --replace llama3:stories15M --suppress huggingface-cli,HF_TOKEN > ./run-quantization.sh
-#          # for good measure, if something happened to updown processor,
-#          # and it did not error out, fail with an exit 1
-#          echo "exit 1" >> ./run-quantization.sh
-#          echo "::endgroup::"
-#  
-#          echo "::group::Run quantization"
-#          echo "*******************************************"
-#          cat ./run-quantization.sh
-#          echo "*******************************************"
-#          bash -x ./run-quantization.sh
-#          echo "::endgroup::"
-#  
-#          echo "::group::Completion"
-#          echo "tests complete"
-#          echo "*******************************************"
-#          echo "::endgroup::"
-#
+  test-quantization-mps-macos:
+    uses: pytorch/test-infra/.github/workflows/macos_job.yml@main
+    with:
+      runner: macos-m1-stable  # neeps MPS, was macos-m1-stable
+      script: |
+          set -x
+          conda create -y -n test-quantization-mps-macos python=3.10.11
+          conda activate test-quantization-mps-macos
+          # NS: Remove previous installation  of torch first
+          # as this script does not isntall anything into conda env
+	  #but rather  system dep
+          pip3 uninstall -y torch || true
+          set -eou pipefail
+
+          echo "::group::Print machine info"
+          uname -a
+          sysctl machdep.cpu.brand_string
+          sysctl machdep.cpu.core_count
+          echo "::endgroup::"
+
+          # echo "::group::Install newer objcopy that supports --set-section-algnment"
+          # yum install -y  devtoolset-10-binutils
+          # export PATH=/opt/rh/devtoolset-10/root/usr/bin/:$PATH
+          # echo "::endgroup::"
+  
+          echo "::group::Create script to run quantization"
+          python3 scripts/updown.py --file docs/quantization.md --replace llama3:stories15M --suppress huggingface-cli,HF_TOKEN > ./run-quantization.sh
+          # for good measure, if something happened to updown processor,
+          # and it did not error out, fail with an exit 1
+          echo "exit 1" >> ./run-quantization.sh
+          echo "::endgroup::"
+  
+          echo "::group::Run quantization"
+          echo "*******************************************"
+          cat ./run-quantization.sh
+          echo "*******************************************"
+          bash -x ./run-quantization.sh
+          echo "::endgroup::"
+  
+          echo "::group::Completion"
+          echo "tests complete"
+          echo "*******************************************"
+          echo "::endgroup::"
diff --git a/.github/workflows/run-readme-pr.yml b/.github/workflows/run-readme-pr.yml
@@ -12,6 +12,7 @@ jobs:
     uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
     with:
       runner: linux.g5.4xlarge.nvidia.gpu
+      secrets-env: "HF_TOKEN_PERIODIC"
       gpu-arch-type: cuda
       gpu-arch-version: "12.1"
       timeout: 60
@@ -26,7 +27,7 @@ jobs:
         echo "::endgroup::"
 
         echo "::group::Create script to run README"
-        python3 scripts/updown.py --file README.md --replace 'llama3:stories15M,-l 3:-l 2,meta-llama/Meta-Llama-3-8B-Instruct:stories15M' --suppress huggingface-cli,HF_TOKEN > ./run-readme.sh
+        python3 scripts/updown.py --create-sections --file README.md --replace 'llama3:stories15M,-l 3:-l 2,meta-llama/Meta-Llama-3-8B-Instruct:stories15M' --suppress huggingface-cli,HF_TOKEN > ./run-readme.sh
         # for good measure, if something happened to updown processor,
         # and it did not error out, fail with an exit 1
         echo "exit 1" >> ./run-readme.sh
@@ -56,13 +57,13 @@ jobs:
         uname -a
         echo "::endgroup::"
 
-        # echo "::group::Install newer objcopy that supports --set-section-alignment"
-        # yum install -y  devtoolset-10-binutils
-        # export PATH=/opt/rh/devtoolset-10/root/usr/bin/:$PATH
-        # echo "::endgroup::"
+        echo "::group::Install newer objcopy that supports --set-section-alignment"
+        yum install -y  devtoolset-10-binutils
+        export PATH=/opt/rh/devtoolset-10/root/usr/bin/:$PATH
+        echo "::endgroup::"
 
         echo "::group::Create script to run quantization"
-        python3 scripts/updown.py --file docs/quantization.md --replace llama3:stories15M --suppress huggingface-cli,HF_TOKEN > ./run-quantization.sh
+        python3 scripts/updown.py --create-sections --file docs/quantization.md --replace llama3:stories15M --suppress huggingface-cli,HF_TOKEN > ./run-quantization.sh
         # for good measure, if something happened to updown processor,
         # and it did not error out, fail with an exit 1
         echo "exit 1" >> ./run-quantization.sh

diff --git a/README.md b/README.md
@@ -268,7 +268,7 @@ For more information run `python3 torchchat.py eval --help`
 
 Eager mode:
 ```
-python3 torchchat.py eval llama3 -d fp32 --limit 5
+python3 torchchat.py eval llama3 --dtype fp32 --limit 5
 ```
 
 To test the perplexity for a lowered or quantized model, pass it in

diff --git a/cli.py b/cli.py
@@ -7,6 +7,7 @@
 import json
 import logging
 import os
+import sys
 from pathlib import Path
 
 import torch
@@ -20,8 +21,7 @@
 logging.basicConfig(filename="/tmp/torchchat.log", level=logging.INFO, format=FORMAT)
 logger = logging.getLogger(__name__)
 
-
-default_device = "fast"
+default_device = os.getenv("TORCHCHAT_DEVICE", "fast")
 default_model_dir = Path(
     os.getenv("TORCHCHAT_MODELDIR", "~/.torchchat/model-cache")
 ).expanduser()
@@ -311,6 +311,9 @@ def arg_init(args):
             f"You are using PyTorch {torch.__version__}. At this time, torchchat uses the latest PyTorch technology with high-performance kernels only available in PyTorch nightly until the PyTorch 2.4 release"
         )
 
+    if sys.version_info.major != 3 or sys.version_info.minor < 10:
+        raise RuntimeError("Please use Python 3.10 or later.")
+
     if hasattr(args, "quantize") and Path(args.quantize).is_file():
         with open(args.quantize, "r") as f:
             args.quantize = json.loads(f.read())

diff --git a/docs/quantization.md b/docs/quantization.md
@@ -1,9 +1,10 @@
 
 # Quantization
 
+<!--
 [shell default]: HF_TOKEN="${SECRET_HF_TOKEN_PERIODIC}" huggingface-cli login
-
 [shell default]: TORCHCHAT_ROOT=${PWD} ./scripts/install_et.sh
+-->
 
 ## Introduction
 Quantization focuses on reducing the precision of model parameters and computations from floating-point to lower-bit integers, such as 8-bit integers. This approach aims to minimize memory requirements, accelerate inference speeds, and decrease power consumption, making models more feasible for deployment on edge devices with limited computational resources. For high-performance devices such as GPUs, quantization provides a way to reduce the required memory bandwidth and take advantage of the massive compute capabilities provided by today's server-based accelerators such as GPUs.

diff --git a/generate.py b/generate.py
@@ -726,9 +726,10 @@ def callback(x):
             )
             aggregate_metrics["accept_counts"].append(metrics["accept_counts"])
             start_pos += y.size(0)
-        if i == -1:
-            logging.info(f"Compilation time: {time.perf_counter() - t0:.2f} seconds")
-            continue
+        jit_compile = (i == 0) and (
+            generator_args.compile or generator_args.compile_prefill
+        )
+        compilation_time = time.perf_counter() - t0
         if hasattr(prof, "export_chrome_trace"):
             if use_tp:
                 prof.export_chrome_trace(f"{profile}_rank_{rank}.json")
@@ -738,18 +739,28 @@ def callback(x):
         t = time.perf_counter() - t0
 
         print()
+        if start_pos >= max_seq_length:
+            print(f"[Max Sequence Length Reached. Ending Conversation.]")
+            print(f"---------------------------------------------------")
 
         tokens_generated = y.size(0) - prompt_length
         tokens_sec = tokens_generated / t
         aggregate_metrics["tokens_per_sec"].append(tokens_sec)
+
+        if jit_compile:
+            print(f"JIT compilation time (incl runtime): {compilation_time:.2} seconds")
+            # Don't continue here.... because we need to report and reset
+            # continue
+
         print(
             f"Time for inference {i + 1}: {t:.02f} sec total, {tokens_sec:.02f} tokens/sec"
         )
         print(f"Bandwidth achieved: {model_size * tokens_sec / 1e9:.02f} GB/s")
-
+        if i == 0:
+            print(
+                f"*** This first iteration will include cold start effects for dynamic import, hardware caches{', JIT compilation' if jit_compile else ''}. ***"
+            )
         if start_pos >= max_seq_length:
-            print(f"[Max Sequence Length Reached. Ending Conversation.]")
-            print(f"---------------------------------------------------")
             if generator_args.chat_mode:
                 break
 

diff --git a/install_requirements.sh b/install_requirements.sh
@@ -17,6 +17,13 @@ then
   fi
 fi
 
+# Check python version. Expect 3.10.x or 3.11.x
+printf "import sys\nif sys.version_info.major != 3 or sys.version_info.minor < 10 :\n\tprint('Please use Python >=3.10');sys.exit(1)\n" | python3
+if [[ $? -ne 0 ]]
+then
+  exit 1
+fi
+
 if [[ "$PYTHON_EXECUTABLE" == "python" ]];
 then
   PIP_EXECUTABLE=pip
@@ -44,7 +51,7 @@ NIGHTLY_VERSION=dev20240507
 # The pip repository that hosts nightly torch packages. cpu by default.
 # If cuda is available, based on presence of nvidia-smi, install the pytorch nightly
 # with cuda for faster execution on cuda GPUs.
-if [[ -x "$(command -v nvidia-smi)" ]]; 
+if [[ -x "$(command -v nvidia-smi)" ]];
 then
   TORCH_NIGHTLY_URL="https://download.pytorch.org/whl/nightly/cu121"
   # Uninstall triton, as nightly will depend on pytorch-triton, which is one and the same