From 23610c74032f70596431a43fc855d2547e375da3 Mon Sep 17 00:00:00 2001 From: Jacob Szwejbka Date: Wed, 8 May 2024 11:16:55 -0700 Subject: [PATCH 1/4] off by one (#724) --- generate.py | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/generate.py b/generate.py index 9824943ae..72dec587f 100644 --- a/generate.py +++ b/generate.py @@ -615,11 +615,7 @@ def _main( # arbitrarily large number as chat mode goes until max_seq length # or user exits num_samples = generator_args.num_samples if not generator_args.chat_mode else 100000 - i = ( - -1 - ) # long loop and Im scared someone will add a continue in it, so start at -1 and increment at the start - while i < num_samples: - i += 1 + for i in range(num_samples): device_sync(device=builder_args.device) if i >= 0 and generator_args.chat_mode: prompt = input("User: ") From 4b699851c455331c7a7ad6f29d04f8ba912e924d Mon Sep 17 00:00:00 2001 From: Michael Gschwind <61328285+mikekgfb@users.noreply.github.com> Date: Thu, 9 May 2024 00:59:49 -0700 Subject: [PATCH 2/4] Update desktop.json (#664) No quantization until we move the PyTorch pin to include the new MPS quantized kernels --- config/data/desktop.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/config/data/desktop.json b/config/data/desktop.json index 2dbdc102f..f75e926ce 100644 --- a/config/data/desktop.json +++ b/config/data/desktop.json @@ -1,4 +1,4 @@ { "executor": {"accelerator": "fast"}, - "precision": {"dtype" : "fast16"}, + "precision": {"dtype" : "fast16"} } From 8a59fd37b998ab4b6f5af68c4a8eb6756eed1296 Mon Sep 17 00:00:00 2001 From: Nikita Shulga <2453524+malfet@users.noreply.github.com> Date: Thu, 9 May 2024 01:35:39 -0700 Subject: [PATCH 3/4] Update PyTorch pin and enable MPS qops (#725) * Update PyTorch pin And enable linter:int8 and linter:int4 acceleration on MPS * Update run-readme-pr.yml * Update install_requirements.sh --- .github/workflows/run-readme-pr.yml | 8 ++++---- install_requirements.sh | 4 +++- qops.py | 14 ++++++++++---- 3 files changed, 17 insertions(+), 9 deletions(-) diff --git a/.github/workflows/run-readme-pr.yml b/.github/workflows/run-readme-pr.yml index d650bf7d4..b53a79d87 100644 --- a/.github/workflows/run-readme-pr.yml +++ b/.github/workflows/run-readme-pr.yml @@ -20,10 +20,10 @@ jobs: uname -a echo "::endgroup::" - # echo "::group::Install newer objcopy that supports --set-section-alignment" - # yum install -y devtoolset-10-binutils - # export PATH=/opt/rh/devtoolset-10/root/usr/bin/:$PATH - # echo "::endgroup::" + echo "::group::Install newer objcopy that supports --set-section-alignment" + yum install -y devtoolset-10-binutils + export PATH=/opt/rh/devtoolset-10/root/usr/bin/:$PATH + echo "::endgroup::" echo "::group::Create script to run README" python3 scripts/updown.py --file README.md --replace 'llama3:stories15M,-l 3:-l 2,meta-llama/Meta-Llama-3-8B-Instruct:stories15M' --suppress huggingface-cli,HF_TOKEN > ./run-readme.sh diff --git a/install_requirements.sh b/install_requirements.sh index cfb2862fa..7ab32a287 100755 --- a/install_requirements.sh +++ b/install_requirements.sh @@ -39,7 +39,7 @@ $PIP_EXECUTABLE install -r requirements.txt --extra-index-url https://download.p # NOTE: If a newly-fetched version of the executorch repo changes the value of # NIGHTLY_VERSION, you should re-run this script to install the necessary # package versions. -NIGHTLY_VERSION=dev20240422 +NIGHTLY_VERSION=dev20240507 # The pip repository that hosts nightly torch packages. cpu by default. # If cuda is available, based on presence of nvidia-smi, install the pytorch nightly @@ -47,6 +47,8 @@ NIGHTLY_VERSION=dev20240422 if [[ -x "$(command -v nvidia-smi)" ]]; then TORCH_NIGHTLY_URL="https://download.pytorch.org/whl/nightly/cu121" + # Uninstall triton, as nightly will depend on pytorch-triton, which is one and the same + $PIP_EXECUTABLE uninstall -y triton else TORCH_NIGHTLY_URL="https://download.pytorch.org/whl/nightly/cpu" fi diff --git a/qops.py b/qops.py index ab86250ff..b4f172163 100644 --- a/qops.py +++ b/qops.py @@ -15,7 +15,7 @@ def linear_int8_aoti(input, weight, scales): scales = scales.view(-1) if ( torch.compiler.is_compiling() - or input.device.type != "cpu" + or input.device.type not in ["cpu", "mps"] or not hasattr(torch.ops.aten, "_weight_int8pack_mm") ): lin = F.linear(input, weight.to(dtype=input.dtype)) @@ -395,9 +395,15 @@ def _prepare_weight_and_scales_and_zeros( weight_int32, scales_and_zeros = group_quantize_tensor( weight_bf16, n_bit=4, groupsize=groupsize ) - weight_int4pack = torch.ops.aten._convert_weight_to_int4pack( - weight_int32, inner_k_tiles - ) + if weight_bf16.device.type == "mps": + # There are still no MPS-accelerated conversion OP + weight_int4pack = torch.ops.aten._convert_weight_to_int4pack( + weight_int32.cpu(), inner_k_tiles + ).to("mps") + else: + weight_int4pack = torch.ops.aten._convert_weight_to_int4pack( + weight_int32, inner_k_tiles + ) return weight_int4pack, scales_and_zeros @classmethod From a89913d2842aa07dcc934a7fc2bb5231331a9a1c Mon Sep 17 00:00:00 2001 From: Michael Gschwind <61328285+mikekgfb@users.noreply.github.com> Date: Thu, 9 May 2024 10:43:31 -0700 Subject: [PATCH 4/4] handle device=mps when loading DSOs by issuing warning and loading cpu model (#731) --- build/builder.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/build/builder.py b/build/builder.py index 84ffccabf..60c833361 100644 --- a/build/builder.py +++ b/build/builder.py @@ -381,6 +381,10 @@ def _initialize_model( print(f"Time to load model: {time.time() - t0:.02f} seconds") try: + if "mps" in builder_args.device: + print("Warning: MPS currently does not support DSO models. Trying to load for CPU.") + builder_args.device = "cpu" + # Replace model forward with the AOT-compiled forward # This is a hacky way to quickly demo AOTI's capability. # model is still a Python object, and any mutation to its