diff --git a/build/builder.py b/build/builder.py
index 1523593e5..e094b4e7a 100644
--- a/build/builder.py
+++ b/build/builder.py
@@ -386,6 +386,7 @@ def _initialize_model(
                     "Cannot load specified DSO to MPS. Attempting to load model to CPU instead"
                 )
                 builder_args.device = "cpu"
+
             # Replace model forward with the AOT-compiled forward
             # This is a hacky way to quickly demo AOTI's capability.
             # model is still a Python object, and any mutation to its
diff --git a/config/data/desktop.json b/config/data/desktop.json
index 2dbdc102f..f75e926ce 100644
--- a/config/data/desktop.json
+++ b/config/data/desktop.json
@@ -1,4 +1,4 @@
 {
     "executor": {"accelerator": "fast"},
-    "precision": {"dtype" : "fast16"},
+    "precision": {"dtype" : "fast16"}
 }
diff --git a/generate.py b/generate.py
index 9824943ae..72dec587f 100644
--- a/generate.py
+++ b/generate.py
@@ -615,11 +615,7 @@ def _main(
     # arbitrarily large number as chat mode goes until max_seq length
     # or user exits
     num_samples = generator_args.num_samples if not generator_args.chat_mode else 100000
-    i = (
-        -1
-    )  # long loop and Im scared someone will add a continue in it, so start at -1 and increment at the start
-    while i < num_samples:
-        i += 1
+    for i in range(num_samples):
         device_sync(device=builder_args.device)
         if i >= 0 and generator_args.chat_mode:
             prompt = input("User: ")
diff --git a/install_requirements.sh b/install_requirements.sh
index cfb2862fa..7ab32a287 100755
--- a/install_requirements.sh
+++ b/install_requirements.sh
@@ -39,7 +39,7 @@ $PIP_EXECUTABLE install -r requirements.txt --extra-index-url https://download.p
 # NOTE: If a newly-fetched version of the executorch repo changes the value of
 # NIGHTLY_VERSION, you should re-run this script to install the necessary
 # package versions.
-NIGHTLY_VERSION=dev20240422
+NIGHTLY_VERSION=dev20240507
 
 # The pip repository that hosts nightly torch packages. cpu by default.
 # If cuda is available, based on presence of nvidia-smi, install the pytorch nightly
@@ -47,6 +47,8 @@ NIGHTLY_VERSION=dev20240422
 if [[ -x "$(command -v nvidia-smi)" ]]; 
 then
   TORCH_NIGHTLY_URL="https://download.pytorch.org/whl/nightly/cu121"
+  # Uninstall triton, as nightly will depend on pytorch-triton, which is one and the same
+  $PIP_EXECUTABLE uninstall -y triton
 else
   TORCH_NIGHTLY_URL="https://download.pytorch.org/whl/nightly/cpu"
 fi
diff --git a/qops.py b/qops.py
index ab86250ff..b4f172163 100644
--- a/qops.py
+++ b/qops.py
@@ -15,7 +15,7 @@ def linear_int8_aoti(input, weight, scales):
         scales = scales.view(-1)
         if (
             torch.compiler.is_compiling()
-            or input.device.type != "cpu"
+            or input.device.type not in ["cpu", "mps"]
             or not hasattr(torch.ops.aten, "_weight_int8pack_mm")
         ):
             lin = F.linear(input, weight.to(dtype=input.dtype))
@@ -395,9 +395,15 @@ def _prepare_weight_and_scales_and_zeros(
         weight_int32, scales_and_zeros = group_quantize_tensor(
             weight_bf16, n_bit=4, groupsize=groupsize
         )
-        weight_int4pack = torch.ops.aten._convert_weight_to_int4pack(
-            weight_int32, inner_k_tiles
-        )
+        if weight_bf16.device.type == "mps":
+            # There are still no MPS-accelerated conversion OP
+            weight_int4pack = torch.ops.aten._convert_weight_to_int4pack(
+                weight_int32.cpu(), inner_k_tiles
+            ).to("mps")
+        else:
+            weight_int4pack = torch.ops.aten._convert_weight_to_int4pack(
+                weight_int32, inner_k_tiles
+            )
         return weight_int4pack, scales_and_zeros
 
     @classmethod