diff --git a/llama31-1213/cpu_aoti_4.txt b/llama31-1213/cpu_aoti_4.txt
new file mode 100644
index 000000000..fb9109ec1
--- /dev/null
+++ b/llama31-1213/cpu_aoti_4.txt
@@ -0,0 +1,655 @@
+python3 torchchat.py export llama3.1 --quantize '{"linear:int4": {"groupsize": 256}, "precision": {"dtype":"bfloat16"}, "executor":{"accelerator":"cpu"}}' --output-dso-path /tmp/model34.so
+OMP_NUM_THREADS=16 numactl --cpunodebind=0 --membind=0 python3 torchchat.py generate llama3.1 --dso-path /tmp/model34.so --prompt "Once upon a time," --max-new-tokens 256 --device cpu --num-samples 3
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+python3 torchchat.py export llama3.1 --quantize '{"linear:int4": {"groupsize": 256}, "precision": {"dtype":"bfloat16"}, "executor":{"accelerator":"cpu"}}' --output-dso-path /tmp/model34.so
+Note: NumExpr detected 22 cores but "NUMEXPR_MAX_THREADS" not set, so enforcing safe limit of 16.
+NumExpr defaulting to 16 threads.
+PyTorch version 2.6.0.dev20241213+cu124 available.
+linear: model.layers.0.attention.wq, in=4096, out=4096
+linear: model.layers.0.attention.wk, in=4096, out=1024
+linear: model.layers.0.attention.wv, in=4096, out=1024
+linear: model.layers.0.attention.wo, in=4096, out=4096
+linear: model.layers.0.feed_forward.w1, in=4096, out=14336
+linear: model.layers.0.feed_forward.w2, in=14336, out=4096
+linear: model.layers.0.feed_forward.w3, in=4096, out=14336
+linear: model.layers.1.attention.wq, in=4096, out=4096
+linear: model.layers.1.attention.wk, in=4096, out=1024
+linear: model.layers.1.attention.wv, in=4096, out=1024
+linear: model.layers.1.attention.wo, in=4096, out=4096
+linear: model.layers.1.feed_forward.w1, in=4096, out=14336
+linear: model.layers.1.feed_forward.w2, in=14336, out=4096
+linear: model.layers.1.feed_forward.w3, in=4096, out=14336
+linear: model.layers.2.attention.wq, in=4096, out=4096
+linear: model.layers.2.attention.wk, in=4096, out=1024
+linear: model.layers.2.attention.wv, in=4096, out=1024
+linear: model.layers.2.attention.wo, in=4096, out=4096
+linear: model.layers.2.feed_forward.w1, in=4096, out=14336
+linear: model.layers.2.feed_forward.w2, in=14336, out=4096
+linear: model.layers.2.feed_forward.w3, in=4096, out=14336
+linear: model.layers.3.attention.wq, in=4096, out=4096
+linear: model.layers.3.attention.wk, in=4096, out=1024
+linear: model.layers.3.attention.wv, in=4096, out=1024
+linear: model.layers.3.attention.wo, in=4096, out=4096
+linear: model.layers.3.feed_forward.w1, in=4096, out=14336
+linear: model.layers.3.feed_forward.w2, in=14336, out=4096
+linear: model.layers.3.feed_forward.w3, in=4096, out=14336
+linear: model.layers.4.attention.wq, in=4096, out=4096
+linear: model.layers.4.attention.wk, in=4096, out=1024
+linear: model.layers.4.attention.wv, in=4096, out=1024
+linear: model.layers.4.attention.wo, in=4096, out=4096
+linear: model.layers.4.feed_forward.w1, in=4096, out=14336
+linear: model.layers.4.feed_forward.w2, in=14336, out=4096
+linear: model.layers.4.feed_forward.w3, in=4096, out=14336
+linear: model.layers.5.attention.wq, in=4096, out=4096
+linear: model.layers.5.attention.wk, in=4096, out=1024
+linear: model.layers.5.attention.wv, in=4096, out=1024
+linear: model.layers.5.attention.wo, in=4096, out=4096
+linear: model.layers.5.feed_forward.w1, in=4096, out=14336
+linear: model.layers.5.feed_forward.w2, in=14336, out=4096
+linear: model.layers.5.feed_forward.w3, in=4096, out=14336
+linear: model.layers.6.attention.wq, in=4096, out=4096
+linear: model.layers.6.attention.wk, in=4096, out=1024
+linear: model.layers.6.attention.wv, in=4096, out=1024
+linear: model.layers.6.attention.wo, in=4096, out=4096
+linear: model.layers.6.feed_forward.w1, in=4096, out=14336
+linear: model.layers.6.feed_forward.w2, in=14336, out=4096
+linear: model.layers.6.feed_forward.w3, in=4096, out=14336
+linear: model.layers.7.attention.wq, in=4096, out=4096
+linear: model.layers.7.attention.wk, in=4096, out=1024
+linear: model.layers.7.attention.wv, in=4096, out=1024
+linear: model.layers.7.attention.wo, in=4096, out=4096
+linear: model.layers.7.feed_forward.w1, in=4096, out=14336
+linear: model.layers.7.feed_forward.w2, in=14336, out=4096
+linear: model.layers.7.feed_forward.w3, in=4096, out=14336
+linear: model.layers.8.attention.wq, in=4096, out=4096
+linear: model.layers.8.attention.wk, in=4096, out=1024
+linear: model.layers.8.attention.wv, in=4096, out=1024
+linear: model.layers.8.attention.wo, in=4096, out=4096
+linear: model.layers.8.feed_forward.w1, in=4096, out=14336
+linear: model.layers.8.feed_forward.w2, in=14336, out=4096
+linear: model.layers.8.feed_forward.w3, in=4096, out=14336
+linear: model.layers.9.attention.wq, in=4096, out=4096
+linear: model.layers.9.attention.wk, in=4096, out=1024
+linear: model.layers.9.attention.wv, in=4096, out=1024
+linear: model.layers.9.attention.wo, in=4096, out=4096
+linear: model.layers.9.feed_forward.w1, in=4096, out=14336
+linear: model.layers.9.feed_forward.w2, in=14336, out=4096
+linear: model.layers.9.feed_forward.w3, in=4096, out=14336
+linear: model.layers.10.attention.wq, in=4096, out=4096
+linear: model.layers.10.attention.wk, in=4096, out=1024
+linear: model.layers.10.attention.wv, in=4096, out=1024
+linear: model.layers.10.attention.wo, in=4096, out=4096
+linear: model.layers.10.feed_forward.w1, in=4096, out=14336
+linear: model.layers.10.feed_forward.w2, in=14336, out=4096
+linear: model.layers.10.feed_forward.w3, in=4096, out=14336
+linear: model.layers.11.attention.wq, in=4096, out=4096
+linear: model.layers.11.attention.wk, in=4096, out=1024
+linear: model.layers.11.attention.wv, in=4096, out=1024
+linear: model.layers.11.attention.wo, in=4096, out=4096
+linear: model.layers.11.feed_forward.w1, in=4096, out=14336
+linear: model.layers.11.feed_forward.w2, in=14336, out=4096
+linear: model.layers.11.feed_forward.w3, in=4096, out=14336
+linear: model.layers.12.attention.wq, in=4096, out=4096
+linear: model.layers.12.attention.wk, in=4096, out=1024
+linear: model.layers.12.attention.wv, in=4096, out=1024
+linear: model.layers.12.attention.wo, in=4096, out=4096
+linear: model.layers.12.feed_forward.w1, in=4096, out=14336
+linear: model.layers.12.feed_forward.w2, in=14336, out=4096
+linear: model.layers.12.feed_forward.w3, in=4096, out=14336
+linear: model.layers.13.attention.wq, in=4096, out=4096
+linear: model.layers.13.attention.wk, in=4096, out=1024
+linear: model.layers.13.attention.wv, in=4096, out=1024
+linear: model.layers.13.attention.wo, in=4096, out=4096
+linear: model.layers.13.feed_forward.w1, in=4096, out=14336
+linear: model.layers.13.feed_forward.w2, in=14336, out=4096
+linear: model.layers.13.feed_forward.w3, in=4096, out=14336
+linear: model.layers.14.attention.wq, in=4096, out=4096
+linear: model.layers.14.attention.wk, in=4096, out=1024
+linear: model.layers.14.attention.wv, in=4096, out=1024
+linear: model.layers.14.attention.wo, in=4096, out=4096
+linear: model.layers.14.feed_forward.w1, in=4096, out=14336
+linear: model.layers.14.feed_forward.w2, in=14336, out=4096
+linear: model.layers.14.feed_forward.w3, in=4096, out=14336
+linear: model.layers.15.attention.wq, in=4096, out=4096
+linear: model.layers.15.attention.wk, in=4096, out=1024
+linear: model.layers.15.attention.wv, in=4096, out=1024
+linear: model.layers.15.attention.wo, in=4096, out=4096
+linear: model.layers.15.feed_forward.w1, in=4096, out=14336
+linear: model.layers.15.feed_forward.w2, in=14336, out=4096
+linear: model.layers.15.feed_forward.w3, in=4096, out=14336
+linear: model.layers.16.attention.wq, in=4096, out=4096
+linear: model.layers.16.attention.wk, in=4096, out=1024
+linear: model.layers.16.attention.wv, in=4096, out=1024
+linear: model.layers.16.attention.wo, in=4096, out=4096
+linear: model.layers.16.feed_forward.w1, in=4096, out=14336
+linear: model.layers.16.feed_forward.w2, in=14336, out=4096
+linear: model.layers.16.feed_forward.w3, in=4096, out=14336
+linear: model.layers.17.attention.wq, in=4096, out=4096
+linear: model.layers.17.attention.wk, in=4096, out=1024
+linear: model.layers.17.attention.wv, in=4096, out=1024
+linear: model.layers.17.attention.wo, in=4096, out=4096
+linear: model.layers.17.feed_forward.w1, in=4096, out=14336
+linear: model.layers.17.feed_forward.w2, in=14336, out=4096
+linear: model.layers.17.feed_forward.w3, in=4096, out=14336
+linear: model.layers.18.attention.wq, in=4096, out=4096
+linear: model.layers.18.attention.wk, in=4096, out=1024
+linear: model.layers.18.attention.wv, in=4096, out=1024
+linear: model.layers.18.attention.wo, in=4096, out=4096
+linear: model.layers.18.feed_forward.w1, in=4096, out=14336
+linear: model.layers.18.feed_forward.w2, in=14336, out=4096
+linear: model.layers.18.feed_forward.w3, in=4096, out=14336
+linear: model.layers.19.attention.wq, in=4096, out=4096
+linear: model.layers.19.attention.wk, in=4096, out=1024
+linear: model.layers.19.attention.wv, in=4096, out=1024
+linear: model.layers.19.attention.wo, in=4096, out=4096
+linear: model.layers.19.feed_forward.w1, in=4096, out=14336
+linear: model.layers.19.feed_forward.w2, in=14336, out=4096
+linear: model.layers.19.feed_forward.w3, in=4096, out=14336
+linear: model.layers.20.attention.wq, in=4096, out=4096
+linear: model.layers.20.attention.wk, in=4096, out=1024
+linear: model.layers.20.attention.wv, in=4096, out=1024
+linear: model.layers.20.attention.wo, in=4096, out=4096
+linear: model.layers.20.feed_forward.w1, in=4096, out=14336
+linear: model.layers.20.feed_forward.w2, in=14336, out=4096
+linear: model.layers.20.feed_forward.w3, in=4096, out=14336
+linear: model.layers.21.attention.wq, in=4096, out=4096
+linear: model.layers.21.attention.wk, in=4096, out=1024
+linear: model.layers.21.attention.wv, in=4096, out=1024
+linear: model.layers.21.attention.wo, in=4096, out=4096
+linear: model.layers.21.feed_forward.w1, in=4096, out=14336
+linear: model.layers.21.feed_forward.w2, in=14336, out=4096
+linear: model.layers.21.feed_forward.w3, in=4096, out=14336
+linear: model.layers.22.attention.wq, in=4096, out=4096
+linear: model.layers.22.attention.wk, in=4096, out=1024
+linear: model.layers.22.attention.wv, in=4096, out=1024
+linear: model.layers.22.attention.wo, in=4096, out=4096
+linear: model.layers.22.feed_forward.w1, in=4096, out=14336
+linear: model.layers.22.feed_forward.w2, in=14336, out=4096
+linear: model.layers.22.feed_forward.w3, in=4096, out=14336
+linear: model.layers.23.attention.wq, in=4096, out=4096
+linear: model.layers.23.attention.wk, in=4096, out=1024
+linear: model.layers.23.attention.wv, in=4096, out=1024
+linear: model.layers.23.attention.wo, in=4096, out=4096
+linear: model.layers.23.feed_forward.w1, in=4096, out=14336
+linear: model.layers.23.feed_forward.w2, in=14336, out=4096
+linear: model.layers.23.feed_forward.w3, in=4096, out=14336
+linear: model.layers.24.attention.wq, in=4096, out=4096
+linear: model.layers.24.attention.wk, in=4096, out=1024
+linear: model.layers.24.attention.wv, in=4096, out=1024
+linear: model.layers.24.attention.wo, in=4096, out=4096
+linear: model.layers.24.feed_forward.w1, in=4096, out=14336
+linear: model.layers.24.feed_forward.w2, in=14336, out=4096
+linear: model.layers.24.feed_forward.w3, in=4096, out=14336
+linear: model.layers.25.attention.wq, in=4096, out=4096
+linear: model.layers.25.attention.wk, in=4096, out=1024
+linear: model.layers.25.attention.wv, in=4096, out=1024
+linear: model.layers.25.attention.wo, in=4096, out=4096
+linear: model.layers.25.feed_forward.w1, in=4096, out=14336
+linear: model.layers.25.feed_forward.w2, in=14336, out=4096
+linear: model.layers.25.feed_forward.w3, in=4096, out=14336
+linear: model.layers.26.attention.wq, in=4096, out=4096
+linear: model.layers.26.attention.wk, in=4096, out=1024
+linear: model.layers.26.attention.wv, in=4096, out=1024
+linear: model.layers.26.attention.wo, in=4096, out=4096
+linear: model.layers.26.feed_forward.w1, in=4096, out=14336
+linear: model.layers.26.feed_forward.w2, in=14336, out=4096
+linear: model.layers.26.feed_forward.w3, in=4096, out=14336
+linear: model.layers.27.attention.wq, in=4096, out=4096
+linear: model.layers.27.attention.wk, in=4096, out=1024
+linear: model.layers.27.attention.wv, in=4096, out=1024
+linear: model.layers.27.attention.wo, in=4096, out=4096
+linear: model.layers.27.feed_forward.w1, in=4096, out=14336
+linear: model.layers.27.feed_forward.w2, in=14336, out=4096
+linear: model.layers.27.feed_forward.w3, in=4096, out=14336
+linear: model.layers.28.attention.wq, in=4096, out=4096
+linear: model.layers.28.attention.wk, in=4096, out=1024
+linear: model.layers.28.attention.wv, in=4096, out=1024
+linear: model.layers.28.attention.wo, in=4096, out=4096
+linear: model.layers.28.feed_forward.w1, in=4096, out=14336
+linear: model.layers.28.feed_forward.w2, in=14336, out=4096
+linear: model.layers.28.feed_forward.w3, in=4096, out=14336
+linear: model.layers.29.attention.wq, in=4096, out=4096
+linear: model.layers.29.attention.wk, in=4096, out=1024
+linear: model.layers.29.attention.wv, in=4096, out=1024
+linear: model.layers.29.attention.wo, in=4096, out=4096
+linear: model.layers.29.feed_forward.w1, in=4096, out=14336
+linear: model.layers.29.feed_forward.w2, in=14336, out=4096
+linear: model.layers.29.feed_forward.w3, in=4096, out=14336
+linear: model.layers.30.attention.wq, in=4096, out=4096
+linear: model.layers.30.attention.wk, in=4096, out=1024
+linear: model.layers.30.attention.wv, in=4096, out=1024
+linear: model.layers.30.attention.wo, in=4096, out=4096
+linear: model.layers.30.feed_forward.w1, in=4096, out=14336
+linear: model.layers.30.feed_forward.w2, in=14336, out=4096
+linear: model.layers.30.feed_forward.w3, in=4096, out=14336
+linear: model.layers.31.attention.wq, in=4096, out=4096
+linear: model.layers.31.attention.wk, in=4096, out=1024
+linear: model.layers.31.attention.wv, in=4096, out=1024
+linear: model.layers.31.attention.wo, in=4096, out=4096
+linear: model.layers.31.feed_forward.w1, in=4096, out=14336
+linear: model.layers.31.feed_forward.w2, in=14336, out=4096
+linear: model.layers.31.feed_forward.w3, in=4096, out=14336
+linear: model.output, in=4096, out=128256
+W1217 20:17:50.017740 1053581 site-packages/torch/_export/__init__.py:276] +============================+
+W1217 20:17:50.018219 1053581 site-packages/torch/_export/__init__.py:277] |     !!!   WARNING   !!!    |
+W1217 20:17:50.018440 1053581 site-packages/torch/_export/__init__.py:278] +============================+
+W1217 20:17:50.018627 1053581 site-packages/torch/_export/__init__.py:279] torch._export.aot_compile()/torch._export.aot_load() is being deprecated, please switch to directly calling torch._inductor.aoti_compile_and_package(torch.export.export())/torch._inductor.aoti_load_package() instead.
+W1217 20:19:14.752341 1053581 site-packages/torch/_inductor/ir.py:6509] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1217 20:19:14.758008 1053581 site-packages/torch/_inductor/ir.py:6509] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1217 20:19:14.759118 1053581 site-packages/torch/_inductor/ir.py:6509] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1217 20:19:15.477747 1053581 site-packages/torch/_inductor/ir.py:6509] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1217 20:19:15.556130 1053581 site-packages/torch/_inductor/ir.py:6509] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1217 20:19:15.558589 1053581 site-packages/torch/_inductor/ir.py:6509] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1217 20:19:15.592000 1053581 site-packages/torch/_inductor/ir.py:6509] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1217 20:19:15.668948 1053581 site-packages/torch/_inductor/ir.py:6509] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1217 20:19:15.712866 1053581 site-packages/torch/_inductor/ir.py:6509] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1217 20:19:15.714414 1053581 site-packages/torch/_inductor/ir.py:6509] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1217 20:19:15.907057 1053581 site-packages/torch/_inductor/ir.py:6509] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1217 20:19:15.984346 1053581 site-packages/torch/_inductor/ir.py:6509] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1217 20:19:15.985764 1053581 site-packages/torch/_inductor/ir.py:6509] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1217 20:19:16.004584 1053581 site-packages/torch/_inductor/ir.py:6509] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1217 20:19:16.093100 1053581 site-packages/torch/_inductor/ir.py:6509] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1217 20:19:16.094341 1053581 site-packages/torch/_inductor/ir.py:6509] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1217 20:19:16.095189 1053581 site-packages/torch/_inductor/ir.py:6509] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1217 20:19:16.306921 1053581 site-packages/torch/_inductor/ir.py:6509] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1217 20:19:16.359418 1053581 site-packages/torch/_inductor/ir.py:6509] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1217 20:19:16.361098 1053581 site-packages/torch/_inductor/ir.py:6509] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1217 20:19:16.379564 1053581 site-packages/torch/_inductor/ir.py:6509] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1217 20:19:16.444402 1053581 site-packages/torch/_inductor/ir.py:6509] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1217 20:19:16.483523 1053581 site-packages/torch/_inductor/ir.py:6509] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1217 20:19:16.484980 1053581 site-packages/torch/_inductor/ir.py:6509] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1217 20:19:16.658927 1053581 site-packages/torch/_inductor/ir.py:6509] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1217 20:19:16.727626 1053581 site-packages/torch/_inductor/ir.py:6509] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1217 20:19:16.729055 1053581 site-packages/torch/_inductor/ir.py:6509] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1217 20:19:16.748685 1053581 site-packages/torch/_inductor/ir.py:6509] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1217 20:19:16.839107 1053581 site-packages/torch/_inductor/ir.py:6509] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1217 20:19:16.840378 1053581 site-packages/torch/_inductor/ir.py:6509] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1217 20:19:16.841265 1053581 site-packages/torch/_inductor/ir.py:6509] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1217 20:19:17.050923 1053581 site-packages/torch/_inductor/ir.py:6509] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1217 20:19:17.105028 1053581 site-packages/torch/_inductor/ir.py:6509] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1217 20:19:17.106702 1053581 site-packages/torch/_inductor/ir.py:6509] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1217 20:19:17.125577 1053581 site-packages/torch/_inductor/ir.py:6509] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1217 20:19:17.192155 1053581 site-packages/torch/_inductor/ir.py:6509] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1217 20:19:17.231954 1053581 site-packages/torch/_inductor/ir.py:6509] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1217 20:19:17.233426 1053581 site-packages/torch/_inductor/ir.py:6509] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1217 20:19:17.408941 1053581 site-packages/torch/_inductor/ir.py:6509] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1217 20:19:17.477298 1053581 site-packages/torch/_inductor/ir.py:6509] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1217 20:19:17.478715 1053581 site-packages/torch/_inductor/ir.py:6509] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1217 20:19:17.499262 1053581 site-packages/torch/_inductor/ir.py:6509] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1217 20:19:17.586895 1053581 site-packages/torch/_inductor/ir.py:6509] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1217 20:19:17.588148 1053581 site-packages/torch/_inductor/ir.py:6509] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1217 20:19:17.589000 1053581 site-packages/torch/_inductor/ir.py:6509] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1217 20:19:17.802783 1053581 site-packages/torch/_inductor/ir.py:6509] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1217 20:19:17.859436 1053581 site-packages/torch/_inductor/ir.py:6509] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1217 20:19:17.861092 1053581 site-packages/torch/_inductor/ir.py:6509] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1217 20:19:17.879744 1053581 site-packages/torch/_inductor/ir.py:6509] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1217 20:19:17.943822 1053581 site-packages/torch/_inductor/ir.py:6509] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1217 20:19:17.982570 1053581 site-packages/torch/_inductor/ir.py:6509] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1217 20:19:17.984050 1053581 site-packages/torch/_inductor/ir.py:6509] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1217 20:19:18.162103 1053581 site-packages/torch/_inductor/ir.py:6509] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1217 20:19:18.234384 1053581 site-packages/torch/_inductor/ir.py:6509] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1217 20:19:18.235791 1053581 site-packages/torch/_inductor/ir.py:6509] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1217 20:19:18.254976 1053581 site-packages/torch/_inductor/ir.py:6509] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1217 20:19:18.346359 1053581 site-packages/torch/_inductor/ir.py:6509] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1217 20:19:18.347687 1053581 site-packages/torch/_inductor/ir.py:6509] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1217 20:19:18.348557 1053581 site-packages/torch/_inductor/ir.py:6509] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1217 20:19:18.569108 1053581 site-packages/torch/_inductor/ir.py:6509] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1217 20:19:18.628752 1053581 site-packages/torch/_inductor/ir.py:6509] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1217 20:19:18.630521 1053581 site-packages/torch/_inductor/ir.py:6509] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1217 20:19:18.651232 1053581 site-packages/torch/_inductor/ir.py:6509] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1217 20:19:18.719131 1053581 site-packages/torch/_inductor/ir.py:6509] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1217 20:19:18.763623 1053581 site-packages/torch/_inductor/ir.py:6509] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1217 20:19:18.765239 1053581 site-packages/torch/_inductor/ir.py:6509] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1217 20:19:18.955900 1053581 site-packages/torch/_inductor/ir.py:6509] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1217 20:19:19.029084 1053581 site-packages/torch/_inductor/ir.py:6509] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1217 20:19:19.030589 1053581 site-packages/torch/_inductor/ir.py:6509] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1217 20:19:19.050292 1053581 site-packages/torch/_inductor/ir.py:6509] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1217 20:19:19.146188 1053581 site-packages/torch/_inductor/ir.py:6509] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1217 20:19:19.147502 1053581 site-packages/torch/_inductor/ir.py:6509] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1217 20:19:19.148389 1053581 site-packages/torch/_inductor/ir.py:6509] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1217 20:19:19.417901 1053581 site-packages/torch/_inductor/ir.py:6509] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1217 20:19:19.470551 1053581 site-packages/torch/_inductor/ir.py:6509] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1217 20:19:19.472659 1053581 site-packages/torch/_inductor/ir.py:6509] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1217 20:19:19.490988 1053581 site-packages/torch/_inductor/ir.py:6509] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1217 20:19:19.557101 1053581 site-packages/torch/_inductor/ir.py:6509] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1217 20:19:19.598272 1053581 site-packages/torch/_inductor/ir.py:6509] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1217 20:19:19.599764 1053581 site-packages/torch/_inductor/ir.py:6509] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1217 20:19:19.772974 1053581 site-packages/torch/_inductor/ir.py:6509] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1217 20:19:19.846216 1053581 site-packages/torch/_inductor/ir.py:6509] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1217 20:19:19.847644 1053581 site-packages/torch/_inductor/ir.py:6509] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1217 20:19:19.865986 1053581 site-packages/torch/_inductor/ir.py:6509] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1217 20:19:19.959597 1053581 site-packages/torch/_inductor/ir.py:6509] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1217 20:19:19.960884 1053581 site-packages/torch/_inductor/ir.py:6509] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1217 20:19:19.961727 1053581 site-packages/torch/_inductor/ir.py:6509] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1217 20:19:20.183543 1053581 site-packages/torch/_inductor/ir.py:6509] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1217 20:19:20.242976 1053581 site-packages/torch/_inductor/ir.py:6509] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1217 20:19:20.245069 1053581 site-packages/torch/_inductor/ir.py:6509] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1217 20:19:20.266511 1053581 site-packages/torch/_inductor/ir.py:6509] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1217 20:19:20.331326 1053581 site-packages/torch/_inductor/ir.py:6509] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1217 20:19:20.372670 1053581 site-packages/torch/_inductor/ir.py:6509] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1217 20:19:20.374189 1053581 site-packages/torch/_inductor/ir.py:6509] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1217 20:19:20.550678 1053581 site-packages/torch/_inductor/ir.py:6509] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1217 20:19:20.621627 1053581 site-packages/torch/_inductor/ir.py:6509] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1217 20:19:20.623005 1053581 site-packages/torch/_inductor/ir.py:6509] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1217 20:19:20.640106 1053581 site-packages/torch/_inductor/ir.py:6509] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1217 20:19:20.733005 1053581 site-packages/torch/_inductor/ir.py:6509] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1217 20:19:20.734366 1053581 site-packages/torch/_inductor/ir.py:6509] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1217 20:19:20.735251 1053581 site-packages/torch/_inductor/ir.py:6509] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1217 20:19:20.947406 1053581 site-packages/torch/_inductor/ir.py:6509] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1217 20:19:21.001752 1053581 site-packages/torch/_inductor/ir.py:6509] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1217 20:19:21.003458 1053581 site-packages/torch/_inductor/ir.py:6509] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1217 20:19:21.022732 1053581 site-packages/torch/_inductor/ir.py:6509] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1217 20:19:21.088229 1053581 site-packages/torch/_inductor/ir.py:6509] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1217 20:19:21.129429 1053581 site-packages/torch/_inductor/ir.py:6509] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1217 20:19:21.130929 1053581 site-packages/torch/_inductor/ir.py:6509] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1217 20:19:21.303266 1053581 site-packages/torch/_inductor/ir.py:6509] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1217 20:19:21.374888 1053581 site-packages/torch/_inductor/ir.py:6509] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1217 20:19:21.376276 1053581 site-packages/torch/_inductor/ir.py:6509] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1217 20:19:21.396691 1053581 site-packages/torch/_inductor/ir.py:6509] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1217 20:19:21.493068 1053581 site-packages/torch/_inductor/ir.py:6509] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1217 20:19:21.494328 1053581 site-packages/torch/_inductor/ir.py:6509] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1217 20:19:21.495182 1053581 site-packages/torch/_inductor/ir.py:6509] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1217 20:19:21.719799 1053581 site-packages/torch/_inductor/ir.py:6509] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1217 20:19:21.775537 1053581 site-packages/torch/_inductor/ir.py:6509] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1217 20:19:21.777211 1053581 site-packages/torch/_inductor/ir.py:6509] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1217 20:19:21.795950 1053581 site-packages/torch/_inductor/ir.py:6509] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1217 20:19:21.861169 1053581 site-packages/torch/_inductor/ir.py:6509] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1217 20:19:21.900308 1053581 site-packages/torch/_inductor/ir.py:6509] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1217 20:19:21.901897 1053581 site-packages/torch/_inductor/ir.py:6509] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1217 20:19:22.081641 1053581 site-packages/torch/_inductor/ir.py:6509] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1217 20:19:22.152060 1053581 site-packages/torch/_inductor/ir.py:6509] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1217 20:19:22.153471 1053581 site-packages/torch/_inductor/ir.py:6509] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1217 20:19:22.171422 1053581 site-packages/torch/_inductor/ir.py:6509] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1217 20:19:22.261556 1053581 site-packages/torch/_inductor/ir.py:6509] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1217 20:19:22.262854 1053581 site-packages/torch/_inductor/ir.py:6509] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1217 20:19:22.263756 1053581 site-packages/torch/_inductor/ir.py:6509] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1217 20:19:22.489795 1053581 site-packages/torch/_inductor/ir.py:6509] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1217 20:19:22.550853 1053581 site-packages/torch/_inductor/ir.py:6509] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1217 20:19:22.552608 1053581 site-packages/torch/_inductor/ir.py:6509] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1217 20:19:22.571038 1053581 site-packages/torch/_inductor/ir.py:6509] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1217 20:19:22.634724 1053581 site-packages/torch/_inductor/ir.py:6509] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1217 20:19:22.674166 1053581 site-packages/torch/_inductor/ir.py:6509] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1217 20:19:22.675602 1053581 site-packages/torch/_inductor/ir.py:6509] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1217 20:19:22.850249 1053581 site-packages/torch/_inductor/ir.py:6509] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1217 20:19:22.927492 1053581 site-packages/torch/_inductor/ir.py:6509] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1217 20:19:22.928954 1053581 site-packages/torch/_inductor/ir.py:6509] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1217 20:19:22.951629 1053581 site-packages/torch/_inductor/ir.py:6509] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1217 20:19:23.056945 1053581 site-packages/torch/_inductor/ir.py:6509] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1217 20:19:23.058847 1053581 site-packages/torch/_inductor/ir.py:6509] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1217 20:19:23.060168 1053581 site-packages/torch/_inductor/ir.py:6509] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1217 20:19:23.319425 1053581 site-packages/torch/_inductor/ir.py:6509] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1217 20:19:23.382362 1053581 site-packages/torch/_inductor/ir.py:6509] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1217 20:19:23.384152 1053581 site-packages/torch/_inductor/ir.py:6509] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1217 20:19:23.405132 1053581 site-packages/torch/_inductor/ir.py:6509] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1217 20:19:23.475934 1053581 site-packages/torch/_inductor/ir.py:6509] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1217 20:19:23.518651 1053581 site-packages/torch/_inductor/ir.py:6509] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1217 20:19:23.520238 1053581 site-packages/torch/_inductor/ir.py:6509] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1217 20:19:23.706735 1053581 site-packages/torch/_inductor/ir.py:6509] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1217 20:19:23.779945 1053581 site-packages/torch/_inductor/ir.py:6509] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1217 20:19:23.781526 1053581 site-packages/torch/_inductor/ir.py:6509] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1217 20:19:23.802152 1053581 site-packages/torch/_inductor/ir.py:6509] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1217 20:19:23.896472 1053581 site-packages/torch/_inductor/ir.py:6509] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1217 20:19:23.897809 1053581 site-packages/torch/_inductor/ir.py:6509] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1217 20:19:23.898684 1053581 site-packages/torch/_inductor/ir.py:6509] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1217 20:19:24.124679 1053581 site-packages/torch/_inductor/ir.py:6509] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1217 20:19:24.184607 1053581 site-packages/torch/_inductor/ir.py:6509] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1217 20:19:24.186349 1053581 site-packages/torch/_inductor/ir.py:6509] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1217 20:19:24.206017 1053581 site-packages/torch/_inductor/ir.py:6509] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1217 20:19:24.274349 1053581 site-packages/torch/_inductor/ir.py:6509] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1217 20:19:24.314822 1053581 site-packages/torch/_inductor/ir.py:6509] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1217 20:19:24.316271 1053581 site-packages/torch/_inductor/ir.py:6509] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1217 20:19:24.533498 1053581 site-packages/torch/_inductor/ir.py:6509] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1217 20:19:24.605424 1053581 site-packages/torch/_inductor/ir.py:6509] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1217 20:19:24.607002 1053581 site-packages/torch/_inductor/ir.py:6509] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1217 20:19:24.626242 1053581 site-packages/torch/_inductor/ir.py:6509] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1217 20:19:24.722759 1053581 site-packages/torch/_inductor/ir.py:6509] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1217 20:19:24.724053 1053581 site-packages/torch/_inductor/ir.py:6509] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1217 20:19:24.725191 1053581 site-packages/torch/_inductor/ir.py:6509] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1217 20:19:24.957913 1053581 site-packages/torch/_inductor/ir.py:6509] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1217 20:19:25.014861 1053581 site-packages/torch/_inductor/ir.py:6509] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1217 20:19:25.016978 1053581 site-packages/torch/_inductor/ir.py:6509] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1217 20:19:25.037420 1053581 site-packages/torch/_inductor/ir.py:6509] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1217 20:19:25.113034 1053581 site-packages/torch/_inductor/ir.py:6509] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1217 20:19:25.154092 1053581 site-packages/torch/_inductor/ir.py:6509] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1217 20:19:25.155553 1053581 site-packages/torch/_inductor/ir.py:6509] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1217 20:19:25.366088 1053581 site-packages/torch/_inductor/ir.py:6509] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1217 20:19:25.443508 1053581 site-packages/torch/_inductor/ir.py:6509] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1217 20:19:25.444969 1053581 site-packages/torch/_inductor/ir.py:6509] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1217 20:19:25.464361 1053581 site-packages/torch/_inductor/ir.py:6509] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1217 20:19:25.566542 1053581 site-packages/torch/_inductor/ir.py:6509] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1217 20:19:25.567835 1053581 site-packages/torch/_inductor/ir.py:6509] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1217 20:19:25.568686 1053581 site-packages/torch/_inductor/ir.py:6509] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1217 20:19:25.789354 1053581 site-packages/torch/_inductor/ir.py:6509] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1217 20:19:25.843644 1053581 site-packages/torch/_inductor/ir.py:6509] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1217 20:19:25.845608 1053581 site-packages/torch/_inductor/ir.py:6509] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1217 20:19:25.865914 1053581 site-packages/torch/_inductor/ir.py:6509] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1217 20:19:25.928810 1053581 site-packages/torch/_inductor/ir.py:6509] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1217 20:19:25.970228 1053581 site-packages/torch/_inductor/ir.py:6509] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1217 20:19:25.971792 1053581 site-packages/torch/_inductor/ir.py:6509] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1217 20:19:26.165763 1053581 site-packages/torch/_inductor/ir.py:6509] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1217 20:19:26.239942 1053581 site-packages/torch/_inductor/ir.py:6509] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1217 20:19:26.241449 1053581 site-packages/torch/_inductor/ir.py:6509] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1217 20:19:26.260365 1053581 site-packages/torch/_inductor/ir.py:6509] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1217 20:19:26.351793 1053581 site-packages/torch/_inductor/ir.py:6509] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1217 20:19:26.353021 1053581 site-packages/torch/_inductor/ir.py:6509] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1217 20:19:26.353847 1053581 site-packages/torch/_inductor/ir.py:6509] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1217 20:19:26.574204 1053581 site-packages/torch/_inductor/ir.py:6509] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1217 20:19:26.632282 1053581 site-packages/torch/_inductor/ir.py:6509] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1217 20:19:26.633995 1053581 site-packages/torch/_inductor/ir.py:6509] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1217 20:19:26.653604 1053581 site-packages/torch/_inductor/ir.py:6509] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1217 20:19:26.719658 1053581 site-packages/torch/_inductor/ir.py:6509] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1217 20:19:26.760519 1053581 site-packages/torch/_inductor/ir.py:6509] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1217 20:19:26.762005 1053581 site-packages/torch/_inductor/ir.py:6509] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1217 20:19:26.950838 1053581 site-packages/torch/_inductor/ir.py:6509] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1217 20:19:27.030345 1053581 site-packages/torch/_inductor/ir.py:6509] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1217 20:19:27.032096 1053581 site-packages/torch/_inductor/ir.py:6509] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1217 20:19:27.055326 1053581 site-packages/torch/_inductor/ir.py:6509] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1217 20:19:27.159719 1053581 site-packages/torch/_inductor/ir.py:6509] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1217 20:19:27.161078 1053581 site-packages/torch/_inductor/ir.py:6509] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1217 20:19:27.161937 1053581 site-packages/torch/_inductor/ir.py:6509] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1217 20:19:27.400845 1053581 site-packages/torch/_inductor/ir.py:6509] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1217 20:19:27.460188 1053581 site-packages/torch/_inductor/ir.py:6509] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1217 20:19:27.461902 1053581 site-packages/torch/_inductor/ir.py:6509] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1217 20:19:27.482922 1053581 site-packages/torch/_inductor/ir.py:6509] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1217 20:19:27.551649 1053581 site-packages/torch/_inductor/ir.py:6509] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1217 20:19:27.592018 1053581 site-packages/torch/_inductor/ir.py:6509] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1217 20:19:27.593478 1053581 site-packages/torch/_inductor/ir.py:6509] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1217 20:19:27.780360 1053581 site-packages/torch/_inductor/ir.py:6509] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1217 20:19:27.850134 1053581 site-packages/torch/_inductor/ir.py:6509] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1217 20:19:27.851500 1053581 site-packages/torch/_inductor/ir.py:6509] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1217 20:19:27.869624 1053581 site-packages/torch/_inductor/ir.py:6509] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1217 20:19:27.917501 1053581 site-packages/torch/_inductor/ir.py:6509] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+/tmp/cacxvxtbss2letr77wffh2m4vz25ibkjzgvre3q6tcknepfmvpry.cpp: In function ‘void cpp_fused__safe_softmax__weight_int4pack_mm_for_cpu_add_bmm_index_index_put_scalar_tensor_stack_where_1(const bfloat16*, const int32_t*, const bfloat16*, const bfloat16*, const float*, const float*, const bool*, const bfloat16*, float*, float*, float*, float*, bfloat16*, float*, bool*, float*, float*, float*, bfloat16*, float*, bfloat16*)’:
+/tmp/cacxvxtbss2letr77wffh2m4vz25ibkjzgvre3q6tcknepfmvpry.cpp:738:31: warning: variable ‘tmp_acc0_arr’ set but not used [-Wunused-but-set-variable]
+  738 |                         float tmp_acc0_arr[32];
+      |                               ^~~~~~~~~~~~
+/tmp/cacxvxtbss2letr77wffh2m4vz25ibkjzgvre3q6tcknepfmvpry.cpp: In function ‘void cpp_fused__safe_softmax__weight_int4pack_mm_for_cpu_add_bmm_index_index_put_scalar_tensor_stack_where_6(const bfloat16*, const int32_t*, const bfloat16*, const float*, const float*, const bool*, const bfloat16*, float*, float*, bfloat16*, float*, bool*, float*, float*, float*, bfloat16*, float*, bfloat16*)’:
+/tmp/cacxvxtbss2letr77wffh2m4vz25ibkjzgvre3q6tcknepfmvpry.cpp:1274:31: warning: variable ‘tmp_acc0_arr’ set but not used [-Wunused-but-set-variable]
+ 1274 |                         float tmp_acc0_arr[32];
+      |                               ^~~~~~~~~~~~
+/tmp/cacxvxtbss2letr77wffh2m4vz25ibkjzgvre3q6tcknepfmvpry.cpp: In function ‘void cpp_fused__safe_softmax__weight_int4pack_mm_for_cpu_add_bmm_index_index_put_scalar_tensor_stack_where_10(const bfloat16*, const int32_t*, const bfloat16*, const bfloat16*, const float*, const float*, const bool*, const bfloat16*, float*, float*, float*, float*, bfloat16*, float*, bool*, float*, float*, float*, bfloat16*, float*, bfloat16*)’:
+/tmp/cacxvxtbss2letr77wffh2m4vz25ibkjzgvre3q6tcknepfmvpry.cpp:1780:31: warning: variable ‘tmp_acc0_arr’ set but not used [-Wunused-but-set-variable]
+ 1780 |                         float tmp_acc0_arr[32];
+      |                               ^~~~~~~~~~~~
+/tmp/cacxvxtbss2letr77wffh2m4vz25ibkjzgvre3q6tcknepfmvpry.cpp: In function ‘void cpp_fused__safe_softmax__weight_int4pack_mm_for_cpu_add_bmm_index_index_put_scalar_tensor_stack_where_15(const bfloat16*, const int32_t*, const bfloat16*, const float*, const float*, const bool*, const bfloat16*, float*, float*, bfloat16*, float*, bool*, float*, float*, float*, bfloat16*, float*, bfloat16*)’:
+/tmp/cacxvxtbss2letr77wffh2m4vz25ibkjzgvre3q6tcknepfmvpry.cpp:2292:31: warning: variable ‘tmp_acc0_arr’ set but not used [-Wunused-but-set-variable]
+ 2292 |                         float tmp_acc0_arr[32];
+      |                               ^~~~~~~~~~~~
+/tmp/cacxvxtbss2letr77wffh2m4vz25ibkjzgvre3q6tcknepfmvpry.cpp: In function ‘void cpp_fused__safe_softmax__weight_int4pack_mm_for_cpu_add_bmm_index_index_put_scalar_tensor_stack_where_19(const bfloat16*, const int32_t*, const bfloat16*, const bfloat16*, const float*, const float*, const bool*, const bfloat16*, float*, float*, float*, float*, bfloat16*, float*, bool*, float*, float*, float*, bfloat16*, float*, bfloat16*)’:
+/tmp/cacxvxtbss2letr77wffh2m4vz25ibkjzgvre3q6tcknepfmvpry.cpp:2792:31: warning: variable ‘tmp_acc0_arr’ set but not used [-Wunused-but-set-variable]
+ 2792 |                         float tmp_acc0_arr[32];
+      |                               ^~~~~~~~~~~~
+/tmp/cacxvxtbss2letr77wffh2m4vz25ibkjzgvre3q6tcknepfmvpry.cpp: In function ‘void cpp_fused__safe_softmax__weight_int4pack_mm_for_cpu_add_bmm_index_index_put_scalar_tensor_stack_where_24(const bfloat16*, const int32_t*, const bfloat16*, const float*, const float*, const bool*, const bfloat16*, float*, float*, bfloat16*, float*, bool*, float*, float*, float*, bfloat16*, float*, bfloat16*)’:
+/tmp/cacxvxtbss2letr77wffh2m4vz25ibkjzgvre3q6tcknepfmvpry.cpp:3304:31: warning: variable ‘tmp_acc0_arr’ set but not used [-Wunused-but-set-variable]
+ 3304 |                         float tmp_acc0_arr[32];
+      |                               ^~~~~~~~~~~~
+/tmp/cacxvxtbss2letr77wffh2m4vz25ibkjzgvre3q6tcknepfmvpry.cpp: In function ‘void cpp_fused__safe_softmax__weight_int4pack_mm_for_cpu_add_bmm_index_index_put_scalar_tensor_stack_where_28(const bfloat16*, const int32_t*, const bfloat16*, const bfloat16*, const float*, const float*, const bool*, const bfloat16*, float*, float*, float*, float*, bfloat16*, float*, bool*, float*, float*, float*, bfloat16*, float*, bfloat16*)’:
+/tmp/cacxvxtbss2letr77wffh2m4vz25ibkjzgvre3q6tcknepfmvpry.cpp:3804:31: warning: variable ‘tmp_acc0_arr’ set but not used [-Wunused-but-set-variable]
+ 3804 |                         float tmp_acc0_arr[32];
+      |                               ^~~~~~~~~~~~
+/tmp/cacxvxtbss2letr77wffh2m4vz25ibkjzgvre3q6tcknepfmvpry.cpp: In function ‘void cpp_fused__safe_softmax__weight_int4pack_mm_for_cpu_add_bmm_index_index_put_scalar_tensor_stack_where_33(const bfloat16*, const int32_t*, const bfloat16*, const float*, const float*, const bool*, const bfloat16*, float*, float*, bfloat16*, float*, bool*, float*, float*, float*, bfloat16*, float*, bfloat16*)’:
+/tmp/cacxvxtbss2letr77wffh2m4vz25ibkjzgvre3q6tcknepfmvpry.cpp:4316:31: warning: variable ‘tmp_acc0_arr’ set but not used [-Wunused-but-set-variable]
+ 4316 |                         float tmp_acc0_arr[32];
+      |                               ^~~~~~~~~~~~
+/tmp/cacxvxtbss2letr77wffh2m4vz25ibkjzgvre3q6tcknepfmvpry.cpp: In function ‘void cpp_fused__safe_softmax__weight_int4pack_mm_for_cpu_add_bmm_index_index_put_scalar_tensor_stack_where_37(const bfloat16*, const int32_t*, const bfloat16*, const bfloat16*, const float*, const float*, const bool*, const bfloat16*, float*, float*, float*, float*, bfloat16*, float*, bool*, float*, float*, float*, bfloat16*, float*, bfloat16*)’:
+/tmp/cacxvxtbss2letr77wffh2m4vz25ibkjzgvre3q6tcknepfmvpry.cpp:4816:31: warning: variable ‘tmp_acc0_arr’ set but not used [-Wunused-but-set-variable]
+ 4816 |                         float tmp_acc0_arr[32];
+      |                               ^~~~~~~~~~~~
+/tmp/cacxvxtbss2letr77wffh2m4vz25ibkjzgvre3q6tcknepfmvpry.cpp: In function ‘void cpp_fused__safe_softmax__weight_int4pack_mm_for_cpu_add_bmm_index_index_put_scalar_tensor_stack_where_42(const bfloat16*, const int32_t*, const bfloat16*, const float*, const float*, const bool*, const bfloat16*, float*, float*, bfloat16*, float*, bool*, float*, float*, float*, bfloat16*, float*, bfloat16*)’:
+/tmp/cacxvxtbss2letr77wffh2m4vz25ibkjzgvre3q6tcknepfmvpry.cpp:5328:31: warning: variable ‘tmp_acc0_arr’ set but not used [-Wunused-but-set-variable]
+ 5328 |                         float tmp_acc0_arr[32];
+      |                               ^~~~~~~~~~~~
+/tmp/cacxvxtbss2letr77wffh2m4vz25ibkjzgvre3q6tcknepfmvpry.cpp: In function ‘void cpp_fused__safe_softmax__weight_int4pack_mm_for_cpu_add_bmm_index_index_put_scalar_tensor_stack_where_46(const bfloat16*, const int32_t*, const bfloat16*, const bfloat16*, const float*, const float*, const bool*, const bfloat16*, float*, float*, float*, float*, bfloat16*, float*, bool*, float*, float*, float*, bfloat16*, float*, bfloat16*)’:
+/tmp/cacxvxtbss2letr77wffh2m4vz25ibkjzgvre3q6tcknepfmvpry.cpp:5828:31: warning: variable ‘tmp_acc0_arr’ set but not used [-Wunused-but-set-variable]
+ 5828 |                         float tmp_acc0_arr[32];
+      |                               ^~~~~~~~~~~~
+/tmp/cacxvxtbss2letr77wffh2m4vz25ibkjzgvre3q6tcknepfmvpry.cpp: In function ‘void cpp_fused__safe_softmax__weight_int4pack_mm_for_cpu_add_bmm_index_index_put_scalar_tensor_stack_where_51(const bfloat16*, const int32_t*, const bfloat16*, const float*, const float*, const bool*, const bfloat16*, float*, float*, bfloat16*, float*, bool*, float*, float*, float*, bfloat16*, float*, bfloat16*)’:
+/tmp/cacxvxtbss2letr77wffh2m4vz25ibkjzgvre3q6tcknepfmvpry.cpp:6340:31: warning: variable ‘tmp_acc0_arr’ set but not used [-Wunused-but-set-variable]
+ 6340 |                         float tmp_acc0_arr[32];
+      |                               ^~~~~~~~~~~~
+/tmp/cacxvxtbss2letr77wffh2m4vz25ibkjzgvre3q6tcknepfmvpry.cpp: In function ‘void cpp_fused__safe_softmax__weight_int4pack_mm_for_cpu_add_bmm_index_index_put_scalar_tensor_stack_where_55(const bfloat16*, const int32_t*, const bfloat16*, const bfloat16*, const float*, const float*, const bool*, const bfloat16*, float*, float*, float*, float*, bfloat16*, float*, bool*, float*, float*, float*, bfloat16*, float*, bfloat16*)’:
+/tmp/cacxvxtbss2letr77wffh2m4vz25ibkjzgvre3q6tcknepfmvpry.cpp:6840:31: warning: variable ‘tmp_acc0_arr’ set but not used [-Wunused-but-set-variable]
+ 6840 |                         float tmp_acc0_arr[32];
+      |                               ^~~~~~~~~~~~
+/tmp/cacxvxtbss2letr77wffh2m4vz25ibkjzgvre3q6tcknepfmvpry.cpp: In function ‘void cpp_fused__safe_softmax__weight_int4pack_mm_for_cpu_add_bmm_index_index_put_scalar_tensor_stack_where_60(const bfloat16*, const int32_t*, const bfloat16*, const float*, const float*, const bool*, const bfloat16*, float*, float*, bfloat16*, float*, bool*, float*, float*, float*, bfloat16*, float*, bfloat16*)’:
+/tmp/cacxvxtbss2letr77wffh2m4vz25ibkjzgvre3q6tcknepfmvpry.cpp:7352:31: warning: variable ‘tmp_acc0_arr’ set but not used [-Wunused-but-set-variable]
+ 7352 |                         float tmp_acc0_arr[32];
+      |                               ^~~~~~~~~~~~
+/tmp/cacxvxtbss2letr77wffh2m4vz25ibkjzgvre3q6tcknepfmvpry.cpp: In function ‘void cpp_fused__safe_softmax__weight_int4pack_mm_for_cpu_add_bmm_index_index_put_scalar_tensor_stack_where_64(const bfloat16*, const int32_t*, const bfloat16*, const bfloat16*, const float*, const float*, const bool*, const bfloat16*, float*, float*, float*, float*, bfloat16*, float*, bool*, float*, float*, float*, bfloat16*, float*, bfloat16*)’:
+/tmp/cacxvxtbss2letr77wffh2m4vz25ibkjzgvre3q6tcknepfmvpry.cpp:7852:31: warning: variable ‘tmp_acc0_arr’ set but not used [-Wunused-but-set-variable]
+ 7852 |                         float tmp_acc0_arr[32];
+      |                               ^~~~~~~~~~~~
+/tmp/cacxvxtbss2letr77wffh2m4vz25ibkjzgvre3q6tcknepfmvpry.cpp: In function ‘void cpp_fused__safe_softmax__weight_int4pack_mm_for_cpu_add_bmm_index_index_put_scalar_tensor_stack_where_69(const bfloat16*, const int32_t*, const bfloat16*, const float*, const float*, const bool*, const bfloat16*, float*, float*, bfloat16*, float*, bool*, float*, float*, float*, bfloat16*, float*, bfloat16*)’:
+/tmp/cacxvxtbss2letr77wffh2m4vz25ibkjzgvre3q6tcknepfmvpry.cpp:8364:31: warning: variable ‘tmp_acc0_arr’ set but not used [-Wunused-but-set-variable]
+ 8364 |                         float tmp_acc0_arr[32];
+      |                               ^~~~~~~~~~~~
+/tmp/cacxvxtbss2letr77wffh2m4vz25ibkjzgvre3q6tcknepfmvpry.cpp: In function ‘void cpp_fused__safe_softmax__weight_int4pack_mm_for_cpu_add_bmm_index_index_put_scalar_tensor_stack_where_73(const bfloat16*, const int32_t*, const bfloat16*, const bfloat16*, const float*, const float*, const bool*, const bfloat16*, float*, float*, float*, float*, bfloat16*, float*, bool*, float*, float*, float*, bfloat16*, float*, bfloat16*)’:
+/tmp/cacxvxtbss2letr77wffh2m4vz25ibkjzgvre3q6tcknepfmvpry.cpp:8864:31: warning: variable ‘tmp_acc0_arr’ set but not used [-Wunused-but-set-variable]
+ 8864 |                         float tmp_acc0_arr[32];
+      |                               ^~~~~~~~~~~~
+/tmp/cacxvxtbss2letr77wffh2m4vz25ibkjzgvre3q6tcknepfmvpry.cpp: In function ‘void cpp_fused__safe_softmax__weight_int4pack_mm_for_cpu_add_bmm_index_index_put_scalar_tensor_stack_where_78(const bfloat16*, const int32_t*, const bfloat16*, const float*, const float*, const bool*, const bfloat16*, float*, float*, bfloat16*, float*, bool*, float*, float*, float*, bfloat16*, float*, bfloat16*)’:
+/tmp/cacxvxtbss2letr77wffh2m4vz25ibkjzgvre3q6tcknepfmvpry.cpp:9376:31: warning: variable ‘tmp_acc0_arr’ set but not used [-Wunused-but-set-variable]
+ 9376 |                         float tmp_acc0_arr[32];
+      |                               ^~~~~~~~~~~~
+/tmp/cacxvxtbss2letr77wffh2m4vz25ibkjzgvre3q6tcknepfmvpry.cpp: In function ‘void cpp_fused__safe_softmax__weight_int4pack_mm_for_cpu_add_bmm_index_index_put_scalar_tensor_stack_where_82(const bfloat16*, const int32_t*, const bfloat16*, const bfloat16*, const float*, const float*, const bool*, const bfloat16*, float*, float*, float*, float*, bfloat16*, float*, bool*, float*, float*, float*, bfloat16*, float*, bfloat16*)’:
+/tmp/cacxvxtbss2letr77wffh2m4vz25ibkjzgvre3q6tcknepfmvpry.cpp:9876:31: warning: variable ‘tmp_acc0_arr’ set but not used [-Wunused-but-set-variable]
+ 9876 |                         float tmp_acc0_arr[32];
+      |                               ^~~~~~~~~~~~
+/tmp/cacxvxtbss2letr77wffh2m4vz25ibkjzgvre3q6tcknepfmvpry.cpp: In function ‘void cpp_fused__safe_softmax__weight_int4pack_mm_for_cpu_add_bmm_index_index_put_scalar_tensor_stack_where_87(const bfloat16*, const int32_t*, const bfloat16*, const float*, const float*, const bool*, const bfloat16*, float*, float*, bfloat16*, float*, bool*, float*, float*, float*, bfloat16*, float*, bfloat16*)’:
+/tmp/cacxvxtbss2letr77wffh2m4vz25ibkjzgvre3q6tcknepfmvpry.cpp:10388:31: warning: variable ‘tmp_acc0_arr’ set but not used [-Wunused-but-set-variable]
+10388 |                         float tmp_acc0_arr[32];
+      |                               ^~~~~~~~~~~~
+/tmp/cacxvxtbss2letr77wffh2m4vz25ibkjzgvre3q6tcknepfmvpry.cpp: In function ‘void cpp_fused__safe_softmax__weight_int4pack_mm_for_cpu_add_bmm_index_index_put_scalar_tensor_stack_where_91(const bfloat16*, const int32_t*, const bfloat16*, const bfloat16*, const float*, const float*, const bool*, const bfloat16*, float*, float*, float*, float*, bfloat16*, float*, bool*, float*, float*, float*, bfloat16*, float*, bfloat16*)’:
+/tmp/cacxvxtbss2letr77wffh2m4vz25ibkjzgvre3q6tcknepfmvpry.cpp:10888:31: warning: variable ‘tmp_acc0_arr’ set but not used [-Wunused-but-set-variable]
+10888 |                         float tmp_acc0_arr[32];
+      |                               ^~~~~~~~~~~~
+/tmp/cacxvxtbss2letr77wffh2m4vz25ibkjzgvre3q6tcknepfmvpry.cpp: In function ‘void cpp_fused__safe_softmax__weight_int4pack_mm_for_cpu_add_bmm_index_index_put_scalar_tensor_stack_where_96(const bfloat16*, const int32_t*, const bfloat16*, const float*, const float*, const bool*, const bfloat16*, float*, float*, bfloat16*, float*, bool*, float*, float*, float*, bfloat16*, float*, bfloat16*)’:
+/tmp/cacxvxtbss2letr77wffh2m4vz25ibkjzgvre3q6tcknepfmvpry.cpp:11400:31: warning: variable ‘tmp_acc0_arr’ set but not used [-Wunused-but-set-variable]
+11400 |                         float tmp_acc0_arr[32];
+      |                               ^~~~~~~~~~~~
+/tmp/cacxvxtbss2letr77wffh2m4vz25ibkjzgvre3q6tcknepfmvpry.cpp: In function ‘void cpp_fused__safe_softmax__weight_int4pack_mm_for_cpu_add_bmm_index_index_put_scalar_tensor_stack_where_100(const bfloat16*, const int32_t*, const bfloat16*, const bfloat16*, const float*, const float*, const bool*, const bfloat16*, float*, float*, float*, float*, bfloat16*, float*, bool*, float*, float*, float*, bfloat16*, float*, bfloat16*)’:
+/tmp/cacxvxtbss2letr77wffh2m4vz25ibkjzgvre3q6tcknepfmvpry.cpp:11900:31: warning: variable ‘tmp_acc0_arr’ set but not used [-Wunused-but-set-variable]
+11900 |                         float tmp_acc0_arr[32];
+      |                               ^~~~~~~~~~~~
+/tmp/cacxvxtbss2letr77wffh2m4vz25ibkjzgvre3q6tcknepfmvpry.cpp: In function ‘void cpp_fused__safe_softmax__weight_int4pack_mm_for_cpu_add_bmm_index_index_put_scalar_tensor_stack_where_105(const bfloat16*, const int32_t*, const bfloat16*, const float*, const float*, const bool*, const bfloat16*, float*, float*, bfloat16*, float*, bool*, float*, float*, float*, bfloat16*, float*, bfloat16*)’:
+/tmp/cacxvxtbss2letr77wffh2m4vz25ibkjzgvre3q6tcknepfmvpry.cpp:12412:31: warning: variable ‘tmp_acc0_arr’ set but not used [-Wunused-but-set-variable]
+12412 |                         float tmp_acc0_arr[32];
+      |                               ^~~~~~~~~~~~
+/tmp/cacxvxtbss2letr77wffh2m4vz25ibkjzgvre3q6tcknepfmvpry.cpp: In function ‘void cpp_fused__safe_softmax__weight_int4pack_mm_for_cpu_add_bmm_index_index_put_scalar_tensor_stack_where_109(const bfloat16*, const int32_t*, const bfloat16*, const bfloat16*, const float*, const float*, const bool*, const bfloat16*, float*, float*, float*, float*, bfloat16*, float*, bool*, float*, float*, float*, bfloat16*, float*, bfloat16*)’:
+/tmp/cacxvxtbss2letr77wffh2m4vz25ibkjzgvre3q6tcknepfmvpry.cpp:12912:31: warning: variable ‘tmp_acc0_arr’ set but not used [-Wunused-but-set-variable]
+12912 |                         float tmp_acc0_arr[32];
+      |                               ^~~~~~~~~~~~
+/tmp/cacxvxtbss2letr77wffh2m4vz25ibkjzgvre3q6tcknepfmvpry.cpp: In function ‘void cpp_fused__safe_softmax__weight_int4pack_mm_for_cpu_add_bmm_index_index_put_scalar_tensor_stack_where_114(const bfloat16*, const int32_t*, const bfloat16*, const float*, const float*, const bool*, const bfloat16*, float*, float*, bfloat16*, float*, bool*, float*, float*, float*, bfloat16*, float*, bfloat16*)’:
+/tmp/cacxvxtbss2letr77wffh2m4vz25ibkjzgvre3q6tcknepfmvpry.cpp:13424:31: warning: variable ‘tmp_acc0_arr’ set but not used [-Wunused-but-set-variable]
+13424 |                         float tmp_acc0_arr[32];
+      |                               ^~~~~~~~~~~~
+/tmp/cacxvxtbss2letr77wffh2m4vz25ibkjzgvre3q6tcknepfmvpry.cpp: In function ‘void cpp_fused__safe_softmax__weight_int4pack_mm_for_cpu_add_bmm_index_index_put_scalar_tensor_stack_where_118(const bfloat16*, const int32_t*, const bfloat16*, const bfloat16*, const float*, const float*, const bool*, const bfloat16*, float*, float*, float*, float*, bfloat16*, float*, bool*, float*, float*, float*, bfloat16*, float*, bfloat16*)’:
+/tmp/cacxvxtbss2letr77wffh2m4vz25ibkjzgvre3q6tcknepfmvpry.cpp:13924:31: warning: variable ‘tmp_acc0_arr’ set but not used [-Wunused-but-set-variable]
+13924 |                         float tmp_acc0_arr[32];
+      |                               ^~~~~~~~~~~~
+/tmp/cacxvxtbss2letr77wffh2m4vz25ibkjzgvre3q6tcknepfmvpry.cpp: In function ‘void cpp_fused__safe_softmax__weight_int4pack_mm_for_cpu_add_bmm_index_index_put_scalar_tensor_stack_where_123(const bfloat16*, const int32_t*, const bfloat16*, const float*, const float*, const bool*, const bfloat16*, float*, float*, bfloat16*, float*, bool*, float*, float*, float*, bfloat16*, float*, bfloat16*)’:
+/tmp/cacxvxtbss2letr77wffh2m4vz25ibkjzgvre3q6tcknepfmvpry.cpp:14436:31: warning: variable ‘tmp_acc0_arr’ set but not used [-Wunused-but-set-variable]
+14436 |                         float tmp_acc0_arr[32];
+      |                               ^~~~~~~~~~~~
+/tmp/cacxvxtbss2letr77wffh2m4vz25ibkjzgvre3q6tcknepfmvpry.cpp: In function ‘void cpp_fused__safe_softmax__weight_int4pack_mm_for_cpu_add_bmm_index_index_put_scalar_tensor_stack_where_127(const bfloat16*, const int32_t*, const bfloat16*, const bfloat16*, const float*, const float*, const bool*, const bfloat16*, float*, float*, float*, float*, bfloat16*, float*, bool*, float*, float*, float*, bfloat16*, float*, bfloat16*)’:
+/tmp/cacxvxtbss2letr77wffh2m4vz25ibkjzgvre3q6tcknepfmvpry.cpp:14936:31: warning: variable ‘tmp_acc0_arr’ set but not used [-Wunused-but-set-variable]
+14936 |                         float tmp_acc0_arr[32];
+      |                               ^~~~~~~~~~~~
+/tmp/cacxvxtbss2letr77wffh2m4vz25ibkjzgvre3q6tcknepfmvpry.cpp: In function ‘void cpp_fused__safe_softmax__weight_int4pack_mm_for_cpu_add_bmm_index_index_put_scalar_tensor_stack_where_132(const bfloat16*, const int32_t*, const bfloat16*, const float*, const float*, const bool*, const bfloat16*, float*, float*, bfloat16*, float*, bool*, float*, float*, float*, bfloat16*, float*, bfloat16*)’:
+/tmp/cacxvxtbss2letr77wffh2m4vz25ibkjzgvre3q6tcknepfmvpry.cpp:15448:31: warning: variable ‘tmp_acc0_arr’ set but not used [-Wunused-but-set-variable]
+15448 |                         float tmp_acc0_arr[32];
+      |                               ^~~~~~~~~~~~
+/tmp/cacxvxtbss2letr77wffh2m4vz25ibkjzgvre3q6tcknepfmvpry.cpp: In function ‘void cpp_fused__safe_softmax__weight_int4pack_mm_for_cpu_add_bmm_index_index_put_scalar_tensor_stack_where_136(const bfloat16*, const int32_t*, const bfloat16*, const bfloat16*, const float*, const float*, const bool*, const bfloat16*, float*, float*, float*, float*, bfloat16*, float*, bool*, float*, float*, float*, bfloat16*, float*, bfloat16*)’:
+/tmp/cacxvxtbss2letr77wffh2m4vz25ibkjzgvre3q6tcknepfmvpry.cpp:15948:31: warning: variable ‘tmp_acc0_arr’ set but not used [-Wunused-but-set-variable]
+15948 |                         float tmp_acc0_arr[32];
+      |                               ^~~~~~~~~~~~
+/tmp/cacxvxtbss2letr77wffh2m4vz25ibkjzgvre3q6tcknepfmvpry.cpp: In function ‘void cpp_fused__safe_softmax__weight_int4pack_mm_for_cpu_add_bmm_index_index_put_scalar_tensor_stack_where_141(const bfloat16*, const int32_t*, const bfloat16*, const float*, const float*, const bool*, const bfloat16*, float*, float*, bfloat16*, float*, bool*, float*, float*, float*, bfloat16*, float*, bfloat16*)’:
+/tmp/cacxvxtbss2letr77wffh2m4vz25ibkjzgvre3q6tcknepfmvpry.cpp:16460:31: warning: variable ‘tmp_acc0_arr’ set but not used [-Wunused-but-set-variable]
+16460 |                         float tmp_acc0_arr[32];
+      |                               ^~~~~~~~~~~~
+Unabled to import torchao experimental quant_api with error:  [Errno 2] No such file or directory: '/home/jackkhuu/oss/torchchat/torchao-build/src/ao/torchao/experimental/quant_api.py'
+Using device=cpu
+Setting max_seq_length to 300 for DSO export.
+Loading model...
+Time to load model: 0.11 seconds
+Quantizing the model with: {'linear:int4': {'groupsize': 256}, 'precision': {'dtype': 'bfloat16'}, 'executor': {'accelerator': 'cpu'}}
+Time to quantize model: 48.25 seconds
+-----------------------------------------------------------
+Exporting model using AOT Inductor to /tmp/model34.so
+WARNING!! The path of compiling a dso is deprecated. Please use --output-aoti-package-path to create a .pt2 artifact instead.
+The generated packaged model can be found at: /tmp/model34.so
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+OMP_NUM_THREADS=16 numactl --cpunodebind=0 --membind=0 python3 torchchat.py generate llama3.1 --dso-path /tmp/model34.so --prompt "Once upon a time," --max-new-tokens 256 --device cpu --num-samples 3
+PyTorch version 2.6.0.dev20241213+cu124 available.
+W1217 20:21:12.363919 1077533 site-packages/torch/_export/__init__.py:276] +============================+
+W1217 20:21:12.364454 1077533 site-packages/torch/_export/__init__.py:277] |     !!!   WARNING   !!!    |
+W1217 20:21:12.364640 1077533 site-packages/torch/_export/__init__.py:278] +============================+
+W1217 20:21:12.364830 1077533 site-packages/torch/_export/__init__.py:279] torch._export.aot_compile()/torch._export.aot_load() is being deprecated, please switch to directly calling torch._inductor.aoti_compile_and_package(torch.export.export())/torch._inductor.aoti_load_package() instead.
+[E1217 20:21:12.110468883 shim_common.cpp:1155] Exception in aoti_torch: Unable to find a proxy executor to run custom ops. Please check if there is a json file generated in the same directory as the so, or use torch._inductor.aoti_compile_and_package to package everything into a PT2 artifact.
+[E1217 20:21:12.110509388 shim_common.cpp:1155] Exception in aoti_torch: Unable to find a proxy executor to run custom ops. Please check if there is a json file generated in the same directory as the so, or use torch._inductor.aoti_compile_and_package to package everything into a PT2 artifact.
+[E1217 20:21:12.110517383 shim_common.cpp:1155] Exception in aoti_torch: Unable to find a proxy executor to run custom ops. Please check if there is a json file generated in the same directory as the so, or use torch._inductor.aoti_compile_and_package to package everything into a PT2 artifact.
+[E1217 20:21:12.110976481 shim_common.cpp:246] Exception in aoti_torch: Cannot access data pointer of Tensor that doesn't have storage
+Exception raised from throw_data_ptr_access_error at /pytorch/c10/core/TensorImpl.cpp:309 (most recent call first):
+frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x96 (0x7f8670b6c1b6 in /home/jackkhuu/.conda/envs/script/lib/python3.10/site-packages/torch/lib/libc10.so)
+frame #1: c10::detail::torchCheckFail(char const*, char const*, unsigned int, char const*) + 0x68 (0x7f8670b15b3f in /home/jackkhuu/.conda/envs/script/lib/python3.10/site-packages/torch/lib/libc10.so)
+frame #2: c10::TensorImpl::throw_data_ptr_access_error() const + 0x34 (0x7f8670b448e4 in /home/jackkhuu/.conda/envs/script/lib/python3.10/site-packages/torch/lib/libc10.so)
+frame #3: aoti_torch_get_data_ptr + 0xd8 (0x7f86613468a8 in /home/jackkhuu/.conda/envs/script/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so)
+frame #4: torch::aot_inductor::AOTInductorModel::run_impl(AtenTensorOpaque**, AtenTensorOpaque**, void*, AOTIProxyExecutorOpaque*) + 0x46c8 (0x7f851de02e18 in /tmp/model34.so)
+frame #5: torch::aot_inductor::AOTInductorModelContainer::run(AtenTensorOpaque**, AtenTensorOpaque**, void*, AOTIProxyExecutorOpaque*) + 0xd7 (0x7f851de5b0d7 in /tmp/model34.so)
+frame #6: AOTInductorModelContainerRun + 0x6a (0x7f851de35dda in /tmp/model34.so)
+frame #7: torch::inductor::AOTIModelContainerRunner::run(std::vector<at::Tensor, std::allocator<at::Tensor> > const&, void*) + 0xb5 (0x7f8661337355 in /home/jackkhuu/.conda/envs/script/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so)
+frame #8: torch::inductor::AOTIModelContainerRunnerCpu::run(std::vector<at::Tensor, std::allocator<at::Tensor> > const&, void*) + 0xa (0x7f86613388da in /home/jackkhuu/.conda/envs/script/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so)
+frame #9: <unknown function> + 0x9f1fb4 (0x7f86715f1fb4 in /home/jackkhuu/.conda/envs/script/lib/python3.10/site-packages/torch/lib/libtorch_python.so)
+frame #10: <unknown function> + 0x518bd7 (0x7f8671118bd7 in /home/jackkhuu/.conda/envs/script/lib/python3.10/site-packages/torch/lib/libtorch_python.so)
+<omitting python frames>
+frame #51: <unknown function> + 0x295d0 (0x7f86738295d0 in /lib64/libc.so.6)
+frame #52: __libc_start_main + 0x80 (0x7f8673829680 in /lib64/libc.so.6)
+
+Error: aoti_torch_get_data_ptr(handle_.get(), &result) API call failed at /home/jackkhuu/.conda/envs/script/lib/python3.10/site-packages/torch/include/torch/csrc/inductor/aoti_runtime/utils.h, line 117
+Unabled to import torchao experimental quant_api with error:  [Errno 2] No such file or directory: '/home/jackkhuu/oss/torchchat/torchao-build/src/ao/torchao/experimental/quant_api.py'
+Warning: checkpoint path ignored because an exported model was specified using a DSO, AOTI PACKAGE or PTE path argument
+Warning: checkpoint path ignored because an exported model was specified using a DSO, AOTI PACKAGE or PTE path argument
+Using device=cpu Intel(R) Xeon(R) Platinum 8339HC CPU @ 1.80GHz
+Loading model...
+Time to load model: 0.51 seconds
+-----------------------------------------------------------
+Traceback (most recent call last):
+  File "/home/jackkhuu/oss/torchchat/torchchat.py", line 96, in <module>
+    generate_main(args)
+  File "/home/jackkhuu/oss/torchchat/torchchat/generate.py", line 1247, in main
+    for _ in gen.chat(generator_args):
+  File "/home/jackkhuu/oss/torchchat/torchchat/generate.py", line 1116, in chat
+    for token_tensor, metrics in generator_func:
+  File "/home/jackkhuu/.conda/envs/script/lib/python3.10/site-packages/torch/utils/_contextlib.py", line 36, in generator_context
+    response = gen.send(None)
+  File "/home/jackkhuu/oss/torchchat/torchchat/generate.py", line 647, in generate
+    next_token = self.prefill(
+  File "/home/jackkhuu/oss/torchchat/torchchat/generate.py", line 398, in prefill
+    logits = model(x_sliced, ip_sliced)  # (x[:, i], input_pos[i])da
+  File "/home/jackkhuu/.conda/envs/script/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1739, in _wrapped_call_impl
+    return self._call_impl(*args, **kwargs)
+  File "/home/jackkhuu/.conda/envs/script/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1750, in _call_impl
+    return forward_call(*args, **kwargs)
+  File "/home/jackkhuu/.conda/envs/script/lib/python3.10/site-packages/torch/_export/__init__.py", line 387, in optimized
+    flat_outputs = runner.run(flat_inputs)  # type: ignore[attr-defined]
+RuntimeError: run_func_( container_handle_, input_handles.data(), input_handles.size(), output_handles.data(), output_handles.size(), reinterpret_cast<AOTInductorStreamHandle>(stream_handle), proxy_executor_handle_) API call failed at /pytorch/torch/csrc/inductor/aoti_runner/model_container_runner.cpp, line 107
diff --git a/llama31-1213/cpu_aoti_8.txt b/llama31-1213/cpu_aoti_8.txt
new file mode 100644
index 000000000..830cff24e
--- /dev/null
+++ b/llama31-1213/cpu_aoti_8.txt
@@ -0,0 +1,223 @@
+python3 torchchat.py export llama3.1 --quantize '{"linear:int8": {"groupsize": 0}, "precision": {"dtype":"bfloat16"}, "executor":{"accelerator":"cpu"}}' --output-dso-path /tmp/model8.so
+OMP_NUM_THREADS=16 numactl --cpunodebind=0 --membind=0 python3 torchchat.py generate llama3.1 --dso-path /tmp/model8.so --prompt "Once upon a time," --max-new-tokens 256 --device cpu --num-samples 3
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+python3 torchchat.py export llama3.1 --quantize '{"linear:int8": {"groupsize": 0}, "precision": {"dtype":"bfloat16"}, "executor":{"accelerator":"cpu"}}' --output-dso-path /tmp/model8.so
+Note: NumExpr detected 22 cores but "NUMEXPR_MAX_THREADS" not set, so enforcing safe limit of 16.
+NumExpr defaulting to 16 threads.
+PyTorch version 2.6.0.dev20241213+cu124 available.
+W1217 19:51:35.699521 755277 site-packages/torch/_export/__init__.py:276] +============================+
+W1217 19:51:35.700027 755277 site-packages/torch/_export/__init__.py:277] |     !!!   WARNING   !!!    |
+W1217 19:51:35.700248 755277 site-packages/torch/_export/__init__.py:278] +============================+
+W1217 19:51:35.700433 755277 site-packages/torch/_export/__init__.py:279] torch._export.aot_compile()/torch._export.aot_load() is being deprecated, please switch to directly calling torch._inductor.aoti_compile_and_package(torch.export.export())/torch._inductor.aoti_load_package() instead.
+/tmp/cpqnh5uhfu4as5ixqda4akm5lbr5qomk5ziwaslzxynceuqnsbsr.cpp: In function ‘void cpp_fused__safe_softmax__to_copy_add_bmm_index_index_put_scalar_tensor_stack_where_2(const bfloat16*, const int32_t*, const bfloat16*, const float*, const float*, const bool*, const bfloat16*, float*, float*, bfloat16*, float*, bool*, float*, float*, float*, bfloat16*, float*, bfloat16*)’:
+/tmp/cpqnh5uhfu4as5ixqda4akm5lbr5qomk5ziwaslzxynceuqnsbsr.cpp:740:31: warning: variable ‘tmp_acc0_arr’ set but not used [-Wunused-but-set-variable]
+  740 |                         float tmp_acc0_arr[32];
+      |                               ^~~~~~~~~~~~
+/tmp/cpqnh5uhfu4as5ixqda4akm5lbr5qomk5ziwaslzxynceuqnsbsr.cpp: In function ‘void cpp_fused__safe_softmax__to_copy_add_bmm_index_index_put_scalar_tensor_stack_where_7(const bfloat16*, const int32_t*, const bfloat16*, const float*, const float*, const bool*, const bfloat16*, float*, float*, bfloat16*, float*, bool*, float*, float*, float*, bfloat16*, float*, bfloat16*)’:
+/tmp/cpqnh5uhfu4as5ixqda4akm5lbr5qomk5ziwaslzxynceuqnsbsr.cpp:1276:31: warning: variable ‘tmp_acc0_arr’ set but not used [-Wunused-but-set-variable]
+ 1276 |                         float tmp_acc0_arr[32];
+      |                               ^~~~~~~~~~~~
+/tmp/cpqnh5uhfu4as5ixqda4akm5lbr5qomk5ziwaslzxynceuqnsbsr.cpp: In function ‘void cpp_fused__safe_softmax__to_copy_add_bmm_index_index_put_scalar_tensor_stack_where_12(const bfloat16*, const int32_t*, const bfloat16*, const float*, const float*, const bool*, const bfloat16*, float*, float*, bfloat16*, float*, bool*, float*, float*, float*, bfloat16*, float*, bfloat16*)’:
+/tmp/cpqnh5uhfu4as5ixqda4akm5lbr5qomk5ziwaslzxynceuqnsbsr.cpp:1782:31: warning: variable ‘tmp_acc0_arr’ set but not used [-Wunused-but-set-variable]
+ 1782 |                         float tmp_acc0_arr[32];
+      |                               ^~~~~~~~~~~~
+/tmp/cpqnh5uhfu4as5ixqda4akm5lbr5qomk5ziwaslzxynceuqnsbsr.cpp: In function ‘void cpp_fused__safe_softmax__to_copy_add_bmm_index_index_put_scalar_tensor_stack_where_17(const bfloat16*, const int32_t*, const bfloat16*, const float*, const float*, const bool*, const bfloat16*, float*, float*, bfloat16*, float*, bool*, float*, float*, float*, bfloat16*, float*, bfloat16*)’:
+/tmp/cpqnh5uhfu4as5ixqda4akm5lbr5qomk5ziwaslzxynceuqnsbsr.cpp:2294:31: warning: variable ‘tmp_acc0_arr’ set but not used [-Wunused-but-set-variable]
+ 2294 |                         float tmp_acc0_arr[32];
+      |                               ^~~~~~~~~~~~
+/tmp/cpqnh5uhfu4as5ixqda4akm5lbr5qomk5ziwaslzxynceuqnsbsr.cpp: In function ‘void cpp_fused__safe_softmax__to_copy_add_bmm_index_index_put_scalar_tensor_stack_where_22(const bfloat16*, const int32_t*, const bfloat16*, const float*, const float*, const bool*, const bfloat16*, float*, float*, bfloat16*, float*, bool*, float*, float*, float*, bfloat16*, float*, bfloat16*)’:
+/tmp/cpqnh5uhfu4as5ixqda4akm5lbr5qomk5ziwaslzxynceuqnsbsr.cpp:2794:31: warning: variable ‘tmp_acc0_arr’ set but not used [-Wunused-but-set-variable]
+ 2794 |                         float tmp_acc0_arr[32];
+      |                               ^~~~~~~~~~~~
+/tmp/cpqnh5uhfu4as5ixqda4akm5lbr5qomk5ziwaslzxynceuqnsbsr.cpp: In function ‘void cpp_fused__safe_softmax__to_copy_add_bmm_index_index_put_scalar_tensor_stack_where_27(const bfloat16*, const int32_t*, const bfloat16*, const float*, const float*, const bool*, const bfloat16*, float*, float*, bfloat16*, float*, bool*, float*, float*, float*, bfloat16*, float*, bfloat16*)’:
+/tmp/cpqnh5uhfu4as5ixqda4akm5lbr5qomk5ziwaslzxynceuqnsbsr.cpp:3306:31: warning: variable ‘tmp_acc0_arr’ set but not used [-Wunused-but-set-variable]
+ 3306 |                         float tmp_acc0_arr[32];
+      |                               ^~~~~~~~~~~~
+/tmp/cpqnh5uhfu4as5ixqda4akm5lbr5qomk5ziwaslzxynceuqnsbsr.cpp: In function ‘void cpp_fused__safe_softmax__to_copy_add_bmm_index_index_put_scalar_tensor_stack_where_32(const bfloat16*, const int32_t*, const bfloat16*, const float*, const float*, const bool*, const bfloat16*, float*, float*, bfloat16*, float*, bool*, float*, float*, float*, bfloat16*, float*, bfloat16*)’:
+/tmp/cpqnh5uhfu4as5ixqda4akm5lbr5qomk5ziwaslzxynceuqnsbsr.cpp:3806:31: warning: variable ‘tmp_acc0_arr’ set but not used [-Wunused-but-set-variable]
+ 3806 |                         float tmp_acc0_arr[32];
+      |                               ^~~~~~~~~~~~
+/tmp/cpqnh5uhfu4as5ixqda4akm5lbr5qomk5ziwaslzxynceuqnsbsr.cpp: In function ‘void cpp_fused__safe_softmax__to_copy_add_bmm_index_index_put_scalar_tensor_stack_where_37(const bfloat16*, const int32_t*, const bfloat16*, const float*, const float*, const bool*, const bfloat16*, float*, float*, bfloat16*, float*, bool*, float*, float*, float*, bfloat16*, float*, bfloat16*)’:
+/tmp/cpqnh5uhfu4as5ixqda4akm5lbr5qomk5ziwaslzxynceuqnsbsr.cpp:4318:31: warning: variable ‘tmp_acc0_arr’ set but not used [-Wunused-but-set-variable]
+ 4318 |                         float tmp_acc0_arr[32];
+      |                               ^~~~~~~~~~~~
+/tmp/cpqnh5uhfu4as5ixqda4akm5lbr5qomk5ziwaslzxynceuqnsbsr.cpp: In function ‘void cpp_fused__safe_softmax__to_copy_add_bmm_index_index_put_scalar_tensor_stack_where_42(const bfloat16*, const int32_t*, const bfloat16*, const float*, const float*, const bool*, const bfloat16*, float*, float*, bfloat16*, float*, bool*, float*, float*, float*, bfloat16*, float*, bfloat16*)’:
+/tmp/cpqnh5uhfu4as5ixqda4akm5lbr5qomk5ziwaslzxynceuqnsbsr.cpp:4818:31: warning: variable ‘tmp_acc0_arr’ set but not used [-Wunused-but-set-variable]
+ 4818 |                         float tmp_acc0_arr[32];
+      |                               ^~~~~~~~~~~~
+/tmp/cpqnh5uhfu4as5ixqda4akm5lbr5qomk5ziwaslzxynceuqnsbsr.cpp: In function ‘void cpp_fused__safe_softmax__to_copy_add_bmm_index_index_put_scalar_tensor_stack_where_47(const bfloat16*, const int32_t*, const bfloat16*, const float*, const float*, const bool*, const bfloat16*, float*, float*, bfloat16*, float*, bool*, float*, float*, float*, bfloat16*, float*, bfloat16*)’:
+/tmp/cpqnh5uhfu4as5ixqda4akm5lbr5qomk5ziwaslzxynceuqnsbsr.cpp:5330:31: warning: variable ‘tmp_acc0_arr’ set but not used [-Wunused-but-set-variable]
+ 5330 |                         float tmp_acc0_arr[32];
+      |                               ^~~~~~~~~~~~
+/tmp/cpqnh5uhfu4as5ixqda4akm5lbr5qomk5ziwaslzxynceuqnsbsr.cpp: In function ‘void cpp_fused__safe_softmax__to_copy_add_bmm_index_index_put_scalar_tensor_stack_where_52(const bfloat16*, const int32_t*, const bfloat16*, const float*, const float*, const bool*, const bfloat16*, float*, float*, bfloat16*, float*, bool*, float*, float*, float*, bfloat16*, float*, bfloat16*)’:
+/tmp/cpqnh5uhfu4as5ixqda4akm5lbr5qomk5ziwaslzxynceuqnsbsr.cpp:5830:31: warning: variable ‘tmp_acc0_arr’ set but not used [-Wunused-but-set-variable]
+ 5830 |                         float tmp_acc0_arr[32];
+      |                               ^~~~~~~~~~~~
+/tmp/cpqnh5uhfu4as5ixqda4akm5lbr5qomk5ziwaslzxynceuqnsbsr.cpp: In function ‘void cpp_fused__safe_softmax__to_copy_add_bmm_index_index_put_scalar_tensor_stack_where_57(const bfloat16*, const int32_t*, const bfloat16*, const float*, const float*, const bool*, const bfloat16*, float*, float*, bfloat16*, float*, bool*, float*, float*, float*, bfloat16*, float*, bfloat16*)’:
+/tmp/cpqnh5uhfu4as5ixqda4akm5lbr5qomk5ziwaslzxynceuqnsbsr.cpp:6342:31: warning: variable ‘tmp_acc0_arr’ set but not used [-Wunused-but-set-variable]
+ 6342 |                         float tmp_acc0_arr[32];
+      |                               ^~~~~~~~~~~~
+/tmp/cpqnh5uhfu4as5ixqda4akm5lbr5qomk5ziwaslzxynceuqnsbsr.cpp: In function ‘void cpp_fused__safe_softmax__to_copy_add_bmm_index_index_put_scalar_tensor_stack_where_62(const bfloat16*, const int32_t*, const bfloat16*, const float*, const float*, const bool*, const bfloat16*, float*, float*, bfloat16*, float*, bool*, float*, float*, float*, bfloat16*, float*, bfloat16*)’:
+/tmp/cpqnh5uhfu4as5ixqda4akm5lbr5qomk5ziwaslzxynceuqnsbsr.cpp:6842:31: warning: variable ‘tmp_acc0_arr’ set but not used [-Wunused-but-set-variable]
+ 6842 |                         float tmp_acc0_arr[32];
+      |                               ^~~~~~~~~~~~
+/tmp/cpqnh5uhfu4as5ixqda4akm5lbr5qomk5ziwaslzxynceuqnsbsr.cpp: In function ‘void cpp_fused__safe_softmax__to_copy_add_bmm_index_index_put_scalar_tensor_stack_where_67(const bfloat16*, const int32_t*, const bfloat16*, const float*, const float*, const bool*, const bfloat16*, float*, float*, bfloat16*, float*, bool*, float*, float*, float*, bfloat16*, float*, bfloat16*)’:
+/tmp/cpqnh5uhfu4as5ixqda4akm5lbr5qomk5ziwaslzxynceuqnsbsr.cpp:7354:31: warning: variable ‘tmp_acc0_arr’ set but not used [-Wunused-but-set-variable]
+ 7354 |                         float tmp_acc0_arr[32];
+      |                               ^~~~~~~~~~~~
+/tmp/cpqnh5uhfu4as5ixqda4akm5lbr5qomk5ziwaslzxynceuqnsbsr.cpp: In function ‘void cpp_fused__safe_softmax__to_copy_add_bmm_index_index_put_scalar_tensor_stack_where_72(const bfloat16*, const int32_t*, const bfloat16*, const float*, const float*, const bool*, const bfloat16*, float*, float*, bfloat16*, float*, bool*, float*, float*, float*, bfloat16*, float*, bfloat16*)’:
+/tmp/cpqnh5uhfu4as5ixqda4akm5lbr5qomk5ziwaslzxynceuqnsbsr.cpp:7854:31: warning: variable ‘tmp_acc0_arr’ set but not used [-Wunused-but-set-variable]
+ 7854 |                         float tmp_acc0_arr[32];
+      |                               ^~~~~~~~~~~~
+/tmp/cpqnh5uhfu4as5ixqda4akm5lbr5qomk5ziwaslzxynceuqnsbsr.cpp: In function ‘void cpp_fused__safe_softmax__to_copy_add_bmm_index_index_put_scalar_tensor_stack_where_77(const bfloat16*, const int32_t*, const bfloat16*, const float*, const float*, const bool*, const bfloat16*, float*, float*, bfloat16*, float*, bool*, float*, float*, float*, bfloat16*, float*, bfloat16*)’:
+/tmp/cpqnh5uhfu4as5ixqda4akm5lbr5qomk5ziwaslzxynceuqnsbsr.cpp:8366:31: warning: variable ‘tmp_acc0_arr’ set but not used [-Wunused-but-set-variable]
+ 8366 |                         float tmp_acc0_arr[32];
+      |                               ^~~~~~~~~~~~
+/tmp/cpqnh5uhfu4as5ixqda4akm5lbr5qomk5ziwaslzxynceuqnsbsr.cpp: In function ‘void cpp_fused__safe_softmax__to_copy_add_bmm_index_index_put_scalar_tensor_stack_where_82(const bfloat16*, const int32_t*, const bfloat16*, const float*, const float*, const bool*, const bfloat16*, float*, float*, bfloat16*, float*, bool*, float*, float*, float*, bfloat16*, float*, bfloat16*)’:
+/tmp/cpqnh5uhfu4as5ixqda4akm5lbr5qomk5ziwaslzxynceuqnsbsr.cpp:8866:31: warning: variable ‘tmp_acc0_arr’ set but not used [-Wunused-but-set-variable]
+ 8866 |                         float tmp_acc0_arr[32];
+      |                               ^~~~~~~~~~~~
+/tmp/cpqnh5uhfu4as5ixqda4akm5lbr5qomk5ziwaslzxynceuqnsbsr.cpp: In function ‘void cpp_fused__safe_softmax__to_copy_add_bmm_index_index_put_scalar_tensor_stack_where_87(const bfloat16*, const int32_t*, const bfloat16*, const float*, const float*, const bool*, const bfloat16*, float*, float*, bfloat16*, float*, bool*, float*, float*, float*, bfloat16*, float*, bfloat16*)’:
+/tmp/cpqnh5uhfu4as5ixqda4akm5lbr5qomk5ziwaslzxynceuqnsbsr.cpp:9378:31: warning: variable ‘tmp_acc0_arr’ set but not used [-Wunused-but-set-variable]
+ 9378 |                         float tmp_acc0_arr[32];
+      |                               ^~~~~~~~~~~~
+/tmp/cpqnh5uhfu4as5ixqda4akm5lbr5qomk5ziwaslzxynceuqnsbsr.cpp: In function ‘void cpp_fused__safe_softmax__to_copy_add_bmm_index_index_put_scalar_tensor_stack_where_92(const bfloat16*, const int32_t*, const bfloat16*, const float*, const float*, const bool*, const bfloat16*, float*, float*, bfloat16*, float*, bool*, float*, float*, float*, bfloat16*, float*, bfloat16*)’:
+/tmp/cpqnh5uhfu4as5ixqda4akm5lbr5qomk5ziwaslzxynceuqnsbsr.cpp:9878:31: warning: variable ‘tmp_acc0_arr’ set but not used [-Wunused-but-set-variable]
+ 9878 |                         float tmp_acc0_arr[32];
+      |                               ^~~~~~~~~~~~
+/tmp/cpqnh5uhfu4as5ixqda4akm5lbr5qomk5ziwaslzxynceuqnsbsr.cpp: In function ‘void cpp_fused__safe_softmax__to_copy_add_bmm_index_index_put_scalar_tensor_stack_where_97(const bfloat16*, const int32_t*, const bfloat16*, const float*, const float*, const bool*, const bfloat16*, float*, float*, bfloat16*, float*, bool*, float*, float*, float*, bfloat16*, float*, bfloat16*)’:
+/tmp/cpqnh5uhfu4as5ixqda4akm5lbr5qomk5ziwaslzxynceuqnsbsr.cpp:10390:31: warning: variable ‘tmp_acc0_arr’ set but not used [-Wunused-but-set-variable]
+10390 |                         float tmp_acc0_arr[32];
+      |                               ^~~~~~~~~~~~
+/tmp/cpqnh5uhfu4as5ixqda4akm5lbr5qomk5ziwaslzxynceuqnsbsr.cpp: In function ‘void cpp_fused__safe_softmax__to_copy_add_bmm_index_index_put_scalar_tensor_stack_where_102(const bfloat16*, const int32_t*, const bfloat16*, const float*, const float*, const bool*, const bfloat16*, float*, float*, bfloat16*, float*, bool*, float*, float*, float*, bfloat16*, float*, bfloat16*)’:
+/tmp/cpqnh5uhfu4as5ixqda4akm5lbr5qomk5ziwaslzxynceuqnsbsr.cpp:10890:31: warning: variable ‘tmp_acc0_arr’ set but not used [-Wunused-but-set-variable]
+10890 |                         float tmp_acc0_arr[32];
+      |                               ^~~~~~~~~~~~
+/tmp/cpqnh5uhfu4as5ixqda4akm5lbr5qomk5ziwaslzxynceuqnsbsr.cpp: In function ‘void cpp_fused__safe_softmax__to_copy_add_bmm_index_index_put_scalar_tensor_stack_where_107(const bfloat16*, const int32_t*, const bfloat16*, const float*, const float*, const bool*, const bfloat16*, float*, float*, bfloat16*, float*, bool*, float*, float*, float*, bfloat16*, float*, bfloat16*)’:
+/tmp/cpqnh5uhfu4as5ixqda4akm5lbr5qomk5ziwaslzxynceuqnsbsr.cpp:11402:31: warning: variable ‘tmp_acc0_arr’ set but not used [-Wunused-but-set-variable]
+11402 |                         float tmp_acc0_arr[32];
+      |                               ^~~~~~~~~~~~
+/tmp/cpqnh5uhfu4as5ixqda4akm5lbr5qomk5ziwaslzxynceuqnsbsr.cpp: In function ‘void cpp_fused__safe_softmax__to_copy_add_bmm_index_index_put_scalar_tensor_stack_where_112(const bfloat16*, const int32_t*, const bfloat16*, const float*, const float*, const bool*, const bfloat16*, float*, float*, bfloat16*, float*, bool*, float*, float*, float*, bfloat16*, float*, bfloat16*)’:
+/tmp/cpqnh5uhfu4as5ixqda4akm5lbr5qomk5ziwaslzxynceuqnsbsr.cpp:11902:31: warning: variable ‘tmp_acc0_arr’ set but not used [-Wunused-but-set-variable]
+11902 |                         float tmp_acc0_arr[32];
+      |                               ^~~~~~~~~~~~
+/tmp/cpqnh5uhfu4as5ixqda4akm5lbr5qomk5ziwaslzxynceuqnsbsr.cpp: In function ‘void cpp_fused__safe_softmax__to_copy_add_bmm_index_index_put_scalar_tensor_stack_where_117(const bfloat16*, const int32_t*, const bfloat16*, const float*, const float*, const bool*, const bfloat16*, float*, float*, bfloat16*, float*, bool*, float*, float*, float*, bfloat16*, float*, bfloat16*)’:
+/tmp/cpqnh5uhfu4as5ixqda4akm5lbr5qomk5ziwaslzxynceuqnsbsr.cpp:12414:31: warning: variable ‘tmp_acc0_arr’ set but not used [-Wunused-but-set-variable]
+12414 |                         float tmp_acc0_arr[32];
+      |                               ^~~~~~~~~~~~
+/tmp/cpqnh5uhfu4as5ixqda4akm5lbr5qomk5ziwaslzxynceuqnsbsr.cpp: In function ‘void cpp_fused__safe_softmax__to_copy_add_bmm_index_index_put_scalar_tensor_stack_where_122(const bfloat16*, const int32_t*, const bfloat16*, const float*, const float*, const bool*, const bfloat16*, float*, float*, bfloat16*, float*, bool*, float*, float*, float*, bfloat16*, float*, bfloat16*)’:
+/tmp/cpqnh5uhfu4as5ixqda4akm5lbr5qomk5ziwaslzxynceuqnsbsr.cpp:12914:31: warning: variable ‘tmp_acc0_arr’ set but not used [-Wunused-but-set-variable]
+12914 |                         float tmp_acc0_arr[32];
+      |                               ^~~~~~~~~~~~
+/tmp/cpqnh5uhfu4as5ixqda4akm5lbr5qomk5ziwaslzxynceuqnsbsr.cpp: In function ‘void cpp_fused__safe_softmax__to_copy_add_bmm_index_index_put_scalar_tensor_stack_where_127(const bfloat16*, const int32_t*, const bfloat16*, const float*, const float*, const bool*, const bfloat16*, float*, float*, bfloat16*, float*, bool*, float*, float*, float*, bfloat16*, float*, bfloat16*)’:
+/tmp/cpqnh5uhfu4as5ixqda4akm5lbr5qomk5ziwaslzxynceuqnsbsr.cpp:13426:31: warning: variable ‘tmp_acc0_arr’ set but not used [-Wunused-but-set-variable]
+13426 |                         float tmp_acc0_arr[32];
+      |                               ^~~~~~~~~~~~
+/tmp/cpqnh5uhfu4as5ixqda4akm5lbr5qomk5ziwaslzxynceuqnsbsr.cpp: In function ‘void cpp_fused__safe_softmax__to_copy_add_bmm_index_index_put_scalar_tensor_stack_where_132(const bfloat16*, const int32_t*, const bfloat16*, const float*, const float*, const bool*, const bfloat16*, float*, float*, bfloat16*, float*, bool*, float*, float*, float*, bfloat16*, float*, bfloat16*)’:
+/tmp/cpqnh5uhfu4as5ixqda4akm5lbr5qomk5ziwaslzxynceuqnsbsr.cpp:13926:31: warning: variable ‘tmp_acc0_arr’ set but not used [-Wunused-but-set-variable]
+13926 |                         float tmp_acc0_arr[32];
+      |                               ^~~~~~~~~~~~
+/tmp/cpqnh5uhfu4as5ixqda4akm5lbr5qomk5ziwaslzxynceuqnsbsr.cpp: In function ‘void cpp_fused__safe_softmax__to_copy_add_bmm_index_index_put_scalar_tensor_stack_where_137(const bfloat16*, const int32_t*, const bfloat16*, const float*, const float*, const bool*, const bfloat16*, float*, float*, bfloat16*, float*, bool*, float*, float*, float*, bfloat16*, float*, bfloat16*)’:
+/tmp/cpqnh5uhfu4as5ixqda4akm5lbr5qomk5ziwaslzxynceuqnsbsr.cpp:14438:31: warning: variable ‘tmp_acc0_arr’ set but not used [-Wunused-but-set-variable]
+14438 |                         float tmp_acc0_arr[32];
+      |                               ^~~~~~~~~~~~
+/tmp/cpqnh5uhfu4as5ixqda4akm5lbr5qomk5ziwaslzxynceuqnsbsr.cpp: In function ‘void cpp_fused__safe_softmax__to_copy_add_bmm_index_index_put_scalar_tensor_stack_where_142(const bfloat16*, const int32_t*, const bfloat16*, const float*, const float*, const bool*, const bfloat16*, float*, float*, bfloat16*, float*, bool*, float*, float*, float*, bfloat16*, float*, bfloat16*)’:
+/tmp/cpqnh5uhfu4as5ixqda4akm5lbr5qomk5ziwaslzxynceuqnsbsr.cpp:14938:31: warning: variable ‘tmp_acc0_arr’ set but not used [-Wunused-but-set-variable]
+14938 |                         float tmp_acc0_arr[32];
+      |                               ^~~~~~~~~~~~
+/tmp/cpqnh5uhfu4as5ixqda4akm5lbr5qomk5ziwaslzxynceuqnsbsr.cpp: In function ‘void cpp_fused__safe_softmax__to_copy_add_bmm_index_index_put_scalar_tensor_stack_where_147(const bfloat16*, const int32_t*, const bfloat16*, const float*, const float*, const bool*, const bfloat16*, float*, float*, bfloat16*, float*, bool*, float*, float*, float*, bfloat16*, float*, bfloat16*)’:
+/tmp/cpqnh5uhfu4as5ixqda4akm5lbr5qomk5ziwaslzxynceuqnsbsr.cpp:15450:31: warning: variable ‘tmp_acc0_arr’ set but not used [-Wunused-but-set-variable]
+15450 |                         float tmp_acc0_arr[32];
+      |                               ^~~~~~~~~~~~
+/tmp/cpqnh5uhfu4as5ixqda4akm5lbr5qomk5ziwaslzxynceuqnsbsr.cpp: In function ‘void cpp_fused__safe_softmax__to_copy_add_bmm_index_index_put_scalar_tensor_stack_where_152(const bfloat16*, const int32_t*, const bfloat16*, const float*, const float*, const bool*, const bfloat16*, float*, float*, bfloat16*, float*, bool*, float*, float*, float*, bfloat16*, float*, bfloat16*)’:
+/tmp/cpqnh5uhfu4as5ixqda4akm5lbr5qomk5ziwaslzxynceuqnsbsr.cpp:15950:31: warning: variable ‘tmp_acc0_arr’ set but not used [-Wunused-but-set-variable]
+15950 |                         float tmp_acc0_arr[32];
+      |                               ^~~~~~~~~~~~
+/tmp/cpqnh5uhfu4as5ixqda4akm5lbr5qomk5ziwaslzxynceuqnsbsr.cpp: In function ‘void cpp_fused__safe_softmax__to_copy_add_bmm_index_index_put_scalar_tensor_stack_where_157(const bfloat16*, const int32_t*, const bfloat16*, const float*, const float*, const bool*, const bfloat16*, float*, float*, bfloat16*, float*, bool*, float*, float*, float*, bfloat16*, float*, bfloat16*)’:
+/tmp/cpqnh5uhfu4as5ixqda4akm5lbr5qomk5ziwaslzxynceuqnsbsr.cpp:16462:31: warning: variable ‘tmp_acc0_arr’ set but not used [-Wunused-but-set-variable]
+16462 |                         float tmp_acc0_arr[32];
+      |                               ^~~~~~~~~~~~
+Unabled to import torchao experimental quant_api with error:  [Errno 2] No such file or directory: '/home/jackkhuu/oss/torchchat/torchao-build/src/ao/torchao/experimental/quant_api.py'
+Using device=cpu
+Setting max_seq_length to 300 for DSO export.
+Loading model...
+Time to load model: 0.11 seconds
+Quantizing the model with: {'linear:int8': {'groupsize': 0}, 'precision': {'dtype': 'bfloat16'}, 'executor': {'accelerator': 'cpu'}}
+Time to quantize model: 48.87 seconds
+-----------------------------------------------------------
+Exporting model using AOT Inductor to /tmp/model8.so
+WARNING!! The path of compiling a dso is deprecated. Please use --output-aoti-package-path to create a .pt2 artifact instead.
+The generated packaged model can be found at: /tmp/model8.so
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+OMP_NUM_THREADS=16 numactl --cpunodebind=0 --membind=0 python3 torchchat.py generate llama3.1 --dso-path /tmp/model8.so --prompt "Once upon a time," --max-new-tokens 256 --device cpu --num-samples 3
+PyTorch version 2.6.0.dev20241213+cu124 available.
+W1217 19:55:16.425022 780783 site-packages/torch/_export/__init__.py:276] +============================+
+W1217 19:55:16.425612 780783 site-packages/torch/_export/__init__.py:277] |     !!!   WARNING   !!!    |
+W1217 19:55:16.425914 780783 site-packages/torch/_export/__init__.py:278] +============================+
+W1217 19:55:16.426213 780783 site-packages/torch/_export/__init__.py:279] torch._export.aot_compile()/torch._export.aot_load() is being deprecated, please switch to directly calling torch._inductor.aoti_compile_and_package(torch.export.export())/torch._inductor.aoti_load_package() instead.
+Unabled to import torchao experimental quant_api with error:  [Errno 2] No such file or directory: '/home/jackkhuu/oss/torchchat/torchao-build/src/ao/torchao/experimental/quant_api.py'
+Warning: checkpoint path ignored because an exported model was specified using a DSO, AOTI PACKAGE or PTE path argument
+Warning: checkpoint path ignored because an exported model was specified using a DSO, AOTI PACKAGE or PTE path argument
+Using device=cpu Intel(R) Xeon(R) Platinum 8339HC CPU @ 1.80GHz
+Loading model...
+Time to load model: 0.52 seconds
+-----------------------------------------------------------
+Once upon a time, there was a small village nestled in the rolling hills of a far-off land. The village was surrounded by vast fields of golden wheat, which swayed gently in the breeze like a chorus of dancing women. At the heart of the village stood a small cottage, its thatched roof a warm golden brown and its door a cheerful red. This was the home of a young woman named Ember.
+Ember was a skilled weaver, known throughout the village for her beautiful fabrics and colorful patterns. She spent her days sitting at her loom, the soft clickety-clack of the shuttle as she wove her magic onto the threads. Her fabrics were sought after by the villagers, who used them to make clothes, blankets, and even tapestries to adorn their walls.
+One day, a stranger arrived in the village. He was a tall, dark-haired man with piercing green eyes, and a mysterious air about him. Ember was immediately drawn to him, sensing that there was more to this man than met the eye. As she watched him walk through the village, she noticed that he seemed to be looking for something - or someone.
+As the stranger approached the cottage, Ember felt a shiver run down her spine. She had a feeling that her life was about to
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~                
+Generated 255 tokens                 
+Time for inference 1: 417.5378 sec total                 
+Time to first token: 11.1746 sec with sequential prefill.                
+
+      Total throughput: 0.6131 tokens/sec, 1.6310 s/token                 
+First token throughput: 0.0895 tokens/sec, 11.1746 s/token                 
+ Next token throughput: 0.6275 tokens/sec, 1.5936 s/token                     
+
+Bandwidth achieved: 0.00 GB/s
+*** This first iteration will include cold start effects for dynamic import, hardware caches. ***
+
+========================================
+
+Once upon a time, in a world not so different from our own, there was a magical kingdom hidden away from the mundane world. Here, humans and mythical creatures lived in harmony, each contributing their unique talents and abilities to the rich tapestry of society. Among the inhabitants of this enchanted realm was a young girl named Lila, with a gift for communicating with animals.
+Lila lived in a small village on the outskirts of the kingdom, where animals were her constant companions. She could speak their language, and they would often offer her wisdom and guidance. As she grew older, Lila began to notice that the animals were becoming increasingly restless. They would whisper warnings of a great darkness that threatened to consume their world.
+Determined to uncover the source of the danger, Lila embarked on a perilous journey to the heart of the kingdom. Along the way, she encountered a wise old owl named Orion, a mischievous shapeshifter named Zephyr, and a gentle giant named Bramble. Together, they braved treacherous landscapes, battled ferocious creatures, and unraveled ancient secrets.
+As they drew closer to the heart of the kingdom, Lila discovered that the darkness was a manifestation of a long-forgotten evil, one that had been awakened by human greed
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~                
+Generated 255 tokens                 
+Time for inference 2: 444.2335 sec total                 
+Time to first token: 9.2487 sec with sequential prefill.                
+
+      Total throughput: 0.5763 tokens/sec, 1.7353 s/token                 
+First token throughput: 0.1081 tokens/sec, 9.2487 s/token                 
+ Next token throughput: 0.5862 tokens/sec, 1.7058 s/token                     
+
+Bandwidth achieved: 0.00 GB/s
+
+========================================
+
+Once upon a time, there was a young woman named Emily who had always been fascinated by the world of interior design. She spent hours flipping through design magazines and scrolling through Pinterest boards, dreaming of the day when she could create her own beautiful spaces. But for now, she was stuck in a small studio apartment, surrounded by thrift store furniture and hand-me-down decor.
+One day, Emily stumbled upon an old, mysterious-looking book in the depths of her local library's archives. The cover was worn and faded, but the title – "The Art of Creating Serenity" – seemed to whisper to her, beckoning her to open the cover and begin reading.
+As she delved into the book, Emily discovered that it was an ancient guide to creating peaceful and harmonious spaces. The author, a wise and renowned interior designer named Sophia, shared her secrets for selecting the perfect colors, textures, and patterns to cultivate a sense of calm and tranquility.
+Intrigued, Emily decided to put Sophia's principles to the test. She began by clearing out her cluttered studio apartment, donating or discarding items that no longer brought her joy. Then, she set about creating a soothing color palette, using a combination of soft blues, creamy whites, and earthy tones to create a sense
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~                
+Generated 255 tokens                 
+Time for inference 3: 430.0397 sec total                 
+Time to first token: 12.1978 sec with sequential prefill.                
+
+      Total throughput: 0.5953 tokens/sec, 1.6798 s/token                 
+First token throughput: 0.0820 tokens/sec, 12.1978 s/token                 
+ Next token throughput: 0.6103 tokens/sec, 1.6386 s/token                     
+
+Bandwidth achieved: 0.00 GB/s
+
+========================================
+
+
+Warning: Excluding compile in calculations                 
+      Average tokens/sec (total): 0.59                 
+Average tokens/sec (first token): 0.09                 
+Average tokens/sec (next tokens): 0.61 
+                
+Memory used: 0.00 GB
diff --git a/llama31-1213/cpu_aoti_b16.txt b/llama31-1213/cpu_aoti_b16.txt
new file mode 100644
index 000000000..86234aa79
--- /dev/null
+++ b/llama31-1213/cpu_aoti_b16.txt
@@ -0,0 +1,225 @@
+python3 torchchat.py export llama3.1 --quantize '{"precision": {"dtype":"bfloat16"}, "executor":{"accelerator":"cpu"}}' --output-dso-path /tmp/model16.so
+OMP_NUM_THREADS=16 numactl --cpunodebind=0 --membind=0 python3 torchchat.py generate llama3.1 --dso-path /tmp/model16.so --prompt "Once upon a time," --max-new-tokens 256 --device cpu --num-samples 3
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+python3 torchchat.py export llama3.1 --quantize '{"precision": {"dtype":"bfloat16"}, "executor":{"accelerator":"cpu"}}' --output-dso-path /tmp/model16.so
+Note: NumExpr detected 22 cores but "NUMEXPR_MAX_THREADS" not set, so enforcing safe limit of 16.
+NumExpr defaulting to 16 threads.
+PyTorch version 2.6.0.dev20241213+cu124 available.
+W1217 19:28:28.191114 483908 site-packages/torch/_export/__init__.py:276] +============================+
+W1217 19:28:28.191601 483908 site-packages/torch/_export/__init__.py:277] |     !!!   WARNING   !!!    |
+W1217 19:28:28.191834 483908 site-packages/torch/_export/__init__.py:278] +============================+
+W1217 19:28:28.192016 483908 site-packages/torch/_export/__init__.py:279] torch._export.aot_compile()/torch._export.aot_load() is being deprecated, please switch to directly calling torch._inductor.aoti_compile_and_package(torch.export.export())/torch._inductor.aoti_load_package() instead.
+/tmp/csftbchxggh3ohrmozpttgodvhhfcv7cwczbgt7cy7nwn2eg5wmh.cpp: In function ‘void cpp_fused__safe_softmax__to_copy_add_bmm_index_index_put_scalar_tensor_stack_where_2(const bfloat16*, const int32_t*, const bfloat16*, const float*, const float*, const bool*, const bfloat16*, float*, float*, bfloat16*, float*, bool*, float*, float*, float*, bfloat16*, float*, bfloat16*)’:
+/tmp/csftbchxggh3ohrmozpttgodvhhfcv7cwczbgt7cy7nwn2eg5wmh.cpp:740:31: warning: variable ‘tmp_acc0_arr’ set but not used [-Wunused-but-set-variable]
+  740 |                         float tmp_acc0_arr[32];
+      |                               ^~~~~~~~~~~~
+/tmp/csftbchxggh3ohrmozpttgodvhhfcv7cwczbgt7cy7nwn2eg5wmh.cpp: In function ‘void cpp_fused__safe_softmax__to_copy_add_bmm_index_index_put_scalar_tensor_stack_where_7(const bfloat16*, const int32_t*, const bfloat16*, const float*, const float*, const bool*, const bfloat16*, float*, float*, bfloat16*, float*, bool*, float*, float*, float*, bfloat16*, float*, bfloat16*)’:
+/tmp/csftbchxggh3ohrmozpttgodvhhfcv7cwczbgt7cy7nwn2eg5wmh.cpp:1276:31: warning: variable ‘tmp_acc0_arr’ set but not used [-Wunused-but-set-variable]
+ 1276 |                         float tmp_acc0_arr[32];
+      |                               ^~~~~~~~~~~~
+/tmp/csftbchxggh3ohrmozpttgodvhhfcv7cwczbgt7cy7nwn2eg5wmh.cpp: In function ‘void cpp_fused__safe_softmax__to_copy_add_bmm_index_index_put_scalar_tensor_stack_where_12(const bfloat16*, const int32_t*, const bfloat16*, const float*, const float*, const bool*, const bfloat16*, float*, float*, bfloat16*, float*, bool*, float*, float*, float*, bfloat16*, float*, bfloat16*)’:
+/tmp/csftbchxggh3ohrmozpttgodvhhfcv7cwczbgt7cy7nwn2eg5wmh.cpp:1782:31: warning: variable ‘tmp_acc0_arr’ set but not used [-Wunused-but-set-variable]
+ 1782 |                         float tmp_acc0_arr[32];
+      |                               ^~~~~~~~~~~~
+/tmp/csftbchxggh3ohrmozpttgodvhhfcv7cwczbgt7cy7nwn2eg5wmh.cpp: In function ‘void cpp_fused__safe_softmax__to_copy_add_bmm_index_index_put_scalar_tensor_stack_where_17(const bfloat16*, const int32_t*, const bfloat16*, const float*, const float*, const bool*, const bfloat16*, float*, float*, bfloat16*, float*, bool*, float*, float*, float*, bfloat16*, float*, bfloat16*)’:
+/tmp/csftbchxggh3ohrmozpttgodvhhfcv7cwczbgt7cy7nwn2eg5wmh.cpp:2294:31: warning: variable ‘tmp_acc0_arr’ set but not used [-Wunused-but-set-variable]
+ 2294 |                         float tmp_acc0_arr[32];
+      |                               ^~~~~~~~~~~~
+/tmp/csftbchxggh3ohrmozpttgodvhhfcv7cwczbgt7cy7nwn2eg5wmh.cpp: In function ‘void cpp_fused__safe_softmax__to_copy_add_bmm_index_index_put_scalar_tensor_stack_where_22(const bfloat16*, const int32_t*, const bfloat16*, const float*, const float*, const bool*, const bfloat16*, float*, float*, bfloat16*, float*, bool*, float*, float*, float*, bfloat16*, float*, bfloat16*)’:
+/tmp/csftbchxggh3ohrmozpttgodvhhfcv7cwczbgt7cy7nwn2eg5wmh.cpp:2794:31: warning: variable ‘tmp_acc0_arr’ set but not used [-Wunused-but-set-variable]
+ 2794 |                         float tmp_acc0_arr[32];
+      |                               ^~~~~~~~~~~~
+/tmp/csftbchxggh3ohrmozpttgodvhhfcv7cwczbgt7cy7nwn2eg5wmh.cpp: In function ‘void cpp_fused__safe_softmax__to_copy_add_bmm_index_index_put_scalar_tensor_stack_where_27(const bfloat16*, const int32_t*, const bfloat16*, const float*, const float*, const bool*, const bfloat16*, float*, float*, bfloat16*, float*, bool*, float*, float*, float*, bfloat16*, float*, bfloat16*)’:
+/tmp/csftbchxggh3ohrmozpttgodvhhfcv7cwczbgt7cy7nwn2eg5wmh.cpp:3306:31: warning: variable ‘tmp_acc0_arr’ set but not used [-Wunused-but-set-variable]
+ 3306 |                         float tmp_acc0_arr[32];
+      |                               ^~~~~~~~~~~~
+/tmp/csftbchxggh3ohrmozpttgodvhhfcv7cwczbgt7cy7nwn2eg5wmh.cpp: In function ‘void cpp_fused__safe_softmax__to_copy_add_bmm_index_index_put_scalar_tensor_stack_where_32(const bfloat16*, const int32_t*, const bfloat16*, const float*, const float*, const bool*, const bfloat16*, float*, float*, bfloat16*, float*, bool*, float*, float*, float*, bfloat16*, float*, bfloat16*)’:
+/tmp/csftbchxggh3ohrmozpttgodvhhfcv7cwczbgt7cy7nwn2eg5wmh.cpp:3806:31: warning: variable ‘tmp_acc0_arr’ set but not used [-Wunused-but-set-variable]
+ 3806 |                         float tmp_acc0_arr[32];
+      |                               ^~~~~~~~~~~~
+/tmp/csftbchxggh3ohrmozpttgodvhhfcv7cwczbgt7cy7nwn2eg5wmh.cpp: In function ‘void cpp_fused__safe_softmax__to_copy_add_bmm_index_index_put_scalar_tensor_stack_where_37(const bfloat16*, const int32_t*, const bfloat16*, const float*, const float*, const bool*, const bfloat16*, float*, float*, bfloat16*, float*, bool*, float*, float*, float*, bfloat16*, float*, bfloat16*)’:
+/tmp/csftbchxggh3ohrmozpttgodvhhfcv7cwczbgt7cy7nwn2eg5wmh.cpp:4318:31: warning: variable ‘tmp_acc0_arr’ set but not used [-Wunused-but-set-variable]
+ 4318 |                         float tmp_acc0_arr[32];
+      |                               ^~~~~~~~~~~~
+/tmp/csftbchxggh3ohrmozpttgodvhhfcv7cwczbgt7cy7nwn2eg5wmh.cpp: In function ‘void cpp_fused__safe_softmax__to_copy_add_bmm_index_index_put_scalar_tensor_stack_where_42(const bfloat16*, const int32_t*, const bfloat16*, const float*, const float*, const bool*, const bfloat16*, float*, float*, bfloat16*, float*, bool*, float*, float*, float*, bfloat16*, float*, bfloat16*)’:
+/tmp/csftbchxggh3ohrmozpttgodvhhfcv7cwczbgt7cy7nwn2eg5wmh.cpp:4818:31: warning: variable ‘tmp_acc0_arr’ set but not used [-Wunused-but-set-variable]
+ 4818 |                         float tmp_acc0_arr[32];
+      |                               ^~~~~~~~~~~~
+/tmp/csftbchxggh3ohrmozpttgodvhhfcv7cwczbgt7cy7nwn2eg5wmh.cpp: In function ‘void cpp_fused__safe_softmax__to_copy_add_bmm_index_index_put_scalar_tensor_stack_where_47(const bfloat16*, const int32_t*, const bfloat16*, const float*, const float*, const bool*, const bfloat16*, float*, float*, bfloat16*, float*, bool*, float*, float*, float*, bfloat16*, float*, bfloat16*)’:
+/tmp/csftbchxggh3ohrmozpttgodvhhfcv7cwczbgt7cy7nwn2eg5wmh.cpp:5330:31: warning: variable ‘tmp_acc0_arr’ set but not used [-Wunused-but-set-variable]
+ 5330 |                         float tmp_acc0_arr[32];
+      |                               ^~~~~~~~~~~~
+/tmp/csftbchxggh3ohrmozpttgodvhhfcv7cwczbgt7cy7nwn2eg5wmh.cpp: In function ‘void cpp_fused__safe_softmax__to_copy_add_bmm_index_index_put_scalar_tensor_stack_where_52(const bfloat16*, const int32_t*, const bfloat16*, const float*, const float*, const bool*, const bfloat16*, float*, float*, bfloat16*, float*, bool*, float*, float*, float*, bfloat16*, float*, bfloat16*)’:
+/tmp/csftbchxggh3ohrmozpttgodvhhfcv7cwczbgt7cy7nwn2eg5wmh.cpp:5830:31: warning: variable ‘tmp_acc0_arr’ set but not used [-Wunused-but-set-variable]
+ 5830 |                         float tmp_acc0_arr[32];
+      |                               ^~~~~~~~~~~~
+/tmp/csftbchxggh3ohrmozpttgodvhhfcv7cwczbgt7cy7nwn2eg5wmh.cpp: In function ‘void cpp_fused__safe_softmax__to_copy_add_bmm_index_index_put_scalar_tensor_stack_where_57(const bfloat16*, const int32_t*, const bfloat16*, const float*, const float*, const bool*, const bfloat16*, float*, float*, bfloat16*, float*, bool*, float*, float*, float*, bfloat16*, float*, bfloat16*)’:
+/tmp/csftbchxggh3ohrmozpttgodvhhfcv7cwczbgt7cy7nwn2eg5wmh.cpp:6342:31: warning: variable ‘tmp_acc0_arr’ set but not used [-Wunused-but-set-variable]
+ 6342 |                         float tmp_acc0_arr[32];
+      |                               ^~~~~~~~~~~~
+/tmp/csftbchxggh3ohrmozpttgodvhhfcv7cwczbgt7cy7nwn2eg5wmh.cpp: In function ‘void cpp_fused__safe_softmax__to_copy_add_bmm_index_index_put_scalar_tensor_stack_where_62(const bfloat16*, const int32_t*, const bfloat16*, const float*, const float*, const bool*, const bfloat16*, float*, float*, bfloat16*, float*, bool*, float*, float*, float*, bfloat16*, float*, bfloat16*)’:
+/tmp/csftbchxggh3ohrmozpttgodvhhfcv7cwczbgt7cy7nwn2eg5wmh.cpp:6842:31: warning: variable ‘tmp_acc0_arr’ set but not used [-Wunused-but-set-variable]
+ 6842 |                         float tmp_acc0_arr[32];
+      |                               ^~~~~~~~~~~~
+/tmp/csftbchxggh3ohrmozpttgodvhhfcv7cwczbgt7cy7nwn2eg5wmh.cpp: In function ‘void cpp_fused__safe_softmax__to_copy_add_bmm_index_index_put_scalar_tensor_stack_where_67(const bfloat16*, const int32_t*, const bfloat16*, const float*, const float*, const bool*, const bfloat16*, float*, float*, bfloat16*, float*, bool*, float*, float*, float*, bfloat16*, float*, bfloat16*)’:
+/tmp/csftbchxggh3ohrmozpttgodvhhfcv7cwczbgt7cy7nwn2eg5wmh.cpp:7354:31: warning: variable ‘tmp_acc0_arr’ set but not used [-Wunused-but-set-variable]
+ 7354 |                         float tmp_acc0_arr[32];
+      |                               ^~~~~~~~~~~~
+/tmp/csftbchxggh3ohrmozpttgodvhhfcv7cwczbgt7cy7nwn2eg5wmh.cpp: In function ‘void cpp_fused__safe_softmax__to_copy_add_bmm_index_index_put_scalar_tensor_stack_where_72(const bfloat16*, const int32_t*, const bfloat16*, const float*, const float*, const bool*, const bfloat16*, float*, float*, bfloat16*, float*, bool*, float*, float*, float*, bfloat16*, float*, bfloat16*)’:
+/tmp/csftbchxggh3ohrmozpttgodvhhfcv7cwczbgt7cy7nwn2eg5wmh.cpp:7854:31: warning: variable ‘tmp_acc0_arr’ set but not used [-Wunused-but-set-variable]
+ 7854 |                         float tmp_acc0_arr[32];
+      |                               ^~~~~~~~~~~~
+/tmp/csftbchxggh3ohrmozpttgodvhhfcv7cwczbgt7cy7nwn2eg5wmh.cpp: In function ‘void cpp_fused__safe_softmax__to_copy_add_bmm_index_index_put_scalar_tensor_stack_where_77(const bfloat16*, const int32_t*, const bfloat16*, const float*, const float*, const bool*, const bfloat16*, float*, float*, bfloat16*, float*, bool*, float*, float*, float*, bfloat16*, float*, bfloat16*)’:
+/tmp/csftbchxggh3ohrmozpttgodvhhfcv7cwczbgt7cy7nwn2eg5wmh.cpp:8366:31: warning: variable ‘tmp_acc0_arr’ set but not used [-Wunused-but-set-variable]
+ 8366 |                         float tmp_acc0_arr[32];
+      |                               ^~~~~~~~~~~~
+/tmp/csftbchxggh3ohrmozpttgodvhhfcv7cwczbgt7cy7nwn2eg5wmh.cpp: In function ‘void cpp_fused__safe_softmax__to_copy_add_bmm_index_index_put_scalar_tensor_stack_where_82(const bfloat16*, const int32_t*, const bfloat16*, const float*, const float*, const bool*, const bfloat16*, float*, float*, bfloat16*, float*, bool*, float*, float*, float*, bfloat16*, float*, bfloat16*)’:
+/tmp/csftbchxggh3ohrmozpttgodvhhfcv7cwczbgt7cy7nwn2eg5wmh.cpp:8866:31: warning: variable ‘tmp_acc0_arr’ set but not used [-Wunused-but-set-variable]
+ 8866 |                         float tmp_acc0_arr[32];
+      |                               ^~~~~~~~~~~~
+/tmp/csftbchxggh3ohrmozpttgodvhhfcv7cwczbgt7cy7nwn2eg5wmh.cpp: In function ‘void cpp_fused__safe_softmax__to_copy_add_bmm_index_index_put_scalar_tensor_stack_where_87(const bfloat16*, const int32_t*, const bfloat16*, const float*, const float*, const bool*, const bfloat16*, float*, float*, bfloat16*, float*, bool*, float*, float*, float*, bfloat16*, float*, bfloat16*)’:
+/tmp/csftbchxggh3ohrmozpttgodvhhfcv7cwczbgt7cy7nwn2eg5wmh.cpp:9378:31: warning: variable ‘tmp_acc0_arr’ set but not used [-Wunused-but-set-variable]
+ 9378 |                         float tmp_acc0_arr[32];
+      |                               ^~~~~~~~~~~~
+/tmp/csftbchxggh3ohrmozpttgodvhhfcv7cwczbgt7cy7nwn2eg5wmh.cpp: In function ‘void cpp_fused__safe_softmax__to_copy_add_bmm_index_index_put_scalar_tensor_stack_where_92(const bfloat16*, const int32_t*, const bfloat16*, const float*, const float*, const bool*, const bfloat16*, float*, float*, bfloat16*, float*, bool*, float*, float*, float*, bfloat16*, float*, bfloat16*)’:
+/tmp/csftbchxggh3ohrmozpttgodvhhfcv7cwczbgt7cy7nwn2eg5wmh.cpp:9878:31: warning: variable ‘tmp_acc0_arr’ set but not used [-Wunused-but-set-variable]
+ 9878 |                         float tmp_acc0_arr[32];
+      |                               ^~~~~~~~~~~~
+/tmp/csftbchxggh3ohrmozpttgodvhhfcv7cwczbgt7cy7nwn2eg5wmh.cpp: In function ‘void cpp_fused__safe_softmax__to_copy_add_bmm_index_index_put_scalar_tensor_stack_where_97(const bfloat16*, const int32_t*, const bfloat16*, const float*, const float*, const bool*, const bfloat16*, float*, float*, bfloat16*, float*, bool*, float*, float*, float*, bfloat16*, float*, bfloat16*)’:
+/tmp/csftbchxggh3ohrmozpttgodvhhfcv7cwczbgt7cy7nwn2eg5wmh.cpp:10390:31: warning: variable ‘tmp_acc0_arr’ set but not used [-Wunused-but-set-variable]
+10390 |                         float tmp_acc0_arr[32];
+      |                               ^~~~~~~~~~~~
+/tmp/csftbchxggh3ohrmozpttgodvhhfcv7cwczbgt7cy7nwn2eg5wmh.cpp: In function ‘void cpp_fused__safe_softmax__to_copy_add_bmm_index_index_put_scalar_tensor_stack_where_102(const bfloat16*, const int32_t*, const bfloat16*, const float*, const float*, const bool*, const bfloat16*, float*, float*, bfloat16*, float*, bool*, float*, float*, float*, bfloat16*, float*, bfloat16*)’:
+/tmp/csftbchxggh3ohrmozpttgodvhhfcv7cwczbgt7cy7nwn2eg5wmh.cpp:10890:31: warning: variable ‘tmp_acc0_arr’ set but not used [-Wunused-but-set-variable]
+10890 |                         float tmp_acc0_arr[32];
+      |                               ^~~~~~~~~~~~
+/tmp/csftbchxggh3ohrmozpttgodvhhfcv7cwczbgt7cy7nwn2eg5wmh.cpp: In function ‘void cpp_fused__safe_softmax__to_copy_add_bmm_index_index_put_scalar_tensor_stack_where_107(const bfloat16*, const int32_t*, const bfloat16*, const float*, const float*, const bool*, const bfloat16*, float*, float*, bfloat16*, float*, bool*, float*, float*, float*, bfloat16*, float*, bfloat16*)’:
+/tmp/csftbchxggh3ohrmozpttgodvhhfcv7cwczbgt7cy7nwn2eg5wmh.cpp:11402:31: warning: variable ‘tmp_acc0_arr’ set but not used [-Wunused-but-set-variable]
+11402 |                         float tmp_acc0_arr[32];
+      |                               ^~~~~~~~~~~~
+/tmp/csftbchxggh3ohrmozpttgodvhhfcv7cwczbgt7cy7nwn2eg5wmh.cpp: In function ‘void cpp_fused__safe_softmax__to_copy_add_bmm_index_index_put_scalar_tensor_stack_where_112(const bfloat16*, const int32_t*, const bfloat16*, const float*, const float*, const bool*, const bfloat16*, float*, float*, bfloat16*, float*, bool*, float*, float*, float*, bfloat16*, float*, bfloat16*)’:
+/tmp/csftbchxggh3ohrmozpttgodvhhfcv7cwczbgt7cy7nwn2eg5wmh.cpp:11902:31: warning: variable ‘tmp_acc0_arr’ set but not used [-Wunused-but-set-variable]
+11902 |                         float tmp_acc0_arr[32];
+      |                               ^~~~~~~~~~~~
+/tmp/csftbchxggh3ohrmozpttgodvhhfcv7cwczbgt7cy7nwn2eg5wmh.cpp: In function ‘void cpp_fused__safe_softmax__to_copy_add_bmm_index_index_put_scalar_tensor_stack_where_117(const bfloat16*, const int32_t*, const bfloat16*, const float*, const float*, const bool*, const bfloat16*, float*, float*, bfloat16*, float*, bool*, float*, float*, float*, bfloat16*, float*, bfloat16*)’:
+/tmp/csftbchxggh3ohrmozpttgodvhhfcv7cwczbgt7cy7nwn2eg5wmh.cpp:12414:31: warning: variable ‘tmp_acc0_arr’ set but not used [-Wunused-but-set-variable]
+12414 |                         float tmp_acc0_arr[32];
+      |                               ^~~~~~~~~~~~
+/tmp/csftbchxggh3ohrmozpttgodvhhfcv7cwczbgt7cy7nwn2eg5wmh.cpp: In function ‘void cpp_fused__safe_softmax__to_copy_add_bmm_index_index_put_scalar_tensor_stack_where_122(const bfloat16*, const int32_t*, const bfloat16*, const float*, const float*, const bool*, const bfloat16*, float*, float*, bfloat16*, float*, bool*, float*, float*, float*, bfloat16*, float*, bfloat16*)’:
+/tmp/csftbchxggh3ohrmozpttgodvhhfcv7cwczbgt7cy7nwn2eg5wmh.cpp:12914:31: warning: variable ‘tmp_acc0_arr’ set but not used [-Wunused-but-set-variable]
+12914 |                         float tmp_acc0_arr[32];
+      |                               ^~~~~~~~~~~~
+/tmp/csftbchxggh3ohrmozpttgodvhhfcv7cwczbgt7cy7nwn2eg5wmh.cpp: In function ‘void cpp_fused__safe_softmax__to_copy_add_bmm_index_index_put_scalar_tensor_stack_where_127(const bfloat16*, const int32_t*, const bfloat16*, const float*, const float*, const bool*, const bfloat16*, float*, float*, bfloat16*, float*, bool*, float*, float*, float*, bfloat16*, float*, bfloat16*)’:
+/tmp/csftbchxggh3ohrmozpttgodvhhfcv7cwczbgt7cy7nwn2eg5wmh.cpp:13426:31: warning: variable ‘tmp_acc0_arr’ set but not used [-Wunused-but-set-variable]
+13426 |                         float tmp_acc0_arr[32];
+      |                               ^~~~~~~~~~~~
+/tmp/csftbchxggh3ohrmozpttgodvhhfcv7cwczbgt7cy7nwn2eg5wmh.cpp: In function ‘void cpp_fused__safe_softmax__to_copy_add_bmm_index_index_put_scalar_tensor_stack_where_132(const bfloat16*, const int32_t*, const bfloat16*, const float*, const float*, const bool*, const bfloat16*, float*, float*, bfloat16*, float*, bool*, float*, float*, float*, bfloat16*, float*, bfloat16*)’:
+/tmp/csftbchxggh3ohrmozpttgodvhhfcv7cwczbgt7cy7nwn2eg5wmh.cpp:13926:31: warning: variable ‘tmp_acc0_arr’ set but not used [-Wunused-but-set-variable]
+13926 |                         float tmp_acc0_arr[32];
+      |                               ^~~~~~~~~~~~
+/tmp/csftbchxggh3ohrmozpttgodvhhfcv7cwczbgt7cy7nwn2eg5wmh.cpp: In function ‘void cpp_fused__safe_softmax__to_copy_add_bmm_index_index_put_scalar_tensor_stack_where_137(const bfloat16*, const int32_t*, const bfloat16*, const float*, const float*, const bool*, const bfloat16*, float*, float*, bfloat16*, float*, bool*, float*, float*, float*, bfloat16*, float*, bfloat16*)’:
+/tmp/csftbchxggh3ohrmozpttgodvhhfcv7cwczbgt7cy7nwn2eg5wmh.cpp:14438:31: warning: variable ‘tmp_acc0_arr’ set but not used [-Wunused-but-set-variable]
+14438 |                         float tmp_acc0_arr[32];
+      |                               ^~~~~~~~~~~~
+/tmp/csftbchxggh3ohrmozpttgodvhhfcv7cwczbgt7cy7nwn2eg5wmh.cpp: In function ‘void cpp_fused__safe_softmax__to_copy_add_bmm_index_index_put_scalar_tensor_stack_where_142(const bfloat16*, const int32_t*, const bfloat16*, const float*, const float*, const bool*, const bfloat16*, float*, float*, bfloat16*, float*, bool*, float*, float*, float*, bfloat16*, float*, bfloat16*)’:
+/tmp/csftbchxggh3ohrmozpttgodvhhfcv7cwczbgt7cy7nwn2eg5wmh.cpp:14938:31: warning: variable ‘tmp_acc0_arr’ set but not used [-Wunused-but-set-variable]
+14938 |                         float tmp_acc0_arr[32];
+      |                               ^~~~~~~~~~~~
+/tmp/csftbchxggh3ohrmozpttgodvhhfcv7cwczbgt7cy7nwn2eg5wmh.cpp: In function ‘void cpp_fused__safe_softmax__to_copy_add_bmm_index_index_put_scalar_tensor_stack_where_147(const bfloat16*, const int32_t*, const bfloat16*, const float*, const float*, const bool*, const bfloat16*, float*, float*, bfloat16*, float*, bool*, float*, float*, float*, bfloat16*, float*, bfloat16*)’:
+/tmp/csftbchxggh3ohrmozpttgodvhhfcv7cwczbgt7cy7nwn2eg5wmh.cpp:15450:31: warning: variable ‘tmp_acc0_arr’ set but not used [-Wunused-but-set-variable]
+15450 |                         float tmp_acc0_arr[32];
+      |                               ^~~~~~~~~~~~
+/tmp/csftbchxggh3ohrmozpttgodvhhfcv7cwczbgt7cy7nwn2eg5wmh.cpp: In function ‘void cpp_fused__safe_softmax__to_copy_add_bmm_index_index_put_scalar_tensor_stack_where_152(const bfloat16*, const int32_t*, const bfloat16*, const float*, const float*, const bool*, const bfloat16*, float*, float*, bfloat16*, float*, bool*, float*, float*, float*, bfloat16*, float*, bfloat16*)’:
+/tmp/csftbchxggh3ohrmozpttgodvhhfcv7cwczbgt7cy7nwn2eg5wmh.cpp:15950:31: warning: variable ‘tmp_acc0_arr’ set but not used [-Wunused-but-set-variable]
+15950 |                         float tmp_acc0_arr[32];
+      |                               ^~~~~~~~~~~~
+/tmp/csftbchxggh3ohrmozpttgodvhhfcv7cwczbgt7cy7nwn2eg5wmh.cpp: In function ‘void cpp_fused__safe_softmax__to_copy_add_bmm_index_index_put_scalar_tensor_stack_where_157(const bfloat16*, const int32_t*, const bfloat16*, const float*, const float*, const bool*, const bfloat16*, float*, float*, bfloat16*, float*, bool*, float*, float*, float*, bfloat16*, float*, bfloat16*)’:
+/tmp/csftbchxggh3ohrmozpttgodvhhfcv7cwczbgt7cy7nwn2eg5wmh.cpp:16462:31: warning: variable ‘tmp_acc0_arr’ set but not used [-Wunused-but-set-variable]
+16462 |                         float tmp_acc0_arr[32];
+      |                               ^~~~~~~~~~~~
+Unabled to import torchao experimental quant_api with error:  [Errno 2] No such file or directory: '/home/jackkhuu/oss/torchchat/torchao-build/src/ao/torchao/experimental/quant_api.py'
+Using device=cpu
+Setting max_seq_length to 300 for DSO export.
+Loading model...
+Time to load model: 0.11 seconds
+Quantizing the model with: {'precision': {'dtype': 'bfloat16'}, 'executor': {'accelerator': 'cpu'}}
+Time to quantize model: 0.01 seconds
+-----------------------------------------------------------
+Exporting model using AOT Inductor to /tmp/model16.so
+WARNING!! The path of compiling a dso is deprecated. Please use --output-aoti-package-path to create a .pt2 artifact instead.
+The generated packaged model can be found at: /tmp/model16.so
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+OMP_NUM_THREADS=16 numactl --cpunodebind=0 --membind=0 python3 torchchat.py generate llama3.1 --dso-path /tmp/model16.so --prompt "Once upon a time," --max-new-tokens 256 --device cpu --num-samples 3
+PyTorch version 2.6.0.dev20241213+cu124 available.
+W1217 19:31:59.883835 503176 site-packages/torch/_export/__init__.py:276] +============================+
+W1217 19:31:59.884361 503176 site-packages/torch/_export/__init__.py:277] |     !!!   WARNING   !!!    |
+W1217 19:31:59.884550 503176 site-packages/torch/_export/__init__.py:278] +============================+
+W1217 19:31:59.884739 503176 site-packages/torch/_export/__init__.py:279] torch._export.aot_compile()/torch._export.aot_load() is being deprecated, please switch to directly calling torch._inductor.aoti_compile_and_package(torch.export.export())/torch._inductor.aoti_load_package() instead.
+Unabled to import torchao experimental quant_api with error:  [Errno 2] No such file or directory: '/home/jackkhuu/oss/torchchat/torchao-build/src/ao/torchao/experimental/quant_api.py'
+Warning: checkpoint path ignored because an exported model was specified using a DSO, AOTI PACKAGE or PTE path argument
+Warning: checkpoint path ignored because an exported model was specified using a DSO, AOTI PACKAGE or PTE path argument
+Using device=cpu Intel(R) Xeon(R) Platinum 8339HC CPU @ 1.80GHz
+Loading model...
+Time to load model: 0.51 seconds
+-----------------------------------------------------------
+Once upon a time, there was a magical kingdom hidden deep within a mysterious forest. The kingdom was ruled by a wise and just king, who was loved by all his subjects. The king's castle was a magnificent structure made of gold and silver, with towers that reached for the sky and gardens that bloomed with the most beautiful flowers anyone had ever seen. (1)
+One day, a beautiful princess named Sophia lived in the kingdom. She was known for her kindness, intelligence, and beauty. Sophia was loved by everyone in the kingdom, and she spent her days helping those in need and learning about the world beyond her kingdom. (2)
+As Sophia grew older, she began to feel a sense of restlessness. She wanted to see the world beyond her kingdom and experience new things. She longed to learn about other cultures, meet new people, and discover new lands. (3)
+The king, sensing Sophia's desire for adventure, called her to his chambers one day. He told her that he had heard of a magical realm beyond their kingdom, where the people were kind, the land was beautiful, and the magic was real. (4)
+The king asked Sophia if she would like to travel to this new realm, to learn about its people and its magic. Sophia was over
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~                
+Generated 255 tokens                 
+Time for inference 1: 374.7422 sec total                 
+Time to first token: 10.2841 sec with sequential prefill.                
+
+      Total throughput: 0.6831 tokens/sec, 1.4638 s/token                 
+First token throughput: 0.0972 tokens/sec, 10.2841 s/token                 
+ Next token throughput: 0.6997 tokens/sec, 1.4292 s/token                     
+
+Bandwidth achieved: 0.00 GB/s
+*** This first iteration will include cold start effects for dynamic import, hardware caches. ***
+
+========================================
+
+Once upon a time, there was a beautiful and wealthy merchant named Zain. Zain owned a large and prosperous trading empire that stretched across many lands. He was known for his cunning business sense and his ability to make smart investments, which had allowed him to build a vast fortune over the years.
+Despite his wealth, however, Zain was a lonely man. He had never been married and had no children of his own. He had many acquaintances and business associates, but he lacked true friends and a sense of family.
+One day, Zain decided that he wanted to change his life. He wanted to find someone to share his wealth and his life with, someone who would love and care for him as he grew older. So, he announced to his family and friends that he was looking for a bride, and he would pay any woman who married him to relocate to his estate and care for him in his old age.
+News of Zain's announcement spread quickly, and many women from all over the region came forward to compete for his hand. Zain was pleased to see so many interested women, but he was also quite picky. He had very high standards and would only consider marrying a woman who was intelligent, kind, and beautiful.
+Among the many women who came to meet
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~                
+Generated 255 tokens                 
+Time for inference 2: 392.8747 sec total                 
+Time to first token: 10.0622 sec with sequential prefill.                
+
+      Total throughput: 0.6516 tokens/sec, 1.5347 s/token                 
+First token throughput: 0.0994 tokens/sec, 10.0622 s/token                 
+ Next token throughput: 0.6661 tokens/sec, 1.5012 s/token                     
+
+Bandwidth achieved: 0.00 GB/s
+
+========================================
+
+Once upon a time, on the Japanese island of Hokkaido, there lived a young girl named Emiko. She was a shy and reserved child, with a passion for nature and a love for the simple things in life. Emiko spent her days exploring the rugged coastline, collecting seashells and watching the waves roll in. She felt most alive when she was surrounded by the sights and sounds of the sea.
+One day, while wandering along the beach, Emiko stumbled upon a beautiful, shimmering shell. It was unlike any she had ever seen before, with intricate patterns and colors that seemed to dance in the light. As she picked it up, she felt a strange sensation, as if the shell was alive and vibrating with energy.
+Without thinking, Emiko decided to take the shell home with her, hoping to learn more about its mysterious properties. She placed it on a shelf in her small cottage, surrounded by her other treasures and trinkets.
+As the days passed, Emiko began to notice strange and wondrous things happening around her. The shell seemed to be emitting a gentle, pulsating glow, and she could feel its energy resonating deep within her own body. She started to sense that the shell was trying to communicate with her, to share secrets and stories
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~                
+Generated 255 tokens                 
+Time for inference 3: 346.1213 sec total                 
+Time to first token: 6.2491 sec with sequential prefill.                
+
+      Total throughput: 0.7396 tokens/sec, 1.3520 s/token                 
+First token throughput: 0.1600 tokens/sec, 6.2491 s/token                 
+ Next token throughput: 0.7503 tokens/sec, 1.3328 s/token                     
+
+Bandwidth achieved: 0.00 GB/s
+
+========================================
+
+
+Warning: Excluding compile in calculations                 
+      Average tokens/sec (total): 0.69                 
+Average tokens/sec (first token): 0.12                 
+Average tokens/sec (next tokens): 0.71 
+                
+Memory used: 0.00 GB
diff --git a/llama31-1213/cpu_aoti_pt2_4.txt b/llama31-1213/cpu_aoti_pt2_4.txt
new file mode 100644
index 000000000..b0911bdd4
--- /dev/null
+++ b/llama31-1213/cpu_aoti_pt2_4.txt
@@ -0,0 +1,668 @@
+python3 torchchat.py export llama3.1 --quantize '{"linear:int4": {"groupsize": 256}, "precision": {"dtype":"bfloat16"}, "executor":{"accelerator":"cpu"}}' --output-aoti-package-path /tmp/model34.pt2
+OMP_NUM_THREADS=16 numactl --cpunodebind=0 --membind=0 python3 torchchat.py generate llama3.1 --aoti-package-path /tmp/model34.pt2 --prompt "Once upon a time," --max-new-tokens 256 --device cpu --num-samples 3
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+python3 torchchat.py export llama3.1 --quantize '{"linear:int4": {"groupsize": 256}, "precision": {"dtype":"bfloat16"}, "executor":{"accelerator":"cpu"}}' --output-aoti-package-path /tmp/model34.pt2
+Note: NumExpr detected 22 cores but "NUMEXPR_MAX_THREADS" not set, so enforcing safe limit of 16.
+NumExpr defaulting to 16 threads.
+PyTorch version 2.6.0.dev20241213+cu124 available.
+linear: model.layers.0.attention.wq, in=4096, out=4096
+linear: model.layers.0.attention.wk, in=4096, out=1024
+linear: model.layers.0.attention.wv, in=4096, out=1024
+linear: model.layers.0.attention.wo, in=4096, out=4096
+linear: model.layers.0.feed_forward.w1, in=4096, out=14336
+linear: model.layers.0.feed_forward.w2, in=14336, out=4096
+linear: model.layers.0.feed_forward.w3, in=4096, out=14336
+linear: model.layers.1.attention.wq, in=4096, out=4096
+linear: model.layers.1.attention.wk, in=4096, out=1024
+linear: model.layers.1.attention.wv, in=4096, out=1024
+linear: model.layers.1.attention.wo, in=4096, out=4096
+linear: model.layers.1.feed_forward.w1, in=4096, out=14336
+linear: model.layers.1.feed_forward.w2, in=14336, out=4096
+linear: model.layers.1.feed_forward.w3, in=4096, out=14336
+linear: model.layers.2.attention.wq, in=4096, out=4096
+linear: model.layers.2.attention.wk, in=4096, out=1024
+linear: model.layers.2.attention.wv, in=4096, out=1024
+linear: model.layers.2.attention.wo, in=4096, out=4096
+linear: model.layers.2.feed_forward.w1, in=4096, out=14336
+linear: model.layers.2.feed_forward.w2, in=14336, out=4096
+linear: model.layers.2.feed_forward.w3, in=4096, out=14336
+linear: model.layers.3.attention.wq, in=4096, out=4096
+linear: model.layers.3.attention.wk, in=4096, out=1024
+linear: model.layers.3.attention.wv, in=4096, out=1024
+linear: model.layers.3.attention.wo, in=4096, out=4096
+linear: model.layers.3.feed_forward.w1, in=4096, out=14336
+linear: model.layers.3.feed_forward.w2, in=14336, out=4096
+linear: model.layers.3.feed_forward.w3, in=4096, out=14336
+linear: model.layers.4.attention.wq, in=4096, out=4096
+linear: model.layers.4.attention.wk, in=4096, out=1024
+linear: model.layers.4.attention.wv, in=4096, out=1024
+linear: model.layers.4.attention.wo, in=4096, out=4096
+linear: model.layers.4.feed_forward.w1, in=4096, out=14336
+linear: model.layers.4.feed_forward.w2, in=14336, out=4096
+linear: model.layers.4.feed_forward.w3, in=4096, out=14336
+linear: model.layers.5.attention.wq, in=4096, out=4096
+linear: model.layers.5.attention.wk, in=4096, out=1024
+linear: model.layers.5.attention.wv, in=4096, out=1024
+linear: model.layers.5.attention.wo, in=4096, out=4096
+linear: model.layers.5.feed_forward.w1, in=4096, out=14336
+linear: model.layers.5.feed_forward.w2, in=14336, out=4096
+linear: model.layers.5.feed_forward.w3, in=4096, out=14336
+linear: model.layers.6.attention.wq, in=4096, out=4096
+linear: model.layers.6.attention.wk, in=4096, out=1024
+linear: model.layers.6.attention.wv, in=4096, out=1024
+linear: model.layers.6.attention.wo, in=4096, out=4096
+linear: model.layers.6.feed_forward.w1, in=4096, out=14336
+linear: model.layers.6.feed_forward.w2, in=14336, out=4096
+linear: model.layers.6.feed_forward.w3, in=4096, out=14336
+linear: model.layers.7.attention.wq, in=4096, out=4096
+linear: model.layers.7.attention.wk, in=4096, out=1024
+linear: model.layers.7.attention.wv, in=4096, out=1024
+linear: model.layers.7.attention.wo, in=4096, out=4096
+linear: model.layers.7.feed_forward.w1, in=4096, out=14336
+linear: model.layers.7.feed_forward.w2, in=14336, out=4096
+linear: model.layers.7.feed_forward.w3, in=4096, out=14336
+linear: model.layers.8.attention.wq, in=4096, out=4096
+linear: model.layers.8.attention.wk, in=4096, out=1024
+linear: model.layers.8.attention.wv, in=4096, out=1024
+linear: model.layers.8.attention.wo, in=4096, out=4096
+linear: model.layers.8.feed_forward.w1, in=4096, out=14336
+linear: model.layers.8.feed_forward.w2, in=14336, out=4096
+linear: model.layers.8.feed_forward.w3, in=4096, out=14336
+linear: model.layers.9.attention.wq, in=4096, out=4096
+linear: model.layers.9.attention.wk, in=4096, out=1024
+linear: model.layers.9.attention.wv, in=4096, out=1024
+linear: model.layers.9.attention.wo, in=4096, out=4096
+linear: model.layers.9.feed_forward.w1, in=4096, out=14336
+linear: model.layers.9.feed_forward.w2, in=14336, out=4096
+linear: model.layers.9.feed_forward.w3, in=4096, out=14336
+linear: model.layers.10.attention.wq, in=4096, out=4096
+linear: model.layers.10.attention.wk, in=4096, out=1024
+linear: model.layers.10.attention.wv, in=4096, out=1024
+linear: model.layers.10.attention.wo, in=4096, out=4096
+linear: model.layers.10.feed_forward.w1, in=4096, out=14336
+linear: model.layers.10.feed_forward.w2, in=14336, out=4096
+linear: model.layers.10.feed_forward.w3, in=4096, out=14336
+linear: model.layers.11.attention.wq, in=4096, out=4096
+linear: model.layers.11.attention.wk, in=4096, out=1024
+linear: model.layers.11.attention.wv, in=4096, out=1024
+linear: model.layers.11.attention.wo, in=4096, out=4096
+linear: model.layers.11.feed_forward.w1, in=4096, out=14336
+linear: model.layers.11.feed_forward.w2, in=14336, out=4096
+linear: model.layers.11.feed_forward.w3, in=4096, out=14336
+linear: model.layers.12.attention.wq, in=4096, out=4096
+linear: model.layers.12.attention.wk, in=4096, out=1024
+linear: model.layers.12.attention.wv, in=4096, out=1024
+linear: model.layers.12.attention.wo, in=4096, out=4096
+linear: model.layers.12.feed_forward.w1, in=4096, out=14336
+linear: model.layers.12.feed_forward.w2, in=14336, out=4096
+linear: model.layers.12.feed_forward.w3, in=4096, out=14336
+linear: model.layers.13.attention.wq, in=4096, out=4096
+linear: model.layers.13.attention.wk, in=4096, out=1024
+linear: model.layers.13.attention.wv, in=4096, out=1024
+linear: model.layers.13.attention.wo, in=4096, out=4096
+linear: model.layers.13.feed_forward.w1, in=4096, out=14336
+linear: model.layers.13.feed_forward.w2, in=14336, out=4096
+linear: model.layers.13.feed_forward.w3, in=4096, out=14336
+linear: model.layers.14.attention.wq, in=4096, out=4096
+linear: model.layers.14.attention.wk, in=4096, out=1024
+linear: model.layers.14.attention.wv, in=4096, out=1024
+linear: model.layers.14.attention.wo, in=4096, out=4096
+linear: model.layers.14.feed_forward.w1, in=4096, out=14336
+linear: model.layers.14.feed_forward.w2, in=14336, out=4096
+linear: model.layers.14.feed_forward.w3, in=4096, out=14336
+linear: model.layers.15.attention.wq, in=4096, out=4096
+linear: model.layers.15.attention.wk, in=4096, out=1024
+linear: model.layers.15.attention.wv, in=4096, out=1024
+linear: model.layers.15.attention.wo, in=4096, out=4096
+linear: model.layers.15.feed_forward.w1, in=4096, out=14336
+linear: model.layers.15.feed_forward.w2, in=14336, out=4096
+linear: model.layers.15.feed_forward.w3, in=4096, out=14336
+linear: model.layers.16.attention.wq, in=4096, out=4096
+linear: model.layers.16.attention.wk, in=4096, out=1024
+linear: model.layers.16.attention.wv, in=4096, out=1024
+linear: model.layers.16.attention.wo, in=4096, out=4096
+linear: model.layers.16.feed_forward.w1, in=4096, out=14336
+linear: model.layers.16.feed_forward.w2, in=14336, out=4096
+linear: model.layers.16.feed_forward.w3, in=4096, out=14336
+linear: model.layers.17.attention.wq, in=4096, out=4096
+linear: model.layers.17.attention.wk, in=4096, out=1024
+linear: model.layers.17.attention.wv, in=4096, out=1024
+linear: model.layers.17.attention.wo, in=4096, out=4096
+linear: model.layers.17.feed_forward.w1, in=4096, out=14336
+linear: model.layers.17.feed_forward.w2, in=14336, out=4096
+linear: model.layers.17.feed_forward.w3, in=4096, out=14336
+linear: model.layers.18.attention.wq, in=4096, out=4096
+linear: model.layers.18.attention.wk, in=4096, out=1024
+linear: model.layers.18.attention.wv, in=4096, out=1024
+linear: model.layers.18.attention.wo, in=4096, out=4096
+linear: model.layers.18.feed_forward.w1, in=4096, out=14336
+linear: model.layers.18.feed_forward.w2, in=14336, out=4096
+linear: model.layers.18.feed_forward.w3, in=4096, out=14336
+linear: model.layers.19.attention.wq, in=4096, out=4096
+linear: model.layers.19.attention.wk, in=4096, out=1024
+linear: model.layers.19.attention.wv, in=4096, out=1024
+linear: model.layers.19.attention.wo, in=4096, out=4096
+linear: model.layers.19.feed_forward.w1, in=4096, out=14336
+linear: model.layers.19.feed_forward.w2, in=14336, out=4096
+linear: model.layers.19.feed_forward.w3, in=4096, out=14336
+linear: model.layers.20.attention.wq, in=4096, out=4096
+linear: model.layers.20.attention.wk, in=4096, out=1024
+linear: model.layers.20.attention.wv, in=4096, out=1024
+linear: model.layers.20.attention.wo, in=4096, out=4096
+linear: model.layers.20.feed_forward.w1, in=4096, out=14336
+linear: model.layers.20.feed_forward.w2, in=14336, out=4096
+linear: model.layers.20.feed_forward.w3, in=4096, out=14336
+linear: model.layers.21.attention.wq, in=4096, out=4096
+linear: model.layers.21.attention.wk, in=4096, out=1024
+linear: model.layers.21.attention.wv, in=4096, out=1024
+linear: model.layers.21.attention.wo, in=4096, out=4096
+linear: model.layers.21.feed_forward.w1, in=4096, out=14336
+linear: model.layers.21.feed_forward.w2, in=14336, out=4096
+linear: model.layers.21.feed_forward.w3, in=4096, out=14336
+linear: model.layers.22.attention.wq, in=4096, out=4096
+linear: model.layers.22.attention.wk, in=4096, out=1024
+linear: model.layers.22.attention.wv, in=4096, out=1024
+linear: model.layers.22.attention.wo, in=4096, out=4096
+linear: model.layers.22.feed_forward.w1, in=4096, out=14336
+linear: model.layers.22.feed_forward.w2, in=14336, out=4096
+linear: model.layers.22.feed_forward.w3, in=4096, out=14336
+linear: model.layers.23.attention.wq, in=4096, out=4096
+linear: model.layers.23.attention.wk, in=4096, out=1024
+linear: model.layers.23.attention.wv, in=4096, out=1024
+linear: model.layers.23.attention.wo, in=4096, out=4096
+linear: model.layers.23.feed_forward.w1, in=4096, out=14336
+linear: model.layers.23.feed_forward.w2, in=14336, out=4096
+linear: model.layers.23.feed_forward.w3, in=4096, out=14336
+linear: model.layers.24.attention.wq, in=4096, out=4096
+linear: model.layers.24.attention.wk, in=4096, out=1024
+linear: model.layers.24.attention.wv, in=4096, out=1024
+linear: model.layers.24.attention.wo, in=4096, out=4096
+linear: model.layers.24.feed_forward.w1, in=4096, out=14336
+linear: model.layers.24.feed_forward.w2, in=14336, out=4096
+linear: model.layers.24.feed_forward.w3, in=4096, out=14336
+linear: model.layers.25.attention.wq, in=4096, out=4096
+linear: model.layers.25.attention.wk, in=4096, out=1024
+linear: model.layers.25.attention.wv, in=4096, out=1024
+linear: model.layers.25.attention.wo, in=4096, out=4096
+linear: model.layers.25.feed_forward.w1, in=4096, out=14336
+linear: model.layers.25.feed_forward.w2, in=14336, out=4096
+linear: model.layers.25.feed_forward.w3, in=4096, out=14336
+linear: model.layers.26.attention.wq, in=4096, out=4096
+linear: model.layers.26.attention.wk, in=4096, out=1024
+linear: model.layers.26.attention.wv, in=4096, out=1024
+linear: model.layers.26.attention.wo, in=4096, out=4096
+linear: model.layers.26.feed_forward.w1, in=4096, out=14336
+linear: model.layers.26.feed_forward.w2, in=14336, out=4096
+linear: model.layers.26.feed_forward.w3, in=4096, out=14336
+linear: model.layers.27.attention.wq, in=4096, out=4096
+linear: model.layers.27.attention.wk, in=4096, out=1024
+linear: model.layers.27.attention.wv, in=4096, out=1024
+linear: model.layers.27.attention.wo, in=4096, out=4096
+linear: model.layers.27.feed_forward.w1, in=4096, out=14336
+linear: model.layers.27.feed_forward.w2, in=14336, out=4096
+linear: model.layers.27.feed_forward.w3, in=4096, out=14336
+linear: model.layers.28.attention.wq, in=4096, out=4096
+linear: model.layers.28.attention.wk, in=4096, out=1024
+linear: model.layers.28.attention.wv, in=4096, out=1024
+linear: model.layers.28.attention.wo, in=4096, out=4096
+linear: model.layers.28.feed_forward.w1, in=4096, out=14336
+linear: model.layers.28.feed_forward.w2, in=14336, out=4096
+linear: model.layers.28.feed_forward.w3, in=4096, out=14336
+linear: model.layers.29.attention.wq, in=4096, out=4096
+linear: model.layers.29.attention.wk, in=4096, out=1024
+linear: model.layers.29.attention.wv, in=4096, out=1024
+linear: model.layers.29.attention.wo, in=4096, out=4096
+linear: model.layers.29.feed_forward.w1, in=4096, out=14336
+linear: model.layers.29.feed_forward.w2, in=14336, out=4096
+linear: model.layers.29.feed_forward.w3, in=4096, out=14336
+linear: model.layers.30.attention.wq, in=4096, out=4096
+linear: model.layers.30.attention.wk, in=4096, out=1024
+linear: model.layers.30.attention.wv, in=4096, out=1024
+linear: model.layers.30.attention.wo, in=4096, out=4096
+linear: model.layers.30.feed_forward.w1, in=4096, out=14336
+linear: model.layers.30.feed_forward.w2, in=14336, out=4096
+linear: model.layers.30.feed_forward.w3, in=4096, out=14336
+linear: model.layers.31.attention.wq, in=4096, out=4096
+linear: model.layers.31.attention.wk, in=4096, out=1024
+linear: model.layers.31.attention.wv, in=4096, out=1024
+linear: model.layers.31.attention.wo, in=4096, out=4096
+linear: model.layers.31.feed_forward.w1, in=4096, out=14336
+linear: model.layers.31.feed_forward.w2, in=14336, out=4096
+linear: model.layers.31.feed_forward.w3, in=4096, out=14336
+linear: model.output, in=4096, out=128256
+W1217 21:24:14.026019 1726928 site-packages/torch/_export/__init__.py:276] +============================+
+W1217 21:24:14.026474 1726928 site-packages/torch/_export/__init__.py:277] |     !!!   WARNING   !!!    |
+W1217 21:24:14.026665 1726928 site-packages/torch/_export/__init__.py:278] +============================+
+W1217 21:24:14.026849 1726928 site-packages/torch/_export/__init__.py:279] torch._export.aot_compile()/torch._export.aot_load() is being deprecated, please switch to directly calling torch._inductor.aoti_compile_and_package(torch.export.export())/torch._inductor.aoti_load_package() instead.
+W1217 21:25:37.451656 1726928 site-packages/torch/_inductor/ir.py:6509] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1217 21:25:37.456909 1726928 site-packages/torch/_inductor/ir.py:6509] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1217 21:25:37.458432 1726928 site-packages/torch/_inductor/ir.py:6509] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1217 21:25:38.104502 1726928 site-packages/torch/_inductor/ir.py:6509] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1217 21:25:38.172109 1726928 site-packages/torch/_inductor/ir.py:6509] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1217 21:25:38.173750 1726928 site-packages/torch/_inductor/ir.py:6509] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1217 21:25:38.193953 1726928 site-packages/torch/_inductor/ir.py:6509] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1217 21:25:38.266832 1726928 site-packages/torch/_inductor/ir.py:6509] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1217 21:25:38.302381 1726928 site-packages/torch/_inductor/ir.py:6509] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1217 21:25:38.303751 1726928 site-packages/torch/_inductor/ir.py:6509] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1217 21:25:38.475950 1726928 site-packages/torch/_inductor/ir.py:6509] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1217 21:25:38.545322 1726928 site-packages/torch/_inductor/ir.py:6509] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1217 21:25:38.546611 1726928 site-packages/torch/_inductor/ir.py:6509] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1217 21:25:38.563827 1726928 site-packages/torch/_inductor/ir.py:6509] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1217 21:25:38.650704 1726928 site-packages/torch/_inductor/ir.py:6509] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1217 21:25:38.651912 1726928 site-packages/torch/_inductor/ir.py:6509] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1217 21:25:38.652755 1726928 site-packages/torch/_inductor/ir.py:6509] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1217 21:25:38.863618 1726928 site-packages/torch/_inductor/ir.py:6509] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1217 21:25:38.921232 1726928 site-packages/torch/_inductor/ir.py:6509] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1217 21:25:38.922786 1726928 site-packages/torch/_inductor/ir.py:6509] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1217 21:25:38.940482 1726928 site-packages/torch/_inductor/ir.py:6509] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1217 21:25:39.001620 1726928 site-packages/torch/_inductor/ir.py:6509] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1217 21:25:39.042980 1726928 site-packages/torch/_inductor/ir.py:6509] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1217 21:25:39.044364 1726928 site-packages/torch/_inductor/ir.py:6509] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1217 21:25:39.216453 1726928 site-packages/torch/_inductor/ir.py:6509] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1217 21:25:39.281834 1726928 site-packages/torch/_inductor/ir.py:6509] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1217 21:25:39.283155 1726928 site-packages/torch/_inductor/ir.py:6509] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1217 21:25:39.300397 1726928 site-packages/torch/_inductor/ir.py:6509] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1217 21:25:39.386302 1726928 site-packages/torch/_inductor/ir.py:6509] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1217 21:25:39.387504 1726928 site-packages/torch/_inductor/ir.py:6509] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1217 21:25:39.388350 1726928 site-packages/torch/_inductor/ir.py:6509] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1217 21:25:39.590519 1726928 site-packages/torch/_inductor/ir.py:6509] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1217 21:25:39.643626 1726928 site-packages/torch/_inductor/ir.py:6509] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1217 21:25:39.645353 1726928 site-packages/torch/_inductor/ir.py:6509] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1217 21:25:39.663479 1726928 site-packages/torch/_inductor/ir.py:6509] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1217 21:25:39.730579 1726928 site-packages/torch/_inductor/ir.py:6509] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1217 21:25:39.770129 1726928 site-packages/torch/_inductor/ir.py:6509] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1217 21:25:39.771540 1726928 site-packages/torch/_inductor/ir.py:6509] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1217 21:25:39.952252 1726928 site-packages/torch/_inductor/ir.py:6509] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1217 21:25:40.022150 1726928 site-packages/torch/_inductor/ir.py:6509] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1217 21:25:40.023521 1726928 site-packages/torch/_inductor/ir.py:6509] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1217 21:25:40.044204 1726928 site-packages/torch/_inductor/ir.py:6509] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1217 21:25:40.136110 1726928 site-packages/torch/_inductor/ir.py:6509] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1217 21:25:40.137324 1726928 site-packages/torch/_inductor/ir.py:6509] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1217 21:25:40.138154 1726928 site-packages/torch/_inductor/ir.py:6509] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1217 21:25:40.344583 1726928 site-packages/torch/_inductor/ir.py:6509] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1217 21:25:40.398035 1726928 site-packages/torch/_inductor/ir.py:6509] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1217 21:25:40.399542 1726928 site-packages/torch/_inductor/ir.py:6509] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1217 21:25:40.416440 1726928 site-packages/torch/_inductor/ir.py:6509] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1217 21:25:40.478395 1726928 site-packages/torch/_inductor/ir.py:6509] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1217 21:25:40.515625 1726928 site-packages/torch/_inductor/ir.py:6509] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1217 21:25:40.517014 1726928 site-packages/torch/_inductor/ir.py:6509] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1217 21:25:40.688990 1726928 site-packages/torch/_inductor/ir.py:6509] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1217 21:25:40.762023 1726928 site-packages/torch/_inductor/ir.py:6509] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1217 21:25:40.763446 1726928 site-packages/torch/_inductor/ir.py:6509] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1217 21:25:40.782799 1726928 site-packages/torch/_inductor/ir.py:6509] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1217 21:25:40.876059 1726928 site-packages/torch/_inductor/ir.py:6509] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1217 21:25:40.877358 1726928 site-packages/torch/_inductor/ir.py:6509] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1217 21:25:40.878373 1726928 site-packages/torch/_inductor/ir.py:6509] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1217 21:25:41.090682 1726928 site-packages/torch/_inductor/ir.py:6509] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1217 21:25:41.143335 1726928 site-packages/torch/_inductor/ir.py:6509] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1217 21:25:41.144849 1726928 site-packages/torch/_inductor/ir.py:6509] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1217 21:25:41.162729 1726928 site-packages/torch/_inductor/ir.py:6509] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1217 21:25:41.223985 1726928 site-packages/torch/_inductor/ir.py:6509] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1217 21:25:41.260059 1726928 site-packages/torch/_inductor/ir.py:6509] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1217 21:25:41.261281 1726928 site-packages/torch/_inductor/ir.py:6509] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1217 21:25:41.422600 1726928 site-packages/torch/_inductor/ir.py:6509] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1217 21:25:41.486187 1726928 site-packages/torch/_inductor/ir.py:6509] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1217 21:25:41.487586 1726928 site-packages/torch/_inductor/ir.py:6509] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1217 21:25:41.505558 1726928 site-packages/torch/_inductor/ir.py:6509] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1217 21:25:41.595883 1726928 site-packages/torch/_inductor/ir.py:6509] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1217 21:25:41.597129 1726928 site-packages/torch/_inductor/ir.py:6509] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1217 21:25:41.597984 1726928 site-packages/torch/_inductor/ir.py:6509] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1217 21:25:41.799895 1726928 site-packages/torch/_inductor/ir.py:6509] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1217 21:25:41.854097 1726928 site-packages/torch/_inductor/ir.py:6509] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1217 21:25:41.855931 1726928 site-packages/torch/_inductor/ir.py:6509] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1217 21:25:41.874098 1726928 site-packages/torch/_inductor/ir.py:6509] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1217 21:25:41.936044 1726928 site-packages/torch/_inductor/ir.py:6509] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1217 21:25:41.973008 1726928 site-packages/torch/_inductor/ir.py:6509] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1217 21:25:41.974292 1726928 site-packages/torch/_inductor/ir.py:6509] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1217 21:25:42.144366 1726928 site-packages/torch/_inductor/ir.py:6509] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1217 21:25:42.210350 1726928 site-packages/torch/_inductor/ir.py:6509] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1217 21:25:42.211694 1726928 site-packages/torch/_inductor/ir.py:6509] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1217 21:25:42.228747 1726928 site-packages/torch/_inductor/ir.py:6509] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1217 21:25:42.318347 1726928 site-packages/torch/_inductor/ir.py:6509] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1217 21:25:42.319550 1726928 site-packages/torch/_inductor/ir.py:6509] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1217 21:25:42.320440 1726928 site-packages/torch/_inductor/ir.py:6509] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1217 21:25:42.521208 1726928 site-packages/torch/_inductor/ir.py:6509] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1217 21:25:42.570340 1726928 site-packages/torch/_inductor/ir.py:6509] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1217 21:25:42.571983 1726928 site-packages/torch/_inductor/ir.py:6509] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1217 21:25:42.590258 1726928 site-packages/torch/_inductor/ir.py:6509] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1217 21:25:42.648636 1726928 site-packages/torch/_inductor/ir.py:6509] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1217 21:25:42.685271 1726928 site-packages/torch/_inductor/ir.py:6509] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1217 21:25:42.686547 1726928 site-packages/torch/_inductor/ir.py:6509] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1217 21:25:42.855123 1726928 site-packages/torch/_inductor/ir.py:6509] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1217 21:25:42.923517 1726928 site-packages/torch/_inductor/ir.py:6509] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1217 21:25:42.924846 1726928 site-packages/torch/_inductor/ir.py:6509] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1217 21:25:42.941554 1726928 site-packages/torch/_inductor/ir.py:6509] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1217 21:25:43.027200 1726928 site-packages/torch/_inductor/ir.py:6509] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1217 21:25:43.028374 1726928 site-packages/torch/_inductor/ir.py:6509] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1217 21:25:43.029178 1726928 site-packages/torch/_inductor/ir.py:6509] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1217 21:25:43.231433 1726928 site-packages/torch/_inductor/ir.py:6509] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1217 21:25:43.282865 1726928 site-packages/torch/_inductor/ir.py:6509] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1217 21:25:43.284325 1726928 site-packages/torch/_inductor/ir.py:6509] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1217 21:25:43.301345 1726928 site-packages/torch/_inductor/ir.py:6509] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1217 21:25:43.359391 1726928 site-packages/torch/_inductor/ir.py:6509] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1217 21:25:43.394111 1726928 site-packages/torch/_inductor/ir.py:6509] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1217 21:25:43.395272 1726928 site-packages/torch/_inductor/ir.py:6509] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1217 21:25:43.559552 1726928 site-packages/torch/_inductor/ir.py:6509] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1217 21:25:43.628100 1726928 site-packages/torch/_inductor/ir.py:6509] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1217 21:25:43.629398 1726928 site-packages/torch/_inductor/ir.py:6509] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1217 21:25:43.647248 1726928 site-packages/torch/_inductor/ir.py:6509] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1217 21:25:43.734637 1726928 site-packages/torch/_inductor/ir.py:6509] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1217 21:25:43.735950 1726928 site-packages/torch/_inductor/ir.py:6509] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1217 21:25:43.736795 1726928 site-packages/torch/_inductor/ir.py:6509] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1217 21:25:43.940444 1726928 site-packages/torch/_inductor/ir.py:6509] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1217 21:25:43.991449 1726928 site-packages/torch/_inductor/ir.py:6509] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1217 21:25:43.992994 1726928 site-packages/torch/_inductor/ir.py:6509] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1217 21:25:44.010978 1726928 site-packages/torch/_inductor/ir.py:6509] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1217 21:25:44.076490 1726928 site-packages/torch/_inductor/ir.py:6509] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1217 21:25:44.115470 1726928 site-packages/torch/_inductor/ir.py:6509] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1217 21:25:44.116978 1726928 site-packages/torch/_inductor/ir.py:6509] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1217 21:25:44.293128 1726928 site-packages/torch/_inductor/ir.py:6509] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1217 21:25:44.363740 1726928 site-packages/torch/_inductor/ir.py:6509] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1217 21:25:44.365111 1726928 site-packages/torch/_inductor/ir.py:6509] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1217 21:25:44.383504 1726928 site-packages/torch/_inductor/ir.py:6509] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1217 21:25:44.472227 1726928 site-packages/torch/_inductor/ir.py:6509] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1217 21:25:44.473408 1726928 site-packages/torch/_inductor/ir.py:6509] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1217 21:25:44.474239 1726928 site-packages/torch/_inductor/ir.py:6509] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1217 21:25:44.676057 1726928 site-packages/torch/_inductor/ir.py:6509] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1217 21:25:44.728065 1726928 site-packages/torch/_inductor/ir.py:6509] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1217 21:25:44.729614 1726928 site-packages/torch/_inductor/ir.py:6509] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1217 21:25:44.747195 1726928 site-packages/torch/_inductor/ir.py:6509] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1217 21:25:44.809276 1726928 site-packages/torch/_inductor/ir.py:6509] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1217 21:25:44.847719 1726928 site-packages/torch/_inductor/ir.py:6509] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1217 21:25:44.849182 1726928 site-packages/torch/_inductor/ir.py:6509] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1217 21:25:45.015431 1726928 site-packages/torch/_inductor/ir.py:6509] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1217 21:25:45.083039 1726928 site-packages/torch/_inductor/ir.py:6509] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1217 21:25:45.084431 1726928 site-packages/torch/_inductor/ir.py:6509] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1217 21:25:45.104194 1726928 site-packages/torch/_inductor/ir.py:6509] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1217 21:25:45.188927 1726928 site-packages/torch/_inductor/ir.py:6509] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1217 21:25:45.190133 1726928 site-packages/torch/_inductor/ir.py:6509] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1217 21:25:45.190978 1726928 site-packages/torch/_inductor/ir.py:6509] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1217 21:25:45.386772 1726928 site-packages/torch/_inductor/ir.py:6509] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1217 21:25:45.439281 1726928 site-packages/torch/_inductor/ir.py:6509] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1217 21:25:45.440768 1726928 site-packages/torch/_inductor/ir.py:6509] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1217 21:25:45.457834 1726928 site-packages/torch/_inductor/ir.py:6509] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1217 21:25:45.514329 1726928 site-packages/torch/_inductor/ir.py:6509] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1217 21:25:45.550947 1726928 site-packages/torch/_inductor/ir.py:6509] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1217 21:25:45.552241 1726928 site-packages/torch/_inductor/ir.py:6509] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1217 21:25:45.721035 1726928 site-packages/torch/_inductor/ir.py:6509] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1217 21:25:45.791931 1726928 site-packages/torch/_inductor/ir.py:6509] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1217 21:25:45.793284 1726928 site-packages/torch/_inductor/ir.py:6509] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1217 21:25:45.810905 1726928 site-packages/torch/_inductor/ir.py:6509] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1217 21:25:45.897783 1726928 site-packages/torch/_inductor/ir.py:6509] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1217 21:25:45.899013 1726928 site-packages/torch/_inductor/ir.py:6509] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1217 21:25:45.899891 1726928 site-packages/torch/_inductor/ir.py:6509] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1217 21:25:46.110556 1726928 site-packages/torch/_inductor/ir.py:6509] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1217 21:25:46.169698 1726928 site-packages/torch/_inductor/ir.py:6509] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1217 21:25:46.171474 1726928 site-packages/torch/_inductor/ir.py:6509] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1217 21:25:46.191062 1726928 site-packages/torch/_inductor/ir.py:6509] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1217 21:25:46.254716 1726928 site-packages/torch/_inductor/ir.py:6509] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1217 21:25:46.293792 1726928 site-packages/torch/_inductor/ir.py:6509] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1217 21:25:46.295235 1726928 site-packages/torch/_inductor/ir.py:6509] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1217 21:25:46.476298 1726928 site-packages/torch/_inductor/ir.py:6509] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1217 21:25:46.548164 1726928 site-packages/torch/_inductor/ir.py:6509] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1217 21:25:46.549569 1726928 site-packages/torch/_inductor/ir.py:6509] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1217 21:25:46.569285 1726928 site-packages/torch/_inductor/ir.py:6509] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1217 21:25:46.666378 1726928 site-packages/torch/_inductor/ir.py:6509] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1217 21:25:46.667587 1726928 site-packages/torch/_inductor/ir.py:6509] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1217 21:25:46.668604 1726928 site-packages/torch/_inductor/ir.py:6509] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1217 21:25:46.894730 1726928 site-packages/torch/_inductor/ir.py:6509] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1217 21:25:46.947788 1726928 site-packages/torch/_inductor/ir.py:6509] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1217 21:25:46.949657 1726928 site-packages/torch/_inductor/ir.py:6509] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1217 21:25:46.968731 1726928 site-packages/torch/_inductor/ir.py:6509] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1217 21:25:47.038142 1726928 site-packages/torch/_inductor/ir.py:6509] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1217 21:25:47.079296 1726928 site-packages/torch/_inductor/ir.py:6509] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1217 21:25:47.080721 1726928 site-packages/torch/_inductor/ir.py:6509] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1217 21:25:47.268423 1726928 site-packages/torch/_inductor/ir.py:6509] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1217 21:25:47.343195 1726928 site-packages/torch/_inductor/ir.py:6509] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1217 21:25:47.344607 1726928 site-packages/torch/_inductor/ir.py:6509] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1217 21:25:47.363332 1726928 site-packages/torch/_inductor/ir.py:6509] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1217 21:25:47.459643 1726928 site-packages/torch/_inductor/ir.py:6509] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1217 21:25:47.460909 1726928 site-packages/torch/_inductor/ir.py:6509] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1217 21:25:47.461772 1726928 site-packages/torch/_inductor/ir.py:6509] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1217 21:25:47.680416 1726928 site-packages/torch/_inductor/ir.py:6509] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1217 21:25:47.739931 1726928 site-packages/torch/_inductor/ir.py:6509] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1217 21:25:47.741855 1726928 site-packages/torch/_inductor/ir.py:6509] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1217 21:25:47.762774 1726928 site-packages/torch/_inductor/ir.py:6509] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1217 21:25:47.830028 1726928 site-packages/torch/_inductor/ir.py:6509] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1217 21:25:47.871154 1726928 site-packages/torch/_inductor/ir.py:6509] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1217 21:25:47.872575 1726928 site-packages/torch/_inductor/ir.py:6509] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1217 21:25:48.041100 1726928 site-packages/torch/_inductor/ir.py:6509] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1217 21:25:48.113689 1726928 site-packages/torch/_inductor/ir.py:6509] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1217 21:25:48.115032 1726928 site-packages/torch/_inductor/ir.py:6509] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1217 21:25:48.133671 1726928 site-packages/torch/_inductor/ir.py:6509] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1217 21:25:48.225839 1726928 site-packages/torch/_inductor/ir.py:6509] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1217 21:25:48.227135 1726928 site-packages/torch/_inductor/ir.py:6509] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1217 21:25:48.228057 1726928 site-packages/torch/_inductor/ir.py:6509] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1217 21:25:48.436536 1726928 site-packages/torch/_inductor/ir.py:6509] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1217 21:25:48.489276 1726928 site-packages/torch/_inductor/ir.py:6509] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1217 21:25:48.490823 1726928 site-packages/torch/_inductor/ir.py:6509] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1217 21:25:48.508600 1726928 site-packages/torch/_inductor/ir.py:6509] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1217 21:25:48.568186 1726928 site-packages/torch/_inductor/ir.py:6509] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1217 21:25:48.605806 1726928 site-packages/torch/_inductor/ir.py:6509] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1217 21:25:48.607059 1726928 site-packages/torch/_inductor/ir.py:6509] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1217 21:25:48.783024 1726928 site-packages/torch/_inductor/ir.py:6509] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1217 21:25:48.854924 1726928 site-packages/torch/_inductor/ir.py:6509] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1217 21:25:48.856274 1726928 site-packages/torch/_inductor/ir.py:6509] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1217 21:25:48.874853 1726928 site-packages/torch/_inductor/ir.py:6509] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1217 21:25:48.965942 1726928 site-packages/torch/_inductor/ir.py:6509] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1217 21:25:48.967214 1726928 site-packages/torch/_inductor/ir.py:6509] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1217 21:25:48.968102 1726928 site-packages/torch/_inductor/ir.py:6509] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1217 21:25:49.182939 1726928 site-packages/torch/_inductor/ir.py:6509] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1217 21:25:49.235876 1726928 site-packages/torch/_inductor/ir.py:6509] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1217 21:25:49.237373 1726928 site-packages/torch/_inductor/ir.py:6509] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1217 21:25:49.256235 1726928 site-packages/torch/_inductor/ir.py:6509] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1217 21:25:49.324054 1726928 site-packages/torch/_inductor/ir.py:6509] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1217 21:25:49.361468 1726928 site-packages/torch/_inductor/ir.py:6509] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1217 21:25:49.362848 1726928 site-packages/torch/_inductor/ir.py:6509] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1217 21:25:49.534660 1726928 site-packages/torch/_inductor/ir.py:6509] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1217 21:25:49.603754 1726928 site-packages/torch/_inductor/ir.py:6509] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1217 21:25:49.605154 1726928 site-packages/torch/_inductor/ir.py:6509] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1217 21:25:49.623737 1726928 site-packages/torch/_inductor/ir.py:6509] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1217 21:25:49.671895 1726928 site-packages/torch/_inductor/ir.py:6509] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+/tmp/torchinductor_jackkhuu/c5o2v2ioswxwskfczjnkmp3pd2krfij5mcm77ewxglwjn3bvjwlp/cjdf6otzt7dk47njelcjyqyz6vvnp6x6pvkv2zntoxs2l2v4bgio.cpp: In function ‘void cpp_fused__safe_softmax__weight_int4pack_mm_for_cpu_add_bmm_index_index_put_scalar_tensor_stack_where_1(const bfloat16*, const int32_t*, const bfloat16*, const bfloat16*, const float*, const float*, const bool*, const bfloat16*, float*, float*, float*, float*, bfloat16*, float*, bool*, float*, float*, float*, bfloat16*, float*, bfloat16*)’:
+/tmp/torchinductor_jackkhuu/c5o2v2ioswxwskfczjnkmp3pd2krfij5mcm77ewxglwjn3bvjwlp/cjdf6otzt7dk47njelcjyqyz6vvnp6x6pvkv2zntoxs2l2v4bgio.cpp:738:31: warning: variable ‘tmp_acc0_arr’ set but not used [-Wunused-but-set-variable]
+  738 |                         float tmp_acc0_arr[32];
+      |                               ^~~~~~~~~~~~
+/tmp/torchinductor_jackkhuu/c5o2v2ioswxwskfczjnkmp3pd2krfij5mcm77ewxglwjn3bvjwlp/cjdf6otzt7dk47njelcjyqyz6vvnp6x6pvkv2zntoxs2l2v4bgio.cpp: In function ‘void cpp_fused__safe_softmax__weight_int4pack_mm_for_cpu_add_bmm_index_index_put_scalar_tensor_stack_where_6(const bfloat16*, const int32_t*, const bfloat16*, const float*, const float*, const bool*, const bfloat16*, float*, float*, bfloat16*, float*, bool*, float*, float*, float*, bfloat16*, float*, bfloat16*)’:
+/tmp/torchinductor_jackkhuu/c5o2v2ioswxwskfczjnkmp3pd2krfij5mcm77ewxglwjn3bvjwlp/cjdf6otzt7dk47njelcjyqyz6vvnp6x6pvkv2zntoxs2l2v4bgio.cpp:1274:31: warning: variable ‘tmp_acc0_arr’ set but not used [-Wunused-but-set-variable]
+ 1274 |                         float tmp_acc0_arr[32];
+      |                               ^~~~~~~~~~~~
+/tmp/torchinductor_jackkhuu/c5o2v2ioswxwskfczjnkmp3pd2krfij5mcm77ewxglwjn3bvjwlp/cjdf6otzt7dk47njelcjyqyz6vvnp6x6pvkv2zntoxs2l2v4bgio.cpp: In function ‘void cpp_fused__safe_softmax__weight_int4pack_mm_for_cpu_add_bmm_index_index_put_scalar_tensor_stack_where_10(const bfloat16*, const int32_t*, const bfloat16*, const bfloat16*, const float*, const float*, const bool*, const bfloat16*, float*, float*, float*, float*, bfloat16*, float*, bool*, float*, float*, float*, bfloat16*, float*, bfloat16*)’:
+/tmp/torchinductor_jackkhuu/c5o2v2ioswxwskfczjnkmp3pd2krfij5mcm77ewxglwjn3bvjwlp/cjdf6otzt7dk47njelcjyqyz6vvnp6x6pvkv2zntoxs2l2v4bgio.cpp:1780:31: warning: variable ‘tmp_acc0_arr’ set but not used [-Wunused-but-set-variable]
+ 1780 |                         float tmp_acc0_arr[32];
+      |                               ^~~~~~~~~~~~
+/tmp/torchinductor_jackkhuu/c5o2v2ioswxwskfczjnkmp3pd2krfij5mcm77ewxglwjn3bvjwlp/cjdf6otzt7dk47njelcjyqyz6vvnp6x6pvkv2zntoxs2l2v4bgio.cpp: In function ‘void cpp_fused__safe_softmax__weight_int4pack_mm_for_cpu_add_bmm_index_index_put_scalar_tensor_stack_where_15(const bfloat16*, const int32_t*, const bfloat16*, const float*, const float*, const bool*, const bfloat16*, float*, float*, bfloat16*, float*, bool*, float*, float*, float*, bfloat16*, float*, bfloat16*)’:
+/tmp/torchinductor_jackkhuu/c5o2v2ioswxwskfczjnkmp3pd2krfij5mcm77ewxglwjn3bvjwlp/cjdf6otzt7dk47njelcjyqyz6vvnp6x6pvkv2zntoxs2l2v4bgio.cpp:2292:31: warning: variable ‘tmp_acc0_arr’ set but not used [-Wunused-but-set-variable]
+ 2292 |                         float tmp_acc0_arr[32];
+      |                               ^~~~~~~~~~~~
+/tmp/torchinductor_jackkhuu/c5o2v2ioswxwskfczjnkmp3pd2krfij5mcm77ewxglwjn3bvjwlp/cjdf6otzt7dk47njelcjyqyz6vvnp6x6pvkv2zntoxs2l2v4bgio.cpp: In function ‘void cpp_fused__safe_softmax__weight_int4pack_mm_for_cpu_add_bmm_index_index_put_scalar_tensor_stack_where_19(const bfloat16*, const int32_t*, const bfloat16*, const bfloat16*, const float*, const float*, const bool*, const bfloat16*, float*, float*, float*, float*, bfloat16*, float*, bool*, float*, float*, float*, bfloat16*, float*, bfloat16*)’:
+/tmp/torchinductor_jackkhuu/c5o2v2ioswxwskfczjnkmp3pd2krfij5mcm77ewxglwjn3bvjwlp/cjdf6otzt7dk47njelcjyqyz6vvnp6x6pvkv2zntoxs2l2v4bgio.cpp:2792:31: warning: variable ‘tmp_acc0_arr’ set but not used [-Wunused-but-set-variable]
+ 2792 |                         float tmp_acc0_arr[32];
+      |                               ^~~~~~~~~~~~
+/tmp/torchinductor_jackkhuu/c5o2v2ioswxwskfczjnkmp3pd2krfij5mcm77ewxglwjn3bvjwlp/cjdf6otzt7dk47njelcjyqyz6vvnp6x6pvkv2zntoxs2l2v4bgio.cpp: In function ‘void cpp_fused__safe_softmax__weight_int4pack_mm_for_cpu_add_bmm_index_index_put_scalar_tensor_stack_where_24(const bfloat16*, const int32_t*, const bfloat16*, const float*, const float*, const bool*, const bfloat16*, float*, float*, bfloat16*, float*, bool*, float*, float*, float*, bfloat16*, float*, bfloat16*)’:
+/tmp/torchinductor_jackkhuu/c5o2v2ioswxwskfczjnkmp3pd2krfij5mcm77ewxglwjn3bvjwlp/cjdf6otzt7dk47njelcjyqyz6vvnp6x6pvkv2zntoxs2l2v4bgio.cpp:3304:31: warning: variable ‘tmp_acc0_arr’ set but not used [-Wunused-but-set-variable]
+ 3304 |                         float tmp_acc0_arr[32];
+      |                               ^~~~~~~~~~~~
+/tmp/torchinductor_jackkhuu/c5o2v2ioswxwskfczjnkmp3pd2krfij5mcm77ewxglwjn3bvjwlp/cjdf6otzt7dk47njelcjyqyz6vvnp6x6pvkv2zntoxs2l2v4bgio.cpp: In function ‘void cpp_fused__safe_softmax__weight_int4pack_mm_for_cpu_add_bmm_index_index_put_scalar_tensor_stack_where_28(const bfloat16*, const int32_t*, const bfloat16*, const bfloat16*, const float*, const float*, const bool*, const bfloat16*, float*, float*, float*, float*, bfloat16*, float*, bool*, float*, float*, float*, bfloat16*, float*, bfloat16*)’:
+/tmp/torchinductor_jackkhuu/c5o2v2ioswxwskfczjnkmp3pd2krfij5mcm77ewxglwjn3bvjwlp/cjdf6otzt7dk47njelcjyqyz6vvnp6x6pvkv2zntoxs2l2v4bgio.cpp:3804:31: warning: variable ‘tmp_acc0_arr’ set but not used [-Wunused-but-set-variable]
+ 3804 |                         float tmp_acc0_arr[32];
+      |                               ^~~~~~~~~~~~
+/tmp/torchinductor_jackkhuu/c5o2v2ioswxwskfczjnkmp3pd2krfij5mcm77ewxglwjn3bvjwlp/cjdf6otzt7dk47njelcjyqyz6vvnp6x6pvkv2zntoxs2l2v4bgio.cpp: In function ‘void cpp_fused__safe_softmax__weight_int4pack_mm_for_cpu_add_bmm_index_index_put_scalar_tensor_stack_where_33(const bfloat16*, const int32_t*, const bfloat16*, const float*, const float*, const bool*, const bfloat16*, float*, float*, bfloat16*, float*, bool*, float*, float*, float*, bfloat16*, float*, bfloat16*)’:
+/tmp/torchinductor_jackkhuu/c5o2v2ioswxwskfczjnkmp3pd2krfij5mcm77ewxglwjn3bvjwlp/cjdf6otzt7dk47njelcjyqyz6vvnp6x6pvkv2zntoxs2l2v4bgio.cpp:4316:31: warning: variable ‘tmp_acc0_arr’ set but not used [-Wunused-but-set-variable]
+ 4316 |                         float tmp_acc0_arr[32];
+      |                               ^~~~~~~~~~~~
+/tmp/torchinductor_jackkhuu/c5o2v2ioswxwskfczjnkmp3pd2krfij5mcm77ewxglwjn3bvjwlp/cjdf6otzt7dk47njelcjyqyz6vvnp6x6pvkv2zntoxs2l2v4bgio.cpp: In function ‘void cpp_fused__safe_softmax__weight_int4pack_mm_for_cpu_add_bmm_index_index_put_scalar_tensor_stack_where_37(const bfloat16*, const int32_t*, const bfloat16*, const bfloat16*, const float*, const float*, const bool*, const bfloat16*, float*, float*, float*, float*, bfloat16*, float*, bool*, float*, float*, float*, bfloat16*, float*, bfloat16*)’:
+/tmp/torchinductor_jackkhuu/c5o2v2ioswxwskfczjnkmp3pd2krfij5mcm77ewxglwjn3bvjwlp/cjdf6otzt7dk47njelcjyqyz6vvnp6x6pvkv2zntoxs2l2v4bgio.cpp:4816:31: warning: variable ‘tmp_acc0_arr’ set but not used [-Wunused-but-set-variable]
+ 4816 |                         float tmp_acc0_arr[32];
+      |                               ^~~~~~~~~~~~
+/tmp/torchinductor_jackkhuu/c5o2v2ioswxwskfczjnkmp3pd2krfij5mcm77ewxglwjn3bvjwlp/cjdf6otzt7dk47njelcjyqyz6vvnp6x6pvkv2zntoxs2l2v4bgio.cpp: In function ‘void cpp_fused__safe_softmax__weight_int4pack_mm_for_cpu_add_bmm_index_index_put_scalar_tensor_stack_where_42(const bfloat16*, const int32_t*, const bfloat16*, const float*, const float*, const bool*, const bfloat16*, float*, float*, bfloat16*, float*, bool*, float*, float*, float*, bfloat16*, float*, bfloat16*)’:
+/tmp/torchinductor_jackkhuu/c5o2v2ioswxwskfczjnkmp3pd2krfij5mcm77ewxglwjn3bvjwlp/cjdf6otzt7dk47njelcjyqyz6vvnp6x6pvkv2zntoxs2l2v4bgio.cpp:5328:31: warning: variable ‘tmp_acc0_arr’ set but not used [-Wunused-but-set-variable]
+ 5328 |                         float tmp_acc0_arr[32];
+      |                               ^~~~~~~~~~~~
+/tmp/torchinductor_jackkhuu/c5o2v2ioswxwskfczjnkmp3pd2krfij5mcm77ewxglwjn3bvjwlp/cjdf6otzt7dk47njelcjyqyz6vvnp6x6pvkv2zntoxs2l2v4bgio.cpp: In function ‘void cpp_fused__safe_softmax__weight_int4pack_mm_for_cpu_add_bmm_index_index_put_scalar_tensor_stack_where_46(const bfloat16*, const int32_t*, const bfloat16*, const bfloat16*, const float*, const float*, const bool*, const bfloat16*, float*, float*, float*, float*, bfloat16*, float*, bool*, float*, float*, float*, bfloat16*, float*, bfloat16*)’:
+/tmp/torchinductor_jackkhuu/c5o2v2ioswxwskfczjnkmp3pd2krfij5mcm77ewxglwjn3bvjwlp/cjdf6otzt7dk47njelcjyqyz6vvnp6x6pvkv2zntoxs2l2v4bgio.cpp:5828:31: warning: variable ‘tmp_acc0_arr’ set but not used [-Wunused-but-set-variable]
+ 5828 |                         float tmp_acc0_arr[32];
+      |                               ^~~~~~~~~~~~
+/tmp/torchinductor_jackkhuu/c5o2v2ioswxwskfczjnkmp3pd2krfij5mcm77ewxglwjn3bvjwlp/cjdf6otzt7dk47njelcjyqyz6vvnp6x6pvkv2zntoxs2l2v4bgio.cpp: In function ‘void cpp_fused__safe_softmax__weight_int4pack_mm_for_cpu_add_bmm_index_index_put_scalar_tensor_stack_where_51(const bfloat16*, const int32_t*, const bfloat16*, const float*, const float*, const bool*, const bfloat16*, float*, float*, bfloat16*, float*, bool*, float*, float*, float*, bfloat16*, float*, bfloat16*)’:
+/tmp/torchinductor_jackkhuu/c5o2v2ioswxwskfczjnkmp3pd2krfij5mcm77ewxglwjn3bvjwlp/cjdf6otzt7dk47njelcjyqyz6vvnp6x6pvkv2zntoxs2l2v4bgio.cpp:6340:31: warning: variable ‘tmp_acc0_arr’ set but not used [-Wunused-but-set-variable]
+ 6340 |                         float tmp_acc0_arr[32];
+      |                               ^~~~~~~~~~~~
+/tmp/torchinductor_jackkhuu/c5o2v2ioswxwskfczjnkmp3pd2krfij5mcm77ewxglwjn3bvjwlp/cjdf6otzt7dk47njelcjyqyz6vvnp6x6pvkv2zntoxs2l2v4bgio.cpp: In function ‘void cpp_fused__safe_softmax__weight_int4pack_mm_for_cpu_add_bmm_index_index_put_scalar_tensor_stack_where_55(const bfloat16*, const int32_t*, const bfloat16*, const bfloat16*, const float*, const float*, const bool*, const bfloat16*, float*, float*, float*, float*, bfloat16*, float*, bool*, float*, float*, float*, bfloat16*, float*, bfloat16*)’:
+/tmp/torchinductor_jackkhuu/c5o2v2ioswxwskfczjnkmp3pd2krfij5mcm77ewxglwjn3bvjwlp/cjdf6otzt7dk47njelcjyqyz6vvnp6x6pvkv2zntoxs2l2v4bgio.cpp:6840:31: warning: variable ‘tmp_acc0_arr’ set but not used [-Wunused-but-set-variable]
+ 6840 |                         float tmp_acc0_arr[32];
+      |                               ^~~~~~~~~~~~
+/tmp/torchinductor_jackkhuu/c5o2v2ioswxwskfczjnkmp3pd2krfij5mcm77ewxglwjn3bvjwlp/cjdf6otzt7dk47njelcjyqyz6vvnp6x6pvkv2zntoxs2l2v4bgio.cpp: In function ‘void cpp_fused__safe_softmax__weight_int4pack_mm_for_cpu_add_bmm_index_index_put_scalar_tensor_stack_where_60(const bfloat16*, const int32_t*, const bfloat16*, const float*, const float*, const bool*, const bfloat16*, float*, float*, bfloat16*, float*, bool*, float*, float*, float*, bfloat16*, float*, bfloat16*)’:
+/tmp/torchinductor_jackkhuu/c5o2v2ioswxwskfczjnkmp3pd2krfij5mcm77ewxglwjn3bvjwlp/cjdf6otzt7dk47njelcjyqyz6vvnp6x6pvkv2zntoxs2l2v4bgio.cpp:7352:31: warning: variable ‘tmp_acc0_arr’ set but not used [-Wunused-but-set-variable]
+ 7352 |                         float tmp_acc0_arr[32];
+      |                               ^~~~~~~~~~~~
+/tmp/torchinductor_jackkhuu/c5o2v2ioswxwskfczjnkmp3pd2krfij5mcm77ewxglwjn3bvjwlp/cjdf6otzt7dk47njelcjyqyz6vvnp6x6pvkv2zntoxs2l2v4bgio.cpp: In function ‘void cpp_fused__safe_softmax__weight_int4pack_mm_for_cpu_add_bmm_index_index_put_scalar_tensor_stack_where_64(const bfloat16*, const int32_t*, const bfloat16*, const bfloat16*, const float*, const float*, const bool*, const bfloat16*, float*, float*, float*, float*, bfloat16*, float*, bool*, float*, float*, float*, bfloat16*, float*, bfloat16*)’:
+/tmp/torchinductor_jackkhuu/c5o2v2ioswxwskfczjnkmp3pd2krfij5mcm77ewxglwjn3bvjwlp/cjdf6otzt7dk47njelcjyqyz6vvnp6x6pvkv2zntoxs2l2v4bgio.cpp:7852:31: warning: variable ‘tmp_acc0_arr’ set but not used [-Wunused-but-set-variable]
+ 7852 |                         float tmp_acc0_arr[32];
+      |                               ^~~~~~~~~~~~
+/tmp/torchinductor_jackkhuu/c5o2v2ioswxwskfczjnkmp3pd2krfij5mcm77ewxglwjn3bvjwlp/cjdf6otzt7dk47njelcjyqyz6vvnp6x6pvkv2zntoxs2l2v4bgio.cpp: In function ‘void cpp_fused__safe_softmax__weight_int4pack_mm_for_cpu_add_bmm_index_index_put_scalar_tensor_stack_where_69(const bfloat16*, const int32_t*, const bfloat16*, const float*, const float*, const bool*, const bfloat16*, float*, float*, bfloat16*, float*, bool*, float*, float*, float*, bfloat16*, float*, bfloat16*)’:
+/tmp/torchinductor_jackkhuu/c5o2v2ioswxwskfczjnkmp3pd2krfij5mcm77ewxglwjn3bvjwlp/cjdf6otzt7dk47njelcjyqyz6vvnp6x6pvkv2zntoxs2l2v4bgio.cpp:8364:31: warning: variable ‘tmp_acc0_arr’ set but not used [-Wunused-but-set-variable]
+ 8364 |                         float tmp_acc0_arr[32];
+      |                               ^~~~~~~~~~~~
+/tmp/torchinductor_jackkhuu/c5o2v2ioswxwskfczjnkmp3pd2krfij5mcm77ewxglwjn3bvjwlp/cjdf6otzt7dk47njelcjyqyz6vvnp6x6pvkv2zntoxs2l2v4bgio.cpp: In function ‘void cpp_fused__safe_softmax__weight_int4pack_mm_for_cpu_add_bmm_index_index_put_scalar_tensor_stack_where_73(const bfloat16*, const int32_t*, const bfloat16*, const bfloat16*, const float*, const float*, const bool*, const bfloat16*, float*, float*, float*, float*, bfloat16*, float*, bool*, float*, float*, float*, bfloat16*, float*, bfloat16*)’:
+/tmp/torchinductor_jackkhuu/c5o2v2ioswxwskfczjnkmp3pd2krfij5mcm77ewxglwjn3bvjwlp/cjdf6otzt7dk47njelcjyqyz6vvnp6x6pvkv2zntoxs2l2v4bgio.cpp:8864:31: warning: variable ‘tmp_acc0_arr’ set but not used [-Wunused-but-set-variable]
+ 8864 |                         float tmp_acc0_arr[32];
+      |                               ^~~~~~~~~~~~
+/tmp/torchinductor_jackkhuu/c5o2v2ioswxwskfczjnkmp3pd2krfij5mcm77ewxglwjn3bvjwlp/cjdf6otzt7dk47njelcjyqyz6vvnp6x6pvkv2zntoxs2l2v4bgio.cpp: In function ‘void cpp_fused__safe_softmax__weight_int4pack_mm_for_cpu_add_bmm_index_index_put_scalar_tensor_stack_where_78(const bfloat16*, const int32_t*, const bfloat16*, const float*, const float*, const bool*, const bfloat16*, float*, float*, bfloat16*, float*, bool*, float*, float*, float*, bfloat16*, float*, bfloat16*)’:
+/tmp/torchinductor_jackkhuu/c5o2v2ioswxwskfczjnkmp3pd2krfij5mcm77ewxglwjn3bvjwlp/cjdf6otzt7dk47njelcjyqyz6vvnp6x6pvkv2zntoxs2l2v4bgio.cpp:9376:31: warning: variable ‘tmp_acc0_arr’ set but not used [-Wunused-but-set-variable]
+ 9376 |                         float tmp_acc0_arr[32];
+      |                               ^~~~~~~~~~~~
+/tmp/torchinductor_jackkhuu/c5o2v2ioswxwskfczjnkmp3pd2krfij5mcm77ewxglwjn3bvjwlp/cjdf6otzt7dk47njelcjyqyz6vvnp6x6pvkv2zntoxs2l2v4bgio.cpp: In function ‘void cpp_fused__safe_softmax__weight_int4pack_mm_for_cpu_add_bmm_index_index_put_scalar_tensor_stack_where_82(const bfloat16*, const int32_t*, const bfloat16*, const bfloat16*, const float*, const float*, const bool*, const bfloat16*, float*, float*, float*, float*, bfloat16*, float*, bool*, float*, float*, float*, bfloat16*, float*, bfloat16*)’:
+/tmp/torchinductor_jackkhuu/c5o2v2ioswxwskfczjnkmp3pd2krfij5mcm77ewxglwjn3bvjwlp/cjdf6otzt7dk47njelcjyqyz6vvnp6x6pvkv2zntoxs2l2v4bgio.cpp:9876:31: warning: variable ‘tmp_acc0_arr’ set but not used [-Wunused-but-set-variable]
+ 9876 |                         float tmp_acc0_arr[32];
+      |                               ^~~~~~~~~~~~
+/tmp/torchinductor_jackkhuu/c5o2v2ioswxwskfczjnkmp3pd2krfij5mcm77ewxglwjn3bvjwlp/cjdf6otzt7dk47njelcjyqyz6vvnp6x6pvkv2zntoxs2l2v4bgio.cpp: In function ‘void cpp_fused__safe_softmax__weight_int4pack_mm_for_cpu_add_bmm_index_index_put_scalar_tensor_stack_where_87(const bfloat16*, const int32_t*, const bfloat16*, const float*, const float*, const bool*, const bfloat16*, float*, float*, bfloat16*, float*, bool*, float*, float*, float*, bfloat16*, float*, bfloat16*)’:
+/tmp/torchinductor_jackkhuu/c5o2v2ioswxwskfczjnkmp3pd2krfij5mcm77ewxglwjn3bvjwlp/cjdf6otzt7dk47njelcjyqyz6vvnp6x6pvkv2zntoxs2l2v4bgio.cpp:10388:31: warning: variable ‘tmp_acc0_arr’ set but not used [-Wunused-but-set-variable]
+10388 |                         float tmp_acc0_arr[32];
+      |                               ^~~~~~~~~~~~
+/tmp/torchinductor_jackkhuu/c5o2v2ioswxwskfczjnkmp3pd2krfij5mcm77ewxglwjn3bvjwlp/cjdf6otzt7dk47njelcjyqyz6vvnp6x6pvkv2zntoxs2l2v4bgio.cpp: In function ‘void cpp_fused__safe_softmax__weight_int4pack_mm_for_cpu_add_bmm_index_index_put_scalar_tensor_stack_where_91(const bfloat16*, const int32_t*, const bfloat16*, const bfloat16*, const float*, const float*, const bool*, const bfloat16*, float*, float*, float*, float*, bfloat16*, float*, bool*, float*, float*, float*, bfloat16*, float*, bfloat16*)’:
+/tmp/torchinductor_jackkhuu/c5o2v2ioswxwskfczjnkmp3pd2krfij5mcm77ewxglwjn3bvjwlp/cjdf6otzt7dk47njelcjyqyz6vvnp6x6pvkv2zntoxs2l2v4bgio.cpp:10888:31: warning: variable ‘tmp_acc0_arr’ set but not used [-Wunused-but-set-variable]
+10888 |                         float tmp_acc0_arr[32];
+      |                               ^~~~~~~~~~~~
+/tmp/torchinductor_jackkhuu/c5o2v2ioswxwskfczjnkmp3pd2krfij5mcm77ewxglwjn3bvjwlp/cjdf6otzt7dk47njelcjyqyz6vvnp6x6pvkv2zntoxs2l2v4bgio.cpp: In function ‘void cpp_fused__safe_softmax__weight_int4pack_mm_for_cpu_add_bmm_index_index_put_scalar_tensor_stack_where_96(const bfloat16*, const int32_t*, const bfloat16*, const float*, const float*, const bool*, const bfloat16*, float*, float*, bfloat16*, float*, bool*, float*, float*, float*, bfloat16*, float*, bfloat16*)’:
+/tmp/torchinductor_jackkhuu/c5o2v2ioswxwskfczjnkmp3pd2krfij5mcm77ewxglwjn3bvjwlp/cjdf6otzt7dk47njelcjyqyz6vvnp6x6pvkv2zntoxs2l2v4bgio.cpp:11400:31: warning: variable ‘tmp_acc0_arr’ set but not used [-Wunused-but-set-variable]
+11400 |                         float tmp_acc0_arr[32];
+      |                               ^~~~~~~~~~~~
+/tmp/torchinductor_jackkhuu/c5o2v2ioswxwskfczjnkmp3pd2krfij5mcm77ewxglwjn3bvjwlp/cjdf6otzt7dk47njelcjyqyz6vvnp6x6pvkv2zntoxs2l2v4bgio.cpp: In function ‘void cpp_fused__safe_softmax__weight_int4pack_mm_for_cpu_add_bmm_index_index_put_scalar_tensor_stack_where_100(const bfloat16*, const int32_t*, const bfloat16*, const bfloat16*, const float*, const float*, const bool*, const bfloat16*, float*, float*, float*, float*, bfloat16*, float*, bool*, float*, float*, float*, bfloat16*, float*, bfloat16*)’:
+/tmp/torchinductor_jackkhuu/c5o2v2ioswxwskfczjnkmp3pd2krfij5mcm77ewxglwjn3bvjwlp/cjdf6otzt7dk47njelcjyqyz6vvnp6x6pvkv2zntoxs2l2v4bgio.cpp:11900:31: warning: variable ‘tmp_acc0_arr’ set but not used [-Wunused-but-set-variable]
+11900 |                         float tmp_acc0_arr[32];
+      |                               ^~~~~~~~~~~~
+/tmp/torchinductor_jackkhuu/c5o2v2ioswxwskfczjnkmp3pd2krfij5mcm77ewxglwjn3bvjwlp/cjdf6otzt7dk47njelcjyqyz6vvnp6x6pvkv2zntoxs2l2v4bgio.cpp: In function ‘void cpp_fused__safe_softmax__weight_int4pack_mm_for_cpu_add_bmm_index_index_put_scalar_tensor_stack_where_105(const bfloat16*, const int32_t*, const bfloat16*, const float*, const float*, const bool*, const bfloat16*, float*, float*, bfloat16*, float*, bool*, float*, float*, float*, bfloat16*, float*, bfloat16*)’:
+/tmp/torchinductor_jackkhuu/c5o2v2ioswxwskfczjnkmp3pd2krfij5mcm77ewxglwjn3bvjwlp/cjdf6otzt7dk47njelcjyqyz6vvnp6x6pvkv2zntoxs2l2v4bgio.cpp:12412:31: warning: variable ‘tmp_acc0_arr’ set but not used [-Wunused-but-set-variable]
+12412 |                         float tmp_acc0_arr[32];
+      |                               ^~~~~~~~~~~~
+/tmp/torchinductor_jackkhuu/c5o2v2ioswxwskfczjnkmp3pd2krfij5mcm77ewxglwjn3bvjwlp/cjdf6otzt7dk47njelcjyqyz6vvnp6x6pvkv2zntoxs2l2v4bgio.cpp: In function ‘void cpp_fused__safe_softmax__weight_int4pack_mm_for_cpu_add_bmm_index_index_put_scalar_tensor_stack_where_109(const bfloat16*, const int32_t*, const bfloat16*, const bfloat16*, const float*, const float*, const bool*, const bfloat16*, float*, float*, float*, float*, bfloat16*, float*, bool*, float*, float*, float*, bfloat16*, float*, bfloat16*)’:
+/tmp/torchinductor_jackkhuu/c5o2v2ioswxwskfczjnkmp3pd2krfij5mcm77ewxglwjn3bvjwlp/cjdf6otzt7dk47njelcjyqyz6vvnp6x6pvkv2zntoxs2l2v4bgio.cpp:12912:31: warning: variable ‘tmp_acc0_arr’ set but not used [-Wunused-but-set-variable]
+12912 |                         float tmp_acc0_arr[32];
+      |                               ^~~~~~~~~~~~
+/tmp/torchinductor_jackkhuu/c5o2v2ioswxwskfczjnkmp3pd2krfij5mcm77ewxglwjn3bvjwlp/cjdf6otzt7dk47njelcjyqyz6vvnp6x6pvkv2zntoxs2l2v4bgio.cpp: In function ‘void cpp_fused__safe_softmax__weight_int4pack_mm_for_cpu_add_bmm_index_index_put_scalar_tensor_stack_where_114(const bfloat16*, const int32_t*, const bfloat16*, const float*, const float*, const bool*, const bfloat16*, float*, float*, bfloat16*, float*, bool*, float*, float*, float*, bfloat16*, float*, bfloat16*)’:
+/tmp/torchinductor_jackkhuu/c5o2v2ioswxwskfczjnkmp3pd2krfij5mcm77ewxglwjn3bvjwlp/cjdf6otzt7dk47njelcjyqyz6vvnp6x6pvkv2zntoxs2l2v4bgio.cpp:13424:31: warning: variable ‘tmp_acc0_arr’ set but not used [-Wunused-but-set-variable]
+13424 |                         float tmp_acc0_arr[32];
+      |                               ^~~~~~~~~~~~
+/tmp/torchinductor_jackkhuu/c5o2v2ioswxwskfczjnkmp3pd2krfij5mcm77ewxglwjn3bvjwlp/cjdf6otzt7dk47njelcjyqyz6vvnp6x6pvkv2zntoxs2l2v4bgio.cpp: In function ‘void cpp_fused__safe_softmax__weight_int4pack_mm_for_cpu_add_bmm_index_index_put_scalar_tensor_stack_where_118(const bfloat16*, const int32_t*, const bfloat16*, const bfloat16*, const float*, const float*, const bool*, const bfloat16*, float*, float*, float*, float*, bfloat16*, float*, bool*, float*, float*, float*, bfloat16*, float*, bfloat16*)’:
+/tmp/torchinductor_jackkhuu/c5o2v2ioswxwskfczjnkmp3pd2krfij5mcm77ewxglwjn3bvjwlp/cjdf6otzt7dk47njelcjyqyz6vvnp6x6pvkv2zntoxs2l2v4bgio.cpp:13924:31: warning: variable ‘tmp_acc0_arr’ set but not used [-Wunused-but-set-variable]
+13924 |                         float tmp_acc0_arr[32];
+      |                               ^~~~~~~~~~~~
+/tmp/torchinductor_jackkhuu/c5o2v2ioswxwskfczjnkmp3pd2krfij5mcm77ewxglwjn3bvjwlp/cjdf6otzt7dk47njelcjyqyz6vvnp6x6pvkv2zntoxs2l2v4bgio.cpp: In function ‘void cpp_fused__safe_softmax__weight_int4pack_mm_for_cpu_add_bmm_index_index_put_scalar_tensor_stack_where_123(const bfloat16*, const int32_t*, const bfloat16*, const float*, const float*, const bool*, const bfloat16*, float*, float*, bfloat16*, float*, bool*, float*, float*, float*, bfloat16*, float*, bfloat16*)’:
+/tmp/torchinductor_jackkhuu/c5o2v2ioswxwskfczjnkmp3pd2krfij5mcm77ewxglwjn3bvjwlp/cjdf6otzt7dk47njelcjyqyz6vvnp6x6pvkv2zntoxs2l2v4bgio.cpp:14436:31: warning: variable ‘tmp_acc0_arr’ set but not used [-Wunused-but-set-variable]
+14436 |                         float tmp_acc0_arr[32];
+      |                               ^~~~~~~~~~~~
+/tmp/torchinductor_jackkhuu/c5o2v2ioswxwskfczjnkmp3pd2krfij5mcm77ewxglwjn3bvjwlp/cjdf6otzt7dk47njelcjyqyz6vvnp6x6pvkv2zntoxs2l2v4bgio.cpp: In function ‘void cpp_fused__safe_softmax__weight_int4pack_mm_for_cpu_add_bmm_index_index_put_scalar_tensor_stack_where_127(const bfloat16*, const int32_t*, const bfloat16*, const bfloat16*, const float*, const float*, const bool*, const bfloat16*, float*, float*, float*, float*, bfloat16*, float*, bool*, float*, float*, float*, bfloat16*, float*, bfloat16*)’:
+/tmp/torchinductor_jackkhuu/c5o2v2ioswxwskfczjnkmp3pd2krfij5mcm77ewxglwjn3bvjwlp/cjdf6otzt7dk47njelcjyqyz6vvnp6x6pvkv2zntoxs2l2v4bgio.cpp:14936:31: warning: variable ‘tmp_acc0_arr’ set but not used [-Wunused-but-set-variable]
+14936 |                         float tmp_acc0_arr[32];
+      |                               ^~~~~~~~~~~~
+/tmp/torchinductor_jackkhuu/c5o2v2ioswxwskfczjnkmp3pd2krfij5mcm77ewxglwjn3bvjwlp/cjdf6otzt7dk47njelcjyqyz6vvnp6x6pvkv2zntoxs2l2v4bgio.cpp: In function ‘void cpp_fused__safe_softmax__weight_int4pack_mm_for_cpu_add_bmm_index_index_put_scalar_tensor_stack_where_132(const bfloat16*, const int32_t*, const bfloat16*, const float*, const float*, const bool*, const bfloat16*, float*, float*, bfloat16*, float*, bool*, float*, float*, float*, bfloat16*, float*, bfloat16*)’:
+/tmp/torchinductor_jackkhuu/c5o2v2ioswxwskfczjnkmp3pd2krfij5mcm77ewxglwjn3bvjwlp/cjdf6otzt7dk47njelcjyqyz6vvnp6x6pvkv2zntoxs2l2v4bgio.cpp:15448:31: warning: variable ‘tmp_acc0_arr’ set but not used [-Wunused-but-set-variable]
+15448 |                         float tmp_acc0_arr[32];
+      |                               ^~~~~~~~~~~~
+/tmp/torchinductor_jackkhuu/c5o2v2ioswxwskfczjnkmp3pd2krfij5mcm77ewxglwjn3bvjwlp/cjdf6otzt7dk47njelcjyqyz6vvnp6x6pvkv2zntoxs2l2v4bgio.cpp: In function ‘void cpp_fused__safe_softmax__weight_int4pack_mm_for_cpu_add_bmm_index_index_put_scalar_tensor_stack_where_136(const bfloat16*, const int32_t*, const bfloat16*, const bfloat16*, const float*, const float*, const bool*, const bfloat16*, float*, float*, float*, float*, bfloat16*, float*, bool*, float*, float*, float*, bfloat16*, float*, bfloat16*)’:
+/tmp/torchinductor_jackkhuu/c5o2v2ioswxwskfczjnkmp3pd2krfij5mcm77ewxglwjn3bvjwlp/cjdf6otzt7dk47njelcjyqyz6vvnp6x6pvkv2zntoxs2l2v4bgio.cpp:15948:31: warning: variable ‘tmp_acc0_arr’ set but not used [-Wunused-but-set-variable]
+15948 |                         float tmp_acc0_arr[32];
+      |                               ^~~~~~~~~~~~
+/tmp/torchinductor_jackkhuu/c5o2v2ioswxwskfczjnkmp3pd2krfij5mcm77ewxglwjn3bvjwlp/cjdf6otzt7dk47njelcjyqyz6vvnp6x6pvkv2zntoxs2l2v4bgio.cpp: In function ‘void cpp_fused__safe_softmax__weight_int4pack_mm_for_cpu_add_bmm_index_index_put_scalar_tensor_stack_where_141(const bfloat16*, const int32_t*, const bfloat16*, const float*, const float*, const bool*, const bfloat16*, float*, float*, bfloat16*, float*, bool*, float*, float*, float*, bfloat16*, float*, bfloat16*)’:
+/tmp/torchinductor_jackkhuu/c5o2v2ioswxwskfczjnkmp3pd2krfij5mcm77ewxglwjn3bvjwlp/cjdf6otzt7dk47njelcjyqyz6vvnp6x6pvkv2zntoxs2l2v4bgio.cpp:16460:31: warning: variable ‘tmp_acc0_arr’ set but not used [-Wunused-but-set-variable]
+16460 |                         float tmp_acc0_arr[32];
+      |                               ^~~~~~~~~~~~
+Unabled to import torchao experimental quant_api with error:  [Errno 2] No such file or directory: '/home/jackkhuu/oss/torchchat/torchao-build/src/ao/torchao/experimental/quant_api.py'
+Using device=cpu
+Setting max_seq_length to 300 for DSO export.
+Loading model...
+Time to load model: 0.11 seconds
+Quantizing the model with: {'linear:int4': {'groupsize': 256}, 'precision': {'dtype': 'bfloat16'}, 'executor': {'accelerator': 'cpu'}}
+Time to quantize model: 53.20 seconds
+-----------------------------------------------------------
+Exporting model using AOT Inductor to /tmp/model34.pt2
+The generated packaged model can be found at: /tmp/model34.pt2
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+OMP_NUM_THREADS=16 numactl --cpunodebind=0 --membind=0 python3 torchchat.py generate llama3.1 --aoti-package-path /tmp/model34.pt2 --prompt "Once upon a time," --max-new-tokens 256 --device cpu --num-samples 3
+PyTorch version 2.6.0.dev20241213+cu124 available.
+Unabled to import torchao experimental quant_api with error:  [Errno 2] No such file or directory: '/home/jackkhuu/oss/torchchat/torchao-build/src/ao/torchao/experimental/quant_api.py'
+Warning: checkpoint path ignored because an exported model was specified using a DSO, AOTI PACKAGE or PTE path argument
+Warning: checkpoint path ignored because an exported model was specified using a DSO, AOTI PACKAGE or PTE path argument
+Using device=cpu Intel(R) Xeon(R) Platinum 8339HC CPU @ 1.80GHz
+Loading model...
+Time to load model: 0.48 seconds
+-----------------------------------------------------------
+Once upon a time, in a small village surrounded by vast green fields and dense forests, there lived a poor and sad boy named Kaito. Kaito's parents were too poor to provide him with any comfort or leisure, and as a result, he spent most of his days working hard from dawn till dusk, helping his father with the daily chores.
+One day, while out on a walk, Kaito stumbled upon a small pond. The sunlight dancing on the water's surface caught his eye, and he felt a sudden urge to sit and look at it. The boy sat down on a nearby rock and began to follow the pond's ripples as they reflected the light. He found the sight so soothing that he forgot all about his hunger and the fatigue that had been building up inside him.
+Kaito realized that he had been so busy working that he hadn't stopped to appreciate the simple joys in life, including the beauty of nature. He sat there for hours, mesmerized by the pond's serene beauty. The water's calm surface and the sunlight's gentle dance across it were so captivating that Kaito forgot to eat or do anything else.
+As the sun began to set, Kaito reluctantly got up from his rock and headed back to his home. He
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~                
+Generated 255 tokens                 
+Time for inference 1: 274.8049 sec total                 
+Time to first token: 11.4800 sec with sequential prefill.                
+
+      Total throughput: 0.9316 tokens/sec, 1.0735 s/token                 
+First token throughput: 0.0871 tokens/sec, 11.4800 s/token                 
+ Next token throughput: 0.9684 tokens/sec, 1.0326 s/token                     
+
+Bandwidth achieved: 0.00 GB/s
+*** This first iteration will include cold start effects for dynamic import, hardware caches. ***
+
+========================================
+
+Once upon a time, when the world was still, and the moon was full, a young adventurer named Max stumbled upon an ancient temple deep in the heart of a mystical forest. The temple was hidden behind a waterfall, and the air was filled with the sweet scent of blooming flowers. As Max approached the temple, he felt an otherworldly energy emanating from its ancient stones. The energy was both alluring and unsettling, and Max felt his heart pounding with excitement and trepidation.
+
+Max had always been drawn to the mysterious and unknown. As a child, he would spend hours poring over dusty tomes and listening to whispered tales of wonder and magic. His parents had been archaeologists, and he had grown up with a deep respect for the ancient civilizations that had left behind their secrets in the form of crumbling ruins and mysterious artifacts.
+
+As he pushed open the massive stone doors, Max felt a rush of cool air wafting out into the night. The interior of the temple was dimly lit, with only a few flickering candles to illuminate the space. The air was thick with the scent of incense, and the sound of dripping water echoed through the halls. Max's eyes adjusted slowly, and he saw that the temple was filled with row upon row of ancient car
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~                
+Generated 255 tokens                 
+Time for inference 2: 375.2484 sec total                 
+Time to first token: 8.3081 sec with sequential prefill.                
+
+      Total throughput: 0.6822 tokens/sec, 1.4658 s/token                 
+First token throughput: 0.1204 tokens/sec, 8.3081 s/token                 
+ Next token throughput: 0.6949 tokens/sec, 1.4390 s/token                     
+
+Bandwidth achieved: 0.00 GB/s
+
+========================================
+
+Once upon a time, in a small village surrounded by vast fields of gold wheat, there lived a young shepherd named Jack. Jack was a hardworking boy who spent his days tending to his family's sheep and exploring the endless wheat fields. One day, while wandering through the wheat, Jack stumbled upon an old, mysterious-looking watch. The watch was made of a strange, silvery metal that sparkled in the sunlight, and it had intricate engravings of animals on its face.
+As soon as Jack picked up the watch, he felt an odd tingling sensation in his fingers. It was as if the watch had awakened a deep connection within him. Without any explanation, Jack felt compelled to take the watch back to his village. As he walked, he noticed that the wheat around him seemed to be growing taller and the air was filling with a sweet, honey-like scent. The village, once ordinary and familiar, now seemed welcoming and mysterious.
+Upon arriving at the village, Jack showed the watch to the village elder, a wise and respected woman named Granny May. Granny May was amazed by the watch and recognized it as a long-lost treasure from their village's history. She shared a legend with Jack about a special watch that had been crafted centuries ago by their ancestors. This watch
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~                
+Generated 255 tokens                 
+Time for inference 3: 270.6182 sec total                 
+Time to first token: 8.3546 sec with sequential prefill.                
+
+      Total throughput: 0.9460 tokens/sec, 1.0571 s/token                 
+First token throughput: 0.1197 tokens/sec, 8.3546 s/token                 
+ Next token throughput: 0.9723 tokens/sec, 1.0285 s/token                     
+
+Bandwidth achieved: 0.00 GB/s
+
+========================================
+
+
+Warning: Excluding compile in calculations                 
+      Average tokens/sec (total): 0.85                 
+Average tokens/sec (first token): 0.11                 
+Average tokens/sec (next tokens): 0.88 
+                
+Memory used: 0.00 GB
diff --git a/llama31-1213/cpu_aoti_pt2_8.txt b/llama31-1213/cpu_aoti_pt2_8.txt
new file mode 100644
index 000000000..e83451fb4
--- /dev/null
+++ b/llama31-1213/cpu_aoti_pt2_8.txt
@@ -0,0 +1,236 @@
+python3 torchchat.py export llama3.1 --quantize '{"linear:int8": {"groupsize": 0}, "precision": {"dtype":"bfloat16"}, "executor":{"accelerator":"cpu"}}' --output-aoti-package-path /tmp/model8.pt2
+OMP_NUM_THREADS=16 numactl --cpunodebind=0 --membind=0 python3 torchchat.py generate llama3.1 --aoti-package-path /tmp/model8.pt2 --prompt "Once upon a time," --max-new-tokens 256 --device cpu --num-samples 3
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+python3 torchchat.py export llama3.1 --quantize '{"linear:int8": {"groupsize": 0}, "precision": {"dtype":"bfloat16"}, "executor":{"accelerator":"cpu"}}' --output-aoti-package-path /tmp/model8.pt2
+Note: NumExpr detected 22 cores but "NUMEXPR_MAX_THREADS" not set, so enforcing safe limit of 16.
+NumExpr defaulting to 16 threads.
+PyTorch version 2.6.0.dev20241213+cu124 available.
+W1217 20:51:40.934506 1391160 site-packages/torch/_export/__init__.py:276] +============================+
+W1217 20:51:40.936002 1391160 site-packages/torch/_export/__init__.py:277] |     !!!   WARNING   !!!    |
+W1217 20:51:40.936321 1391160 site-packages/torch/_export/__init__.py:278] +============================+
+W1217 20:51:40.936525 1391160 site-packages/torch/_export/__init__.py:279] torch._export.aot_compile()/torch._export.aot_load() is being deprecated, please switch to directly calling torch._inductor.aoti_compile_and_package(torch.export.export())/torch._inductor.aoti_load_package() instead.
+/tmp/torchinductor_jackkhuu/clwlk6qr5j5pkuourbuvzm7ziqe63ybjruymcu3j2i7zqv4pikg6/coqx5t3xg3mwdv4mepirm6vycnfxqanf6fs447wrpj7jtflo3wwr.cpp: In function ‘void cpp_fused__safe_softmax__to_copy_add_bmm_index_index_put_scalar_tensor_stack_where_2(const bfloat16*, const int32_t*, const bfloat16*, const float*, const float*, const bool*, const bfloat16*, float*, float*, bfloat16*, float*, bool*, float*, float*, float*, bfloat16*, float*, bfloat16*)’:
+/tmp/torchinductor_jackkhuu/clwlk6qr5j5pkuourbuvzm7ziqe63ybjruymcu3j2i7zqv4pikg6/coqx5t3xg3mwdv4mepirm6vycnfxqanf6fs447wrpj7jtflo3wwr.cpp:740:31: warning: variable ‘tmp_acc0_arr’ set but not used [-Wunused-but-set-variable]
+  740 |                         float tmp_acc0_arr[32];
+      |                               ^~~~~~~~~~~~
+/tmp/torchinductor_jackkhuu/clwlk6qr5j5pkuourbuvzm7ziqe63ybjruymcu3j2i7zqv4pikg6/coqx5t3xg3mwdv4mepirm6vycnfxqanf6fs447wrpj7jtflo3wwr.cpp: In function ‘void cpp_fused__safe_softmax__to_copy_add_bmm_index_index_put_scalar_tensor_stack_where_7(const bfloat16*, const int32_t*, const bfloat16*, const float*, const float*, const bool*, const bfloat16*, float*, float*, bfloat16*, float*, bool*, float*, float*, float*, bfloat16*, float*, bfloat16*)’:
+/tmp/torchinductor_jackkhuu/clwlk6qr5j5pkuourbuvzm7ziqe63ybjruymcu3j2i7zqv4pikg6/coqx5t3xg3mwdv4mepirm6vycnfxqanf6fs447wrpj7jtflo3wwr.cpp:1276:31: warning: variable ‘tmp_acc0_arr’ set but not used [-Wunused-but-set-variable]
+ 1276 |                         float tmp_acc0_arr[32];
+      |                               ^~~~~~~~~~~~
+/tmp/torchinductor_jackkhuu/clwlk6qr5j5pkuourbuvzm7ziqe63ybjruymcu3j2i7zqv4pikg6/coqx5t3xg3mwdv4mepirm6vycnfxqanf6fs447wrpj7jtflo3wwr.cpp: In function ‘void cpp_fused__safe_softmax__to_copy_add_bmm_index_index_put_scalar_tensor_stack_where_12(const bfloat16*, const int32_t*, const bfloat16*, const float*, const float*, const bool*, const bfloat16*, float*, float*, bfloat16*, float*, bool*, float*, float*, float*, bfloat16*, float*, bfloat16*)’:
+/tmp/torchinductor_jackkhuu/clwlk6qr5j5pkuourbuvzm7ziqe63ybjruymcu3j2i7zqv4pikg6/coqx5t3xg3mwdv4mepirm6vycnfxqanf6fs447wrpj7jtflo3wwr.cpp:1782:31: warning: variable ‘tmp_acc0_arr’ set but not used [-Wunused-but-set-variable]
+ 1782 |                         float tmp_acc0_arr[32];
+      |                               ^~~~~~~~~~~~
+/tmp/torchinductor_jackkhuu/clwlk6qr5j5pkuourbuvzm7ziqe63ybjruymcu3j2i7zqv4pikg6/coqx5t3xg3mwdv4mepirm6vycnfxqanf6fs447wrpj7jtflo3wwr.cpp: In function ‘void cpp_fused__safe_softmax__to_copy_add_bmm_index_index_put_scalar_tensor_stack_where_17(const bfloat16*, const int32_t*, const bfloat16*, const float*, const float*, const bool*, const bfloat16*, float*, float*, bfloat16*, float*, bool*, float*, float*, float*, bfloat16*, float*, bfloat16*)’:
+/tmp/torchinductor_jackkhuu/clwlk6qr5j5pkuourbuvzm7ziqe63ybjruymcu3j2i7zqv4pikg6/coqx5t3xg3mwdv4mepirm6vycnfxqanf6fs447wrpj7jtflo3wwr.cpp:2294:31: warning: variable ‘tmp_acc0_arr’ set but not used [-Wunused-but-set-variable]
+ 2294 |                         float tmp_acc0_arr[32];
+      |                               ^~~~~~~~~~~~
+/tmp/torchinductor_jackkhuu/clwlk6qr5j5pkuourbuvzm7ziqe63ybjruymcu3j2i7zqv4pikg6/coqx5t3xg3mwdv4mepirm6vycnfxqanf6fs447wrpj7jtflo3wwr.cpp: In function ‘void cpp_fused__safe_softmax__to_copy_add_bmm_index_index_put_scalar_tensor_stack_where_22(const bfloat16*, const int32_t*, const bfloat16*, const float*, const float*, const bool*, const bfloat16*, float*, float*, bfloat16*, float*, bool*, float*, float*, float*, bfloat16*, float*, bfloat16*)’:
+/tmp/torchinductor_jackkhuu/clwlk6qr5j5pkuourbuvzm7ziqe63ybjruymcu3j2i7zqv4pikg6/coqx5t3xg3mwdv4mepirm6vycnfxqanf6fs447wrpj7jtflo3wwr.cpp:2794:31: warning: variable ‘tmp_acc0_arr’ set but not used [-Wunused-but-set-variable]
+ 2794 |                         float tmp_acc0_arr[32];
+      |                               ^~~~~~~~~~~~
+/tmp/torchinductor_jackkhuu/clwlk6qr5j5pkuourbuvzm7ziqe63ybjruymcu3j2i7zqv4pikg6/coqx5t3xg3mwdv4mepirm6vycnfxqanf6fs447wrpj7jtflo3wwr.cpp: In function ‘void cpp_fused__safe_softmax__to_copy_add_bmm_index_index_put_scalar_tensor_stack_where_27(const bfloat16*, const int32_t*, const bfloat16*, const float*, const float*, const bool*, const bfloat16*, float*, float*, bfloat16*, float*, bool*, float*, float*, float*, bfloat16*, float*, bfloat16*)’:
+/tmp/torchinductor_jackkhuu/clwlk6qr5j5pkuourbuvzm7ziqe63ybjruymcu3j2i7zqv4pikg6/coqx5t3xg3mwdv4mepirm6vycnfxqanf6fs447wrpj7jtflo3wwr.cpp:3306:31: warning: variable ‘tmp_acc0_arr’ set but not used [-Wunused-but-set-variable]
+ 3306 |                         float tmp_acc0_arr[32];
+      |                               ^~~~~~~~~~~~
+/tmp/torchinductor_jackkhuu/clwlk6qr5j5pkuourbuvzm7ziqe63ybjruymcu3j2i7zqv4pikg6/coqx5t3xg3mwdv4mepirm6vycnfxqanf6fs447wrpj7jtflo3wwr.cpp: In function ‘void cpp_fused__safe_softmax__to_copy_add_bmm_index_index_put_scalar_tensor_stack_where_32(const bfloat16*, const int32_t*, const bfloat16*, const float*, const float*, const bool*, const bfloat16*, float*, float*, bfloat16*, float*, bool*, float*, float*, float*, bfloat16*, float*, bfloat16*)’:
+/tmp/torchinductor_jackkhuu/clwlk6qr5j5pkuourbuvzm7ziqe63ybjruymcu3j2i7zqv4pikg6/coqx5t3xg3mwdv4mepirm6vycnfxqanf6fs447wrpj7jtflo3wwr.cpp:3806:31: warning: variable ‘tmp_acc0_arr’ set but not used [-Wunused-but-set-variable]
+ 3806 |                         float tmp_acc0_arr[32];
+      |                               ^~~~~~~~~~~~
+/tmp/torchinductor_jackkhuu/clwlk6qr5j5pkuourbuvzm7ziqe63ybjruymcu3j2i7zqv4pikg6/coqx5t3xg3mwdv4mepirm6vycnfxqanf6fs447wrpj7jtflo3wwr.cpp: In function ‘void cpp_fused__safe_softmax__to_copy_add_bmm_index_index_put_scalar_tensor_stack_where_37(const bfloat16*, const int32_t*, const bfloat16*, const float*, const float*, const bool*, const bfloat16*, float*, float*, bfloat16*, float*, bool*, float*, float*, float*, bfloat16*, float*, bfloat16*)’:
+/tmp/torchinductor_jackkhuu/clwlk6qr5j5pkuourbuvzm7ziqe63ybjruymcu3j2i7zqv4pikg6/coqx5t3xg3mwdv4mepirm6vycnfxqanf6fs447wrpj7jtflo3wwr.cpp:4318:31: warning: variable ‘tmp_acc0_arr’ set but not used [-Wunused-but-set-variable]
+ 4318 |                         float tmp_acc0_arr[32];
+      |                               ^~~~~~~~~~~~
+/tmp/torchinductor_jackkhuu/clwlk6qr5j5pkuourbuvzm7ziqe63ybjruymcu3j2i7zqv4pikg6/coqx5t3xg3mwdv4mepirm6vycnfxqanf6fs447wrpj7jtflo3wwr.cpp: In function ‘void cpp_fused__safe_softmax__to_copy_add_bmm_index_index_put_scalar_tensor_stack_where_42(const bfloat16*, const int32_t*, const bfloat16*, const float*, const float*, const bool*, const bfloat16*, float*, float*, bfloat16*, float*, bool*, float*, float*, float*, bfloat16*, float*, bfloat16*)’:
+/tmp/torchinductor_jackkhuu/clwlk6qr5j5pkuourbuvzm7ziqe63ybjruymcu3j2i7zqv4pikg6/coqx5t3xg3mwdv4mepirm6vycnfxqanf6fs447wrpj7jtflo3wwr.cpp:4818:31: warning: variable ‘tmp_acc0_arr’ set but not used [-Wunused-but-set-variable]
+ 4818 |                         float tmp_acc0_arr[32];
+      |                               ^~~~~~~~~~~~
+/tmp/torchinductor_jackkhuu/clwlk6qr5j5pkuourbuvzm7ziqe63ybjruymcu3j2i7zqv4pikg6/coqx5t3xg3mwdv4mepirm6vycnfxqanf6fs447wrpj7jtflo3wwr.cpp: In function ‘void cpp_fused__safe_softmax__to_copy_add_bmm_index_index_put_scalar_tensor_stack_where_47(const bfloat16*, const int32_t*, const bfloat16*, const float*, const float*, const bool*, const bfloat16*, float*, float*, bfloat16*, float*, bool*, float*, float*, float*, bfloat16*, float*, bfloat16*)’:
+/tmp/torchinductor_jackkhuu/clwlk6qr5j5pkuourbuvzm7ziqe63ybjruymcu3j2i7zqv4pikg6/coqx5t3xg3mwdv4mepirm6vycnfxqanf6fs447wrpj7jtflo3wwr.cpp:5330:31: warning: variable ‘tmp_acc0_arr’ set but not used [-Wunused-but-set-variable]
+ 5330 |                         float tmp_acc0_arr[32];
+      |                               ^~~~~~~~~~~~
+/tmp/torchinductor_jackkhuu/clwlk6qr5j5pkuourbuvzm7ziqe63ybjruymcu3j2i7zqv4pikg6/coqx5t3xg3mwdv4mepirm6vycnfxqanf6fs447wrpj7jtflo3wwr.cpp: In function ‘void cpp_fused__safe_softmax__to_copy_add_bmm_index_index_put_scalar_tensor_stack_where_52(const bfloat16*, const int32_t*, const bfloat16*, const float*, const float*, const bool*, const bfloat16*, float*, float*, bfloat16*, float*, bool*, float*, float*, float*, bfloat16*, float*, bfloat16*)’:
+/tmp/torchinductor_jackkhuu/clwlk6qr5j5pkuourbuvzm7ziqe63ybjruymcu3j2i7zqv4pikg6/coqx5t3xg3mwdv4mepirm6vycnfxqanf6fs447wrpj7jtflo3wwr.cpp:5830:31: warning: variable ‘tmp_acc0_arr’ set but not used [-Wunused-but-set-variable]
+ 5830 |                         float tmp_acc0_arr[32];
+      |                               ^~~~~~~~~~~~
+/tmp/torchinductor_jackkhuu/clwlk6qr5j5pkuourbuvzm7ziqe63ybjruymcu3j2i7zqv4pikg6/coqx5t3xg3mwdv4mepirm6vycnfxqanf6fs447wrpj7jtflo3wwr.cpp: In function ‘void cpp_fused__safe_softmax__to_copy_add_bmm_index_index_put_scalar_tensor_stack_where_57(const bfloat16*, const int32_t*, const bfloat16*, const float*, const float*, const bool*, const bfloat16*, float*, float*, bfloat16*, float*, bool*, float*, float*, float*, bfloat16*, float*, bfloat16*)’:
+/tmp/torchinductor_jackkhuu/clwlk6qr5j5pkuourbuvzm7ziqe63ybjruymcu3j2i7zqv4pikg6/coqx5t3xg3mwdv4mepirm6vycnfxqanf6fs447wrpj7jtflo3wwr.cpp:6342:31: warning: variable ‘tmp_acc0_arr’ set but not used [-Wunused-but-set-variable]
+ 6342 |                         float tmp_acc0_arr[32];
+      |                               ^~~~~~~~~~~~
+/tmp/torchinductor_jackkhuu/clwlk6qr5j5pkuourbuvzm7ziqe63ybjruymcu3j2i7zqv4pikg6/coqx5t3xg3mwdv4mepirm6vycnfxqanf6fs447wrpj7jtflo3wwr.cpp: In function ‘void cpp_fused__safe_softmax__to_copy_add_bmm_index_index_put_scalar_tensor_stack_where_62(const bfloat16*, const int32_t*, const bfloat16*, const float*, const float*, const bool*, const bfloat16*, float*, float*, bfloat16*, float*, bool*, float*, float*, float*, bfloat16*, float*, bfloat16*)’:
+/tmp/torchinductor_jackkhuu/clwlk6qr5j5pkuourbuvzm7ziqe63ybjruymcu3j2i7zqv4pikg6/coqx5t3xg3mwdv4mepirm6vycnfxqanf6fs447wrpj7jtflo3wwr.cpp:6842:31: warning: variable ‘tmp_acc0_arr’ set but not used [-Wunused-but-set-variable]
+ 6842 |                         float tmp_acc0_arr[32];
+      |                               ^~~~~~~~~~~~
+/tmp/torchinductor_jackkhuu/clwlk6qr5j5pkuourbuvzm7ziqe63ybjruymcu3j2i7zqv4pikg6/coqx5t3xg3mwdv4mepirm6vycnfxqanf6fs447wrpj7jtflo3wwr.cpp: In function ‘void cpp_fused__safe_softmax__to_copy_add_bmm_index_index_put_scalar_tensor_stack_where_67(const bfloat16*, const int32_t*, const bfloat16*, const float*, const float*, const bool*, const bfloat16*, float*, float*, bfloat16*, float*, bool*, float*, float*, float*, bfloat16*, float*, bfloat16*)’:
+/tmp/torchinductor_jackkhuu/clwlk6qr5j5pkuourbuvzm7ziqe63ybjruymcu3j2i7zqv4pikg6/coqx5t3xg3mwdv4mepirm6vycnfxqanf6fs447wrpj7jtflo3wwr.cpp:7354:31: warning: variable ‘tmp_acc0_arr’ set but not used [-Wunused-but-set-variable]
+ 7354 |                         float tmp_acc0_arr[32];
+      |                               ^~~~~~~~~~~~
+/tmp/torchinductor_jackkhuu/clwlk6qr5j5pkuourbuvzm7ziqe63ybjruymcu3j2i7zqv4pikg6/coqx5t3xg3mwdv4mepirm6vycnfxqanf6fs447wrpj7jtflo3wwr.cpp: In function ‘void cpp_fused__safe_softmax__to_copy_add_bmm_index_index_put_scalar_tensor_stack_where_72(const bfloat16*, const int32_t*, const bfloat16*, const float*, const float*, const bool*, const bfloat16*, float*, float*, bfloat16*, float*, bool*, float*, float*, float*, bfloat16*, float*, bfloat16*)’:
+/tmp/torchinductor_jackkhuu/clwlk6qr5j5pkuourbuvzm7ziqe63ybjruymcu3j2i7zqv4pikg6/coqx5t3xg3mwdv4mepirm6vycnfxqanf6fs447wrpj7jtflo3wwr.cpp:7854:31: warning: variable ‘tmp_acc0_arr’ set but not used [-Wunused-but-set-variable]
+ 7854 |                         float tmp_acc0_arr[32];
+      |                               ^~~~~~~~~~~~
+/tmp/torchinductor_jackkhuu/clwlk6qr5j5pkuourbuvzm7ziqe63ybjruymcu3j2i7zqv4pikg6/coqx5t3xg3mwdv4mepirm6vycnfxqanf6fs447wrpj7jtflo3wwr.cpp: In function ‘void cpp_fused__safe_softmax__to_copy_add_bmm_index_index_put_scalar_tensor_stack_where_77(const bfloat16*, const int32_t*, const bfloat16*, const float*, const float*, const bool*, const bfloat16*, float*, float*, bfloat16*, float*, bool*, float*, float*, float*, bfloat16*, float*, bfloat16*)’:
+/tmp/torchinductor_jackkhuu/clwlk6qr5j5pkuourbuvzm7ziqe63ybjruymcu3j2i7zqv4pikg6/coqx5t3xg3mwdv4mepirm6vycnfxqanf6fs447wrpj7jtflo3wwr.cpp:8366:31: warning: variable ‘tmp_acc0_arr’ set but not used [-Wunused-but-set-variable]
+ 8366 |                         float tmp_acc0_arr[32];
+      |                               ^~~~~~~~~~~~
+/tmp/torchinductor_jackkhuu/clwlk6qr5j5pkuourbuvzm7ziqe63ybjruymcu3j2i7zqv4pikg6/coqx5t3xg3mwdv4mepirm6vycnfxqanf6fs447wrpj7jtflo3wwr.cpp: In function ‘void cpp_fused__safe_softmax__to_copy_add_bmm_index_index_put_scalar_tensor_stack_where_82(const bfloat16*, const int32_t*, const bfloat16*, const float*, const float*, const bool*, const bfloat16*, float*, float*, bfloat16*, float*, bool*, float*, float*, float*, bfloat16*, float*, bfloat16*)’:
+/tmp/torchinductor_jackkhuu/clwlk6qr5j5pkuourbuvzm7ziqe63ybjruymcu3j2i7zqv4pikg6/coqx5t3xg3mwdv4mepirm6vycnfxqanf6fs447wrpj7jtflo3wwr.cpp:8866:31: warning: variable ‘tmp_acc0_arr’ set but not used [-Wunused-but-set-variable]
+ 8866 |                         float tmp_acc0_arr[32];
+      |                               ^~~~~~~~~~~~
+/tmp/torchinductor_jackkhuu/clwlk6qr5j5pkuourbuvzm7ziqe63ybjruymcu3j2i7zqv4pikg6/coqx5t3xg3mwdv4mepirm6vycnfxqanf6fs447wrpj7jtflo3wwr.cpp: In function ‘void cpp_fused__safe_softmax__to_copy_add_bmm_index_index_put_scalar_tensor_stack_where_87(const bfloat16*, const int32_t*, const bfloat16*, const float*, const float*, const bool*, const bfloat16*, float*, float*, bfloat16*, float*, bool*, float*, float*, float*, bfloat16*, float*, bfloat16*)’:
+/tmp/torchinductor_jackkhuu/clwlk6qr5j5pkuourbuvzm7ziqe63ybjruymcu3j2i7zqv4pikg6/coqx5t3xg3mwdv4mepirm6vycnfxqanf6fs447wrpj7jtflo3wwr.cpp:9378:31: warning: variable ‘tmp_acc0_arr’ set but not used [-Wunused-but-set-variable]
+ 9378 |                         float tmp_acc0_arr[32];
+      |                               ^~~~~~~~~~~~
+/tmp/torchinductor_jackkhuu/clwlk6qr5j5pkuourbuvzm7ziqe63ybjruymcu3j2i7zqv4pikg6/coqx5t3xg3mwdv4mepirm6vycnfxqanf6fs447wrpj7jtflo3wwr.cpp: In function ‘void cpp_fused__safe_softmax__to_copy_add_bmm_index_index_put_scalar_tensor_stack_where_92(const bfloat16*, const int32_t*, const bfloat16*, const float*, const float*, const bool*, const bfloat16*, float*, float*, bfloat16*, float*, bool*, float*, float*, float*, bfloat16*, float*, bfloat16*)’:
+/tmp/torchinductor_jackkhuu/clwlk6qr5j5pkuourbuvzm7ziqe63ybjruymcu3j2i7zqv4pikg6/coqx5t3xg3mwdv4mepirm6vycnfxqanf6fs447wrpj7jtflo3wwr.cpp:9878:31: warning: variable ‘tmp_acc0_arr’ set but not used [-Wunused-but-set-variable]
+ 9878 |                         float tmp_acc0_arr[32];
+      |                               ^~~~~~~~~~~~
+/tmp/torchinductor_jackkhuu/clwlk6qr5j5pkuourbuvzm7ziqe63ybjruymcu3j2i7zqv4pikg6/coqx5t3xg3mwdv4mepirm6vycnfxqanf6fs447wrpj7jtflo3wwr.cpp: In function ‘void cpp_fused__safe_softmax__to_copy_add_bmm_index_index_put_scalar_tensor_stack_where_97(const bfloat16*, const int32_t*, const bfloat16*, const float*, const float*, const bool*, const bfloat16*, float*, float*, bfloat16*, float*, bool*, float*, float*, float*, bfloat16*, float*, bfloat16*)’:
+/tmp/torchinductor_jackkhuu/clwlk6qr5j5pkuourbuvzm7ziqe63ybjruymcu3j2i7zqv4pikg6/coqx5t3xg3mwdv4mepirm6vycnfxqanf6fs447wrpj7jtflo3wwr.cpp:10390:31: warning: variable ‘tmp_acc0_arr’ set but not used [-Wunused-but-set-variable]
+10390 |                         float tmp_acc0_arr[32];
+      |                               ^~~~~~~~~~~~
+/tmp/torchinductor_jackkhuu/clwlk6qr5j5pkuourbuvzm7ziqe63ybjruymcu3j2i7zqv4pikg6/coqx5t3xg3mwdv4mepirm6vycnfxqanf6fs447wrpj7jtflo3wwr.cpp: In function ‘void cpp_fused__safe_softmax__to_copy_add_bmm_index_index_put_scalar_tensor_stack_where_102(const bfloat16*, const int32_t*, const bfloat16*, const float*, const float*, const bool*, const bfloat16*, float*, float*, bfloat16*, float*, bool*, float*, float*, float*, bfloat16*, float*, bfloat16*)’:
+/tmp/torchinductor_jackkhuu/clwlk6qr5j5pkuourbuvzm7ziqe63ybjruymcu3j2i7zqv4pikg6/coqx5t3xg3mwdv4mepirm6vycnfxqanf6fs447wrpj7jtflo3wwr.cpp:10890:31: warning: variable ‘tmp_acc0_arr’ set but not used [-Wunused-but-set-variable]
+10890 |                         float tmp_acc0_arr[32];
+      |                               ^~~~~~~~~~~~
+/tmp/torchinductor_jackkhuu/clwlk6qr5j5pkuourbuvzm7ziqe63ybjruymcu3j2i7zqv4pikg6/coqx5t3xg3mwdv4mepirm6vycnfxqanf6fs447wrpj7jtflo3wwr.cpp: In function ‘void cpp_fused__safe_softmax__to_copy_add_bmm_index_index_put_scalar_tensor_stack_where_107(const bfloat16*, const int32_t*, const bfloat16*, const float*, const float*, const bool*, const bfloat16*, float*, float*, bfloat16*, float*, bool*, float*, float*, float*, bfloat16*, float*, bfloat16*)’:
+/tmp/torchinductor_jackkhuu/clwlk6qr5j5pkuourbuvzm7ziqe63ybjruymcu3j2i7zqv4pikg6/coqx5t3xg3mwdv4mepirm6vycnfxqanf6fs447wrpj7jtflo3wwr.cpp:11402:31: warning: variable ‘tmp_acc0_arr’ set but not used [-Wunused-but-set-variable]
+11402 |                         float tmp_acc0_arr[32];
+      |                               ^~~~~~~~~~~~
+/tmp/torchinductor_jackkhuu/clwlk6qr5j5pkuourbuvzm7ziqe63ybjruymcu3j2i7zqv4pikg6/coqx5t3xg3mwdv4mepirm6vycnfxqanf6fs447wrpj7jtflo3wwr.cpp: In function ‘void cpp_fused__safe_softmax__to_copy_add_bmm_index_index_put_scalar_tensor_stack_where_112(const bfloat16*, const int32_t*, const bfloat16*, const float*, const float*, const bool*, const bfloat16*, float*, float*, bfloat16*, float*, bool*, float*, float*, float*, bfloat16*, float*, bfloat16*)’:
+/tmp/torchinductor_jackkhuu/clwlk6qr5j5pkuourbuvzm7ziqe63ybjruymcu3j2i7zqv4pikg6/coqx5t3xg3mwdv4mepirm6vycnfxqanf6fs447wrpj7jtflo3wwr.cpp:11902:31: warning: variable ‘tmp_acc0_arr’ set but not used [-Wunused-but-set-variable]
+11902 |                         float tmp_acc0_arr[32];
+      |                               ^~~~~~~~~~~~
+/tmp/torchinductor_jackkhuu/clwlk6qr5j5pkuourbuvzm7ziqe63ybjruymcu3j2i7zqv4pikg6/coqx5t3xg3mwdv4mepirm6vycnfxqanf6fs447wrpj7jtflo3wwr.cpp: In function ‘void cpp_fused__safe_softmax__to_copy_add_bmm_index_index_put_scalar_tensor_stack_where_117(const bfloat16*, const int32_t*, const bfloat16*, const float*, const float*, const bool*, const bfloat16*, float*, float*, bfloat16*, float*, bool*, float*, float*, float*, bfloat16*, float*, bfloat16*)’:
+/tmp/torchinductor_jackkhuu/clwlk6qr5j5pkuourbuvzm7ziqe63ybjruymcu3j2i7zqv4pikg6/coqx5t3xg3mwdv4mepirm6vycnfxqanf6fs447wrpj7jtflo3wwr.cpp:12414:31: warning: variable ‘tmp_acc0_arr’ set but not used [-Wunused-but-set-variable]
+12414 |                         float tmp_acc0_arr[32];
+      |                               ^~~~~~~~~~~~
+/tmp/torchinductor_jackkhuu/clwlk6qr5j5pkuourbuvzm7ziqe63ybjruymcu3j2i7zqv4pikg6/coqx5t3xg3mwdv4mepirm6vycnfxqanf6fs447wrpj7jtflo3wwr.cpp: In function ‘void cpp_fused__safe_softmax__to_copy_add_bmm_index_index_put_scalar_tensor_stack_where_122(const bfloat16*, const int32_t*, const bfloat16*, const float*, const float*, const bool*, const bfloat16*, float*, float*, bfloat16*, float*, bool*, float*, float*, float*, bfloat16*, float*, bfloat16*)’:
+/tmp/torchinductor_jackkhuu/clwlk6qr5j5pkuourbuvzm7ziqe63ybjruymcu3j2i7zqv4pikg6/coqx5t3xg3mwdv4mepirm6vycnfxqanf6fs447wrpj7jtflo3wwr.cpp:12914:31: warning: variable ‘tmp_acc0_arr’ set but not used [-Wunused-but-set-variable]
+12914 |                         float tmp_acc0_arr[32];
+      |                               ^~~~~~~~~~~~
+/tmp/torchinductor_jackkhuu/clwlk6qr5j5pkuourbuvzm7ziqe63ybjruymcu3j2i7zqv4pikg6/coqx5t3xg3mwdv4mepirm6vycnfxqanf6fs447wrpj7jtflo3wwr.cpp: In function ‘void cpp_fused__safe_softmax__to_copy_add_bmm_index_index_put_scalar_tensor_stack_where_127(const bfloat16*, const int32_t*, const bfloat16*, const float*, const float*, const bool*, const bfloat16*, float*, float*, bfloat16*, float*, bool*, float*, float*, float*, bfloat16*, float*, bfloat16*)’:
+/tmp/torchinductor_jackkhuu/clwlk6qr5j5pkuourbuvzm7ziqe63ybjruymcu3j2i7zqv4pikg6/coqx5t3xg3mwdv4mepirm6vycnfxqanf6fs447wrpj7jtflo3wwr.cpp:13426:31: warning: variable ‘tmp_acc0_arr’ set but not used [-Wunused-but-set-variable]
+13426 |                         float tmp_acc0_arr[32];
+      |                               ^~~~~~~~~~~~
+/tmp/torchinductor_jackkhuu/clwlk6qr5j5pkuourbuvzm7ziqe63ybjruymcu3j2i7zqv4pikg6/coqx5t3xg3mwdv4mepirm6vycnfxqanf6fs447wrpj7jtflo3wwr.cpp: In function ‘void cpp_fused__safe_softmax__to_copy_add_bmm_index_index_put_scalar_tensor_stack_where_132(const bfloat16*, const int32_t*, const bfloat16*, const float*, const float*, const bool*, const bfloat16*, float*, float*, bfloat16*, float*, bool*, float*, float*, float*, bfloat16*, float*, bfloat16*)’:
+/tmp/torchinductor_jackkhuu/clwlk6qr5j5pkuourbuvzm7ziqe63ybjruymcu3j2i7zqv4pikg6/coqx5t3xg3mwdv4mepirm6vycnfxqanf6fs447wrpj7jtflo3wwr.cpp:13926:31: warning: variable ‘tmp_acc0_arr’ set but not used [-Wunused-but-set-variable]
+13926 |                         float tmp_acc0_arr[32];
+      |                               ^~~~~~~~~~~~
+/tmp/torchinductor_jackkhuu/clwlk6qr5j5pkuourbuvzm7ziqe63ybjruymcu3j2i7zqv4pikg6/coqx5t3xg3mwdv4mepirm6vycnfxqanf6fs447wrpj7jtflo3wwr.cpp: In function ‘void cpp_fused__safe_softmax__to_copy_add_bmm_index_index_put_scalar_tensor_stack_where_137(const bfloat16*, const int32_t*, const bfloat16*, const float*, const float*, const bool*, const bfloat16*, float*, float*, bfloat16*, float*, bool*, float*, float*, float*, bfloat16*, float*, bfloat16*)’:
+/tmp/torchinductor_jackkhuu/clwlk6qr5j5pkuourbuvzm7ziqe63ybjruymcu3j2i7zqv4pikg6/coqx5t3xg3mwdv4mepirm6vycnfxqanf6fs447wrpj7jtflo3wwr.cpp:14438:31: warning: variable ‘tmp_acc0_arr’ set but not used [-Wunused-but-set-variable]
+14438 |                         float tmp_acc0_arr[32];
+      |                               ^~~~~~~~~~~~
+/tmp/torchinductor_jackkhuu/clwlk6qr5j5pkuourbuvzm7ziqe63ybjruymcu3j2i7zqv4pikg6/coqx5t3xg3mwdv4mepirm6vycnfxqanf6fs447wrpj7jtflo3wwr.cpp: In function ‘void cpp_fused__safe_softmax__to_copy_add_bmm_index_index_put_scalar_tensor_stack_where_142(const bfloat16*, const int32_t*, const bfloat16*, const float*, const float*, const bool*, const bfloat16*, float*, float*, bfloat16*, float*, bool*, float*, float*, float*, bfloat16*, float*, bfloat16*)’:
+/tmp/torchinductor_jackkhuu/clwlk6qr5j5pkuourbuvzm7ziqe63ybjruymcu3j2i7zqv4pikg6/coqx5t3xg3mwdv4mepirm6vycnfxqanf6fs447wrpj7jtflo3wwr.cpp:14938:31: warning: variable ‘tmp_acc0_arr’ set but not used [-Wunused-but-set-variable]
+14938 |                         float tmp_acc0_arr[32];
+      |                               ^~~~~~~~~~~~
+/tmp/torchinductor_jackkhuu/clwlk6qr5j5pkuourbuvzm7ziqe63ybjruymcu3j2i7zqv4pikg6/coqx5t3xg3mwdv4mepirm6vycnfxqanf6fs447wrpj7jtflo3wwr.cpp: In function ‘void cpp_fused__safe_softmax__to_copy_add_bmm_index_index_put_scalar_tensor_stack_where_147(const bfloat16*, const int32_t*, const bfloat16*, const float*, const float*, const bool*, const bfloat16*, float*, float*, bfloat16*, float*, bool*, float*, float*, float*, bfloat16*, float*, bfloat16*)’:
+/tmp/torchinductor_jackkhuu/clwlk6qr5j5pkuourbuvzm7ziqe63ybjruymcu3j2i7zqv4pikg6/coqx5t3xg3mwdv4mepirm6vycnfxqanf6fs447wrpj7jtflo3wwr.cpp:15450:31: warning: variable ‘tmp_acc0_arr’ set but not used [-Wunused-but-set-variable]
+15450 |                         float tmp_acc0_arr[32];
+      |                               ^~~~~~~~~~~~
+/tmp/torchinductor_jackkhuu/clwlk6qr5j5pkuourbuvzm7ziqe63ybjruymcu3j2i7zqv4pikg6/coqx5t3xg3mwdv4mepirm6vycnfxqanf6fs447wrpj7jtflo3wwr.cpp: In function ‘void cpp_fused__safe_softmax__to_copy_add_bmm_index_index_put_scalar_tensor_stack_where_152(const bfloat16*, const int32_t*, const bfloat16*, const float*, const float*, const bool*, const bfloat16*, float*, float*, bfloat16*, float*, bool*, float*, float*, float*, bfloat16*, float*, bfloat16*)’:
+/tmp/torchinductor_jackkhuu/clwlk6qr5j5pkuourbuvzm7ziqe63ybjruymcu3j2i7zqv4pikg6/coqx5t3xg3mwdv4mepirm6vycnfxqanf6fs447wrpj7jtflo3wwr.cpp:15950:31: warning: variable ‘tmp_acc0_arr’ set but not used [-Wunused-but-set-variable]
+15950 |                         float tmp_acc0_arr[32];
+      |                               ^~~~~~~~~~~~
+/tmp/torchinductor_jackkhuu/clwlk6qr5j5pkuourbuvzm7ziqe63ybjruymcu3j2i7zqv4pikg6/coqx5t3xg3mwdv4mepirm6vycnfxqanf6fs447wrpj7jtflo3wwr.cpp: In function ‘void cpp_fused__safe_softmax__to_copy_add_bmm_index_index_put_scalar_tensor_stack_where_157(const bfloat16*, const int32_t*, const bfloat16*, const float*, const float*, const bool*, const bfloat16*, float*, float*, bfloat16*, float*, bool*, float*, float*, float*, bfloat16*, float*, bfloat16*)’:
+/tmp/torchinductor_jackkhuu/clwlk6qr5j5pkuourbuvzm7ziqe63ybjruymcu3j2i7zqv4pikg6/coqx5t3xg3mwdv4mepirm6vycnfxqanf6fs447wrpj7jtflo3wwr.cpp:16462:31: warning: variable ‘tmp_acc0_arr’ set but not used [-Wunused-but-set-variable]
+16462 |                         float tmp_acc0_arr[32];
+      |                               ^~~~~~~~~~~~
+Unabled to import torchao experimental quant_api with error:  [Errno 2] No such file or directory: '/home/jackkhuu/oss/torchchat/torchao-build/src/ao/torchao/experimental/quant_api.py'
+Using device=cpu
+Setting max_seq_length to 300 for DSO export.
+Loading model...
+Time to load model: 0.11 seconds
+Quantizing the model with: {'linear:int8': {'groupsize': 0}, 'precision': {'dtype': 'bfloat16'}, 'executor': {'accelerator': 'cpu'}}
+Time to quantize model: 53.11 seconds
+-----------------------------------------------------------
+Exporting model using AOT Inductor to /tmp/model8.pt2
+The generated packaged model can be found at: /tmp/model8.pt2
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+OMP_NUM_THREADS=16 numactl --cpunodebind=0 --membind=0 python3 torchchat.py generate llama3.1 --aoti-package-path /tmp/model8.pt2 --prompt "Once upon a time," --max-new-tokens 256 --device cpu --num-samples 3
+PyTorch version 2.6.0.dev20241213+cu124 available.
+Unabled to import torchao experimental quant_api with error:  [Errno 2] No such file or directory: '/home/jackkhuu/oss/torchchat/torchao-build/src/ao/torchao/experimental/quant_api.py'
+Warning: checkpoint path ignored because an exported model was specified using a DSO, AOTI PACKAGE or PTE path argument
+Warning: checkpoint path ignored because an exported model was specified using a DSO, AOTI PACKAGE or PTE path argument
+Using device=cpu Intel(R) Xeon(R) Platinum 8339HC CPU @ 1.80GHz
+Loading model...
+Time to load model: 0.49 seconds
+-----------------------------------------------------------
+Once upon a time, a small town in the heart of the American Midwest was faced with a crisis. A massive tornado had ravaged the town, leaving behind a trail of destruction and debris. The residents were left to pick up the pieces and rebuild their lives.
+As the townspeople began to survey the damage, they noticed something peculiar. Amidst the rubble, a small, delicate flower had managed to grow. It was a tiny, yet resilient bloom, that seemed to defy the odds and thrive in the midst of devastation.
+The flower quickly became a symbol of hope and resilience for the town. It reminded them that even in the darkest of times, there is always the possibility for growth and renewal.
+As the townspeople began to rebuild, they made a decision to create a memorial garden in honor of the flower. They designed a beautiful garden, filled with a variety of flowers, trees, and shrubs, that would serve as a reminder of the power of resilience and hope.
+The garden became a beacon of inspiration for the community, a place where people could come to reflect and find solace. It was a testament to the human spirit, and the ability to overcome even the most daunting challenges.
+Years went by, and the garden continued to flourish. It became a popular destination for tourists and
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~                
+Generated 255 tokens                 
+Time for inference 1: 966.5080 sec total                 
+Time to first token: 25.7696 sec with sequential prefill.                
+
+      Total throughput: 0.2649 tokens/sec, 3.7754 s/token                 
+First token throughput: 0.0388 tokens/sec, 25.7696 s/token                 
+ Next token throughput: 0.2711 tokens/sec, 3.6892 s/token                     
+
+Bandwidth achieved: 0.00 GB/s
+*** This first iteration will include cold start effects for dynamic import, hardware caches. ***
+
+========================================
+
+Once upon a time, there was a little boy named Max who sat on a couch and went on a journey.
+Another boy named Samuel sat on the same couch and had the same journey.
+A third boy named Thomas also sat on the same couch and went on the same journey.
+On this journey, they met a dragon who breathed fire.
+They met a fairy who wore wings.
+They met a king who wore a crown.
+Each boy, Max and Samuel and Thomas, saw and experienced the same things on this journey.
+Each boy, however, saw and experienced it in a different way.
+Max saw the dragon as big and scary.
+Samuel saw the dragon as small and friendly.
+Thomas saw the dragon as a reflection of himself.
+The fairy was a reminder to Samuel of his mother.
+The king was a symbol of power for Max.
+The king was a reminder to Thomas of his father.
+In this story, each boy has a unique perspective and experience despite the fact that they are in the same place at the same time.
+This story illustrates that truth is not absolute, but rather subjective.
+It is dependent on the individual's unique perspective and experiences.
+Our experiences shape our perspectives and our perspectives shape our understanding of the world.
+This story also illustrates the concept of perception and how it influences our understanding
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~                
+Generated 255 tokens                 
+Time for inference 2: 402.3418 sec total                 
+Time to first token: 29.8217 sec with sequential prefill.                
+
+      Total throughput: 0.6363 tokens/sec, 1.5716 s/token                 
+First token throughput: 0.0335 tokens/sec, 29.8217 s/token                 
+ Next token throughput: 0.6845 tokens/sec, 1.4609 s/token                     
+
+Bandwidth achieved: 0.00 GB/s
+
+========================================
+
+Once upon a time, there was a tiny village nestled in the rolling hills of the countryside. The villagers lived simple lives, working the land and tending to their animals. But one day, a wise old man came to the village, carrying a small wooden box. He announced that he had within it a magical stone, one that would grant the village prosperity and good fortune.
+The villagers were skeptical at first, but the old man showed them the stone, which glowed with a soft, pulsing light. He told them that the stone had the power to make their crops grow bigger and stronger, their animals healthier, and their lives more abundant.
+The villagers were amazed, and they begged the old man to give them the stone. But he refused, saying that the stone's power was not to be taken lightly and that they would have to prove themselves worthy of its gift.
+The villagers were determined to prove themselves worthy, and they set to work, tilling the soil, planting seeds, and tending to their animals. They worked tirelessly, day and night, and slowly but surely, their efforts began to pay off. Their crops began to grow, their animals grew stronger and healthier, and the village began to flourish.
+But as the villagers' success grew, so did their greed.
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~                
+Generated 255 tokens                 
+Time for inference 3: 268.2427 sec total                 
+Time to first token: 2.1756 sec with sequential prefill.                
+
+      Total throughput: 0.9544 tokens/sec, 1.0478 s/token                 
+First token throughput: 0.4597 tokens/sec, 2.1756 s/token                 
+ Next token throughput: 0.9584 tokens/sec, 1.0434 s/token                     
+
+Bandwidth achieved: 0.00 GB/s
+
+========================================
+
+
+Warning: Excluding compile in calculations                 
+      Average tokens/sec (total): 0.62                 
+Average tokens/sec (first token): 0.18                 
+Average tokens/sec (next tokens): 0.64 
+                
+Memory used: 0.00 GB
diff --git a/llama31-1213/cpu_aoti_pt2_b16.txt b/llama31-1213/cpu_aoti_pt2_b16.txt
new file mode 100644
index 000000000..ad7f0a5f0
--- /dev/null
+++ b/llama31-1213/cpu_aoti_pt2_b16.txt
@@ -0,0 +1,223 @@
+python3 torchchat.py export llama3.1 --quantize '{"precision": {"dtype":"bfloat16"}, "executor":{"accelerator":"cpu"}}' --output-aoti-package-path /tmp/model16.pt2
+OMP_NUM_THREADS=16 numactl --cpunodebind=0 --membind=0 python3 torchchat.py generate llama3.1 --aoti-package-path /tmp/model16.pt2 --prompt "Once upon a time," --max-new-tokens 256 --device cpu --num-samples 3
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+python3 torchchat.py export llama3.1 --quantize '{"precision": {"dtype":"bfloat16"}, "executor":{"accelerator":"cpu"}}' --output-aoti-package-path /tmp/model16.pt2
+Note: NumExpr detected 22 cores but "NUMEXPR_MAX_THREADS" not set, so enforcing safe limit of 16.
+NumExpr defaulting to 16 threads.
+PyTorch version 2.6.0.dev20241213+cu124 available.
+W1217 20:21:25.080884 1080214 site-packages/torch/_export/__init__.py:276] +============================+
+W1217 20:21:25.081428 1080214 site-packages/torch/_export/__init__.py:277] |     !!!   WARNING   !!!    |
+W1217 20:21:25.081624 1080214 site-packages/torch/_export/__init__.py:278] +============================+
+W1217 20:21:25.081813 1080214 site-packages/torch/_export/__init__.py:279] torch._export.aot_compile()/torch._export.aot_load() is being deprecated, please switch to directly calling torch._inductor.aoti_compile_and_package(torch.export.export())/torch._inductor.aoti_load_package() instead.
+/tmp/torchinductor_jackkhuu/cs36dbtcv6inf3gvez7zkbzuf4duad4rqfhnitwlbpchqxhnjvb2/clylwvqqub47oveth6rx7lztlybikfvog7ve73ot2ljilscbzrus.cpp: In function ‘void cpp_fused__safe_softmax__to_copy_add_bmm_index_index_put_scalar_tensor_stack_where_2(const bfloat16*, const int32_t*, const bfloat16*, const float*, const float*, const bool*, const bfloat16*, float*, float*, bfloat16*, float*, bool*, float*, float*, float*, bfloat16*, float*, bfloat16*)’:
+/tmp/torchinductor_jackkhuu/cs36dbtcv6inf3gvez7zkbzuf4duad4rqfhnitwlbpchqxhnjvb2/clylwvqqub47oveth6rx7lztlybikfvog7ve73ot2ljilscbzrus.cpp:740:31: warning: variable ‘tmp_acc0_arr’ set but not used [-Wunused-but-set-variable]
+  740 |                         float tmp_acc0_arr[32];
+      |                               ^~~~~~~~~~~~
+/tmp/torchinductor_jackkhuu/cs36dbtcv6inf3gvez7zkbzuf4duad4rqfhnitwlbpchqxhnjvb2/clylwvqqub47oveth6rx7lztlybikfvog7ve73ot2ljilscbzrus.cpp: In function ‘void cpp_fused__safe_softmax__to_copy_add_bmm_index_index_put_scalar_tensor_stack_where_7(const bfloat16*, const int32_t*, const bfloat16*, const float*, const float*, const bool*, const bfloat16*, float*, float*, bfloat16*, float*, bool*, float*, float*, float*, bfloat16*, float*, bfloat16*)’:
+/tmp/torchinductor_jackkhuu/cs36dbtcv6inf3gvez7zkbzuf4duad4rqfhnitwlbpchqxhnjvb2/clylwvqqub47oveth6rx7lztlybikfvog7ve73ot2ljilscbzrus.cpp:1276:31: warning: variable ‘tmp_acc0_arr’ set but not used [-Wunused-but-set-variable]
+ 1276 |                         float tmp_acc0_arr[32];
+      |                               ^~~~~~~~~~~~
+/tmp/torchinductor_jackkhuu/cs36dbtcv6inf3gvez7zkbzuf4duad4rqfhnitwlbpchqxhnjvb2/clylwvqqub47oveth6rx7lztlybikfvog7ve73ot2ljilscbzrus.cpp: In function ‘void cpp_fused__safe_softmax__to_copy_add_bmm_index_index_put_scalar_tensor_stack_where_12(const bfloat16*, const int32_t*, const bfloat16*, const float*, const float*, const bool*, const bfloat16*, float*, float*, bfloat16*, float*, bool*, float*, float*, float*, bfloat16*, float*, bfloat16*)’:
+/tmp/torchinductor_jackkhuu/cs36dbtcv6inf3gvez7zkbzuf4duad4rqfhnitwlbpchqxhnjvb2/clylwvqqub47oveth6rx7lztlybikfvog7ve73ot2ljilscbzrus.cpp:1782:31: warning: variable ‘tmp_acc0_arr’ set but not used [-Wunused-but-set-variable]
+ 1782 |                         float tmp_acc0_arr[32];
+      |                               ^~~~~~~~~~~~
+/tmp/torchinductor_jackkhuu/cs36dbtcv6inf3gvez7zkbzuf4duad4rqfhnitwlbpchqxhnjvb2/clylwvqqub47oveth6rx7lztlybikfvog7ve73ot2ljilscbzrus.cpp: In function ‘void cpp_fused__safe_softmax__to_copy_add_bmm_index_index_put_scalar_tensor_stack_where_17(const bfloat16*, const int32_t*, const bfloat16*, const float*, const float*, const bool*, const bfloat16*, float*, float*, bfloat16*, float*, bool*, float*, float*, float*, bfloat16*, float*, bfloat16*)’:
+/tmp/torchinductor_jackkhuu/cs36dbtcv6inf3gvez7zkbzuf4duad4rqfhnitwlbpchqxhnjvb2/clylwvqqub47oveth6rx7lztlybikfvog7ve73ot2ljilscbzrus.cpp:2294:31: warning: variable ‘tmp_acc0_arr’ set but not used [-Wunused-but-set-variable]
+ 2294 |                         float tmp_acc0_arr[32];
+      |                               ^~~~~~~~~~~~
+/tmp/torchinductor_jackkhuu/cs36dbtcv6inf3gvez7zkbzuf4duad4rqfhnitwlbpchqxhnjvb2/clylwvqqub47oveth6rx7lztlybikfvog7ve73ot2ljilscbzrus.cpp: In function ‘void cpp_fused__safe_softmax__to_copy_add_bmm_index_index_put_scalar_tensor_stack_where_22(const bfloat16*, const int32_t*, const bfloat16*, const float*, const float*, const bool*, const bfloat16*, float*, float*, bfloat16*, float*, bool*, float*, float*, float*, bfloat16*, float*, bfloat16*)’:
+/tmp/torchinductor_jackkhuu/cs36dbtcv6inf3gvez7zkbzuf4duad4rqfhnitwlbpchqxhnjvb2/clylwvqqub47oveth6rx7lztlybikfvog7ve73ot2ljilscbzrus.cpp:2794:31: warning: variable ‘tmp_acc0_arr’ set but not used [-Wunused-but-set-variable]
+ 2794 |                         float tmp_acc0_arr[32];
+      |                               ^~~~~~~~~~~~
+/tmp/torchinductor_jackkhuu/cs36dbtcv6inf3gvez7zkbzuf4duad4rqfhnitwlbpchqxhnjvb2/clylwvqqub47oveth6rx7lztlybikfvog7ve73ot2ljilscbzrus.cpp: In function ‘void cpp_fused__safe_softmax__to_copy_add_bmm_index_index_put_scalar_tensor_stack_where_27(const bfloat16*, const int32_t*, const bfloat16*, const float*, const float*, const bool*, const bfloat16*, float*, float*, bfloat16*, float*, bool*, float*, float*, float*, bfloat16*, float*, bfloat16*)’:
+/tmp/torchinductor_jackkhuu/cs36dbtcv6inf3gvez7zkbzuf4duad4rqfhnitwlbpchqxhnjvb2/clylwvqqub47oveth6rx7lztlybikfvog7ve73ot2ljilscbzrus.cpp:3306:31: warning: variable ‘tmp_acc0_arr’ set but not used [-Wunused-but-set-variable]
+ 3306 |                         float tmp_acc0_arr[32];
+      |                               ^~~~~~~~~~~~
+/tmp/torchinductor_jackkhuu/cs36dbtcv6inf3gvez7zkbzuf4duad4rqfhnitwlbpchqxhnjvb2/clylwvqqub47oveth6rx7lztlybikfvog7ve73ot2ljilscbzrus.cpp: In function ‘void cpp_fused__safe_softmax__to_copy_add_bmm_index_index_put_scalar_tensor_stack_where_32(const bfloat16*, const int32_t*, const bfloat16*, const float*, const float*, const bool*, const bfloat16*, float*, float*, bfloat16*, float*, bool*, float*, float*, float*, bfloat16*, float*, bfloat16*)’:
+/tmp/torchinductor_jackkhuu/cs36dbtcv6inf3gvez7zkbzuf4duad4rqfhnitwlbpchqxhnjvb2/clylwvqqub47oveth6rx7lztlybikfvog7ve73ot2ljilscbzrus.cpp:3806:31: warning: variable ‘tmp_acc0_arr’ set but not used [-Wunused-but-set-variable]
+ 3806 |                         float tmp_acc0_arr[32];
+      |                               ^~~~~~~~~~~~
+/tmp/torchinductor_jackkhuu/cs36dbtcv6inf3gvez7zkbzuf4duad4rqfhnitwlbpchqxhnjvb2/clylwvqqub47oveth6rx7lztlybikfvog7ve73ot2ljilscbzrus.cpp: In function ‘void cpp_fused__safe_softmax__to_copy_add_bmm_index_index_put_scalar_tensor_stack_where_37(const bfloat16*, const int32_t*, const bfloat16*, const float*, const float*, const bool*, const bfloat16*, float*, float*, bfloat16*, float*, bool*, float*, float*, float*, bfloat16*, float*, bfloat16*)’:
+/tmp/torchinductor_jackkhuu/cs36dbtcv6inf3gvez7zkbzuf4duad4rqfhnitwlbpchqxhnjvb2/clylwvqqub47oveth6rx7lztlybikfvog7ve73ot2ljilscbzrus.cpp:4318:31: warning: variable ‘tmp_acc0_arr’ set but not used [-Wunused-but-set-variable]
+ 4318 |                         float tmp_acc0_arr[32];
+      |                               ^~~~~~~~~~~~
+/tmp/torchinductor_jackkhuu/cs36dbtcv6inf3gvez7zkbzuf4duad4rqfhnitwlbpchqxhnjvb2/clylwvqqub47oveth6rx7lztlybikfvog7ve73ot2ljilscbzrus.cpp: In function ‘void cpp_fused__safe_softmax__to_copy_add_bmm_index_index_put_scalar_tensor_stack_where_42(const bfloat16*, const int32_t*, const bfloat16*, const float*, const float*, const bool*, const bfloat16*, float*, float*, bfloat16*, float*, bool*, float*, float*, float*, bfloat16*, float*, bfloat16*)’:
+/tmp/torchinductor_jackkhuu/cs36dbtcv6inf3gvez7zkbzuf4duad4rqfhnitwlbpchqxhnjvb2/clylwvqqub47oveth6rx7lztlybikfvog7ve73ot2ljilscbzrus.cpp:4818:31: warning: variable ‘tmp_acc0_arr’ set but not used [-Wunused-but-set-variable]
+ 4818 |                         float tmp_acc0_arr[32];
+      |                               ^~~~~~~~~~~~
+/tmp/torchinductor_jackkhuu/cs36dbtcv6inf3gvez7zkbzuf4duad4rqfhnitwlbpchqxhnjvb2/clylwvqqub47oveth6rx7lztlybikfvog7ve73ot2ljilscbzrus.cpp: In function ‘void cpp_fused__safe_softmax__to_copy_add_bmm_index_index_put_scalar_tensor_stack_where_47(const bfloat16*, const int32_t*, const bfloat16*, const float*, const float*, const bool*, const bfloat16*, float*, float*, bfloat16*, float*, bool*, float*, float*, float*, bfloat16*, float*, bfloat16*)’:
+/tmp/torchinductor_jackkhuu/cs36dbtcv6inf3gvez7zkbzuf4duad4rqfhnitwlbpchqxhnjvb2/clylwvqqub47oveth6rx7lztlybikfvog7ve73ot2ljilscbzrus.cpp:5330:31: warning: variable ‘tmp_acc0_arr’ set but not used [-Wunused-but-set-variable]
+ 5330 |                         float tmp_acc0_arr[32];
+      |                               ^~~~~~~~~~~~
+/tmp/torchinductor_jackkhuu/cs36dbtcv6inf3gvez7zkbzuf4duad4rqfhnitwlbpchqxhnjvb2/clylwvqqub47oveth6rx7lztlybikfvog7ve73ot2ljilscbzrus.cpp: In function ‘void cpp_fused__safe_softmax__to_copy_add_bmm_index_index_put_scalar_tensor_stack_where_52(const bfloat16*, const int32_t*, const bfloat16*, const float*, const float*, const bool*, const bfloat16*, float*, float*, bfloat16*, float*, bool*, float*, float*, float*, bfloat16*, float*, bfloat16*)’:
+/tmp/torchinductor_jackkhuu/cs36dbtcv6inf3gvez7zkbzuf4duad4rqfhnitwlbpchqxhnjvb2/clylwvqqub47oveth6rx7lztlybikfvog7ve73ot2ljilscbzrus.cpp:5830:31: warning: variable ‘tmp_acc0_arr’ set but not used [-Wunused-but-set-variable]
+ 5830 |                         float tmp_acc0_arr[32];
+      |                               ^~~~~~~~~~~~
+/tmp/torchinductor_jackkhuu/cs36dbtcv6inf3gvez7zkbzuf4duad4rqfhnitwlbpchqxhnjvb2/clylwvqqub47oveth6rx7lztlybikfvog7ve73ot2ljilscbzrus.cpp: In function ‘void cpp_fused__safe_softmax__to_copy_add_bmm_index_index_put_scalar_tensor_stack_where_57(const bfloat16*, const int32_t*, const bfloat16*, const float*, const float*, const bool*, const bfloat16*, float*, float*, bfloat16*, float*, bool*, float*, float*, float*, bfloat16*, float*, bfloat16*)’:
+/tmp/torchinductor_jackkhuu/cs36dbtcv6inf3gvez7zkbzuf4duad4rqfhnitwlbpchqxhnjvb2/clylwvqqub47oveth6rx7lztlybikfvog7ve73ot2ljilscbzrus.cpp:6342:31: warning: variable ‘tmp_acc0_arr’ set but not used [-Wunused-but-set-variable]
+ 6342 |                         float tmp_acc0_arr[32];
+      |                               ^~~~~~~~~~~~
+/tmp/torchinductor_jackkhuu/cs36dbtcv6inf3gvez7zkbzuf4duad4rqfhnitwlbpchqxhnjvb2/clylwvqqub47oveth6rx7lztlybikfvog7ve73ot2ljilscbzrus.cpp: In function ‘void cpp_fused__safe_softmax__to_copy_add_bmm_index_index_put_scalar_tensor_stack_where_62(const bfloat16*, const int32_t*, const bfloat16*, const float*, const float*, const bool*, const bfloat16*, float*, float*, bfloat16*, float*, bool*, float*, float*, float*, bfloat16*, float*, bfloat16*)’:
+/tmp/torchinductor_jackkhuu/cs36dbtcv6inf3gvez7zkbzuf4duad4rqfhnitwlbpchqxhnjvb2/clylwvqqub47oveth6rx7lztlybikfvog7ve73ot2ljilscbzrus.cpp:6842:31: warning: variable ‘tmp_acc0_arr’ set but not used [-Wunused-but-set-variable]
+ 6842 |                         float tmp_acc0_arr[32];
+      |                               ^~~~~~~~~~~~
+/tmp/torchinductor_jackkhuu/cs36dbtcv6inf3gvez7zkbzuf4duad4rqfhnitwlbpchqxhnjvb2/clylwvqqub47oveth6rx7lztlybikfvog7ve73ot2ljilscbzrus.cpp: In function ‘void cpp_fused__safe_softmax__to_copy_add_bmm_index_index_put_scalar_tensor_stack_where_67(const bfloat16*, const int32_t*, const bfloat16*, const float*, const float*, const bool*, const bfloat16*, float*, float*, bfloat16*, float*, bool*, float*, float*, float*, bfloat16*, float*, bfloat16*)’:
+/tmp/torchinductor_jackkhuu/cs36dbtcv6inf3gvez7zkbzuf4duad4rqfhnitwlbpchqxhnjvb2/clylwvqqub47oveth6rx7lztlybikfvog7ve73ot2ljilscbzrus.cpp:7354:31: warning: variable ‘tmp_acc0_arr’ set but not used [-Wunused-but-set-variable]
+ 7354 |                         float tmp_acc0_arr[32];
+      |                               ^~~~~~~~~~~~
+/tmp/torchinductor_jackkhuu/cs36dbtcv6inf3gvez7zkbzuf4duad4rqfhnitwlbpchqxhnjvb2/clylwvqqub47oveth6rx7lztlybikfvog7ve73ot2ljilscbzrus.cpp: In function ‘void cpp_fused__safe_softmax__to_copy_add_bmm_index_index_put_scalar_tensor_stack_where_72(const bfloat16*, const int32_t*, const bfloat16*, const float*, const float*, const bool*, const bfloat16*, float*, float*, bfloat16*, float*, bool*, float*, float*, float*, bfloat16*, float*, bfloat16*)’:
+/tmp/torchinductor_jackkhuu/cs36dbtcv6inf3gvez7zkbzuf4duad4rqfhnitwlbpchqxhnjvb2/clylwvqqub47oveth6rx7lztlybikfvog7ve73ot2ljilscbzrus.cpp:7854:31: warning: variable ‘tmp_acc0_arr’ set but not used [-Wunused-but-set-variable]
+ 7854 |                         float tmp_acc0_arr[32];
+      |                               ^~~~~~~~~~~~
+/tmp/torchinductor_jackkhuu/cs36dbtcv6inf3gvez7zkbzuf4duad4rqfhnitwlbpchqxhnjvb2/clylwvqqub47oveth6rx7lztlybikfvog7ve73ot2ljilscbzrus.cpp: In function ‘void cpp_fused__safe_softmax__to_copy_add_bmm_index_index_put_scalar_tensor_stack_where_77(const bfloat16*, const int32_t*, const bfloat16*, const float*, const float*, const bool*, const bfloat16*, float*, float*, bfloat16*, float*, bool*, float*, float*, float*, bfloat16*, float*, bfloat16*)’:
+/tmp/torchinductor_jackkhuu/cs36dbtcv6inf3gvez7zkbzuf4duad4rqfhnitwlbpchqxhnjvb2/clylwvqqub47oveth6rx7lztlybikfvog7ve73ot2ljilscbzrus.cpp:8366:31: warning: variable ‘tmp_acc0_arr’ set but not used [-Wunused-but-set-variable]
+ 8366 |                         float tmp_acc0_arr[32];
+      |                               ^~~~~~~~~~~~
+/tmp/torchinductor_jackkhuu/cs36dbtcv6inf3gvez7zkbzuf4duad4rqfhnitwlbpchqxhnjvb2/clylwvqqub47oveth6rx7lztlybikfvog7ve73ot2ljilscbzrus.cpp: In function ‘void cpp_fused__safe_softmax__to_copy_add_bmm_index_index_put_scalar_tensor_stack_where_82(const bfloat16*, const int32_t*, const bfloat16*, const float*, const float*, const bool*, const bfloat16*, float*, float*, bfloat16*, float*, bool*, float*, float*, float*, bfloat16*, float*, bfloat16*)’:
+/tmp/torchinductor_jackkhuu/cs36dbtcv6inf3gvez7zkbzuf4duad4rqfhnitwlbpchqxhnjvb2/clylwvqqub47oveth6rx7lztlybikfvog7ve73ot2ljilscbzrus.cpp:8866:31: warning: variable ‘tmp_acc0_arr’ set but not used [-Wunused-but-set-variable]
+ 8866 |                         float tmp_acc0_arr[32];
+      |                               ^~~~~~~~~~~~
+/tmp/torchinductor_jackkhuu/cs36dbtcv6inf3gvez7zkbzuf4duad4rqfhnitwlbpchqxhnjvb2/clylwvqqub47oveth6rx7lztlybikfvog7ve73ot2ljilscbzrus.cpp: In function ‘void cpp_fused__safe_softmax__to_copy_add_bmm_index_index_put_scalar_tensor_stack_where_87(const bfloat16*, const int32_t*, const bfloat16*, const float*, const float*, const bool*, const bfloat16*, float*, float*, bfloat16*, float*, bool*, float*, float*, float*, bfloat16*, float*, bfloat16*)’:
+/tmp/torchinductor_jackkhuu/cs36dbtcv6inf3gvez7zkbzuf4duad4rqfhnitwlbpchqxhnjvb2/clylwvqqub47oveth6rx7lztlybikfvog7ve73ot2ljilscbzrus.cpp:9378:31: warning: variable ‘tmp_acc0_arr’ set but not used [-Wunused-but-set-variable]
+ 9378 |                         float tmp_acc0_arr[32];
+      |                               ^~~~~~~~~~~~
+/tmp/torchinductor_jackkhuu/cs36dbtcv6inf3gvez7zkbzuf4duad4rqfhnitwlbpchqxhnjvb2/clylwvqqub47oveth6rx7lztlybikfvog7ve73ot2ljilscbzrus.cpp: In function ‘void cpp_fused__safe_softmax__to_copy_add_bmm_index_index_put_scalar_tensor_stack_where_92(const bfloat16*, const int32_t*, const bfloat16*, const float*, const float*, const bool*, const bfloat16*, float*, float*, bfloat16*, float*, bool*, float*, float*, float*, bfloat16*, float*, bfloat16*)’:
+/tmp/torchinductor_jackkhuu/cs36dbtcv6inf3gvez7zkbzuf4duad4rqfhnitwlbpchqxhnjvb2/clylwvqqub47oveth6rx7lztlybikfvog7ve73ot2ljilscbzrus.cpp:9878:31: warning: variable ‘tmp_acc0_arr’ set but not used [-Wunused-but-set-variable]
+ 9878 |                         float tmp_acc0_arr[32];
+      |                               ^~~~~~~~~~~~
+/tmp/torchinductor_jackkhuu/cs36dbtcv6inf3gvez7zkbzuf4duad4rqfhnitwlbpchqxhnjvb2/clylwvqqub47oveth6rx7lztlybikfvog7ve73ot2ljilscbzrus.cpp: In function ‘void cpp_fused__safe_softmax__to_copy_add_bmm_index_index_put_scalar_tensor_stack_where_97(const bfloat16*, const int32_t*, const bfloat16*, const float*, const float*, const bool*, const bfloat16*, float*, float*, bfloat16*, float*, bool*, float*, float*, float*, bfloat16*, float*, bfloat16*)’:
+/tmp/torchinductor_jackkhuu/cs36dbtcv6inf3gvez7zkbzuf4duad4rqfhnitwlbpchqxhnjvb2/clylwvqqub47oveth6rx7lztlybikfvog7ve73ot2ljilscbzrus.cpp:10390:31: warning: variable ‘tmp_acc0_arr’ set but not used [-Wunused-but-set-variable]
+10390 |                         float tmp_acc0_arr[32];
+      |                               ^~~~~~~~~~~~
+/tmp/torchinductor_jackkhuu/cs36dbtcv6inf3gvez7zkbzuf4duad4rqfhnitwlbpchqxhnjvb2/clylwvqqub47oveth6rx7lztlybikfvog7ve73ot2ljilscbzrus.cpp: In function ‘void cpp_fused__safe_softmax__to_copy_add_bmm_index_index_put_scalar_tensor_stack_where_102(const bfloat16*, const int32_t*, const bfloat16*, const float*, const float*, const bool*, const bfloat16*, float*, float*, bfloat16*, float*, bool*, float*, float*, float*, bfloat16*, float*, bfloat16*)’:
+/tmp/torchinductor_jackkhuu/cs36dbtcv6inf3gvez7zkbzuf4duad4rqfhnitwlbpchqxhnjvb2/clylwvqqub47oveth6rx7lztlybikfvog7ve73ot2ljilscbzrus.cpp:10890:31: warning: variable ‘tmp_acc0_arr’ set but not used [-Wunused-but-set-variable]
+10890 |                         float tmp_acc0_arr[32];
+      |                               ^~~~~~~~~~~~
+/tmp/torchinductor_jackkhuu/cs36dbtcv6inf3gvez7zkbzuf4duad4rqfhnitwlbpchqxhnjvb2/clylwvqqub47oveth6rx7lztlybikfvog7ve73ot2ljilscbzrus.cpp: In function ‘void cpp_fused__safe_softmax__to_copy_add_bmm_index_index_put_scalar_tensor_stack_where_107(const bfloat16*, const int32_t*, const bfloat16*, const float*, const float*, const bool*, const bfloat16*, float*, float*, bfloat16*, float*, bool*, float*, float*, float*, bfloat16*, float*, bfloat16*)’:
+/tmp/torchinductor_jackkhuu/cs36dbtcv6inf3gvez7zkbzuf4duad4rqfhnitwlbpchqxhnjvb2/clylwvqqub47oveth6rx7lztlybikfvog7ve73ot2ljilscbzrus.cpp:11402:31: warning: variable ‘tmp_acc0_arr’ set but not used [-Wunused-but-set-variable]
+11402 |                         float tmp_acc0_arr[32];
+      |                               ^~~~~~~~~~~~
+/tmp/torchinductor_jackkhuu/cs36dbtcv6inf3gvez7zkbzuf4duad4rqfhnitwlbpchqxhnjvb2/clylwvqqub47oveth6rx7lztlybikfvog7ve73ot2ljilscbzrus.cpp: In function ‘void cpp_fused__safe_softmax__to_copy_add_bmm_index_index_put_scalar_tensor_stack_where_112(const bfloat16*, const int32_t*, const bfloat16*, const float*, const float*, const bool*, const bfloat16*, float*, float*, bfloat16*, float*, bool*, float*, float*, float*, bfloat16*, float*, bfloat16*)’:
+/tmp/torchinductor_jackkhuu/cs36dbtcv6inf3gvez7zkbzuf4duad4rqfhnitwlbpchqxhnjvb2/clylwvqqub47oveth6rx7lztlybikfvog7ve73ot2ljilscbzrus.cpp:11902:31: warning: variable ‘tmp_acc0_arr’ set but not used [-Wunused-but-set-variable]
+11902 |                         float tmp_acc0_arr[32];
+      |                               ^~~~~~~~~~~~
+/tmp/torchinductor_jackkhuu/cs36dbtcv6inf3gvez7zkbzuf4duad4rqfhnitwlbpchqxhnjvb2/clylwvqqub47oveth6rx7lztlybikfvog7ve73ot2ljilscbzrus.cpp: In function ‘void cpp_fused__safe_softmax__to_copy_add_bmm_index_index_put_scalar_tensor_stack_where_117(const bfloat16*, const int32_t*, const bfloat16*, const float*, const float*, const bool*, const bfloat16*, float*, float*, bfloat16*, float*, bool*, float*, float*, float*, bfloat16*, float*, bfloat16*)’:
+/tmp/torchinductor_jackkhuu/cs36dbtcv6inf3gvez7zkbzuf4duad4rqfhnitwlbpchqxhnjvb2/clylwvqqub47oveth6rx7lztlybikfvog7ve73ot2ljilscbzrus.cpp:12414:31: warning: variable ‘tmp_acc0_arr’ set but not used [-Wunused-but-set-variable]
+12414 |                         float tmp_acc0_arr[32];
+      |                               ^~~~~~~~~~~~
+/tmp/torchinductor_jackkhuu/cs36dbtcv6inf3gvez7zkbzuf4duad4rqfhnitwlbpchqxhnjvb2/clylwvqqub47oveth6rx7lztlybikfvog7ve73ot2ljilscbzrus.cpp: In function ‘void cpp_fused__safe_softmax__to_copy_add_bmm_index_index_put_scalar_tensor_stack_where_122(const bfloat16*, const int32_t*, const bfloat16*, const float*, const float*, const bool*, const bfloat16*, float*, float*, bfloat16*, float*, bool*, float*, float*, float*, bfloat16*, float*, bfloat16*)’:
+/tmp/torchinductor_jackkhuu/cs36dbtcv6inf3gvez7zkbzuf4duad4rqfhnitwlbpchqxhnjvb2/clylwvqqub47oveth6rx7lztlybikfvog7ve73ot2ljilscbzrus.cpp:12914:31: warning: variable ‘tmp_acc0_arr’ set but not used [-Wunused-but-set-variable]
+12914 |                         float tmp_acc0_arr[32];
+      |                               ^~~~~~~~~~~~
+/tmp/torchinductor_jackkhuu/cs36dbtcv6inf3gvez7zkbzuf4duad4rqfhnitwlbpchqxhnjvb2/clylwvqqub47oveth6rx7lztlybikfvog7ve73ot2ljilscbzrus.cpp: In function ‘void cpp_fused__safe_softmax__to_copy_add_bmm_index_index_put_scalar_tensor_stack_where_127(const bfloat16*, const int32_t*, const bfloat16*, const float*, const float*, const bool*, const bfloat16*, float*, float*, bfloat16*, float*, bool*, float*, float*, float*, bfloat16*, float*, bfloat16*)’:
+/tmp/torchinductor_jackkhuu/cs36dbtcv6inf3gvez7zkbzuf4duad4rqfhnitwlbpchqxhnjvb2/clylwvqqub47oveth6rx7lztlybikfvog7ve73ot2ljilscbzrus.cpp:13426:31: warning: variable ‘tmp_acc0_arr’ set but not used [-Wunused-but-set-variable]
+13426 |                         float tmp_acc0_arr[32];
+      |                               ^~~~~~~~~~~~
+/tmp/torchinductor_jackkhuu/cs36dbtcv6inf3gvez7zkbzuf4duad4rqfhnitwlbpchqxhnjvb2/clylwvqqub47oveth6rx7lztlybikfvog7ve73ot2ljilscbzrus.cpp: In function ‘void cpp_fused__safe_softmax__to_copy_add_bmm_index_index_put_scalar_tensor_stack_where_132(const bfloat16*, const int32_t*, const bfloat16*, const float*, const float*, const bool*, const bfloat16*, float*, float*, bfloat16*, float*, bool*, float*, float*, float*, bfloat16*, float*, bfloat16*)’:
+/tmp/torchinductor_jackkhuu/cs36dbtcv6inf3gvez7zkbzuf4duad4rqfhnitwlbpchqxhnjvb2/clylwvqqub47oveth6rx7lztlybikfvog7ve73ot2ljilscbzrus.cpp:13926:31: warning: variable ‘tmp_acc0_arr’ set but not used [-Wunused-but-set-variable]
+13926 |                         float tmp_acc0_arr[32];
+      |                               ^~~~~~~~~~~~
+/tmp/torchinductor_jackkhuu/cs36dbtcv6inf3gvez7zkbzuf4duad4rqfhnitwlbpchqxhnjvb2/clylwvqqub47oveth6rx7lztlybikfvog7ve73ot2ljilscbzrus.cpp: In function ‘void cpp_fused__safe_softmax__to_copy_add_bmm_index_index_put_scalar_tensor_stack_where_137(const bfloat16*, const int32_t*, const bfloat16*, const float*, const float*, const bool*, const bfloat16*, float*, float*, bfloat16*, float*, bool*, float*, float*, float*, bfloat16*, float*, bfloat16*)’:
+/tmp/torchinductor_jackkhuu/cs36dbtcv6inf3gvez7zkbzuf4duad4rqfhnitwlbpchqxhnjvb2/clylwvqqub47oveth6rx7lztlybikfvog7ve73ot2ljilscbzrus.cpp:14438:31: warning: variable ‘tmp_acc0_arr’ set but not used [-Wunused-but-set-variable]
+14438 |                         float tmp_acc0_arr[32];
+      |                               ^~~~~~~~~~~~
+/tmp/torchinductor_jackkhuu/cs36dbtcv6inf3gvez7zkbzuf4duad4rqfhnitwlbpchqxhnjvb2/clylwvqqub47oveth6rx7lztlybikfvog7ve73ot2ljilscbzrus.cpp: In function ‘void cpp_fused__safe_softmax__to_copy_add_bmm_index_index_put_scalar_tensor_stack_where_142(const bfloat16*, const int32_t*, const bfloat16*, const float*, const float*, const bool*, const bfloat16*, float*, float*, bfloat16*, float*, bool*, float*, float*, float*, bfloat16*, float*, bfloat16*)’:
+/tmp/torchinductor_jackkhuu/cs36dbtcv6inf3gvez7zkbzuf4duad4rqfhnitwlbpchqxhnjvb2/clylwvqqub47oveth6rx7lztlybikfvog7ve73ot2ljilscbzrus.cpp:14938:31: warning: variable ‘tmp_acc0_arr’ set but not used [-Wunused-but-set-variable]
+14938 |                         float tmp_acc0_arr[32];
+      |                               ^~~~~~~~~~~~
+/tmp/torchinductor_jackkhuu/cs36dbtcv6inf3gvez7zkbzuf4duad4rqfhnitwlbpchqxhnjvb2/clylwvqqub47oveth6rx7lztlybikfvog7ve73ot2ljilscbzrus.cpp: In function ‘void cpp_fused__safe_softmax__to_copy_add_bmm_index_index_put_scalar_tensor_stack_where_147(const bfloat16*, const int32_t*, const bfloat16*, const float*, const float*, const bool*, const bfloat16*, float*, float*, bfloat16*, float*, bool*, float*, float*, float*, bfloat16*, float*, bfloat16*)’:
+/tmp/torchinductor_jackkhuu/cs36dbtcv6inf3gvez7zkbzuf4duad4rqfhnitwlbpchqxhnjvb2/clylwvqqub47oveth6rx7lztlybikfvog7ve73ot2ljilscbzrus.cpp:15450:31: warning: variable ‘tmp_acc0_arr’ set but not used [-Wunused-but-set-variable]
+15450 |                         float tmp_acc0_arr[32];
+      |                               ^~~~~~~~~~~~
+/tmp/torchinductor_jackkhuu/cs36dbtcv6inf3gvez7zkbzuf4duad4rqfhnitwlbpchqxhnjvb2/clylwvqqub47oveth6rx7lztlybikfvog7ve73ot2ljilscbzrus.cpp: In function ‘void cpp_fused__safe_softmax__to_copy_add_bmm_index_index_put_scalar_tensor_stack_where_152(const bfloat16*, const int32_t*, const bfloat16*, const float*, const float*, const bool*, const bfloat16*, float*, float*, bfloat16*, float*, bool*, float*, float*, float*, bfloat16*, float*, bfloat16*)’:
+/tmp/torchinductor_jackkhuu/cs36dbtcv6inf3gvez7zkbzuf4duad4rqfhnitwlbpchqxhnjvb2/clylwvqqub47oveth6rx7lztlybikfvog7ve73ot2ljilscbzrus.cpp:15950:31: warning: variable ‘tmp_acc0_arr’ set but not used [-Wunused-but-set-variable]
+15950 |                         float tmp_acc0_arr[32];
+      |                               ^~~~~~~~~~~~
+/tmp/torchinductor_jackkhuu/cs36dbtcv6inf3gvez7zkbzuf4duad4rqfhnitwlbpchqxhnjvb2/clylwvqqub47oveth6rx7lztlybikfvog7ve73ot2ljilscbzrus.cpp: In function ‘void cpp_fused__safe_softmax__to_copy_add_bmm_index_index_put_scalar_tensor_stack_where_157(const bfloat16*, const int32_t*, const bfloat16*, const float*, const float*, const bool*, const bfloat16*, float*, float*, bfloat16*, float*, bool*, float*, float*, float*, bfloat16*, float*, bfloat16*)’:
+/tmp/torchinductor_jackkhuu/cs36dbtcv6inf3gvez7zkbzuf4duad4rqfhnitwlbpchqxhnjvb2/clylwvqqub47oveth6rx7lztlybikfvog7ve73ot2ljilscbzrus.cpp:16462:31: warning: variable ‘tmp_acc0_arr’ set but not used [-Wunused-but-set-variable]
+16462 |                         float tmp_acc0_arr[32];
+      |                               ^~~~~~~~~~~~
+Unabled to import torchao experimental quant_api with error:  [Errno 2] No such file or directory: '/home/jackkhuu/oss/torchchat/torchao-build/src/ao/torchao/experimental/quant_api.py'
+Using device=cpu
+Setting max_seq_length to 300 for DSO export.
+Loading model...
+Time to load model: 0.13 seconds
+Quantizing the model with: {'precision': {'dtype': 'bfloat16'}, 'executor': {'accelerator': 'cpu'}}
+Time to quantize model: 0.01 seconds
+-----------------------------------------------------------
+Exporting model using AOT Inductor to /tmp/model16.pt2
+The generated packaged model can be found at: /tmp/model16.pt2
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+OMP_NUM_THREADS=16 numactl --cpunodebind=0 --membind=0 python3 torchchat.py generate llama3.1 --aoti-package-path /tmp/model16.pt2 --prompt "Once upon a time," --max-new-tokens 256 --device cpu --num-samples 3
+PyTorch version 2.6.0.dev20241213+cu124 available.
+Unabled to import torchao experimental quant_api with error:  [Errno 2] No such file or directory: '/home/jackkhuu/oss/torchchat/torchao-build/src/ao/torchao/experimental/quant_api.py'
+Warning: checkpoint path ignored because an exported model was specified using a DSO, AOTI PACKAGE or PTE path argument
+Warning: checkpoint path ignored because an exported model was specified using a DSO, AOTI PACKAGE or PTE path argument
+Using device=cpu Intel(R) Xeon(R) Platinum 8339HC CPU @ 1.80GHz
+Loading model...
+Time to load model: 0.38 seconds
+-----------------------------------------------------------
+Once upon a time, there was a tiny toad who lived in a small pond surrounded by tall trees and a meandering stream. One day, a wise old owl perched on a branch above the pond called out to the toad, "Little one, what is your greatest desire?"
+The toad thought for a moment and then replied, "Oh wise owl, I have always dreamed of being the largest toad in the forest, feared by all and respected by none. I want to be the bulkiest, the most robust, and the most powerful toad that has ever croaked!"
+The owl looked at the toad with wise eyes and said, "Ah, little toad, your desire is not unique. Many creatures have similar ambitions. But to achieve your dream, you must first understand that becoming the largest toad in the forest is not just about physical size, but also about the size of your heart, your courage, and your compassion."
+The toad looked puzzled and asked, "What do you mean, wise owl?"
+The owl explained, "Think of it this way: if you are the largest toad, but you are also the largest ego, would you be respected or admired? Or would you be seen as a bully, a tyrant,
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~                
+Generated 255 tokens                 
+Time for inference 1: 514.3121 sec total                 
+Time to first token: 21.3696 sec with sequential prefill.                
+
+      Total throughput: 0.4978 tokens/sec, 2.0090 s/token                 
+First token throughput: 0.0468 tokens/sec, 21.3696 s/token                 
+ Next token throughput: 0.5173 tokens/sec, 1.9331 s/token                     
+
+Bandwidth achieved: 0.00 GB/s
+*** This first iteration will include cold start effects for dynamic import, hardware caches. ***
+
+========================================
+
+Once upon a time, in a small village nestled in the rolling hills of the countryside, there lived a young girl named Sophia. Sophia was a curious and adventurous child, with a heart full of wonder and a mind full of questions. She loved to explore the world around her, to discover new things, and to learn about the magic that lay just beyond the edge of everyday life.
+One day, while wandering through the village, Sophia stumbled upon a small, mysterious shop tucked away in a quiet alley. The sign above the door read "Curios and Wonders," and the windows were filled with a dazzling array of strange and exotic objects: glowing crystals, shimmering fabrics, and curious contraptions that seemed to defy explanation.
+Sophia's eyes grew wide with wonder as she pushed open the door and stepped inside. The shop was dimly lit, with soft music playing in the background, and the air was thick with the scent of incense and spices. A kindly old man with a long white beard and twinkling eyes looked up from behind the counter, and a warm smile spread across his face.
+
+"Welcome, young one," he said, beckoning Sophia to come closer. "I have just the thing for a curious and adventurous soul like yours."
+
+Sophia wandered the aisles
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~                
+Generated 255 tokens                 
+Time for inference 2: 522.4509 sec total                 
+Time to first token: 9.3618 sec with sequential prefill.                
+
+      Total throughput: 0.4900 tokens/sec, 2.0408 s/token                 
+First token throughput: 0.1068 tokens/sec, 9.3618 s/token                 
+ Next token throughput: 0.4970 tokens/sec, 2.0121 s/token                     
+
+Bandwidth achieved: 0.00 GB/s
+
+========================================
+
+Once upon a time, there was a beautiful, plush forest, nestled in a valley surrounded by towering mountains. The forest was home to a variety of creatures, large and small, from majestic deer to tiny rabbits, and from wise old owls to mischievous squirrels.
+At the heart of the forest stood an ancient, gnarled tree, its branches twisted and tangled in a way that seemed almost magical. The tree was known as the "Heart Tree," and it was said to possess the wisdom and secrets of the forest.
+One day, a young rabbit named Luna wandered into the forest, searching for a place to call home. As she explored the woods, she stumbled upon the Heart Tree, and was drawn to its mystical energy.
+Luna felt an inexplicable connection to the Heart Tree, and as she approached, she noticed that the tree seemed to be glowing with a soft, pulsing light. She reached out a paw to touch the trunk, and as she did, she was flooded with visions and knowledge from the Heart Tree.
+The tree revealed to Luna the secrets of the forest, showing her the hidden paths and hidden streams, and the ancient stories of the creatures that lived there. Luna was amazed and delighted by the knowledge she gained, and she knew that she had found
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~                
+Generated 255 tokens                 
+Time for inference 3: 440.9076 sec total                 
+Time to first token: 9.7547 sec with sequential prefill.                
+
+      Total throughput: 0.5806 tokens/sec, 1.7223 s/token                 
+First token throughput: 0.1025 tokens/sec, 9.7547 s/token                 
+ Next token throughput: 0.5914 tokens/sec, 1.6908 s/token                     
+
+Bandwidth achieved: 0.00 GB/s
+
+========================================
+
+
+Warning: Excluding compile in calculations                 
+      Average tokens/sec (total): 0.52                 
+Average tokens/sec (first token): 0.09                 
+Average tokens/sec (next tokens): 0.54 
+                
+Memory used: 0.00 GB
diff --git a/llama31-1213/cpu_compile_4.txt b/llama31-1213/cpu_compile_4.txt
new file mode 100644
index 000000000..ffebd5652
--- /dev/null
+++ b/llama31-1213/cpu_compile_4.txt
@@ -0,0 +1,299 @@
+
+OMP_NUM_THREADS=16 numactl --cpunodebind=0 --membind=0 python3 torchchat.py generate llama3.1 --quantize '{"linear:int4": {"groupsize": 256}, "precision": {"dtype":"bfloat16"}, "executor":{"accelerator":"cpu"}}' --prompt "Once upon a time," --max-new-tokens 256 --compile --num-samples 3
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+OMP_NUM_THREADS=16 numactl --cpunodebind=0 --membind=0 python3 torchchat.py generate llama3.1 --quantize '{"linear:int4": {"groupsize": 256}, "precision": {"dtype":"bfloat16"}, "executor":{"accelerator":"cpu"}}' --prompt "Once upon a time," --max-new-tokens 256 --compile --num-samples 3
+PyTorch version 2.6.0.dev20241213+cu124 available.
+linear: model.layers.0.attention.wq, in=4096, out=4096
+linear: model.layers.0.attention.wk, in=4096, out=1024
+linear: model.layers.0.attention.wv, in=4096, out=1024
+linear: model.layers.0.attention.wo, in=4096, out=4096
+linear: model.layers.0.feed_forward.w1, in=4096, out=14336
+linear: model.layers.0.feed_forward.w2, in=14336, out=4096
+linear: model.layers.0.feed_forward.w3, in=4096, out=14336
+linear: model.layers.1.attention.wq, in=4096, out=4096
+linear: model.layers.1.attention.wk, in=4096, out=1024
+linear: model.layers.1.attention.wv, in=4096, out=1024
+linear: model.layers.1.attention.wo, in=4096, out=4096
+linear: model.layers.1.feed_forward.w1, in=4096, out=14336
+linear: model.layers.1.feed_forward.w2, in=14336, out=4096
+linear: model.layers.1.feed_forward.w3, in=4096, out=14336
+linear: model.layers.2.attention.wq, in=4096, out=4096
+linear: model.layers.2.attention.wk, in=4096, out=1024
+linear: model.layers.2.attention.wv, in=4096, out=1024
+linear: model.layers.2.attention.wo, in=4096, out=4096
+linear: model.layers.2.feed_forward.w1, in=4096, out=14336
+linear: model.layers.2.feed_forward.w2, in=14336, out=4096
+linear: model.layers.2.feed_forward.w3, in=4096, out=14336
+linear: model.layers.3.attention.wq, in=4096, out=4096
+linear: model.layers.3.attention.wk, in=4096, out=1024
+linear: model.layers.3.attention.wv, in=4096, out=1024
+linear: model.layers.3.attention.wo, in=4096, out=4096
+linear: model.layers.3.feed_forward.w1, in=4096, out=14336
+linear: model.layers.3.feed_forward.w2, in=14336, out=4096
+linear: model.layers.3.feed_forward.w3, in=4096, out=14336
+linear: model.layers.4.attention.wq, in=4096, out=4096
+linear: model.layers.4.attention.wk, in=4096, out=1024
+linear: model.layers.4.attention.wv, in=4096, out=1024
+linear: model.layers.4.attention.wo, in=4096, out=4096
+linear: model.layers.4.feed_forward.w1, in=4096, out=14336
+linear: model.layers.4.feed_forward.w2, in=14336, out=4096
+linear: model.layers.4.feed_forward.w3, in=4096, out=14336
+linear: model.layers.5.attention.wq, in=4096, out=4096
+linear: model.layers.5.attention.wk, in=4096, out=1024
+linear: model.layers.5.attention.wv, in=4096, out=1024
+linear: model.layers.5.attention.wo, in=4096, out=4096
+linear: model.layers.5.feed_forward.w1, in=4096, out=14336
+linear: model.layers.5.feed_forward.w2, in=14336, out=4096
+linear: model.layers.5.feed_forward.w3, in=4096, out=14336
+linear: model.layers.6.attention.wq, in=4096, out=4096
+linear: model.layers.6.attention.wk, in=4096, out=1024
+linear: model.layers.6.attention.wv, in=4096, out=1024
+linear: model.layers.6.attention.wo, in=4096, out=4096
+linear: model.layers.6.feed_forward.w1, in=4096, out=14336
+linear: model.layers.6.feed_forward.w2, in=14336, out=4096
+linear: model.layers.6.feed_forward.w3, in=4096, out=14336
+linear: model.layers.7.attention.wq, in=4096, out=4096
+linear: model.layers.7.attention.wk, in=4096, out=1024
+linear: model.layers.7.attention.wv, in=4096, out=1024
+linear: model.layers.7.attention.wo, in=4096, out=4096
+linear: model.layers.7.feed_forward.w1, in=4096, out=14336
+linear: model.layers.7.feed_forward.w2, in=14336, out=4096
+linear: model.layers.7.feed_forward.w3, in=4096, out=14336
+linear: model.layers.8.attention.wq, in=4096, out=4096
+linear: model.layers.8.attention.wk, in=4096, out=1024
+linear: model.layers.8.attention.wv, in=4096, out=1024
+linear: model.layers.8.attention.wo, in=4096, out=4096
+linear: model.layers.8.feed_forward.w1, in=4096, out=14336
+linear: model.layers.8.feed_forward.w2, in=14336, out=4096
+linear: model.layers.8.feed_forward.w3, in=4096, out=14336
+linear: model.layers.9.attention.wq, in=4096, out=4096
+linear: model.layers.9.attention.wk, in=4096, out=1024
+linear: model.layers.9.attention.wv, in=4096, out=1024
+linear: model.layers.9.attention.wo, in=4096, out=4096
+linear: model.layers.9.feed_forward.w1, in=4096, out=14336
+linear: model.layers.9.feed_forward.w2, in=14336, out=4096
+linear: model.layers.9.feed_forward.w3, in=4096, out=14336
+linear: model.layers.10.attention.wq, in=4096, out=4096
+linear: model.layers.10.attention.wk, in=4096, out=1024
+linear: model.layers.10.attention.wv, in=4096, out=1024
+linear: model.layers.10.attention.wo, in=4096, out=4096
+linear: model.layers.10.feed_forward.w1, in=4096, out=14336
+linear: model.layers.10.feed_forward.w2, in=14336, out=4096
+linear: model.layers.10.feed_forward.w3, in=4096, out=14336
+linear: model.layers.11.attention.wq, in=4096, out=4096
+linear: model.layers.11.attention.wk, in=4096, out=1024
+linear: model.layers.11.attention.wv, in=4096, out=1024
+linear: model.layers.11.attention.wo, in=4096, out=4096
+linear: model.layers.11.feed_forward.w1, in=4096, out=14336
+linear: model.layers.11.feed_forward.w2, in=14336, out=4096
+linear: model.layers.11.feed_forward.w3, in=4096, out=14336
+linear: model.layers.12.attention.wq, in=4096, out=4096
+linear: model.layers.12.attention.wk, in=4096, out=1024
+linear: model.layers.12.attention.wv, in=4096, out=1024
+linear: model.layers.12.attention.wo, in=4096, out=4096
+linear: model.layers.12.feed_forward.w1, in=4096, out=14336
+linear: model.layers.12.feed_forward.w2, in=14336, out=4096
+linear: model.layers.12.feed_forward.w3, in=4096, out=14336
+linear: model.layers.13.attention.wq, in=4096, out=4096
+linear: model.layers.13.attention.wk, in=4096, out=1024
+linear: model.layers.13.attention.wv, in=4096, out=1024
+linear: model.layers.13.attention.wo, in=4096, out=4096
+linear: model.layers.13.feed_forward.w1, in=4096, out=14336
+linear: model.layers.13.feed_forward.w2, in=14336, out=4096
+linear: model.layers.13.feed_forward.w3, in=4096, out=14336
+linear: model.layers.14.attention.wq, in=4096, out=4096
+linear: model.layers.14.attention.wk, in=4096, out=1024
+linear: model.layers.14.attention.wv, in=4096, out=1024
+linear: model.layers.14.attention.wo, in=4096, out=4096
+linear: model.layers.14.feed_forward.w1, in=4096, out=14336
+linear: model.layers.14.feed_forward.w2, in=14336, out=4096
+linear: model.layers.14.feed_forward.w3, in=4096, out=14336
+linear: model.layers.15.attention.wq, in=4096, out=4096
+linear: model.layers.15.attention.wk, in=4096, out=1024
+linear: model.layers.15.attention.wv, in=4096, out=1024
+linear: model.layers.15.attention.wo, in=4096, out=4096
+linear: model.layers.15.feed_forward.w1, in=4096, out=14336
+linear: model.layers.15.feed_forward.w2, in=14336, out=4096
+linear: model.layers.15.feed_forward.w3, in=4096, out=14336
+linear: model.layers.16.attention.wq, in=4096, out=4096
+linear: model.layers.16.attention.wk, in=4096, out=1024
+linear: model.layers.16.attention.wv, in=4096, out=1024
+linear: model.layers.16.attention.wo, in=4096, out=4096
+linear: model.layers.16.feed_forward.w1, in=4096, out=14336
+linear: model.layers.16.feed_forward.w2, in=14336, out=4096
+linear: model.layers.16.feed_forward.w3, in=4096, out=14336
+linear: model.layers.17.attention.wq, in=4096, out=4096
+linear: model.layers.17.attention.wk, in=4096, out=1024
+linear: model.layers.17.attention.wv, in=4096, out=1024
+linear: model.layers.17.attention.wo, in=4096, out=4096
+linear: model.layers.17.feed_forward.w1, in=4096, out=14336
+linear: model.layers.17.feed_forward.w2, in=14336, out=4096
+linear: model.layers.17.feed_forward.w3, in=4096, out=14336
+linear: model.layers.18.attention.wq, in=4096, out=4096
+linear: model.layers.18.attention.wk, in=4096, out=1024
+linear: model.layers.18.attention.wv, in=4096, out=1024
+linear: model.layers.18.attention.wo, in=4096, out=4096
+linear: model.layers.18.feed_forward.w1, in=4096, out=14336
+linear: model.layers.18.feed_forward.w2, in=14336, out=4096
+linear: model.layers.18.feed_forward.w3, in=4096, out=14336
+linear: model.layers.19.attention.wq, in=4096, out=4096
+linear: model.layers.19.attention.wk, in=4096, out=1024
+linear: model.layers.19.attention.wv, in=4096, out=1024
+linear: model.layers.19.attention.wo, in=4096, out=4096
+linear: model.layers.19.feed_forward.w1, in=4096, out=14336
+linear: model.layers.19.feed_forward.w2, in=14336, out=4096
+linear: model.layers.19.feed_forward.w3, in=4096, out=14336
+linear: model.layers.20.attention.wq, in=4096, out=4096
+linear: model.layers.20.attention.wk, in=4096, out=1024
+linear: model.layers.20.attention.wv, in=4096, out=1024
+linear: model.layers.20.attention.wo, in=4096, out=4096
+linear: model.layers.20.feed_forward.w1, in=4096, out=14336
+linear: model.layers.20.feed_forward.w2, in=14336, out=4096
+linear: model.layers.20.feed_forward.w3, in=4096, out=14336
+linear: model.layers.21.attention.wq, in=4096, out=4096
+linear: model.layers.21.attention.wk, in=4096, out=1024
+linear: model.layers.21.attention.wv, in=4096, out=1024
+linear: model.layers.21.attention.wo, in=4096, out=4096
+linear: model.layers.21.feed_forward.w1, in=4096, out=14336
+linear: model.layers.21.feed_forward.w2, in=14336, out=4096
+linear: model.layers.21.feed_forward.w3, in=4096, out=14336
+linear: model.layers.22.attention.wq, in=4096, out=4096
+linear: model.layers.22.attention.wk, in=4096, out=1024
+linear: model.layers.22.attention.wv, in=4096, out=1024
+linear: model.layers.22.attention.wo, in=4096, out=4096
+linear: model.layers.22.feed_forward.w1, in=4096, out=14336
+linear: model.layers.22.feed_forward.w2, in=14336, out=4096
+linear: model.layers.22.feed_forward.w3, in=4096, out=14336
+linear: model.layers.23.attention.wq, in=4096, out=4096
+linear: model.layers.23.attention.wk, in=4096, out=1024
+linear: model.layers.23.attention.wv, in=4096, out=1024
+linear: model.layers.23.attention.wo, in=4096, out=4096
+linear: model.layers.23.feed_forward.w1, in=4096, out=14336
+linear: model.layers.23.feed_forward.w2, in=14336, out=4096
+linear: model.layers.23.feed_forward.w3, in=4096, out=14336
+linear: model.layers.24.attention.wq, in=4096, out=4096
+linear: model.layers.24.attention.wk, in=4096, out=1024
+linear: model.layers.24.attention.wv, in=4096, out=1024
+linear: model.layers.24.attention.wo, in=4096, out=4096
+linear: model.layers.24.feed_forward.w1, in=4096, out=14336
+linear: model.layers.24.feed_forward.w2, in=14336, out=4096
+linear: model.layers.24.feed_forward.w3, in=4096, out=14336
+linear: model.layers.25.attention.wq, in=4096, out=4096
+linear: model.layers.25.attention.wk, in=4096, out=1024
+linear: model.layers.25.attention.wv, in=4096, out=1024
+linear: model.layers.25.attention.wo, in=4096, out=4096
+linear: model.layers.25.feed_forward.w1, in=4096, out=14336
+linear: model.layers.25.feed_forward.w2, in=14336, out=4096
+linear: model.layers.25.feed_forward.w3, in=4096, out=14336
+linear: model.layers.26.attention.wq, in=4096, out=4096
+linear: model.layers.26.attention.wk, in=4096, out=1024
+linear: model.layers.26.attention.wv, in=4096, out=1024
+linear: model.layers.26.attention.wo, in=4096, out=4096
+linear: model.layers.26.feed_forward.w1, in=4096, out=14336
+linear: model.layers.26.feed_forward.w2, in=14336, out=4096
+linear: model.layers.26.feed_forward.w3, in=4096, out=14336
+linear: model.layers.27.attention.wq, in=4096, out=4096
+linear: model.layers.27.attention.wk, in=4096, out=1024
+linear: model.layers.27.attention.wv, in=4096, out=1024
+linear: model.layers.27.attention.wo, in=4096, out=4096
+linear: model.layers.27.feed_forward.w1, in=4096, out=14336
+linear: model.layers.27.feed_forward.w2, in=14336, out=4096
+linear: model.layers.27.feed_forward.w3, in=4096, out=14336
+linear: model.layers.28.attention.wq, in=4096, out=4096
+linear: model.layers.28.attention.wk, in=4096, out=1024
+linear: model.layers.28.attention.wv, in=4096, out=1024
+linear: model.layers.28.attention.wo, in=4096, out=4096
+linear: model.layers.28.feed_forward.w1, in=4096, out=14336
+linear: model.layers.28.feed_forward.w2, in=14336, out=4096
+linear: model.layers.28.feed_forward.w3, in=4096, out=14336
+linear: model.layers.29.attention.wq, in=4096, out=4096
+linear: model.layers.29.attention.wk, in=4096, out=1024
+linear: model.layers.29.attention.wv, in=4096, out=1024
+linear: model.layers.29.attention.wo, in=4096, out=4096
+linear: model.layers.29.feed_forward.w1, in=4096, out=14336
+linear: model.layers.29.feed_forward.w2, in=14336, out=4096
+linear: model.layers.29.feed_forward.w3, in=4096, out=14336
+linear: model.layers.30.attention.wq, in=4096, out=4096
+linear: model.layers.30.attention.wk, in=4096, out=1024
+linear: model.layers.30.attention.wv, in=4096, out=1024
+linear: model.layers.30.attention.wo, in=4096, out=4096
+linear: model.layers.30.feed_forward.w1, in=4096, out=14336
+linear: model.layers.30.feed_forward.w2, in=14336, out=4096
+linear: model.layers.30.feed_forward.w3, in=4096, out=14336
+linear: model.layers.31.attention.wq, in=4096, out=4096
+linear: model.layers.31.attention.wk, in=4096, out=1024
+linear: model.layers.31.attention.wv, in=4096, out=1024
+linear: model.layers.31.attention.wo, in=4096, out=4096
+linear: model.layers.31.feed_forward.w1, in=4096, out=14336
+linear: model.layers.31.feed_forward.w2, in=14336, out=4096
+linear: model.layers.31.feed_forward.w3, in=4096, out=14336
+linear: model.output, in=4096, out=128256
+Unabled to import torchao experimental quant_api with error:  [Errno 2] No such file or directory: '/home/jackkhuu/oss/torchchat/torchao-build/src/ao/torchao/experimental/quant_api.py'
+Using device=cpu Intel(R) Xeon(R) Platinum 8339HC CPU @ 1.80GHz
+Loading model...
+Time to load model: 0.12 seconds
+Quantizing the model with: {'linear:int4': {'groupsize': 256}, 'precision': {'dtype': 'bfloat16'}, 'executor': {'accelerator': 'cpu'}}
+Time to quantize model: 22.19 seconds
+-----------------------------------------------------------
+Once upon a time, there was a young boy named Tom who loved playing outside in the warm sunshine. One day, while he was playing in his backyard, he stumbled upon a small clearing surrounded by tall trees and colorful flowers. In the center of the clearing stood an enormous tree with branches that seemed to stretch up to the sky.
+Tom had never seen a tree like this before, and he felt drawn to it as if by magic. He approached the tree cautiously, feeling the soft bark beneath his fingers, and looked up at its towering height. Suddenly, a gentle voice spoke to him, echoing in his mind.
+“Welcome, little one,” said the voice. “I have been waiting for you. My name is Oakley, and I am an enchanted tree with a special gift for those who approach me with kindness and respect.”
+Tom was amazed and a bit scared, but he managed to stammer out a greeting. Oakley told him that he had been watching Tom for a while and was impressed by his kind heart and curious nature. The tree explained that he had the power to grant wishes to those who treated him with love and respect.
+Overjoyed, Tom asked Oakley how he could use this gift. Oakley responded that he would give him three wishes, but warned him
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~                
+Generated 255 tokens                 
+Time for inference 1: 60.0837 sec total                 
+Time to first token: 0.3077 sec with parallel prefill.                
+
+      Total throughput: 4.2607 tokens/sec, 0.2347 s/token                 
+First token throughput: 3.2504 tokens/sec, 0.3077 s/token                 
+ Next token throughput: 4.2659 tokens/sec, 0.2344 s/token                     
+
+Bandwidth achieved: 20.97 GB/s
+*** This first iteration will include cold start effects for dynamic import, hardware caches, JIT compilation. ***
+just-in-time compilation time (incl run time): 6e+01 seconds
+
+========================================
+
+Once upon a time, in a small village surrounded by rolling green hills and sparkling crystal streams, there lived a young boy named Leo. He was a curious and adventurous boy, always eager to explore the world around him. Leo loved nothing more than spending his days outdoors, whether it was climbing trees, chasing after butterflies, or simply lying in the sun and watching the clouds drift lazily by.
+One day, as Leo was wandering through the village, he stumbled upon an old, mysterious-looking shop. The sign above the door read "Curios and Wonders," and the windows were filled with strange and exotic treasures. Leo's curiosity was piqued, and he pushed open the door to venture inside.
+The shop was dimly lit, the air thick with the scent of old books and dust. Shelves lined the walls, packed tightly with all manner of strange and wonderful objects. Leo wandered the aisles, his eyes widening as he took in the endless wonders on display. There were antique clocks, shimmering gemstones, and even a taxidermied eagle perched atop a shelf, its glassy eyes seeming to stare right through him.
+As Leo explored the shop, he stumbled upon a small, leather-bound book with a strange symbol etched into the cover. The symbol seemed
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~                
+Generated 255 tokens                 
+Time for inference 2: 25.8322 sec total                 
+Time to first token: 0.3133 sec with parallel prefill.                
+
+      Total throughput: 9.9101 tokens/sec, 0.1009 s/token                 
+First token throughput: 3.1921 tokens/sec, 0.3133 s/token                 
+ Next token throughput: 9.9926 tokens/sec, 0.1001 s/token                     
+
+Bandwidth achieved: 48.77 GB/s
+
+========================================
+
+Once upon a time, there was a small village surrounded by vast fields of wheat, and a winding river that flowed gently through the heart of the village. The villagers lived simple lives, working hard from dawn till dusk, and enjoying the beauty of nature around them.
+One day, a wise old man named Ravi came to the village. He was known for his wisdom and knowledge, and the people of the village looked up to him with great respect. Ravi had a special gift – he could see into the future.
+The villagers would often gather around him, asking for his guidance on various matters. Ravi would listen carefully, and then offer words of wisdom that would help them make informed decisions.
+One day, a young village girl named Rohini came to Ravi with a problem. She was in love with a boy named Bhargav, but Bhargav’s family was very poor, and Rohini’s parents wanted her to marry a wealthy man named Vikram. Rohini was torn between her love for Bhargav and her obligations to her family.
+Ravi listened to Rohini’s story and then looked into the future. He saw a picture of Bhargav, a brilliant scientist, working on a new invention that would change the world. He saw Rohini, a
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~                
+Generated 255 tokens                 
+Time for inference 3: 23.7908 sec total                 
+Time to first token: 0.3015 sec with parallel prefill.                
+
+      Total throughput: 10.7605 tokens/sec, 0.0929 s/token                 
+First token throughput: 3.3173 tokens/sec, 0.3015 s/token                 
+ Next token throughput: 10.8560 tokens/sec, 0.0921 s/token                     
+
+Bandwidth achieved: 52.95 GB/s
+
+========================================
+
+
+Warning: Excluding compile in calculations                 
+      Average tokens/sec (total): 10.34                 
+Average tokens/sec (first token): 3.25                 
+Average tokens/sec (next tokens): 10.42 
+                
+Memory used: 0.00 GB
diff --git a/llama31-1213/cpu_compile_8.txt b/llama31-1213/cpu_compile_8.txt
new file mode 100644
index 000000000..d06bdce5b
--- /dev/null
+++ b/llama31-1213/cpu_compile_8.txt
@@ -0,0 +1,81 @@
+
+OMP_NUM_THREADS=16 numactl --cpunodebind=0 --membind=0 python3 torchchat.py generate llama3.1 --quantize '{"linear:int8": {"groupsize": 0}, "precision": {"dtype":"bfloat16"}, "executor":{"accelerator":"cpu"}}' --prompt "Once upon a time," --max-new-tokens 256 --compile --num-samples 3
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+OMP_NUM_THREADS=16 numactl --cpunodebind=0 --membind=0 python3 torchchat.py generate llama3.1 --quantize '{"linear:int8": {"groupsize": 0}, "precision": {"dtype":"bfloat16"}, "executor":{"accelerator":"cpu"}}' --prompt "Once upon a time," --max-new-tokens 256 --compile --num-samples 3
+PyTorch version 2.6.0.dev20241213+cu124 available.
+Unabled to import torchao experimental quant_api with error:  [Errno 2] No such file or directory: '/home/jackkhuu/oss/torchchat/torchao-build/src/ao/torchao/experimental/quant_api.py'
+Using device=cpu Intel(R) Xeon(R) Platinum 8339HC CPU @ 1.80GHz
+Loading model...
+Time to load model: 0.11 seconds
+Quantizing the model with: {'linear:int8': {'groupsize': 0}, 'precision': {'dtype': 'bfloat16'}, 'executor': {'accelerator': 'cpu'}}
+Time to quantize model: 29.28 seconds
+-----------------------------------------------------------
+Once upon a time, I was sitting in a coffee shop, surrounded by the bustling sounds of city living. As I sipped on my coffee, I noticed a peculiar-looking man walking in, wearing a stylish three-piece suit. He was an older gentleman, with a kind face and wispy hair.
+
+The man approached the counter and ordered his coffee, engaging in a warm conversation with the barista. When he received his drink, he sat down in a nearby seat, his eyes scanning the room as if searching for someone. Our eyes met, and he smiled kindly, nodding at me.
+
+Intrigued, I decided to strike up a conversation. "Excuse me," I said, "but you seem like a man with a great story. Would you like to hear one of mine?"
+
+He chuckled, his eyes lighting up with interest. "That sounds intriguing," he replied. "I'm always eager to hear a good tale."
+
+I launched into a story about a peculiar occurrence in my childhood, about a dream that I had experienced as a young boy. The man listened attentively, his expression growing more and more intrigued.
+
+As I finished my story, the man leaned in, his voice barely above a whisper. "I'm glad you shared that with me," he said
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~                
+Generated 255 tokens                 
+Time for inference 1: 167.6127 sec total                 
+Time to first token: 0.3494 sec with parallel prefill.                
+
+      Total throughput: 1.5273 tokens/sec, 0.6547 s/token                 
+First token throughput: 2.8620 tokens/sec, 0.3494 s/token                 
+ Next token throughput: 1.5245 tokens/sec, 0.6559 s/token                     
+
+Bandwidth achieved: 13.07 GB/s
+*** This first iteration will include cold start effects for dynamic import, hardware caches, JIT compilation. ***
+just-in-time compilation time (incl run time): 1.7e+02 seconds
+
+========================================
+
+Once upon a time, in a far-off kingdom, there was a beautiful and kind-hearted princess named Sophia. She was loved by all who knew her, and her smile could light up the darkest of rooms.
+One day, a wise old wizard named Zephyr came to the kingdom, seeking refuge from the dangers of the outside world. He was a master of magic, and the princess was immediately drawn to his wisdom and kindness.
+As the days passed, Sophia and Zephyr grew closer, and they began to talk about their dreams and aspirations. Sophia confided in Zephyr about her desire to help those in need, to make a difference in the world, and to bring joy and happiness to all whom she met.
+Zephyr, seeing the good in Sophia's heart, decided to share a secret with her. He told her that he had a magical amulet that would grant her wishes, but warned her that it came with a great responsibility.
+The amulet, Zephyr explained, was a powerful tool that could change the course of history, and it was up to Sophia to use it wisely. He told her that with this amulet, she could bring peace and prosperity to the kingdom, but she must be careful not to abuse its power.
+Sophia was both
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~                
+Generated 255 tokens                 
+Time for inference 2: 38.5396 sec total                 
+Time to first token: 0.3094 sec with parallel prefill.                
+
+      Total throughput: 6.6425 tokens/sec, 0.1505 s/token                 
+First token throughput: 3.2318 tokens/sec, 0.3094 s/token                 
+ Next token throughput: 6.6701 tokens/sec, 0.1499 s/token                     
+
+Bandwidth achieved: 56.85 GB/s
+
+========================================
+
+Once upon a time, we were young, fun-loving, and carefree. We had high school sweethearts, prom dates, and graduation ceremonies. But, as time went by, life took its toll. We got older, got married, got kids, and got busy. The romance faded, and the excitement began to wane. But, that was before we discovered a secret to rekindle the flame.
+One day, while browsing through an online blog, we stumbled upon an article about a magical way to bring back the spark. It mentioned a simple yet powerful technique: couples’ date nights. That's right! Regularly scheduled dates with your partner can reignite the flames of passion, intimacy, and love.
+At first, it sounded too simple, too cheesy, or too old-fashioned. But, we were desperate to revive our relationship, so we decided to give it a try. We started with a weekly dinner date, just the two of us. We would pick a restaurant, plan a fun activity, and make it a priority to spend quality time together.
+As we began to incorporate date nights into our busy schedules, we noticed a significant change. We started to look forward to our alone time together, and the excitement began to build again. We would laugh, reminis
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~                
+Generated 255 tokens                 
+Time for inference 3: 35.3439 sec total                 
+Time to first token: 0.3349 sec with parallel prefill.                
+
+      Total throughput: 7.2431 tokens/sec, 0.1381 s/token                 
+First token throughput: 2.9861 tokens/sec, 0.3349 s/token                 
+ Next token throughput: 7.2838 tokens/sec, 0.1373 s/token                     
+
+Bandwidth achieved: 61.99 GB/s
+
+========================================
+
+
+Warning: Excluding compile in calculations                 
+      Average tokens/sec (total): 6.94                 
+Average tokens/sec (first token): 3.11                 
+Average tokens/sec (next tokens): 6.98 
+                
+Memory used: 0.00 GB
diff --git a/llama31-1213/cpu_compile_b16.txt b/llama31-1213/cpu_compile_b16.txt
new file mode 100644
index 000000000..65a261bac
--- /dev/null
+++ b/llama31-1213/cpu_compile_b16.txt
@@ -0,0 +1,73 @@
+
+OMP_NUM_THREADS=16 numactl --cpunodebind=0 --membind=0 python3 torchchat.py generate llama3.1 --quantize '{"precision": {"dtype":"bfloat16"}, "executor":{"accelerator":"cpu"}}' --prompt "Once upon a time," --max-new-tokens 256 --compile --num-samples 3
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+OMP_NUM_THREADS=16 numactl --cpunodebind=0 --membind=0 python3 torchchat.py generate llama3.1 --quantize '{"precision": {"dtype":"bfloat16"}, "executor":{"accelerator":"cpu"}}' --prompt "Once upon a time," --max-new-tokens 256 --compile --num-samples 3
+PyTorch version 2.6.0.dev20241213+cu124 available.
+Unabled to import torchao experimental quant_api with error:  [Errno 2] No such file or directory: '/home/jackkhuu/oss/torchchat/torchao-build/src/ao/torchao/experimental/quant_api.py'
+Using device=cpu Intel(R) Xeon(R) Platinum 8339HC CPU @ 1.80GHz
+Loading model...
+Time to load model: 0.12 seconds
+Quantizing the model with: {'precision': {'dtype': 'bfloat16'}, 'executor': {'accelerator': 'cpu'}}
+Time to quantize model: 0.01 seconds
+-----------------------------------------------------------
+Once upon a time, in a small village nestled in the rolling hills of the countryside, there lived a young girl named Sophia. Sophia was a curious and adventurous child who loved to explore the world around her. She spent most of her days playing outside, chasing after butterflies, and watching the clouds roll by.
+One day, while wandering through the village, Sophia stumbled upon a small, mysterious shop tucked away on a quiet street. The sign above the door read "Curios and Wonders," and the windows were filled with all sorts of strange and fascinating objects. Sophia's curiosity was piqued, and she pushed open the door to venture inside.
+The shop was dimly lit, and the air was thick with the scent of old books and dust. Sophia's eyes adjusted slowly to the darkness, and she saw rows upon rows of shelves stacked high with peculiar items. There were strange artifacts, rare minerals, and even a few taxidermied animals peeking out from behind a velvet curtain.
+Sophia wandered through the shop, running her fingers over the various objects on display. She picked up a delicate crystal pendant, a vintage locket, and a small, leather-bound book. As she touched each item, she felt a strange sensation, as if the object was imbuing her with
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~                
+Generated 255 tokens                 
+Time for inference 1: 196.0270 sec total                 
+Time to first token: 0.6336 sec with parallel prefill.                
+
+      Total throughput: 1.3059 tokens/sec, 0.7657 s/token                 
+First token throughput: 1.5783 tokens/sec, 0.6336 s/token                 
+ Next token throughput: 1.3051 tokens/sec, 0.7662 s/token                     
+
+Bandwidth achieved: 20.97 GB/s
+*** This first iteration will include cold start effects for dynamic import, hardware caches, JIT compilation. ***
+just-in-time compilation time (incl run time): 2e+02 seconds
+
+========================================
+
+Once upon a time, there was a small, rural town nestled in the heart of a vast and mysterious forest. The town was called Ravenswood, and it was a place where time seemed to stand still. The residents of Ravenswood lived simple lives, relying on the land for their livelihood and honoring the traditions of their ancestors.
+In the center of Ravenswood stood an ancient, gnarled tree, its branches twisted and knotted with age. The townspeople believed that this tree held mystical powers, and they would often gather around its base to pray, tell stories, and seek guidance.
+One day, a young girl named Aria wandered into the forest, searching for a rare herb for her mother's healing potions. As she walked deeper into the woods, the trees seemed to grow taller, and the shadows grew darker. Aria felt a strange, tingling sensation in her fingers, as if the tree was calling to her.
+She approached the ancient tree, feeling a sense of wonder and awe wash over her. As she reached out to touch the trunk, a low, rumbling voice spoke to her, echoing in her mind.
+"Aria, child of Ravenswood, I have been waiting for you. You have come to seek the secrets of the forest, and I shall grant them
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~                
+Generated 255 tokens                 
+Time for inference 2: 74.2544 sec total                 
+Time to first token: 0.4688 sec with parallel prefill.                
+
+      Total throughput: 3.4476 tokens/sec, 0.2901 s/token                 
+First token throughput: 2.1333 tokens/sec, 0.4688 s/token                 
+ Next token throughput: 3.4560 tokens/sec, 0.2894 s/token                     
+
+Bandwidth achieved: 55.37 GB/s
+
+========================================
+
+Once upon a time, in a world not so different from our own, there lived a young girl named Sophia. Sophia loved to dream big and chase her passions. She was a curious and ambitious individual who always sought to learn more and push beyond her boundaries. One day, Sophia stumbled upon an old, mysterious-looking book hidden in the attic of her family's old mansion. The book was bound in a strange, glowing material that seemed to pulse with an otherworldly energy.
+As Sophia opened the book, she discovered that it contained ancient knowledge and secrets that had been hidden for centuries. The book spoke of magical realms, hidden dimensions, and mystical creatures that existed beyond the veil of the mundane world. Sophia was both fascinated and terrified by the secrets revealed within the book's pages.
+As she delved deeper into the book, Sophia began to notice strange occurrences happening around her. Objects would move on their own, and she would hear whispers in the dead of night. It was as if the book was trying to communicate with her, drawing her into a world of wonder and magic. Sophia's heart pounded with excitement as she realized that she had stumbled upon something much bigger than herself.
+With every passing day, Sophia became more and more enthralled by the mystical world within the book. She would
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~                
+Generated 255 tokens                 
+Time for inference 3: 75.9722 sec total                 
+Time to first token: 0.3684 sec with parallel prefill.                
+
+      Total throughput: 3.3697 tokens/sec, 0.2968 s/token                 
+First token throughput: 2.7142 tokens/sec, 0.3684 s/token                 
+ Next token throughput: 3.3728 tokens/sec, 0.2965 s/token                     
+
+Bandwidth achieved: 54.12 GB/s
+
+========================================
+
+
+Warning: Excluding compile in calculations                 
+      Average tokens/sec (total): 3.41                 
+Average tokens/sec (first token): 2.42                 
+Average tokens/sec (next tokens): 3.41 
+                
+Memory used: 0.00 GB
diff --git a/llama31-1213/cpu_eager_4.txt b/llama31-1213/cpu_eager_4.txt
new file mode 100644
index 000000000..ed72e6e1a
--- /dev/null
+++ b/llama31-1213/cpu_eager_4.txt
@@ -0,0 +1,298 @@
+
+OMP_NUM_THREADS=16 numactl --cpunodebind=0 --membind=0 python3 torchchat.py generate llama3.1 --quantize '{"linear:int4": {"groupsize": 256}, "precision": {"dtype":"bfloat16"}, "executor":{"accelerator":"cpu"}}' --prompt "Once upon a time," --max-new-tokens 256 --num-samples 3
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+OMP_NUM_THREADS=16 numactl --cpunodebind=0 --membind=0 python3 torchchat.py generate llama3.1 --quantize '{"linear:int4": {"groupsize": 256}, "precision": {"dtype":"bfloat16"}, "executor":{"accelerator":"cpu"}}' --prompt "Once upon a time," --max-new-tokens 256 --num-samples 3
+PyTorch version 2.6.0.dev20241213+cu124 available.
+linear: model.layers.0.attention.wq, in=4096, out=4096
+linear: model.layers.0.attention.wk, in=4096, out=1024
+linear: model.layers.0.attention.wv, in=4096, out=1024
+linear: model.layers.0.attention.wo, in=4096, out=4096
+linear: model.layers.0.feed_forward.w1, in=4096, out=14336
+linear: model.layers.0.feed_forward.w2, in=14336, out=4096
+linear: model.layers.0.feed_forward.w3, in=4096, out=14336
+linear: model.layers.1.attention.wq, in=4096, out=4096
+linear: model.layers.1.attention.wk, in=4096, out=1024
+linear: model.layers.1.attention.wv, in=4096, out=1024
+linear: model.layers.1.attention.wo, in=4096, out=4096
+linear: model.layers.1.feed_forward.w1, in=4096, out=14336
+linear: model.layers.1.feed_forward.w2, in=14336, out=4096
+linear: model.layers.1.feed_forward.w3, in=4096, out=14336
+linear: model.layers.2.attention.wq, in=4096, out=4096
+linear: model.layers.2.attention.wk, in=4096, out=1024
+linear: model.layers.2.attention.wv, in=4096, out=1024
+linear: model.layers.2.attention.wo, in=4096, out=4096
+linear: model.layers.2.feed_forward.w1, in=4096, out=14336
+linear: model.layers.2.feed_forward.w2, in=14336, out=4096
+linear: model.layers.2.feed_forward.w3, in=4096, out=14336
+linear: model.layers.3.attention.wq, in=4096, out=4096
+linear: model.layers.3.attention.wk, in=4096, out=1024
+linear: model.layers.3.attention.wv, in=4096, out=1024
+linear: model.layers.3.attention.wo, in=4096, out=4096
+linear: model.layers.3.feed_forward.w1, in=4096, out=14336
+linear: model.layers.3.feed_forward.w2, in=14336, out=4096
+linear: model.layers.3.feed_forward.w3, in=4096, out=14336
+linear: model.layers.4.attention.wq, in=4096, out=4096
+linear: model.layers.4.attention.wk, in=4096, out=1024
+linear: model.layers.4.attention.wv, in=4096, out=1024
+linear: model.layers.4.attention.wo, in=4096, out=4096
+linear: model.layers.4.feed_forward.w1, in=4096, out=14336
+linear: model.layers.4.feed_forward.w2, in=14336, out=4096
+linear: model.layers.4.feed_forward.w3, in=4096, out=14336
+linear: model.layers.5.attention.wq, in=4096, out=4096
+linear: model.layers.5.attention.wk, in=4096, out=1024
+linear: model.layers.5.attention.wv, in=4096, out=1024
+linear: model.layers.5.attention.wo, in=4096, out=4096
+linear: model.layers.5.feed_forward.w1, in=4096, out=14336
+linear: model.layers.5.feed_forward.w2, in=14336, out=4096
+linear: model.layers.5.feed_forward.w3, in=4096, out=14336
+linear: model.layers.6.attention.wq, in=4096, out=4096
+linear: model.layers.6.attention.wk, in=4096, out=1024
+linear: model.layers.6.attention.wv, in=4096, out=1024
+linear: model.layers.6.attention.wo, in=4096, out=4096
+linear: model.layers.6.feed_forward.w1, in=4096, out=14336
+linear: model.layers.6.feed_forward.w2, in=14336, out=4096
+linear: model.layers.6.feed_forward.w3, in=4096, out=14336
+linear: model.layers.7.attention.wq, in=4096, out=4096
+linear: model.layers.7.attention.wk, in=4096, out=1024
+linear: model.layers.7.attention.wv, in=4096, out=1024
+linear: model.layers.7.attention.wo, in=4096, out=4096
+linear: model.layers.7.feed_forward.w1, in=4096, out=14336
+linear: model.layers.7.feed_forward.w2, in=14336, out=4096
+linear: model.layers.7.feed_forward.w3, in=4096, out=14336
+linear: model.layers.8.attention.wq, in=4096, out=4096
+linear: model.layers.8.attention.wk, in=4096, out=1024
+linear: model.layers.8.attention.wv, in=4096, out=1024
+linear: model.layers.8.attention.wo, in=4096, out=4096
+linear: model.layers.8.feed_forward.w1, in=4096, out=14336
+linear: model.layers.8.feed_forward.w2, in=14336, out=4096
+linear: model.layers.8.feed_forward.w3, in=4096, out=14336
+linear: model.layers.9.attention.wq, in=4096, out=4096
+linear: model.layers.9.attention.wk, in=4096, out=1024
+linear: model.layers.9.attention.wv, in=4096, out=1024
+linear: model.layers.9.attention.wo, in=4096, out=4096
+linear: model.layers.9.feed_forward.w1, in=4096, out=14336
+linear: model.layers.9.feed_forward.w2, in=14336, out=4096
+linear: model.layers.9.feed_forward.w3, in=4096, out=14336
+linear: model.layers.10.attention.wq, in=4096, out=4096
+linear: model.layers.10.attention.wk, in=4096, out=1024
+linear: model.layers.10.attention.wv, in=4096, out=1024
+linear: model.layers.10.attention.wo, in=4096, out=4096
+linear: model.layers.10.feed_forward.w1, in=4096, out=14336
+linear: model.layers.10.feed_forward.w2, in=14336, out=4096
+linear: model.layers.10.feed_forward.w3, in=4096, out=14336
+linear: model.layers.11.attention.wq, in=4096, out=4096
+linear: model.layers.11.attention.wk, in=4096, out=1024
+linear: model.layers.11.attention.wv, in=4096, out=1024
+linear: model.layers.11.attention.wo, in=4096, out=4096
+linear: model.layers.11.feed_forward.w1, in=4096, out=14336
+linear: model.layers.11.feed_forward.w2, in=14336, out=4096
+linear: model.layers.11.feed_forward.w3, in=4096, out=14336
+linear: model.layers.12.attention.wq, in=4096, out=4096
+linear: model.layers.12.attention.wk, in=4096, out=1024
+linear: model.layers.12.attention.wv, in=4096, out=1024
+linear: model.layers.12.attention.wo, in=4096, out=4096
+linear: model.layers.12.feed_forward.w1, in=4096, out=14336
+linear: model.layers.12.feed_forward.w2, in=14336, out=4096
+linear: model.layers.12.feed_forward.w3, in=4096, out=14336
+linear: model.layers.13.attention.wq, in=4096, out=4096
+linear: model.layers.13.attention.wk, in=4096, out=1024
+linear: model.layers.13.attention.wv, in=4096, out=1024
+linear: model.layers.13.attention.wo, in=4096, out=4096
+linear: model.layers.13.feed_forward.w1, in=4096, out=14336
+linear: model.layers.13.feed_forward.w2, in=14336, out=4096
+linear: model.layers.13.feed_forward.w3, in=4096, out=14336
+linear: model.layers.14.attention.wq, in=4096, out=4096
+linear: model.layers.14.attention.wk, in=4096, out=1024
+linear: model.layers.14.attention.wv, in=4096, out=1024
+linear: model.layers.14.attention.wo, in=4096, out=4096
+linear: model.layers.14.feed_forward.w1, in=4096, out=14336
+linear: model.layers.14.feed_forward.w2, in=14336, out=4096
+linear: model.layers.14.feed_forward.w3, in=4096, out=14336
+linear: model.layers.15.attention.wq, in=4096, out=4096
+linear: model.layers.15.attention.wk, in=4096, out=1024
+linear: model.layers.15.attention.wv, in=4096, out=1024
+linear: model.layers.15.attention.wo, in=4096, out=4096
+linear: model.layers.15.feed_forward.w1, in=4096, out=14336
+linear: model.layers.15.feed_forward.w2, in=14336, out=4096
+linear: model.layers.15.feed_forward.w3, in=4096, out=14336
+linear: model.layers.16.attention.wq, in=4096, out=4096
+linear: model.layers.16.attention.wk, in=4096, out=1024
+linear: model.layers.16.attention.wv, in=4096, out=1024
+linear: model.layers.16.attention.wo, in=4096, out=4096
+linear: model.layers.16.feed_forward.w1, in=4096, out=14336
+linear: model.layers.16.feed_forward.w2, in=14336, out=4096
+linear: model.layers.16.feed_forward.w3, in=4096, out=14336
+linear: model.layers.17.attention.wq, in=4096, out=4096
+linear: model.layers.17.attention.wk, in=4096, out=1024
+linear: model.layers.17.attention.wv, in=4096, out=1024
+linear: model.layers.17.attention.wo, in=4096, out=4096
+linear: model.layers.17.feed_forward.w1, in=4096, out=14336
+linear: model.layers.17.feed_forward.w2, in=14336, out=4096
+linear: model.layers.17.feed_forward.w3, in=4096, out=14336
+linear: model.layers.18.attention.wq, in=4096, out=4096
+linear: model.layers.18.attention.wk, in=4096, out=1024
+linear: model.layers.18.attention.wv, in=4096, out=1024
+linear: model.layers.18.attention.wo, in=4096, out=4096
+linear: model.layers.18.feed_forward.w1, in=4096, out=14336
+linear: model.layers.18.feed_forward.w2, in=14336, out=4096
+linear: model.layers.18.feed_forward.w3, in=4096, out=14336
+linear: model.layers.19.attention.wq, in=4096, out=4096
+linear: model.layers.19.attention.wk, in=4096, out=1024
+linear: model.layers.19.attention.wv, in=4096, out=1024
+linear: model.layers.19.attention.wo, in=4096, out=4096
+linear: model.layers.19.feed_forward.w1, in=4096, out=14336
+linear: model.layers.19.feed_forward.w2, in=14336, out=4096
+linear: model.layers.19.feed_forward.w3, in=4096, out=14336
+linear: model.layers.20.attention.wq, in=4096, out=4096
+linear: model.layers.20.attention.wk, in=4096, out=1024
+linear: model.layers.20.attention.wv, in=4096, out=1024
+linear: model.layers.20.attention.wo, in=4096, out=4096
+linear: model.layers.20.feed_forward.w1, in=4096, out=14336
+linear: model.layers.20.feed_forward.w2, in=14336, out=4096
+linear: model.layers.20.feed_forward.w3, in=4096, out=14336
+linear: model.layers.21.attention.wq, in=4096, out=4096
+linear: model.layers.21.attention.wk, in=4096, out=1024
+linear: model.layers.21.attention.wv, in=4096, out=1024
+linear: model.layers.21.attention.wo, in=4096, out=4096
+linear: model.layers.21.feed_forward.w1, in=4096, out=14336
+linear: model.layers.21.feed_forward.w2, in=14336, out=4096
+linear: model.layers.21.feed_forward.w3, in=4096, out=14336
+linear: model.layers.22.attention.wq, in=4096, out=4096
+linear: model.layers.22.attention.wk, in=4096, out=1024
+linear: model.layers.22.attention.wv, in=4096, out=1024
+linear: model.layers.22.attention.wo, in=4096, out=4096
+linear: model.layers.22.feed_forward.w1, in=4096, out=14336
+linear: model.layers.22.feed_forward.w2, in=14336, out=4096
+linear: model.layers.22.feed_forward.w3, in=4096, out=14336
+linear: model.layers.23.attention.wq, in=4096, out=4096
+linear: model.layers.23.attention.wk, in=4096, out=1024
+linear: model.layers.23.attention.wv, in=4096, out=1024
+linear: model.layers.23.attention.wo, in=4096, out=4096
+linear: model.layers.23.feed_forward.w1, in=4096, out=14336
+linear: model.layers.23.feed_forward.w2, in=14336, out=4096
+linear: model.layers.23.feed_forward.w3, in=4096, out=14336
+linear: model.layers.24.attention.wq, in=4096, out=4096
+linear: model.layers.24.attention.wk, in=4096, out=1024
+linear: model.layers.24.attention.wv, in=4096, out=1024
+linear: model.layers.24.attention.wo, in=4096, out=4096
+linear: model.layers.24.feed_forward.w1, in=4096, out=14336
+linear: model.layers.24.feed_forward.w2, in=14336, out=4096
+linear: model.layers.24.feed_forward.w3, in=4096, out=14336
+linear: model.layers.25.attention.wq, in=4096, out=4096
+linear: model.layers.25.attention.wk, in=4096, out=1024
+linear: model.layers.25.attention.wv, in=4096, out=1024
+linear: model.layers.25.attention.wo, in=4096, out=4096
+linear: model.layers.25.feed_forward.w1, in=4096, out=14336
+linear: model.layers.25.feed_forward.w2, in=14336, out=4096
+linear: model.layers.25.feed_forward.w3, in=4096, out=14336
+linear: model.layers.26.attention.wq, in=4096, out=4096
+linear: model.layers.26.attention.wk, in=4096, out=1024
+linear: model.layers.26.attention.wv, in=4096, out=1024
+linear: model.layers.26.attention.wo, in=4096, out=4096
+linear: model.layers.26.feed_forward.w1, in=4096, out=14336
+linear: model.layers.26.feed_forward.w2, in=14336, out=4096
+linear: model.layers.26.feed_forward.w3, in=4096, out=14336
+linear: model.layers.27.attention.wq, in=4096, out=4096
+linear: model.layers.27.attention.wk, in=4096, out=1024
+linear: model.layers.27.attention.wv, in=4096, out=1024
+linear: model.layers.27.attention.wo, in=4096, out=4096
+linear: model.layers.27.feed_forward.w1, in=4096, out=14336
+linear: model.layers.27.feed_forward.w2, in=14336, out=4096
+linear: model.layers.27.feed_forward.w3, in=4096, out=14336
+linear: model.layers.28.attention.wq, in=4096, out=4096
+linear: model.layers.28.attention.wk, in=4096, out=1024
+linear: model.layers.28.attention.wv, in=4096, out=1024
+linear: model.layers.28.attention.wo, in=4096, out=4096
+linear: model.layers.28.feed_forward.w1, in=4096, out=14336
+linear: model.layers.28.feed_forward.w2, in=14336, out=4096
+linear: model.layers.28.feed_forward.w3, in=4096, out=14336
+linear: model.layers.29.attention.wq, in=4096, out=4096
+linear: model.layers.29.attention.wk, in=4096, out=1024
+linear: model.layers.29.attention.wv, in=4096, out=1024
+linear: model.layers.29.attention.wo, in=4096, out=4096
+linear: model.layers.29.feed_forward.w1, in=4096, out=14336
+linear: model.layers.29.feed_forward.w2, in=14336, out=4096
+linear: model.layers.29.feed_forward.w3, in=4096, out=14336
+linear: model.layers.30.attention.wq, in=4096, out=4096
+linear: model.layers.30.attention.wk, in=4096, out=1024
+linear: model.layers.30.attention.wv, in=4096, out=1024
+linear: model.layers.30.attention.wo, in=4096, out=4096
+linear: model.layers.30.feed_forward.w1, in=4096, out=14336
+linear: model.layers.30.feed_forward.w2, in=14336, out=4096
+linear: model.layers.30.feed_forward.w3, in=4096, out=14336
+linear: model.layers.31.attention.wq, in=4096, out=4096
+linear: model.layers.31.attention.wk, in=4096, out=1024
+linear: model.layers.31.attention.wv, in=4096, out=1024
+linear: model.layers.31.attention.wo, in=4096, out=4096
+linear: model.layers.31.feed_forward.w1, in=4096, out=14336
+linear: model.layers.31.feed_forward.w2, in=14336, out=4096
+linear: model.layers.31.feed_forward.w3, in=4096, out=14336
+linear: model.output, in=4096, out=128256
+Unabled to import torchao experimental quant_api with error:  [Errno 2] No such file or directory: '/home/jackkhuu/oss/torchchat/torchao-build/src/ao/torchao/experimental/quant_api.py'
+Using device=cpu Intel(R) Xeon(R) Platinum 8339HC CPU @ 1.80GHz
+Loading model...
+Time to load model: 0.12 seconds
+Quantizing the model with: {'linear:int4': {'groupsize': 256}, 'precision': {'dtype': 'bfloat16'}, 'executor': {'accelerator': 'cpu'}}
+Time to quantize model: 16.83 seconds
+-----------------------------------------------------------
+Once upon a time, in a small village nestled in the heart of a mystical forest, there lived a young girl named Lily. Lily was a gentle soul with a heart full of love for all living things. She spent most of her days exploring the forest, learning about the creatures that lived there, and helping those in need.
+One day, while out on a walk, Lily stumbled upon a small, delicate fairy who was struggling to free herself from a spider's web. Without hesitation, Lily gently touched the fairy and whispered words of encouragement, releasing the fairy from the web.
+The fairy, grateful for Lily's kindness, thanked her and flew off into the trees. But before she disappeared, she promised Lily that she would return and repay her kindness in a most unexpected way.
+Days turned into weeks, and weeks turned into months. Lily continued to explore the forest, helping those in need and spreading joy wherever she went. And then, one evening, as the sun began to set, the fairy reappeared.
+This time, she was not alone. A magnificent dragon, with scales that shone like gold and eyes that burned with kindness, stood beside her. The fairy introduced the dragon as her friend and protector, and told Lily that he had been watching her from afar, impressed by her self
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~                
+Generated 255 tokens                 
+Time for inference 1: 38.3536 sec total                 
+Time to first token: 0.2855 sec with parallel prefill.                
+
+      Total throughput: 6.6747 tokens/sec, 0.1498 s/token                 
+First token throughput: 3.5024 tokens/sec, 0.2855 s/token                 
+ Next token throughput: 6.6985 tokens/sec, 0.1493 s/token                     
+
+Bandwidth achieved: 32.84 GB/s
+*** This first iteration will include cold start effects for dynamic import, hardware caches. ***
+
+========================================
+
+Once upon a time, in a far-off kingdom, there was a beautiful princess named Sophia. Sophia had long, flowing hair, sparkling blue eyes, and a heart as kind as the morning sun. She lived in a magnificent castle surrounded by rolling green hills and sparkling rivers.
+Princess Sophia was loved by all, and everyone in the kingdom adored her. She spent her days playing with the castle's little animals, gardening in the castle's beautiful gardens, and helping the poor people of the kingdom. She loved nothing more than bringing joy to others and spreading happiness around her.
+One day, a wicked sorcerer cast a cruel spell on Princess Sophia, changing her into a beautiful swan. The sorcerer laughed with glee, thinking that he had defeated the kind princess. But, he did not know that being a swan was not the end of Princess Sophia's kindness and courage. The swan flew high up in the sky, feeling the wind beneath her wings.
+The people of the kingdom were heartbroken when they saw the princess changed into a swan. They cried, "Oh, poor Princess Sophia! What have we done to deserve this cruel fate?"
+However, Princess Sophia, the swan, saw it differently. She was free to fly wherever she wanted and explore the kingdom
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~                
+Generated 255 tokens                 
+Time for inference 2: 36.5991 sec total                 
+Time to first token: 0.2562 sec with parallel prefill.                
+
+      Total throughput: 6.9947 tokens/sec, 0.1430 s/token                 
+First token throughput: 3.9026 tokens/sec, 0.2562 s/token                 
+ Next token throughput: 7.0165 tokens/sec, 0.1425 s/token                     
+
+Bandwidth achieved: 34.42 GB/s
+
+========================================
+
+Once upon a time, when King Pravardhan was the ruler of Ayodhya, a Brahmin named Manu was living a simple life. His wife, Jayanti, was a woman of great faith and devotion. Manu was the only son of his parents, who had passed away at a very young age. His grandmother, Renuka, had taken care of him since then.
+The king, Pravardhan, had a great desire to conquer the surrounding kingdoms and expand his empire. He had sent a messenger to the king of Videha, who was his rival, asking him to hand over his kingdom or face the consequences.
+The king of Videha, who was a just and wise ruler, was not willing to give up his kingdom. He thought that the king of Ayodhya was not fit for the task of expansion.
+The king of Videha sent a reply to the messenger, saying that he would not give up his kingdom, no matter what King Pravardhan threatened. Manu, who was at home listening to the conversation between his grandmother, Renuka, and a visiting priest, overheard the conversation about King Pravardhan's ambitions. At the time, King Pravardhan had put a condition in front of the king
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~                
+Generated 255 tokens                 
+Time for inference 3: 35.4345 sec total                 
+Time to first token: 0.2590 sec with parallel prefill.                
+
+      Total throughput: 7.2246 tokens/sec, 0.1384 s/token                 
+First token throughput: 3.8609 tokens/sec, 0.2590 s/token                 
+ Next token throughput: 7.2494 tokens/sec, 0.1379 s/token                     
+
+Bandwidth achieved: 35.55 GB/s
+
+========================================
+
+
+Warning: Excluding compile in calculations                 
+      Average tokens/sec (total): 6.96                 
+Average tokens/sec (first token): 3.76                 
+Average tokens/sec (next tokens): 6.99 
+                
+Memory used: 0.00 GB
diff --git a/llama31-1213/cpu_eager_8.txt b/llama31-1213/cpu_eager_8.txt
new file mode 100644
index 000000000..d72687f17
--- /dev/null
+++ b/llama31-1213/cpu_eager_8.txt
@@ -0,0 +1,74 @@
+
+OMP_NUM_THREADS=16 numactl --cpunodebind=0 --membind=0 python3 torchchat.py generate llama3.1 --quantize '{"linear:int8": {"groupsize": 0}, "precision": {"dtype":"bfloat16"}, "executor":{"accelerator":"cpu"}}' --prompt "Once upon a time," --max-new-tokens 256 --num-samples 3
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+OMP_NUM_THREADS=16 numactl --cpunodebind=0 --membind=0 python3 torchchat.py generate llama3.1 --quantize '{"linear:int8": {"groupsize": 0}, "precision": {"dtype":"bfloat16"}, "executor":{"accelerator":"cpu"}}' --prompt "Once upon a time," --max-new-tokens 256 --num-samples 3
+PyTorch version 2.6.0.dev20241213+cu124 available.
+Unabled to import torchao experimental quant_api with error:  [Errno 2] No such file or directory: '/home/jackkhuu/oss/torchchat/torchao-build/src/ao/torchao/experimental/quant_api.py'
+Using device=cpu Intel(R) Xeon(R) Platinum 8339HC CPU @ 1.80GHz
+Loading model...
+Time to load model: 0.12 seconds
+Quantizing the model with: {'linear:int8': {'groupsize': 0}, 'precision': {'dtype': 'bfloat16'}, 'executor': {'accelerator': 'cpu'}}
+Time to quantize model: 28.01 seconds
+-----------------------------------------------------------
+Once upon a time, there was a young man named Max. Max was an avid hiker. He loved nothing more than setting out on a new trail, exploring the great outdoors, and taking in the breathtaking views. One day, Max decided to tackle the infamous "Devil's Backbone" trail, a notorious path known for its steep incline and treacherous terrain.
+As he made his way up the mountain, Max felt a sense of accomplishment and pride in his physical abilities. However, as the sun began to set, casting a golden glow over the landscape, Max realized he had underestimated the trail's difficulty. The terrain grew increasingly treacherous, with loose rocks and steep drop-offs making every step a challenging one.
+Despite his growing fatigue, Max persisted, determined to reach the summit. As he climbed higher, the air grew thinner, and Max began to feel lightheaded. He knew he had to slow down and pace himself, but his desire to reach the top was too strong.
+
+Just as Max was starting to flag, he spotted a group of hikers huddled together, taking a break and sipping water from their canteens. Max made his way over to them, grateful for the rest and refreshment. As he sat down alongside the group, one
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~                
+Generated 255 tokens                 
+Time for inference 1: 60.2183 sec total                 
+Time to first token: 0.4795 sec with parallel prefill.                
+
+      Total throughput: 4.2512 tokens/sec, 0.2352 s/token                 
+First token throughput: 2.0856 tokens/sec, 0.4795 s/token                 
+ Next token throughput: 4.2686 tokens/sec, 0.2343 s/token                     
+
+Bandwidth achieved: 36.39 GB/s
+*** This first iteration will include cold start effects for dynamic import, hardware caches. ***
+
+========================================
+
+Once upon a time, there was a young woman named Hannah who lived in a small village surrounded by a dense forest. She was a skilled baker and ran a small bakery in the village, where she spent her days baking delicious bread, pastries, and cakes.
+One day, a mysterious stranger arrived in the village, carrying a large, ornate box. He introduced himself as a traveling merchant and said he had come to the village to sell his wares. However, Hannah noticed that the stranger seemed out of place, and she couldn’t shake the feeling that he was hiding something.
+As the days passed, strange occurrences began to happen in the village. People would report seeing the merchant lurking around their homes at night, and some even claimed to have seen him talking to the spirits of the forest. The villagers grew increasingly uneasy, and they began to whisper among themselves about the merchant’s true intentions.
+Hannah, being the curious and brave soul that she was, decided to investigate the merchant further. She started by asking him questions, trying to get to the bottom of the strange happenings in the village. However, the merchant seemed evasive and secretive, and Hannah couldn’t get any concrete answers out of him.
+Determined to uncover the truth, Hannah decided to sneak into the merchant’s
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~                
+Generated 255 tokens                 
+Time for inference 2: 54.9780 sec total                 
+Time to first token: 0.3048 sec with parallel prefill.                
+
+      Total throughput: 4.6564 tokens/sec, 0.2148 s/token                 
+First token throughput: 3.2805 tokens/sec, 0.3048 s/token                 
+ Next token throughput: 4.6641 tokens/sec, 0.2144 s/token                     
+
+Bandwidth achieved: 39.85 GB/s
+
+========================================
+
+Once upon a time, a small village nestled in the heart of a vast and mysterious forest. The villagers lived simple lives, relying on the forest for their food, shelter, and livelihood. They knew the forest like the back of their hand, but they also knew that it held many secrets and dangers.
+One day, a young boy named Kaito wandered into the forest, searching for a rare herb to help his ailing mother. As he ventured deeper into the woods, the trees seemed to grow taller and the shadows darker. Kaito felt a sense of unease, but he pressed on, his determination driving him forward.
+Suddenly, a faint glow appeared in the distance. Kaito's curiosity got the better of him, and he approached the light cautiously. As he drew closer, he saw a beautiful fairy perched on a toadstool, surrounded by a halo of soft, ethereal light.
+The fairy's wings sparkled like diamonds, and her hair shone with a soft, golden glow. She gazed at Kaito with kind, ancient eyes and spoke in a voice that was both melodious and mysterious.
+"Greetings, young one," she said. "I have been waiting for you. My name is Lyra, and I am the guardian
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~                
+Generated 255 tokens                 
+Time for inference 3: 48.1160 sec total                 
+Time to first token: 0.3134 sec with parallel prefill.                
+
+      Total throughput: 5.3205 tokens/sec, 0.1880 s/token                 
+First token throughput: 3.1909 tokens/sec, 0.3134 s/token                 
+ Next token throughput: 5.3344 tokens/sec, 0.1875 s/token                     
+
+Bandwidth achieved: 45.54 GB/s
+
+========================================
+
+
+Warning: Excluding compile in calculations                 
+      Average tokens/sec (total): 4.74                 
+Average tokens/sec (first token): 2.85                 
+Average tokens/sec (next tokens): 4.76 
+                
+Memory used: 0.00 GB
diff --git a/llama31-1213/cpu_eager_b16.txt b/llama31-1213/cpu_eager_b16.txt
new file mode 100644
index 000000000..8e0240578
--- /dev/null
+++ b/llama31-1213/cpu_eager_b16.txt
@@ -0,0 +1,75 @@
+
+OMP_NUM_THREADS=16 numactl --cpunodebind=0 --membind=0 python3 torchchat.py generate llama3.1 --quantize '{"precision": {"dtype":"bfloat16"}, "executor":{"accelerator":"cpu"}}' --prompt "Once upon a time," --max-new-tokens 256 --num-samples 3
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+OMP_NUM_THREADS=16 numactl --cpunodebind=0 --membind=0 python3 torchchat.py generate llama3.1 --quantize '{"precision": {"dtype":"bfloat16"}, "executor":{"accelerator":"cpu"}}' --prompt "Once upon a time," --max-new-tokens 256 --num-samples 3
+PyTorch version 2.6.0.dev20241213+cu124 available.
+Unabled to import torchao experimental quant_api with error:  [Errno 2] No such file or directory: '/home/jackkhuu/oss/torchchat/torchao-build/src/ao/torchao/experimental/quant_api.py'
+Using device=cpu Intel(R) Xeon(R) Platinum 8339HC CPU @ 1.80GHz
+Loading model...
+Time to load model: 0.10 seconds
+Quantizing the model with: {'precision': {'dtype': 'bfloat16'}, 'executor': {'accelerator': 'cpu'}}
+Time to quantize model: 0.01 seconds
+-----------------------------------------------------------
+Once upon a time, there was a beautiful, lively and prosperous city that attracted tourists and business travelers from all over the world. Known for its modern architecture, bustling streets and spectacular landmarks, it was indeed a must-visit destination for anyone who sought to immerse themselves in the excitement of urban life.
+However, one day, a powerful tornado swept through the city, causing widespread destruction, rendering several buildings un inhabitable, and disrupting the lives of its inhabitants. The once-thriving metropolis turned into a chaotic, dirty and polluted environment, posing a significant threat to the health, safety and well-being of its residents.
+Despite the efforts of local and national authorities to provide assistance, many people were still left without access to basic necessities, including clean water, sanitation facilities, and healthcare services. As the city struggled to recover, its residents felt a sense of hopelessness and despair, wondering if they would ever be able to rebuild their homes, their lives and their community.
+Here, we will explore how a sustainable city, which combines social, economic and environmental considerations, can be designed and developed to mitigate the impact of natural disasters such as the tornado, while also ensuring the well-being and satisfaction of its inhabitants.
+
+### Designing a Sustainable City
+To create a resilient and sustainable city, several
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~                
+Generated 255 tokens                 
+Time for inference 1: 106.8963 sec total                 
+Time to first token: 0.6166 sec with parallel prefill.                
+
+      Total throughput: 2.3948 tokens/sec, 0.4176 s/token                 
+First token throughput: 1.6219 tokens/sec, 0.6166 s/token                 
+ Next token throughput: 2.3993 tokens/sec, 0.4168 s/token                     
+
+Bandwidth achieved: 38.46 GB/s
+*** This first iteration will include cold start effects for dynamic import, hardware caches. ***
+
+========================================
+
+Once upon a time, in a small village nestled in the rolling hills of Provence, there lived a young girl named Sophie. Sophie was a delightful child with a heart full of love and a spirit that shone brightly like the sun on a warm summer day. She lived with her parents in a cozy little house surrounded by a garden filled with the sweet scent of lavender and the soft songs of birds.
+Sophie loved to spend her days exploring the countryside, playing with the animals, and helping her mother in the garden. She was a curious child, always asking questions and seeking answers about the world around her. Her parents, though sometimes tired from their daily chores, encouraged Sophie's curiosity and nurtured her love for learning.
+One day, as Sophie was wandering through the garden, she stumbled upon a small, hidden path she had never seen before. The path was overgrown with weeds and vines, and it seemed to lead to a secret place that was hidden from the rest of the world. Sophie's curiosity was piqued, and she felt an inexplicable pull to follow the path.
+She pushed aside the tangled growth and began to make her way down the winding path. The air grew sweeter, and the scent of lavender grew stronger as she walked. The path led her to a
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~                
+Generated 255 tokens                 
+Time for inference 2: 111.0549 sec total                 
+Time to first token: 0.3540 sec with parallel prefill.                
+
+      Total throughput: 2.3052 tokens/sec, 0.4338 s/token                 
+First token throughput: 2.8249 tokens/sec, 0.3540 s/token                 
+ Next token throughput: 2.3035 tokens/sec, 0.4341 s/token                     
+
+Bandwidth achieved: 37.02 GB/s
+
+========================================
+
+Once upon a time, there were three companies that specialized in producing high end audio equipment. The three companies were Sennheiser, Beyerdynamic, and Audio-Technica.
+Sennheiser, founded by Fritz Sennheiser in 1945, had been known for producing high-quality microphones and headphones for decades. Their products were widely used by professionals in the music industry and were known for their durability and sound quality. They were particularly famous for their iconic HD 280 and HD 380 headphones.
+Beyerdynamic, established in 1924 by Josef Beyer, had also been a prominent name in the world of high-end audio. They were known for their high-quality microphones and headphones that were used by many famous musicians, including The Beatles. Their DT 770 and DT 990 headphones were particularly popular among audiophiles.
+Audio-Technica, founded by Hideo Matsushita in 1962, was a Japanese company that had been producing high-quality audio equipment for over 50 years. They were known for their high-end headphones, such as the ATH-M50x and ATH-M40x, which were widely used by audio engineers and musicians.
+One day, the three companies decided to compete with each other in a product development competition. The
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~                
+Generated 255 tokens                 
+Time for inference 3: 107.0137 sec total                 
+Time to first token: 0.3876 sec with parallel prefill.                
+
+      Total throughput: 2.3922 tokens/sec, 0.4180 s/token                 
+First token throughput: 2.5797 tokens/sec, 0.3876 s/token                 
+ Next token throughput: 2.3915 tokens/sec, 0.4181 s/token                     
+
+Bandwidth achieved: 38.42 GB/s
+
+========================================
+
+
+Warning: Excluding compile in calculations                 
+      Average tokens/sec (total): 2.36                 
+Average tokens/sec (first token): 2.34                 
+Average tokens/sec (next tokens): 2.36 
+                
+Memory used: 0.00 GB
diff --git a/llama31-1213/cuda_compile_4.txt b/llama31-1213/cuda_compile_4.txt
new file mode 100644
index 000000000..93fabb62c
--- /dev/null
+++ b/llama31-1213/cuda_compile_4.txt
@@ -0,0 +1,79 @@
+
+OMP_NUM_THREADS=16 numactl --cpunodebind=0 --membind=0 python3 torchchat.py generate llama3.1 --quantize '{"linear:int4": {"groupsize": 256}, "precision": {"dtype":"bfloat16"}, "executor":{"accelerator":"cuda"}}' --prompt "Once upon a time," --max-new-tokens 200 --compile --num-samples 3
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+OMP_NUM_THREADS=16 numactl --cpunodebind=0 --membind=0 python3 torchchat.py generate llama3.1 --quantize '{"linear:int4": {"groupsize": 256}, "precision": {"dtype":"bfloat16"}, "executor":{"accelerator":"cuda"}}' --prompt "Once upon a time," --max-new-tokens 200 --compile --num-samples 3
+PyTorch version 2.6.0.dev20241213+cu124 available.
+Unabled to import torchao experimental quant_api with error:  [Errno 2] No such file or directory: '/home/jackkhuu/oss/torchchat/torchao-build/src/ao/torchao/experimental/quant_api.py'
+Using device=cuda NVIDIA PG509-210
+Loading model...
+Time to load model: 6.17 seconds
+Quantizing the model with: {'linear:int4': {'groupsize': 256}, 'precision': {'dtype': 'bfloat16'}, 'executor': {'accelerator': 'cuda'}}
+Time to quantize model: 0.57 seconds
+-----------------------------------------------------------
+Once upon a time, there was a small village surrounded by beautiful mountains and valleys. The villagers lived a peaceful life, surrounded by nature's splendor. They spent their days farming, hunting, and fishing, and they cherished every moment of their simple yet fulfilling lives.
+
+One day, a wise old man came to the village. He was a traveler who had walked for many days, and he looked tired but content. The villagers were curious about this stranger, and they welcomed him with open arms.
+
+The old man told the villagers that he had come to share a special gift with them. He said that he had learned from his travels how to harness the power of the wind and the water to make their lives easier and more efficient.
+
+The villagers were skeptical at first, but the old man's words were persuasive. He showed them how to build simple windmills that could power their homes, and how to dig canals that could bring fresh water from the mountains to their fields.
+
+The villagers were amazed by the old
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~                
+Generated 199 tokens                 
+Time for inference 1: 256.3477 sec total                 
+Time to first token: 0.2921 sec with parallel prefill.                
+
+      Total throughput: 0.7802 tokens/sec, 1.2817 s/token                 
+First token throughput: 3.4232 tokens/sec, 0.2921 s/token                 
+ Next token throughput: 0.7772 tokens/sec, 1.2867 s/token                     
+
+Bandwidth achieved: 12.53 GB/s
+*** This first iteration will include cold start effects for dynamic import, hardware caches, JIT compilation. ***
+just-in-time compilation time (incl run time): 2.6e+02 seconds
+
+========================================
+
+Once upon a time, a lot of people thought that the world was flat.
+They thought that if you sailed far enough on a ship, you would eventually fall off the edge of the world.
+If you were standing on the edge of a cliff, you could literally see over the edge of the world!
+If you walked too far away from a tree, you could see that the world stopped.
+A very long time ago, people did believe that the Earth was flat. However, people who sailed across the oceans, like Christopher Columbus, knew that the Earth was a sphere. A large number of people in the world in the 15th century believed that the Earth was a sphere, but the Catholic Church did not agree. They believed the only way for a ship to reach the East Indies was on the back of a giant sea turtle or a huge, invisible elephant that carried a magic carpet! Can you believe this? But in 1531, German astronomer Andreas Osiander convinced the Catholic Church that the Earth
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~                
+Generated 199 tokens                 
+Time for inference 2: 4.0837 sec total                 
+Time to first token: 0.0905 sec with parallel prefill.                
+
+      Total throughput: 48.9753 tokens/sec, 0.0204 s/token                 
+First token throughput: 11.0490 tokens/sec, 0.0905 s/token                 
+ Next token throughput: 49.8349 tokens/sec, 0.0201 s/token                     
+
+Bandwidth achieved: 786.57 GB/s
+
+========================================
+
+Once upon a time, there was a vast and mysterious forest, full of magic and wonder. At the heart of this forest, there was a magnificent castle, a symbol of power and prestige. The kingdom that ruled from this castle had been known for its wisdom and kindness, loved by all who lived in the surrounding lands.
+The ruler of the kingdom was the wise and just king, who loved his people with all his heart and ruled with compassion and empathy. He had an unbroken policy of peace and as such, his kingdom had become a beacon of hope and peace.
+In his kingdom, there lived a young girl named Sophia. She was a beautiful and kind-hearted girl, full of laughter and smile. She was a happy-go-lucky person, loved by everyone who knew her. Her beautiful brown eyes sparkled with joy, and her soft golden hair added to her beauty. She was well-known for her sweet nature, always willing to lend a helping hand to those who needed it.
+
+Sophia lived in the
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~                
+Generated 199 tokens                 
+Time for inference 3: 2.8753 sec total                 
+Time to first token: 0.0876 sec with parallel prefill.                
+
+      Total throughput: 69.5580 tokens/sec, 0.0144 s/token                 
+First token throughput: 11.4143 tokens/sec, 0.0876 s/token                 
+ Next token throughput: 71.3852 tokens/sec, 0.0140 s/token                     
+
+Bandwidth achieved: 1117.14 GB/s
+
+========================================
+
+
+Warning: Excluding compile in calculations                 
+      Average tokens/sec (total): 59.27                 
+Average tokens/sec (first token): 11.23                 
+Average tokens/sec (next tokens): 60.61 
+                
+Memory used: 24.75 GB
diff --git a/llama31-1213/cuda_compile_8.txt b/llama31-1213/cuda_compile_8.txt
new file mode 100644
index 000000000..1cade3d08
--- /dev/null
+++ b/llama31-1213/cuda_compile_8.txt
@@ -0,0 +1,72 @@
+
+OMP_NUM_THREADS=16 numactl --cpunodebind=0 --membind=0 python3 torchchat.py generate llama3.1 --quantize '{"linear:int8": {"groupsize": 0}, "precision": {"dtype":"bfloat16"}, "executor":{"accelerator":"cuda"}}' --prompt "Once upon a time," --max-new-tokens 200 --compile --num-samples 3
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+OMP_NUM_THREADS=16 numactl --cpunodebind=0 --membind=0 python3 torchchat.py generate llama3.1 --quantize '{"linear:int8": {"groupsize": 0}, "precision": {"dtype":"bfloat16"}, "executor":{"accelerator":"cuda"}}' --prompt "Once upon a time," --max-new-tokens 200 --compile --num-samples 3
+PyTorch version 2.6.0.dev20241213+cu124 available.
+Unabled to import torchao experimental quant_api with error:  [Errno 2] No such file or directory: '/home/jackkhuu/oss/torchchat/torchao-build/src/ao/torchao/experimental/quant_api.py'
+Using device=cuda NVIDIA PG509-210
+Loading model...
+Time to load model: 6.04 seconds
+Quantizing the model with: {'linear:int8': {'groupsize': 0}, 'precision': {'dtype': 'bfloat16'}, 'executor': {'accelerator': 'cuda'}}
+Time to quantize model: 0.41 seconds
+-----------------------------------------------------------
+Once upon a time, in a small village nestled in the rolling hills of Tuscany, there lived a young woman named Giulia. Giulia was a talented chef and baker, known throughout the village for her delicious pastries and breads. She had a reputation for using only the freshest ingredients and traditional techniques to create her creations.
+One day, a wealthy merchant from Florence arrived in the village, seeking the best baker in the land. He had heard of Giulia's exceptional skills and had come to sample her wares. Giulia, eager to impress, spent hours preparing a special batch of her famous sweet bread, infused with the fragrance of rose petals and the sweetness of honey.
+As the merchant waited in her small shop, the aroma of freshly baked bread wafted through the air, enticing the villagers to come and sample Giulia's creations. The merchant, impressed by the variety of breads and pastries on display, asked Giulia to prepare a special dessert for him. Giulia,
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~                
+Generated 199 tokens                 
+Time for inference 1: 228.1775 sec total                 
+Time to first token: 0.6230 sec with parallel prefill.                
+
+      Total throughput: 0.8765 tokens/sec, 1.1409 s/token                 
+First token throughput: 1.6051 tokens/sec, 0.6230 s/token                 
+ Next token throughput: 0.8745 tokens/sec, 1.1435 s/token                     
+
+Bandwidth achieved: 7.50 GB/s
+*** This first iteration will include cold start effects for dynamic import, hardware caches, JIT compilation. ***
+just-in-time compilation time (incl run time): 2.3e+02 seconds
+
+========================================
+
+Once upon a time, in a small coastal town, there was a bustling fish market. Every day, fishermen would bring in their catch of the day, and the market would be filled with the lively smell of fresh seafood. The locals would gather to buy their daily supplies, and the market would buzz with conversation.
+Among the fishermen was a young boy named Jack. Jack had grown up on the water and was fascinated by the sea and its creatures. He would often help his father, a skilled fisherman, on their vessels, learning the ways of the ocean and the art of fishing.
+One day, while Jack was helping his father, they came across a rare and mysterious sea creature. It was a giant squid, unlike any they had ever seen before. The squid was unlike any other they had seen before, with its distinctive shape and the striking patterns on its body.
+The fisherman were amazed by the creature's sheer size and its elusive nature. They carefully examined the squid's body, taking note of its unique
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~                
+Generated 199 tokens                 
+Time for inference 2: 4.4041 sec total                 
+Time to first token: 0.0858 sec with parallel prefill.                
+
+      Total throughput: 45.4117 tokens/sec, 0.0220 s/token                 
+First token throughput: 11.6542 tokens/sec, 0.0858 s/token                 
+ Next token throughput: 46.0825 tokens/sec, 0.0217 s/token                     
+
+Bandwidth achieved: 388.67 GB/s
+
+========================================
+
+Once upon a time, in a village nestled between two great mountains, there lived a young girl named Ava. Ava loved to explore the world around her, and one day, she stumbled upon a hidden pathway she had never seen before. The path was overgrown with vines and shrubs, but Ava was curious, and her heart was full of wonder. She pushed aside the foliage and stepped onto the winding road.
+
+As she walked, Ava noticed that the air grew thick with the scent of blooming flowers. The path began to wind its way up the mountain, and Ava found herself surrounded by an array of vibrant colors. She walked in awe, feeling the gentle breeze caress her face and the warmth of the sun on her skin.
+
+The path began to narrow, and Ava found herself walking alongside a sparkling stream. The sound of the water was like music to her ears, and she couldn't help but skip along the path, feeling carefree and alive. As she walked, the trees grew taller, and the
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~                
+Generated 199 tokens                 
+Time for inference 3: 2.2439 sec total                 
+Time to first token: 0.0816 sec with parallel prefill.                
+
+      Total throughput: 89.1296 tokens/sec, 0.0112 s/token                 
+First token throughput: 12.2534 tokens/sec, 0.0816 s/token                 
+ Next token throughput: 92.0310 tokens/sec, 0.0109 s/token                     
+
+Bandwidth achieved: 762.85 GB/s
+
+========================================
+
+
+Warning: Excluding compile in calculations                 
+      Average tokens/sec (total): 67.27                 
+Average tokens/sec (first token): 11.95                 
+Average tokens/sec (next tokens): 69.06 
+                
+Memory used: 28.81 GB
diff --git a/llama31-1213/cuda_compile_b16.txt b/llama31-1213/cuda_compile_b16.txt
new file mode 100644
index 000000000..c37f04a5e
--- /dev/null
+++ b/llama31-1213/cuda_compile_b16.txt
@@ -0,0 +1,74 @@
+
+OMP_NUM_THREADS=16 numactl --cpunodebind=0 --membind=0 python3 torchchat.py generate llama3.1 --quantize '{"precision": {"dtype":"bfloat16"}, "executor":{"accelerator":"cuda"}}' --prompt "Once upon a time," --max-new-tokens 200 --compile --num-samples 3
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+OMP_NUM_THREADS=16 numactl --cpunodebind=0 --membind=0 python3 torchchat.py generate llama3.1 --quantize '{"precision": {"dtype":"bfloat16"}, "executor":{"accelerator":"cuda"}}' --prompt "Once upon a time," --max-new-tokens 200 --compile --num-samples 3
+PyTorch version 2.6.0.dev20241213+cu124 available.
+Unabled to import torchao experimental quant_api with error:  [Errno 2] No such file or directory: '/home/jackkhuu/oss/torchchat/torchao-build/src/ao/torchao/experimental/quant_api.py'
+Using device=cuda NVIDIA PG509-210
+Loading model...
+Time to load model: 5.81 seconds
+Quantizing the model with: {'precision': {'dtype': 'bfloat16'}, 'executor': {'accelerator': 'cuda'}}
+Time to quantize model: 0.01 seconds
+-----------------------------------------------------------
+Once upon a time, in the bustling city of New York, there lived a young artist named Emma. Emma was a painter, and her passion was to capture the beauty of the city through her art. She spent most of her days wandering the streets of Manhattan, observing the people, the architecture, and the ever-changing scenery.
+One day, while walking through Central Park, Emma stumbled upon a small, quaint shop tucked away in a quiet corner. The sign above the door read "Moonlit Dreams," and the windows were filled with an assortment of curious objects that seemed to spark Emma's curiosity.
+She pushed open the door, and a bell above it rang out, announcing her arrival. The shop was dimly lit, with soft music playing in the background, creating an enchanting atmosphere. Emma's eyes wandered around the shop, taking in the eclectic assortment of items on display.
+There were vintage clothing, antique furniture, and an assortment of peculiar collectibles that seemed to be from another era. Emma's eyes landed
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~                
+Generated 199 tokens                 
+Time for inference 1: 222.5732 sec total                 
+Time to first token: 0.6159 sec with parallel prefill.                
+
+      Total throughput: 0.8986 tokens/sec, 1.1129 s/token                 
+First token throughput: 1.6237 tokens/sec, 0.6159 s/token                 
+ Next token throughput: 0.8966 tokens/sec, 1.1154 s/token                     
+
+Bandwidth achieved: 14.43 GB/s
+*** This first iteration will include cold start effects for dynamic import, hardware caches, JIT compilation. ***
+just-in-time compilation time (incl run time): 2.2e+02 seconds
+
+========================================
+
+Once upon a time, in a small village, there lived a young baker named Sarah. Sarah was known throughout the village for her delicious pastries and bread. She would often wake up before dawn to mix and knead the dough, and by the time the sun was rising, the aroma of freshly baked goods wafted through the village, teasing everyone's taste buds.
+
+One day, a wealthy merchant arrived in the village, seeking the finest pastries for his important guests. He visited Sarah's bakery, and as he sampled her creations, he was impressed by the flavors and textures. He offered Sarah a handsome sum of money to bake a special cake for his guests, but only if she could make it to his exact specifications.
+
+Sarah, eager to earn the money and showcase her skills, accepted the challenge. She spent hours perfecting the recipe, experimenting with different ingredients and techniques to meet the merchant's demands. As the deadline approached, Sarah became increasingly anxious, fearing that she might not be able to deliver the
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~                
+Generated 199 tokens                 
+Time for inference 2: 4.7174 sec total                 
+Time to first token: 0.0634 sec with parallel prefill.                
+
+      Total throughput: 42.3962 tokens/sec, 0.0236 s/token                 
+First token throughput: 15.7646 tokens/sec, 0.0634 s/token                 
+ Next token throughput: 42.7592 tokens/sec, 0.0234 s/token                     
+
+Bandwidth achieved: 680.91 GB/s
+
+========================================
+
+Once upon a time, in a tropical paradise, there was a magnificent tree named Thimbu. Thimbu was the largest tree in the forest, with a trunk as wide as a house and branches that stretched up to the sky like giant arms. Its leaves were a vibrant green and its flowers were a brilliant shade of pink. Thimbu was a beloved member of the forest community, providing shade for the animals and shelter from the storms.
+
+However, Thimbu had a secret: it was dying. A disease had taken hold of the tree, slowly eating away at its insides. Thimbu tried to hide its condition from the other trees and animals, but they began to notice its gradual decline. The once-strong trunk was now weakened, and the branches were starting to wither and fall.
+
+One day, a wise old owl named Haku came to visit Thimbu. Haku was known for his understanding of the natural world and his ability to heal even the most grievous wounds. He
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~                
+Generated 199 tokens                 
+Time for inference 3: 2.6676 sec total                 
+Time to first token: 0.0646 sec with parallel prefill.                
+
+      Total throughput: 74.9739 tokens/sec, 0.0133 s/token                 
+First token throughput: 15.4896 tokens/sec, 0.0646 s/token                 
+ Next token throughput: 76.4492 tokens/sec, 0.0131 s/token                     
+
+Bandwidth achieved: 1204.12 GB/s
+
+========================================
+
+
+Warning: Excluding compile in calculations                 
+      Average tokens/sec (total): 58.69                 
+Average tokens/sec (first token): 15.63                 
+Average tokens/sec (next tokens): 59.60 
+                
+Memory used: 16.91 GB
diff --git a/llama31-1213/cuda_eager_4.txt b/llama31-1213/cuda_eager_4.txt
new file mode 100644
index 000000000..65eb13477
--- /dev/null
+++ b/llama31-1213/cuda_eager_4.txt
@@ -0,0 +1,73 @@
+
+OMP_NUM_THREADS=16 numactl --cpunodebind=0 --membind=0 python3 torchchat.py generate llama3.1 --quantize '{"linear:int4": {"groupsize": 256}, "precision": {"dtype":"bfloat16"}, "executor":{"accelerator":"cuda"}}' --prompt "Once upon a time," --max-new-tokens 200 --num-samples 3
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+OMP_NUM_THREADS=16 numactl --cpunodebind=0 --membind=0 python3 torchchat.py generate llama3.1 --quantize '{"linear:int4": {"groupsize": 256}, "precision": {"dtype":"bfloat16"}, "executor":{"accelerator":"cuda"}}' --prompt "Once upon a time," --max-new-tokens 200 --num-samples 3
+PyTorch version 2.6.0.dev20241213+cu124 available.
+Unabled to import torchao experimental quant_api with error:  [Errno 2] No such file or directory: '/home/jackkhuu/oss/torchchat/torchao-build/src/ao/torchao/experimental/quant_api.py'
+Using device=cuda NVIDIA PG509-210
+Loading model...
+Time to load model: 5.77 seconds
+Quantizing the model with: {'linear:int4': {'groupsize': 256}, 'precision': {'dtype': 'bfloat16'}, 'executor': {'accelerator': 'cuda'}}
+Time to quantize model: 0.58 seconds
+-----------------------------------------------------------
+Once upon a time, there was a young man who wanted to learn how to play the guitar. He had always been fascinated by the sounds and rhythms of the instrument and wanted to be able to play it himself. So, he went out and bought himself a guitar and started practicing every day.
+At first, it was tough. His fingers would hurt and his hands would cramp. He would get frustrated when he couldn't get the chords right or when the strings would buzz. But he didn't give up. He kept practicing, and slowly but surely, he started to get better.
+As he got more confident, he started to play with other people, joining a band or playing at open mic nights. He even started to write his own music, and people began to take notice of his talent.
+He became known as a skilled guitarist, and people would come from all over to hear him play. He would play at weddings and parties, and even landed a record deal with a big music label.
+The once young
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~                
+Generated 199 tokens                 
+Time for inference 1: 19.6881 sec total                 
+Time to first token: 0.3371 sec with parallel prefill.                
+
+      Total throughput: 10.1584 tokens/sec, 0.0984 s/token                 
+First token throughput: 2.9664 tokens/sec, 0.3371 s/token                 
+ Next token throughput: 10.2837 tokens/sec, 0.0972 s/token                     
+
+Bandwidth achieved: 163.15 GB/s
+*** This first iteration will include cold start effects for dynamic import, hardware caches. ***
+
+========================================
+
+Once upon a time, in a small village surrounded by rolling hills and lush green forests, there lived a young girl named Lily. She was a kind and gentle soul, with a heart full of love for all living things. Lily lived with her grandmother, who was a wise and respected woman in the village. Together, they tended to a beautiful garden filled with vibrant flowers, sweet-smelling herbs, and juicy fruits.
+
+One day, while Lily was out collecting wildflowers, she stumbled upon a tiny, furry creature with big brown eyes. The creature, who introduced himself as Finn, was a lost and scared little rabbit. Lily immediately took pity on him and decided to take him home, where her grandmother welcomed him with open arms.
+
+As the days went by, Lily and her grandmother took care of Finn, feeding him nutritious food and giving him a warm and cozy place to sleep. They soon discovered that Finn had a remarkable talent – he could sense and detect even the slightest changes in his surroundings, from the sweetest
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~                
+Generated 199 tokens                 
+Time for inference 2: 19.7267 sec total                 
+Time to first token: 0.0860 sec with parallel prefill.                
+
+      Total throughput: 10.1386 tokens/sec, 0.0986 s/token                 
+First token throughput: 11.6323 tokens/sec, 0.0860 s/token                 
+ Next token throughput: 10.1320 tokens/sec, 0.0987 s/token                     
+
+Bandwidth achieved: 162.83 GB/s
+
+========================================
+
+Once upon a time, in a small village surrounded by vast fields of wheat and barley, there lived a young girl named Sophia. Sophia was known throughout the village for her incredible beauty and her extraordinary talent as a singer. She had a voice that could calm the savage beast and lift the spirits of even the most downtrodden person.
+One day, a grand procession came to the village, led by the King himself, who was known for his fairness and kindness. The villagers were overjoyed and gathered in large numbers to catch a glimpse of their monarch.
+As the procession marched through the village, Sophia stood on the edge of the crowd, her eyes fixed on the regal figure of the King. She was awestruck by his grandeur and majesty. The King, too, noticed her and was struck by her beauty and poise. He ordered his guards to bring the girl closer to him.
+As Sophia approached the King, she bowed low and greeted him with the traditional village customs. The King,
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~                
+Generated 199 tokens                 
+Time for inference 3: 19.5912 sec total                 
+Time to first token: 0.0945 sec with parallel prefill.                
+
+      Total throughput: 10.2087 tokens/sec, 0.0980 s/token                 
+First token throughput: 10.5812 tokens/sec, 0.0945 s/token                 
+ Next token throughput: 10.2069 tokens/sec, 0.0980 s/token                     
+
+Bandwidth achieved: 163.96 GB/s
+
+========================================
+
+
+Warning: Excluding compile in calculations                 
+      Average tokens/sec (total): 10.17                 
+Average tokens/sec (first token): 8.39                 
+Average tokens/sec (next tokens): 10.21 
+                
+Memory used: 24.80 GB
diff --git a/llama31-1213/cuda_eager_8.txt b/llama31-1213/cuda_eager_8.txt
new file mode 100644
index 000000000..49f68c7ca
--- /dev/null
+++ b/llama31-1213/cuda_eager_8.txt
@@ -0,0 +1,70 @@
+
+OMP_NUM_THREADS=16 numactl --cpunodebind=0 --membind=0 python3 torchchat.py generate llama3.1 --quantize '{"linear:int8": {"groupsize": 0}, "precision": {"dtype":"bfloat16"}, "executor":{"accelerator":"cuda"}}' --prompt "Once upon a time," --max-new-tokens 200 --num-samples 3
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+OMP_NUM_THREADS=16 numactl --cpunodebind=0 --membind=0 python3 torchchat.py generate llama3.1 --quantize '{"linear:int8": {"groupsize": 0}, "precision": {"dtype":"bfloat16"}, "executor":{"accelerator":"cuda"}}' --prompt "Once upon a time," --max-new-tokens 200 --num-samples 3
+PyTorch version 2.6.0.dev20241213+cu124 available.
+Unabled to import torchao experimental quant_api with error:  [Errno 2] No such file or directory: '/home/jackkhuu/oss/torchchat/torchao-build/src/ao/torchao/experimental/quant_api.py'
+Using device=cuda NVIDIA PG509-210
+Loading model...
+Time to load model: 5.77 seconds
+Quantizing the model with: {'linear:int8': {'groupsize': 0}, 'precision': {'dtype': 'bfloat16'}, 'executor': {'accelerator': 'cuda'}}
+Time to quantize model: 0.40 seconds
+-----------------------------------------------------------
+Once upon a time, in the northern region of Peru, there was a young boy named César. César was a curious and adventurous boy who loved exploring the Andes mountains with his family. One day, while they were out on a hike, César stumbled upon an ancient Inca gold nugget. As soon as he touched it, he felt an strange sensation, like a sudden rush of excitement and energy.
+
+As he picked up the nugget, he noticed that it was glowing with a soft, golden light. César felt drawn to the nugget, as if it were calling to him. He couldn't resist the urge to hold it closer and examine it more closely. As he did, he felt a sudden jolt of electricity run through his body, and everything around him began to shimmer and shine.
+
+Suddenly, César found himself transported through time and space to the era of the Inca Empire. He found himself standing in the middle of a bustling marketplace, surrounded
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~                
+Generated 199 tokens                 
+Time for inference 1: 17.0737 sec total                 
+Time to first token: 0.6103 sec with parallel prefill.                
+
+      Total throughput: 11.7139 tokens/sec, 0.0854 s/token                 
+First token throughput: 1.6385 tokens/sec, 0.6103 s/token                 
+ Next token throughput: 12.0875 tokens/sec, 0.0827 s/token                     
+
+Bandwidth achieved: 100.26 GB/s
+*** This first iteration will include cold start effects for dynamic import, hardware caches. ***
+
+========================================
+
+Once upon a time, in a distant land, there was a mystical realm called the kingdom of Eldoria. Eldoria was a land of breathtaking beauty, where towering spires of crystal pierced the sky and lush forests stretched as far as the eye could see. The kingdom was home to a variety of magical creatures, each with their own unique abilities and characteristics.
+In the heart of Eldoria, there lived a young apprentice named Eira. Eira was a skilled weaver of magical threads, able to craft powerful spells and enchantments that could manipulate the very fabric of reality. She was a novice in the art of magic, but her innate talent and dedication had already earned her a reputation as one of the most promising young mages in the kingdom.
+Eira's mentor, the wise and venerable mage named Lyra, had taken her under her wing and was teaching her the intricacies of magic. Lyra was a master weaver of threads, able to create complex spells that could bend time and space to
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~                
+Generated 199 tokens                 
+Time for inference 2: 16.7426 sec total                 
+Time to first token: 0.0711 sec with parallel prefill.                
+
+      Total throughput: 11.9456 tokens/sec, 0.0837 s/token                 
+First token throughput: 14.0621 tokens/sec, 0.0711 s/token                 
+ Next token throughput: 11.9365 tokens/sec, 0.0838 s/token                     
+
+Bandwidth achieved: 102.24 GB/s
+
+========================================
+
+Once upon a time, as the last remnants of a dying sun set over a western horizon, two adventurers embarked upon a journey through a land of mystery and wonder. They were a duo of unlikely friends, a grizzled old trapper named Jack and a young and ambitious scientist named Alex. Jack had spent his life roaming the wilderness, tracking and trapping the creatures that lived within it, while Alex had studied the ancient lore and mythologies of the land. Their paths had crossed by chance, and now they found themselves bound together on a quest to uncover the secrets of a long-lost civilization.
+As they journeyed deeper into the heart of the unknown, they encountered strange and wondrous sights: towering crystal spires that pierced the sky like shards of glass, and ancient ruins that seemed to hold the whispers of the past within their crumbling stones. They spoke with the spirits of the land, who shared with them the secrets of the long-lost civilization that had once thrived here.
+Their journey took them through tre
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~                
+Generated 199 tokens                 
+Time for inference 3: 16.5982 sec total                 
+Time to first token: 0.0735 sec with parallel prefill.                
+
+      Total throughput: 12.0495 tokens/sec, 0.0830 s/token                 
+First token throughput: 13.6109 tokens/sec, 0.0735 s/token                 
+ Next token throughput: 12.0426 tokens/sec, 0.0830 s/token                     
+
+Bandwidth achieved: 103.13 GB/s
+
+========================================
+
+
+Warning: Excluding compile in calculations                 
+      Average tokens/sec (total): 11.90                 
+Average tokens/sec (first token): 9.77                 
+Average tokens/sec (next tokens): 12.02 
+                
+Memory used: 28.86 GB
diff --git a/llama31-1213/cuda_eager_b16.txt b/llama31-1213/cuda_eager_b16.txt
new file mode 100644
index 000000000..a51a25e91
--- /dev/null
+++ b/llama31-1213/cuda_eager_b16.txt
@@ -0,0 +1,72 @@
+
+OMP_NUM_THREADS=16 numactl --cpunodebind=0 --membind=0 python3 torchchat.py generate llama3.1 --quantize '{"precision": {"dtype":"bfloat16"}, "executor":{"accelerator":"cuda"}}' --prompt "Once upon a time," --max-new-tokens 200 --num-samples 3
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+OMP_NUM_THREADS=16 numactl --cpunodebind=0 --membind=0 python3 torchchat.py generate llama3.1 --quantize '{"precision": {"dtype":"bfloat16"}, "executor":{"accelerator":"cuda"}}' --prompt "Once upon a time," --max-new-tokens 200 --num-samples 3
+PyTorch version 2.6.0.dev20241213+cu124 available.
+Unabled to import torchao experimental quant_api with error:  [Errno 2] No such file or directory: '/home/jackkhuu/oss/torchchat/torchao-build/src/ao/torchao/experimental/quant_api.py'
+Using device=cuda NVIDIA PG509-210
+Loading model...
+Time to load model: 6.09 seconds
+Quantizing the model with: {'precision': {'dtype': 'bfloat16'}, 'executor': {'accelerator': 'cuda'}}
+Time to quantize model: 0.01 seconds
+-----------------------------------------------------------
+Once upon a time, in a galaxy far, far away, there existed a magical place called the Forest of Dreams. This enchanted land was home to a magnificent, ancient tree known as the Tree of Wisdom.
+As the sun set, and the stars began to twinkle, the Tree of Wisdom would transform into a magnificent sight to behold. Its trunk would shimmer with a soft, ethereal glow, and its branches would stretch out like a canopy of sparkling diamonds, reaching for the stars.
+Within the forest, there lived a young adventurer named Luna. Luna was a curious and brave soul, with a heart full of wonder and a mind full of questions. She was determined to find the Tree of Wisdom, to unlock its secrets, and to gain the wisdom that it held.
+One evening, Luna set out on her journey, following the winding paths and glistening streams of the Forest of Dreams. As she wandered deeper into the forest, the shadows grew darker, and the silence grew thicker, like a heavy mist.
+L
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~                
+Generated 199 tokens                 
+Time for inference 1: 14.0259 sec total                 
+Time to first token: 0.6336 sec with parallel prefill.                
+
+      Total throughput: 14.2593 tokens/sec, 0.0701 s/token                 
+First token throughput: 1.5783 tokens/sec, 0.6336 s/token                 
+ Next token throughput: 14.8593 tokens/sec, 0.0673 s/token                     
+
+Bandwidth achieved: 229.01 GB/s
+*** This first iteration will include cold start effects for dynamic import, hardware caches. ***
+
+========================================
+
+Once upon a time, at the end of the last Ice Age, a strange object fell from the sky to the earth in the modern-day state of Nebraska, USA. The object was a metallic meteorite, known as the Sutter's Mill meteorite, which weighed over 1,000 kilograms (2,200 pounds) and was heavily shaped by its passage through the atmosphere.
+The meteorite was one of the most significant meteorites discovered in history, containing a large quantity of extraterrestrial material, including tiny fragments of the sun's photosphere, the surface of the sun and other celestial objects.
+Although the meteorite itself was not radioactive, it contained a small amount of noble gas such as Krypton, which was trapped inside the rocky fragments of the meteorite. Scientists were able to analyze the Krypton to learn more about the age of the meteorite, which turned out to be approximately 6.2 million years old.
+The analysis of the meteorite revealed that it was a fragment of the largest
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~                
+Generated 199 tokens                 
+Time for inference 2: 12.6627 sec total                 
+Time to first token: 0.0586 sec with parallel prefill.                
+
+      Total throughput: 15.7945 tokens/sec, 0.0633 s/token                 
+First token throughput: 17.0624 tokens/sec, 0.0586 s/token                 
+ Next token throughput: 15.7886 tokens/sec, 0.0633 s/token                     
+
+Bandwidth achieved: 253.67 GB/s
+
+========================================
+
+Once upon a time, a young girl named Sophia lived in a small village surrounded by vast fields and dense forests. She was a curious and adventurous child, always eager to explore the world around her. One day, while wandering through the forest, Sophia stumbled upon a mysterious and magical garden. The garden was filled with beautiful flowers, sparkling fountains, and towering trees that seemed to stretch up to the sky.
+As Sophia explored the garden, she came across a talking tree named Oakley. Oakley was an ancient and wise tree, with a deep understanding of the natural world. He welcomed Sophia to the garden and offered to show her the secrets and wonders that lay within.
+Together, Sophia and Oakley explored the garden, discovering hidden paths, sparkling waterfalls, and secret glades. Along the way, Oakley taught Sophia about the magic of the natural world, from the way the trees communicated with each other through their roots, to the way the flowers bloomed in response to the changing seasons.
+As the sun
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~                
+Generated 199 tokens                 
+Time for inference 3: 12.2806 sec total                 
+Time to first token: 0.0539 sec with parallel prefill.                
+
+      Total throughput: 16.2858 tokens/sec, 0.0614 s/token                 
+First token throughput: 18.5675 tokens/sec, 0.0539 s/token                 
+ Next token throughput: 16.2758 tokens/sec, 0.0614 s/token                     
+
+Bandwidth achieved: 261.56 GB/s
+
+========================================
+
+
+Warning: Excluding compile in calculations                 
+      Average tokens/sec (total): 15.45                 
+Average tokens/sec (first token): 12.40                 
+Average tokens/sec (next tokens): 15.64 
+                
+Memory used: 16.68 GB
diff --git a/llama31-1218/cpu_aoti_4.txt b/llama31-1218/cpu_aoti_4.txt
new file mode 100644
index 000000000..1fd4518ca
--- /dev/null
+++ b/llama31-1218/cpu_aoti_4.txt
@@ -0,0 +1,655 @@
+python3 torchchat.py export llama3.1 --quantize '{"linear:int4": {"groupsize": 256}, "precision": {"dtype":"bfloat16"}, "executor":{"accelerator":"cpu"}}' --output-dso-path /tmp/model34.so
+OMP_NUM_THREADS=16 numactl --cpunodebind=0 --membind=0 python3 torchchat.py generate llama3.1 --dso-path /tmp/model34.so --prompt "Once upon a time," --max-new-tokens 256 --device cpu --num-samples 3
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+python3 torchchat.py export llama3.1 --quantize '{"linear:int4": {"groupsize": 256}, "precision": {"dtype":"bfloat16"}, "executor":{"accelerator":"cpu"}}' --output-dso-path /tmp/model34.so
+Note: NumExpr detected 22 cores but "NUMEXPR_MAX_THREADS" not set, so enforcing safe limit of 16.
+NumExpr defaulting to 16 threads.
+PyTorch version 2.6.0.dev20241218+cu124 available.
+linear: model.layers.0.attention.wq, in=4096, out=4096
+linear: model.layers.0.attention.wk, in=4096, out=1024
+linear: model.layers.0.attention.wv, in=4096, out=1024
+linear: model.layers.0.attention.wo, in=4096, out=4096
+linear: model.layers.0.feed_forward.w1, in=4096, out=14336
+linear: model.layers.0.feed_forward.w2, in=14336, out=4096
+linear: model.layers.0.feed_forward.w3, in=4096, out=14336
+linear: model.layers.1.attention.wq, in=4096, out=4096
+linear: model.layers.1.attention.wk, in=4096, out=1024
+linear: model.layers.1.attention.wv, in=4096, out=1024
+linear: model.layers.1.attention.wo, in=4096, out=4096
+linear: model.layers.1.feed_forward.w1, in=4096, out=14336
+linear: model.layers.1.feed_forward.w2, in=14336, out=4096
+linear: model.layers.1.feed_forward.w3, in=4096, out=14336
+linear: model.layers.2.attention.wq, in=4096, out=4096
+linear: model.layers.2.attention.wk, in=4096, out=1024
+linear: model.layers.2.attention.wv, in=4096, out=1024
+linear: model.layers.2.attention.wo, in=4096, out=4096
+linear: model.layers.2.feed_forward.w1, in=4096, out=14336
+linear: model.layers.2.feed_forward.w2, in=14336, out=4096
+linear: model.layers.2.feed_forward.w3, in=4096, out=14336
+linear: model.layers.3.attention.wq, in=4096, out=4096
+linear: model.layers.3.attention.wk, in=4096, out=1024
+linear: model.layers.3.attention.wv, in=4096, out=1024
+linear: model.layers.3.attention.wo, in=4096, out=4096
+linear: model.layers.3.feed_forward.w1, in=4096, out=14336
+linear: model.layers.3.feed_forward.w2, in=14336, out=4096
+linear: model.layers.3.feed_forward.w3, in=4096, out=14336
+linear: model.layers.4.attention.wq, in=4096, out=4096
+linear: model.layers.4.attention.wk, in=4096, out=1024
+linear: model.layers.4.attention.wv, in=4096, out=1024
+linear: model.layers.4.attention.wo, in=4096, out=4096
+linear: model.layers.4.feed_forward.w1, in=4096, out=14336
+linear: model.layers.4.feed_forward.w2, in=14336, out=4096
+linear: model.layers.4.feed_forward.w3, in=4096, out=14336
+linear: model.layers.5.attention.wq, in=4096, out=4096
+linear: model.layers.5.attention.wk, in=4096, out=1024
+linear: model.layers.5.attention.wv, in=4096, out=1024
+linear: model.layers.5.attention.wo, in=4096, out=4096
+linear: model.layers.5.feed_forward.w1, in=4096, out=14336
+linear: model.layers.5.feed_forward.w2, in=14336, out=4096
+linear: model.layers.5.feed_forward.w3, in=4096, out=14336
+linear: model.layers.6.attention.wq, in=4096, out=4096
+linear: model.layers.6.attention.wk, in=4096, out=1024
+linear: model.layers.6.attention.wv, in=4096, out=1024
+linear: model.layers.6.attention.wo, in=4096, out=4096
+linear: model.layers.6.feed_forward.w1, in=4096, out=14336
+linear: model.layers.6.feed_forward.w2, in=14336, out=4096
+linear: model.layers.6.feed_forward.w3, in=4096, out=14336
+linear: model.layers.7.attention.wq, in=4096, out=4096
+linear: model.layers.7.attention.wk, in=4096, out=1024
+linear: model.layers.7.attention.wv, in=4096, out=1024
+linear: model.layers.7.attention.wo, in=4096, out=4096
+linear: model.layers.7.feed_forward.w1, in=4096, out=14336
+linear: model.layers.7.feed_forward.w2, in=14336, out=4096
+linear: model.layers.7.feed_forward.w3, in=4096, out=14336
+linear: model.layers.8.attention.wq, in=4096, out=4096
+linear: model.layers.8.attention.wk, in=4096, out=1024
+linear: model.layers.8.attention.wv, in=4096, out=1024
+linear: model.layers.8.attention.wo, in=4096, out=4096
+linear: model.layers.8.feed_forward.w1, in=4096, out=14336
+linear: model.layers.8.feed_forward.w2, in=14336, out=4096
+linear: model.layers.8.feed_forward.w3, in=4096, out=14336
+linear: model.layers.9.attention.wq, in=4096, out=4096
+linear: model.layers.9.attention.wk, in=4096, out=1024
+linear: model.layers.9.attention.wv, in=4096, out=1024
+linear: model.layers.9.attention.wo, in=4096, out=4096
+linear: model.layers.9.feed_forward.w1, in=4096, out=14336
+linear: model.layers.9.feed_forward.w2, in=14336, out=4096
+linear: model.layers.9.feed_forward.w3, in=4096, out=14336
+linear: model.layers.10.attention.wq, in=4096, out=4096
+linear: model.layers.10.attention.wk, in=4096, out=1024
+linear: model.layers.10.attention.wv, in=4096, out=1024
+linear: model.layers.10.attention.wo, in=4096, out=4096
+linear: model.layers.10.feed_forward.w1, in=4096, out=14336
+linear: model.layers.10.feed_forward.w2, in=14336, out=4096
+linear: model.layers.10.feed_forward.w3, in=4096, out=14336
+linear: model.layers.11.attention.wq, in=4096, out=4096
+linear: model.layers.11.attention.wk, in=4096, out=1024
+linear: model.layers.11.attention.wv, in=4096, out=1024
+linear: model.layers.11.attention.wo, in=4096, out=4096
+linear: model.layers.11.feed_forward.w1, in=4096, out=14336
+linear: model.layers.11.feed_forward.w2, in=14336, out=4096
+linear: model.layers.11.feed_forward.w3, in=4096, out=14336
+linear: model.layers.12.attention.wq, in=4096, out=4096
+linear: model.layers.12.attention.wk, in=4096, out=1024
+linear: model.layers.12.attention.wv, in=4096, out=1024
+linear: model.layers.12.attention.wo, in=4096, out=4096
+linear: model.layers.12.feed_forward.w1, in=4096, out=14336
+linear: model.layers.12.feed_forward.w2, in=14336, out=4096
+linear: model.layers.12.feed_forward.w3, in=4096, out=14336
+linear: model.layers.13.attention.wq, in=4096, out=4096
+linear: model.layers.13.attention.wk, in=4096, out=1024
+linear: model.layers.13.attention.wv, in=4096, out=1024
+linear: model.layers.13.attention.wo, in=4096, out=4096
+linear: model.layers.13.feed_forward.w1, in=4096, out=14336
+linear: model.layers.13.feed_forward.w2, in=14336, out=4096
+linear: model.layers.13.feed_forward.w3, in=4096, out=14336
+linear: model.layers.14.attention.wq, in=4096, out=4096
+linear: model.layers.14.attention.wk, in=4096, out=1024
+linear: model.layers.14.attention.wv, in=4096, out=1024
+linear: model.layers.14.attention.wo, in=4096, out=4096
+linear: model.layers.14.feed_forward.w1, in=4096, out=14336
+linear: model.layers.14.feed_forward.w2, in=14336, out=4096
+linear: model.layers.14.feed_forward.w3, in=4096, out=14336
+linear: model.layers.15.attention.wq, in=4096, out=4096
+linear: model.layers.15.attention.wk, in=4096, out=1024
+linear: model.layers.15.attention.wv, in=4096, out=1024
+linear: model.layers.15.attention.wo, in=4096, out=4096
+linear: model.layers.15.feed_forward.w1, in=4096, out=14336
+linear: model.layers.15.feed_forward.w2, in=14336, out=4096
+linear: model.layers.15.feed_forward.w3, in=4096, out=14336
+linear: model.layers.16.attention.wq, in=4096, out=4096
+linear: model.layers.16.attention.wk, in=4096, out=1024
+linear: model.layers.16.attention.wv, in=4096, out=1024
+linear: model.layers.16.attention.wo, in=4096, out=4096
+linear: model.layers.16.feed_forward.w1, in=4096, out=14336
+linear: model.layers.16.feed_forward.w2, in=14336, out=4096
+linear: model.layers.16.feed_forward.w3, in=4096, out=14336
+linear: model.layers.17.attention.wq, in=4096, out=4096
+linear: model.layers.17.attention.wk, in=4096, out=1024
+linear: model.layers.17.attention.wv, in=4096, out=1024
+linear: model.layers.17.attention.wo, in=4096, out=4096
+linear: model.layers.17.feed_forward.w1, in=4096, out=14336
+linear: model.layers.17.feed_forward.w2, in=14336, out=4096
+linear: model.layers.17.feed_forward.w3, in=4096, out=14336
+linear: model.layers.18.attention.wq, in=4096, out=4096
+linear: model.layers.18.attention.wk, in=4096, out=1024
+linear: model.layers.18.attention.wv, in=4096, out=1024
+linear: model.layers.18.attention.wo, in=4096, out=4096
+linear: model.layers.18.feed_forward.w1, in=4096, out=14336
+linear: model.layers.18.feed_forward.w2, in=14336, out=4096
+linear: model.layers.18.feed_forward.w3, in=4096, out=14336
+linear: model.layers.19.attention.wq, in=4096, out=4096
+linear: model.layers.19.attention.wk, in=4096, out=1024
+linear: model.layers.19.attention.wv, in=4096, out=1024
+linear: model.layers.19.attention.wo, in=4096, out=4096
+linear: model.layers.19.feed_forward.w1, in=4096, out=14336
+linear: model.layers.19.feed_forward.w2, in=14336, out=4096
+linear: model.layers.19.feed_forward.w3, in=4096, out=14336
+linear: model.layers.20.attention.wq, in=4096, out=4096
+linear: model.layers.20.attention.wk, in=4096, out=1024
+linear: model.layers.20.attention.wv, in=4096, out=1024
+linear: model.layers.20.attention.wo, in=4096, out=4096
+linear: model.layers.20.feed_forward.w1, in=4096, out=14336
+linear: model.layers.20.feed_forward.w2, in=14336, out=4096
+linear: model.layers.20.feed_forward.w3, in=4096, out=14336
+linear: model.layers.21.attention.wq, in=4096, out=4096
+linear: model.layers.21.attention.wk, in=4096, out=1024
+linear: model.layers.21.attention.wv, in=4096, out=1024
+linear: model.layers.21.attention.wo, in=4096, out=4096
+linear: model.layers.21.feed_forward.w1, in=4096, out=14336
+linear: model.layers.21.feed_forward.w2, in=14336, out=4096
+linear: model.layers.21.feed_forward.w3, in=4096, out=14336
+linear: model.layers.22.attention.wq, in=4096, out=4096
+linear: model.layers.22.attention.wk, in=4096, out=1024
+linear: model.layers.22.attention.wv, in=4096, out=1024
+linear: model.layers.22.attention.wo, in=4096, out=4096
+linear: model.layers.22.feed_forward.w1, in=4096, out=14336
+linear: model.layers.22.feed_forward.w2, in=14336, out=4096
+linear: model.layers.22.feed_forward.w3, in=4096, out=14336
+linear: model.layers.23.attention.wq, in=4096, out=4096
+linear: model.layers.23.attention.wk, in=4096, out=1024
+linear: model.layers.23.attention.wv, in=4096, out=1024
+linear: model.layers.23.attention.wo, in=4096, out=4096
+linear: model.layers.23.feed_forward.w1, in=4096, out=14336
+linear: model.layers.23.feed_forward.w2, in=14336, out=4096
+linear: model.layers.23.feed_forward.w3, in=4096, out=14336
+linear: model.layers.24.attention.wq, in=4096, out=4096
+linear: model.layers.24.attention.wk, in=4096, out=1024
+linear: model.layers.24.attention.wv, in=4096, out=1024
+linear: model.layers.24.attention.wo, in=4096, out=4096
+linear: model.layers.24.feed_forward.w1, in=4096, out=14336
+linear: model.layers.24.feed_forward.w2, in=14336, out=4096
+linear: model.layers.24.feed_forward.w3, in=4096, out=14336
+linear: model.layers.25.attention.wq, in=4096, out=4096
+linear: model.layers.25.attention.wk, in=4096, out=1024
+linear: model.layers.25.attention.wv, in=4096, out=1024
+linear: model.layers.25.attention.wo, in=4096, out=4096
+linear: model.layers.25.feed_forward.w1, in=4096, out=14336
+linear: model.layers.25.feed_forward.w2, in=14336, out=4096
+linear: model.layers.25.feed_forward.w3, in=4096, out=14336
+linear: model.layers.26.attention.wq, in=4096, out=4096
+linear: model.layers.26.attention.wk, in=4096, out=1024
+linear: model.layers.26.attention.wv, in=4096, out=1024
+linear: model.layers.26.attention.wo, in=4096, out=4096
+linear: model.layers.26.feed_forward.w1, in=4096, out=14336
+linear: model.layers.26.feed_forward.w2, in=14336, out=4096
+linear: model.layers.26.feed_forward.w3, in=4096, out=14336
+linear: model.layers.27.attention.wq, in=4096, out=4096
+linear: model.layers.27.attention.wk, in=4096, out=1024
+linear: model.layers.27.attention.wv, in=4096, out=1024
+linear: model.layers.27.attention.wo, in=4096, out=4096
+linear: model.layers.27.feed_forward.w1, in=4096, out=14336
+linear: model.layers.27.feed_forward.w2, in=14336, out=4096
+linear: model.layers.27.feed_forward.w3, in=4096, out=14336
+linear: model.layers.28.attention.wq, in=4096, out=4096
+linear: model.layers.28.attention.wk, in=4096, out=1024
+linear: model.layers.28.attention.wv, in=4096, out=1024
+linear: model.layers.28.attention.wo, in=4096, out=4096
+linear: model.layers.28.feed_forward.w1, in=4096, out=14336
+linear: model.layers.28.feed_forward.w2, in=14336, out=4096
+linear: model.layers.28.feed_forward.w3, in=4096, out=14336
+linear: model.layers.29.attention.wq, in=4096, out=4096
+linear: model.layers.29.attention.wk, in=4096, out=1024
+linear: model.layers.29.attention.wv, in=4096, out=1024
+linear: model.layers.29.attention.wo, in=4096, out=4096
+linear: model.layers.29.feed_forward.w1, in=4096, out=14336
+linear: model.layers.29.feed_forward.w2, in=14336, out=4096
+linear: model.layers.29.feed_forward.w3, in=4096, out=14336
+linear: model.layers.30.attention.wq, in=4096, out=4096
+linear: model.layers.30.attention.wk, in=4096, out=1024
+linear: model.layers.30.attention.wv, in=4096, out=1024
+linear: model.layers.30.attention.wo, in=4096, out=4096
+linear: model.layers.30.feed_forward.w1, in=4096, out=14336
+linear: model.layers.30.feed_forward.w2, in=14336, out=4096
+linear: model.layers.30.feed_forward.w3, in=4096, out=14336
+linear: model.layers.31.attention.wq, in=4096, out=4096
+linear: model.layers.31.attention.wk, in=4096, out=1024
+linear: model.layers.31.attention.wv, in=4096, out=1024
+linear: model.layers.31.attention.wo, in=4096, out=4096
+linear: model.layers.31.feed_forward.w1, in=4096, out=14336
+linear: model.layers.31.feed_forward.w2, in=14336, out=4096
+linear: model.layers.31.feed_forward.w3, in=4096, out=14336
+linear: model.output, in=4096, out=128256
+W1218 21:23:53.576256 2238824 site-packages/torch/_export/__init__.py:276] +============================+
+W1218 21:23:53.576678 2238824 site-packages/torch/_export/__init__.py:277] |     !!!   WARNING   !!!    |
+W1218 21:23:53.576880 2238824 site-packages/torch/_export/__init__.py:278] +============================+
+W1218 21:23:53.577066 2238824 site-packages/torch/_export/__init__.py:279] torch._export.aot_compile()/torch._export.aot_load() is being deprecated, please switch to directly calling torch._inductor.aoti_compile_and_package(torch.export.export())/torch._inductor.aoti_load_package() instead.
+W1218 21:25:20.233896 2238824 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 21:25:20.239318 2238824 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 21:25:20.240385 2238824 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 21:25:20.883318 2238824 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 21:25:20.953763 2238824 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 21:25:20.955421 2238824 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 21:25:20.977681 2238824 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 21:25:21.053072 2238824 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 21:25:21.092159 2238824 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 21:25:21.093502 2238824 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 21:25:21.271579 2238824 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 21:25:21.344665 2238824 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 21:25:21.346022 2238824 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 21:25:21.364932 2238824 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 21:25:21.458998 2238824 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 21:25:21.460202 2238824 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 21:25:21.461016 2238824 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 21:25:21.677285 2238824 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 21:25:21.736128 2238824 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 21:25:21.737890 2238824 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 21:25:21.756758 2238824 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 21:25:21.824434 2238824 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 21:25:21.864685 2238824 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 21:25:21.866140 2238824 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 21:25:22.050815 2238824 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 21:25:22.129587 2238824 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 21:25:22.131014 2238824 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 21:25:22.150149 2238824 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 21:25:22.258744 2238824 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 21:25:22.260050 2238824 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 21:25:22.260892 2238824 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 21:25:22.496145 2238824 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 21:25:22.553836 2238824 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 21:25:22.555370 2238824 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 21:25:22.574595 2238824 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 21:25:22.643304 2238824 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 21:25:22.683268 2238824 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 21:25:22.684646 2238824 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 21:25:22.860385 2238824 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 21:25:22.935622 2238824 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 21:25:22.936992 2238824 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 21:25:22.957414 2238824 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 21:25:23.071340 2238824 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 21:25:23.072979 2238824 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 21:25:23.074157 2238824 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 21:25:23.306982 2238824 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 21:25:23.365303 2238824 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 21:25:23.366822 2238824 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 21:25:23.385984 2238824 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 21:25:23.454555 2238824 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 21:25:23.494814 2238824 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 21:25:23.496194 2238824 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 21:25:23.697596 2238824 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 21:25:23.779840 2238824 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 21:25:23.781234 2238824 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 21:25:23.802160 2238824 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 21:25:23.913789 2238824 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 21:25:23.915060 2238824 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 21:25:23.915942 2238824 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 21:25:24.145303 2238824 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 21:25:24.201368 2238824 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 21:25:24.203027 2238824 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 21:25:24.222470 2238824 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 21:25:24.291375 2238824 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 21:25:24.331730 2238824 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 21:25:24.333143 2238824 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 21:25:24.512544 2238824 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 21:25:24.586663 2238824 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 21:25:24.588068 2238824 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 21:25:24.607161 2238824 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 21:25:24.707357 2238824 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 21:25:24.708548 2238824 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 21:25:24.709377 2238824 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 21:25:24.922704 2238824 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 21:25:24.982773 2238824 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 21:25:24.984325 2238824 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 21:25:25.006305 2238824 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 21:25:25.074771 2238824 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 21:25:25.114615 2238824 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 21:25:25.116001 2238824 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 21:25:25.293876 2238824 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 21:25:25.367986 2238824 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 21:25:25.369267 2238824 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 21:25:25.387587 2238824 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 21:25:25.484052 2238824 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 21:25:25.485341 2238824 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 21:25:25.486235 2238824 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 21:25:25.703394 2238824 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 21:25:25.760080 2238824 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 21:25:25.761663 2238824 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 21:25:25.780857 2238824 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 21:25:25.847835 2238824 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 21:25:25.889059 2238824 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 21:25:25.890357 2238824 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 21:25:26.074828 2238824 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 21:25:26.153099 2238824 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 21:25:26.154477 2238824 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 21:25:26.177997 2238824 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 21:25:26.288408 2238824 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 21:25:26.289663 2238824 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 21:25:26.290576 2238824 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 21:25:26.521352 2238824 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 21:25:26.585586 2238824 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 21:25:26.587229 2238824 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 21:25:26.607861 2238824 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 21:25:26.688992 2238824 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 21:25:26.740197 2238824 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 21:25:26.741644 2238824 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 21:25:26.936436 2238824 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 21:25:27.015001 2238824 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 21:25:27.016461 2238824 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 21:25:27.036435 2238824 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 21:25:27.135572 2238824 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 21:25:27.136843 2238824 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 21:25:27.137667 2238824 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 21:25:27.366190 2238824 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 21:25:27.426763 2238824 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 21:25:27.428731 2238824 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 21:25:27.449573 2238824 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 21:25:27.519201 2238824 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 21:25:27.561167 2238824 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 21:25:27.562740 2238824 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 21:25:27.751635 2238824 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 21:25:27.828538 2238824 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 21:25:27.829916 2238824 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 21:25:27.850567 2238824 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 21:25:27.954275 2238824 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 21:25:27.955549 2238824 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 21:25:27.956412 2238824 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 21:25:28.186756 2238824 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 21:25:28.249253 2238824 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 21:25:28.251011 2238824 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 21:25:28.271723 2238824 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 21:25:28.343948 2238824 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 21:25:28.387609 2238824 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 21:25:28.389125 2238824 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 21:25:28.597146 2238824 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 21:25:28.681834 2238824 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 21:25:28.683411 2238824 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 21:25:28.707207 2238824 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 21:25:28.805835 2238824 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 21:25:28.807015 2238824 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 21:25:28.807830 2238824 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 21:25:29.026887 2238824 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 21:25:29.088868 2238824 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 21:25:29.090545 2238824 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 21:25:29.110947 2238824 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 21:25:29.178845 2238824 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 21:25:29.220081 2238824 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 21:25:29.221472 2238824 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 21:25:29.414366 2238824 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 21:25:29.489016 2238824 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 21:25:29.490384 2238824 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 21:25:29.509267 2238824 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 21:25:29.608895 2238824 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 21:25:29.610198 2238824 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 21:25:29.611115 2238824 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 21:25:29.841837 2238824 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 21:25:29.900247 2238824 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 21:25:29.901938 2238824 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 21:25:29.921683 2238824 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 21:25:29.992690 2238824 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 21:25:30.036447 2238824 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 21:25:30.038029 2238824 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 21:25:30.235601 2238824 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 21:25:30.313172 2238824 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 21:25:30.314603 2238824 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 21:25:30.334380 2238824 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 21:25:30.436677 2238824 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 21:25:30.437915 2238824 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 21:25:30.438748 2238824 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 21:25:30.664465 2238824 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 21:25:30.722916 2238824 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 21:25:30.724525 2238824 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 21:25:30.746635 2238824 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 21:25:30.815278 2238824 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 21:25:30.856171 2238824 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 21:25:30.857557 2238824 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 21:25:31.041084 2238824 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 21:25:31.120267 2238824 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 21:25:31.121633 2238824 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 21:25:31.141353 2238824 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 21:25:31.240418 2238824 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 21:25:31.241714 2238824 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 21:25:31.242549 2238824 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 21:25:31.467540 2238824 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 21:25:31.527176 2238824 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 21:25:31.528848 2238824 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 21:25:31.549545 2238824 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 21:25:31.620660 2238824 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 21:25:31.665610 2238824 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 21:25:31.667115 2238824 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 21:25:31.862168 2238824 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 21:25:31.939233 2238824 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 21:25:31.940639 2238824 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 21:25:31.960984 2238824 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 21:25:32.061481 2238824 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 21:25:32.062717 2238824 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 21:25:32.063520 2238824 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 21:25:32.289888 2238824 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 21:25:32.350006 2238824 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 21:25:32.351542 2238824 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 21:25:32.372074 2238824 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 21:25:32.443625 2238824 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 21:25:32.484638 2238824 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 21:25:32.486084 2238824 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 21:25:32.678209 2238824 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 21:25:32.758171 2238824 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 21:25:32.759570 2238824 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 21:25:32.779173 2238824 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 21:25:32.877443 2238824 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 21:25:32.878699 2238824 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 21:25:32.879554 2238824 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 21:25:33.109153 2238824 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 21:25:33.169797 2238824 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 21:25:33.171651 2238824 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 21:25:33.192316 2238824 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 21:25:33.268254 2238824 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 21:25:33.314141 2238824 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 21:25:33.315602 2238824 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 21:25:33.540070 2238824 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 21:25:33.628262 2238824 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 21:25:33.629751 2238824 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 21:25:33.652234 2238824 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 21:25:33.710010 2238824 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+/tmp/cehyjtgmju2nn6npfywx2xkxsjwkj6rb45gv6xespkiyuceqg6xn.cpp: In function ‘void cpp_fused__safe_softmax__weight_int4pack_mm_for_cpu_add_bmm_index_index_put_scalar_tensor_stack_where_1(const bfloat16*, const int32_t*, const bfloat16*, const bfloat16*, const float*, const float*, const bool*, const bfloat16*, float*, float*, float*, float*, bfloat16*, float*, bool*, float*, float*, float*, bfloat16*, float*, bfloat16*)’:
+/tmp/cehyjtgmju2nn6npfywx2xkxsjwkj6rb45gv6xespkiyuceqg6xn.cpp:738:31: warning: variable ‘tmp_acc0_arr’ set but not used [-Wunused-but-set-variable]
+  738 |                         float tmp_acc0_arr[32];
+      |                               ^~~~~~~~~~~~
+/tmp/cehyjtgmju2nn6npfywx2xkxsjwkj6rb45gv6xespkiyuceqg6xn.cpp: In function ‘void cpp_fused__safe_softmax__weight_int4pack_mm_for_cpu_add_bmm_index_index_put_scalar_tensor_stack_where_6(const bfloat16*, const int32_t*, const bfloat16*, const float*, const float*, const bool*, const bfloat16*, float*, float*, bfloat16*, float*, bool*, float*, float*, float*, bfloat16*, float*, bfloat16*)’:
+/tmp/cehyjtgmju2nn6npfywx2xkxsjwkj6rb45gv6xespkiyuceqg6xn.cpp:1274:31: warning: variable ‘tmp_acc0_arr’ set but not used [-Wunused-but-set-variable]
+ 1274 |                         float tmp_acc0_arr[32];
+      |                               ^~~~~~~~~~~~
+/tmp/cehyjtgmju2nn6npfywx2xkxsjwkj6rb45gv6xespkiyuceqg6xn.cpp: In function ‘void cpp_fused__safe_softmax__weight_int4pack_mm_for_cpu_add_bmm_index_index_put_scalar_tensor_stack_where_10(const bfloat16*, const int32_t*, const bfloat16*, const bfloat16*, const float*, const float*, const bool*, const bfloat16*, float*, float*, float*, float*, bfloat16*, float*, bool*, float*, float*, float*, bfloat16*, float*, bfloat16*)’:
+/tmp/cehyjtgmju2nn6npfywx2xkxsjwkj6rb45gv6xespkiyuceqg6xn.cpp:1780:31: warning: variable ‘tmp_acc0_arr’ set but not used [-Wunused-but-set-variable]
+ 1780 |                         float tmp_acc0_arr[32];
+      |                               ^~~~~~~~~~~~
+/tmp/cehyjtgmju2nn6npfywx2xkxsjwkj6rb45gv6xespkiyuceqg6xn.cpp: In function ‘void cpp_fused__safe_softmax__weight_int4pack_mm_for_cpu_add_bmm_index_index_put_scalar_tensor_stack_where_15(const bfloat16*, const int32_t*, const bfloat16*, const float*, const float*, const bool*, const bfloat16*, float*, float*, bfloat16*, float*, bool*, float*, float*, float*, bfloat16*, float*, bfloat16*)’:
+/tmp/cehyjtgmju2nn6npfywx2xkxsjwkj6rb45gv6xespkiyuceqg6xn.cpp:2292:31: warning: variable ‘tmp_acc0_arr’ set but not used [-Wunused-but-set-variable]
+ 2292 |                         float tmp_acc0_arr[32];
+      |                               ^~~~~~~~~~~~
+/tmp/cehyjtgmju2nn6npfywx2xkxsjwkj6rb45gv6xespkiyuceqg6xn.cpp: In function ‘void cpp_fused__safe_softmax__weight_int4pack_mm_for_cpu_add_bmm_index_index_put_scalar_tensor_stack_where_19(const bfloat16*, const int32_t*, const bfloat16*, const bfloat16*, const float*, const float*, const bool*, const bfloat16*, float*, float*, float*, float*, bfloat16*, float*, bool*, float*, float*, float*, bfloat16*, float*, bfloat16*)’:
+/tmp/cehyjtgmju2nn6npfywx2xkxsjwkj6rb45gv6xespkiyuceqg6xn.cpp:2792:31: warning: variable ‘tmp_acc0_arr’ set but not used [-Wunused-but-set-variable]
+ 2792 |                         float tmp_acc0_arr[32];
+      |                               ^~~~~~~~~~~~
+/tmp/cehyjtgmju2nn6npfywx2xkxsjwkj6rb45gv6xespkiyuceqg6xn.cpp: In function ‘void cpp_fused__safe_softmax__weight_int4pack_mm_for_cpu_add_bmm_index_index_put_scalar_tensor_stack_where_24(const bfloat16*, const int32_t*, const bfloat16*, const float*, const float*, const bool*, const bfloat16*, float*, float*, bfloat16*, float*, bool*, float*, float*, float*, bfloat16*, float*, bfloat16*)’:
+/tmp/cehyjtgmju2nn6npfywx2xkxsjwkj6rb45gv6xespkiyuceqg6xn.cpp:3304:31: warning: variable ‘tmp_acc0_arr’ set but not used [-Wunused-but-set-variable]
+ 3304 |                         float tmp_acc0_arr[32];
+      |                               ^~~~~~~~~~~~
+/tmp/cehyjtgmju2nn6npfywx2xkxsjwkj6rb45gv6xespkiyuceqg6xn.cpp: In function ‘void cpp_fused__safe_softmax__weight_int4pack_mm_for_cpu_add_bmm_index_index_put_scalar_tensor_stack_where_28(const bfloat16*, const int32_t*, const bfloat16*, const bfloat16*, const float*, const float*, const bool*, const bfloat16*, float*, float*, float*, float*, bfloat16*, float*, bool*, float*, float*, float*, bfloat16*, float*, bfloat16*)’:
+/tmp/cehyjtgmju2nn6npfywx2xkxsjwkj6rb45gv6xespkiyuceqg6xn.cpp:3804:31: warning: variable ‘tmp_acc0_arr’ set but not used [-Wunused-but-set-variable]
+ 3804 |                         float tmp_acc0_arr[32];
+      |                               ^~~~~~~~~~~~
+/tmp/cehyjtgmju2nn6npfywx2xkxsjwkj6rb45gv6xespkiyuceqg6xn.cpp: In function ‘void cpp_fused__safe_softmax__weight_int4pack_mm_for_cpu_add_bmm_index_index_put_scalar_tensor_stack_where_33(const bfloat16*, const int32_t*, const bfloat16*, const float*, const float*, const bool*, const bfloat16*, float*, float*, bfloat16*, float*, bool*, float*, float*, float*, bfloat16*, float*, bfloat16*)’:
+/tmp/cehyjtgmju2nn6npfywx2xkxsjwkj6rb45gv6xespkiyuceqg6xn.cpp:4316:31: warning: variable ‘tmp_acc0_arr’ set but not used [-Wunused-but-set-variable]
+ 4316 |                         float tmp_acc0_arr[32];
+      |                               ^~~~~~~~~~~~
+/tmp/cehyjtgmju2nn6npfywx2xkxsjwkj6rb45gv6xespkiyuceqg6xn.cpp: In function ‘void cpp_fused__safe_softmax__weight_int4pack_mm_for_cpu_add_bmm_index_index_put_scalar_tensor_stack_where_37(const bfloat16*, const int32_t*, const bfloat16*, const bfloat16*, const float*, const float*, const bool*, const bfloat16*, float*, float*, float*, float*, bfloat16*, float*, bool*, float*, float*, float*, bfloat16*, float*, bfloat16*)’:
+/tmp/cehyjtgmju2nn6npfywx2xkxsjwkj6rb45gv6xespkiyuceqg6xn.cpp:4816:31: warning: variable ‘tmp_acc0_arr’ set but not used [-Wunused-but-set-variable]
+ 4816 |                         float tmp_acc0_arr[32];
+      |                               ^~~~~~~~~~~~
+/tmp/cehyjtgmju2nn6npfywx2xkxsjwkj6rb45gv6xespkiyuceqg6xn.cpp: In function ‘void cpp_fused__safe_softmax__weight_int4pack_mm_for_cpu_add_bmm_index_index_put_scalar_tensor_stack_where_42(const bfloat16*, const int32_t*, const bfloat16*, const float*, const float*, const bool*, const bfloat16*, float*, float*, bfloat16*, float*, bool*, float*, float*, float*, bfloat16*, float*, bfloat16*)’:
+/tmp/cehyjtgmju2nn6npfywx2xkxsjwkj6rb45gv6xespkiyuceqg6xn.cpp:5328:31: warning: variable ‘tmp_acc0_arr’ set but not used [-Wunused-but-set-variable]
+ 5328 |                         float tmp_acc0_arr[32];
+      |                               ^~~~~~~~~~~~
+/tmp/cehyjtgmju2nn6npfywx2xkxsjwkj6rb45gv6xespkiyuceqg6xn.cpp: In function ‘void cpp_fused__safe_softmax__weight_int4pack_mm_for_cpu_add_bmm_index_index_put_scalar_tensor_stack_where_46(const bfloat16*, const int32_t*, const bfloat16*, const bfloat16*, const float*, const float*, const bool*, const bfloat16*, float*, float*, float*, float*, bfloat16*, float*, bool*, float*, float*, float*, bfloat16*, float*, bfloat16*)’:
+/tmp/cehyjtgmju2nn6npfywx2xkxsjwkj6rb45gv6xespkiyuceqg6xn.cpp:5828:31: warning: variable ‘tmp_acc0_arr’ set but not used [-Wunused-but-set-variable]
+ 5828 |                         float tmp_acc0_arr[32];
+      |                               ^~~~~~~~~~~~
+/tmp/cehyjtgmju2nn6npfywx2xkxsjwkj6rb45gv6xespkiyuceqg6xn.cpp: In function ‘void cpp_fused__safe_softmax__weight_int4pack_mm_for_cpu_add_bmm_index_index_put_scalar_tensor_stack_where_51(const bfloat16*, const int32_t*, const bfloat16*, const float*, const float*, const bool*, const bfloat16*, float*, float*, bfloat16*, float*, bool*, float*, float*, float*, bfloat16*, float*, bfloat16*)’:
+/tmp/cehyjtgmju2nn6npfywx2xkxsjwkj6rb45gv6xespkiyuceqg6xn.cpp:6340:31: warning: variable ‘tmp_acc0_arr’ set but not used [-Wunused-but-set-variable]
+ 6340 |                         float tmp_acc0_arr[32];
+      |                               ^~~~~~~~~~~~
+/tmp/cehyjtgmju2nn6npfywx2xkxsjwkj6rb45gv6xespkiyuceqg6xn.cpp: In function ‘void cpp_fused__safe_softmax__weight_int4pack_mm_for_cpu_add_bmm_index_index_put_scalar_tensor_stack_where_55(const bfloat16*, const int32_t*, const bfloat16*, const bfloat16*, const float*, const float*, const bool*, const bfloat16*, float*, float*, float*, float*, bfloat16*, float*, bool*, float*, float*, float*, bfloat16*, float*, bfloat16*)’:
+/tmp/cehyjtgmju2nn6npfywx2xkxsjwkj6rb45gv6xespkiyuceqg6xn.cpp:6840:31: warning: variable ‘tmp_acc0_arr’ set but not used [-Wunused-but-set-variable]
+ 6840 |                         float tmp_acc0_arr[32];
+      |                               ^~~~~~~~~~~~
+/tmp/cehyjtgmju2nn6npfywx2xkxsjwkj6rb45gv6xespkiyuceqg6xn.cpp: In function ‘void cpp_fused__safe_softmax__weight_int4pack_mm_for_cpu_add_bmm_index_index_put_scalar_tensor_stack_where_60(const bfloat16*, const int32_t*, const bfloat16*, const float*, const float*, const bool*, const bfloat16*, float*, float*, bfloat16*, float*, bool*, float*, float*, float*, bfloat16*, float*, bfloat16*)’:
+/tmp/cehyjtgmju2nn6npfywx2xkxsjwkj6rb45gv6xespkiyuceqg6xn.cpp:7352:31: warning: variable ‘tmp_acc0_arr’ set but not used [-Wunused-but-set-variable]
+ 7352 |                         float tmp_acc0_arr[32];
+      |                               ^~~~~~~~~~~~
+/tmp/cehyjtgmju2nn6npfywx2xkxsjwkj6rb45gv6xespkiyuceqg6xn.cpp: In function ‘void cpp_fused__safe_softmax__weight_int4pack_mm_for_cpu_add_bmm_index_index_put_scalar_tensor_stack_where_64(const bfloat16*, const int32_t*, const bfloat16*, const bfloat16*, const float*, const float*, const bool*, const bfloat16*, float*, float*, float*, float*, bfloat16*, float*, bool*, float*, float*, float*, bfloat16*, float*, bfloat16*)’:
+/tmp/cehyjtgmju2nn6npfywx2xkxsjwkj6rb45gv6xespkiyuceqg6xn.cpp:7852:31: warning: variable ‘tmp_acc0_arr’ set but not used [-Wunused-but-set-variable]
+ 7852 |                         float tmp_acc0_arr[32];
+      |                               ^~~~~~~~~~~~
+/tmp/cehyjtgmju2nn6npfywx2xkxsjwkj6rb45gv6xespkiyuceqg6xn.cpp: In function ‘void cpp_fused__safe_softmax__weight_int4pack_mm_for_cpu_add_bmm_index_index_put_scalar_tensor_stack_where_69(const bfloat16*, const int32_t*, const bfloat16*, const float*, const float*, const bool*, const bfloat16*, float*, float*, bfloat16*, float*, bool*, float*, float*, float*, bfloat16*, float*, bfloat16*)’:
+/tmp/cehyjtgmju2nn6npfywx2xkxsjwkj6rb45gv6xespkiyuceqg6xn.cpp:8364:31: warning: variable ‘tmp_acc0_arr’ set but not used [-Wunused-but-set-variable]
+ 8364 |                         float tmp_acc0_arr[32];
+      |                               ^~~~~~~~~~~~
+/tmp/cehyjtgmju2nn6npfywx2xkxsjwkj6rb45gv6xespkiyuceqg6xn.cpp: In function ‘void cpp_fused__safe_softmax__weight_int4pack_mm_for_cpu_add_bmm_index_index_put_scalar_tensor_stack_where_73(const bfloat16*, const int32_t*, const bfloat16*, const bfloat16*, const float*, const float*, const bool*, const bfloat16*, float*, float*, float*, float*, bfloat16*, float*, bool*, float*, float*, float*, bfloat16*, float*, bfloat16*)’:
+/tmp/cehyjtgmju2nn6npfywx2xkxsjwkj6rb45gv6xespkiyuceqg6xn.cpp:8864:31: warning: variable ‘tmp_acc0_arr’ set but not used [-Wunused-but-set-variable]
+ 8864 |                         float tmp_acc0_arr[32];
+      |                               ^~~~~~~~~~~~
+/tmp/cehyjtgmju2nn6npfywx2xkxsjwkj6rb45gv6xespkiyuceqg6xn.cpp: In function ‘void cpp_fused__safe_softmax__weight_int4pack_mm_for_cpu_add_bmm_index_index_put_scalar_tensor_stack_where_78(const bfloat16*, const int32_t*, const bfloat16*, const float*, const float*, const bool*, const bfloat16*, float*, float*, bfloat16*, float*, bool*, float*, float*, float*, bfloat16*, float*, bfloat16*)’:
+/tmp/cehyjtgmju2nn6npfywx2xkxsjwkj6rb45gv6xespkiyuceqg6xn.cpp:9376:31: warning: variable ‘tmp_acc0_arr’ set but not used [-Wunused-but-set-variable]
+ 9376 |                         float tmp_acc0_arr[32];
+      |                               ^~~~~~~~~~~~
+/tmp/cehyjtgmju2nn6npfywx2xkxsjwkj6rb45gv6xespkiyuceqg6xn.cpp: In function ‘void cpp_fused__safe_softmax__weight_int4pack_mm_for_cpu_add_bmm_index_index_put_scalar_tensor_stack_where_82(const bfloat16*, const int32_t*, const bfloat16*, const bfloat16*, const float*, const float*, const bool*, const bfloat16*, float*, float*, float*, float*, bfloat16*, float*, bool*, float*, float*, float*, bfloat16*, float*, bfloat16*)’:
+/tmp/cehyjtgmju2nn6npfywx2xkxsjwkj6rb45gv6xespkiyuceqg6xn.cpp:9876:31: warning: variable ‘tmp_acc0_arr’ set but not used [-Wunused-but-set-variable]
+ 9876 |                         float tmp_acc0_arr[32];
+      |                               ^~~~~~~~~~~~
+/tmp/cehyjtgmju2nn6npfywx2xkxsjwkj6rb45gv6xespkiyuceqg6xn.cpp: In function ‘void cpp_fused__safe_softmax__weight_int4pack_mm_for_cpu_add_bmm_index_index_put_scalar_tensor_stack_where_87(const bfloat16*, const int32_t*, const bfloat16*, const float*, const float*, const bool*, const bfloat16*, float*, float*, bfloat16*, float*, bool*, float*, float*, float*, bfloat16*, float*, bfloat16*)’:
+/tmp/cehyjtgmju2nn6npfywx2xkxsjwkj6rb45gv6xespkiyuceqg6xn.cpp:10388:31: warning: variable ‘tmp_acc0_arr’ set but not used [-Wunused-but-set-variable]
+10388 |                         float tmp_acc0_arr[32];
+      |                               ^~~~~~~~~~~~
+/tmp/cehyjtgmju2nn6npfywx2xkxsjwkj6rb45gv6xespkiyuceqg6xn.cpp: In function ‘void cpp_fused__safe_softmax__weight_int4pack_mm_for_cpu_add_bmm_index_index_put_scalar_tensor_stack_where_91(const bfloat16*, const int32_t*, const bfloat16*, const bfloat16*, const float*, const float*, const bool*, const bfloat16*, float*, float*, float*, float*, bfloat16*, float*, bool*, float*, float*, float*, bfloat16*, float*, bfloat16*)’:
+/tmp/cehyjtgmju2nn6npfywx2xkxsjwkj6rb45gv6xespkiyuceqg6xn.cpp:10888:31: warning: variable ‘tmp_acc0_arr’ set but not used [-Wunused-but-set-variable]
+10888 |                         float tmp_acc0_arr[32];
+      |                               ^~~~~~~~~~~~
+/tmp/cehyjtgmju2nn6npfywx2xkxsjwkj6rb45gv6xespkiyuceqg6xn.cpp: In function ‘void cpp_fused__safe_softmax__weight_int4pack_mm_for_cpu_add_bmm_index_index_put_scalar_tensor_stack_where_96(const bfloat16*, const int32_t*, const bfloat16*, const float*, const float*, const bool*, const bfloat16*, float*, float*, bfloat16*, float*, bool*, float*, float*, float*, bfloat16*, float*, bfloat16*)’:
+/tmp/cehyjtgmju2nn6npfywx2xkxsjwkj6rb45gv6xespkiyuceqg6xn.cpp:11400:31: warning: variable ‘tmp_acc0_arr’ set but not used [-Wunused-but-set-variable]
+11400 |                         float tmp_acc0_arr[32];
+      |                               ^~~~~~~~~~~~
+/tmp/cehyjtgmju2nn6npfywx2xkxsjwkj6rb45gv6xespkiyuceqg6xn.cpp: In function ‘void cpp_fused__safe_softmax__weight_int4pack_mm_for_cpu_add_bmm_index_index_put_scalar_tensor_stack_where_100(const bfloat16*, const int32_t*, const bfloat16*, const bfloat16*, const float*, const float*, const bool*, const bfloat16*, float*, float*, float*, float*, bfloat16*, float*, bool*, float*, float*, float*, bfloat16*, float*, bfloat16*)’:
+/tmp/cehyjtgmju2nn6npfywx2xkxsjwkj6rb45gv6xespkiyuceqg6xn.cpp:11900:31: warning: variable ‘tmp_acc0_arr’ set but not used [-Wunused-but-set-variable]
+11900 |                         float tmp_acc0_arr[32];
+      |                               ^~~~~~~~~~~~
+/tmp/cehyjtgmju2nn6npfywx2xkxsjwkj6rb45gv6xespkiyuceqg6xn.cpp: In function ‘void cpp_fused__safe_softmax__weight_int4pack_mm_for_cpu_add_bmm_index_index_put_scalar_tensor_stack_where_105(const bfloat16*, const int32_t*, const bfloat16*, const float*, const float*, const bool*, const bfloat16*, float*, float*, bfloat16*, float*, bool*, float*, float*, float*, bfloat16*, float*, bfloat16*)’:
+/tmp/cehyjtgmju2nn6npfywx2xkxsjwkj6rb45gv6xespkiyuceqg6xn.cpp:12412:31: warning: variable ‘tmp_acc0_arr’ set but not used [-Wunused-but-set-variable]
+12412 |                         float tmp_acc0_arr[32];
+      |                               ^~~~~~~~~~~~
+/tmp/cehyjtgmju2nn6npfywx2xkxsjwkj6rb45gv6xespkiyuceqg6xn.cpp: In function ‘void cpp_fused__safe_softmax__weight_int4pack_mm_for_cpu_add_bmm_index_index_put_scalar_tensor_stack_where_109(const bfloat16*, const int32_t*, const bfloat16*, const bfloat16*, const float*, const float*, const bool*, const bfloat16*, float*, float*, float*, float*, bfloat16*, float*, bool*, float*, float*, float*, bfloat16*, float*, bfloat16*)’:
+/tmp/cehyjtgmju2nn6npfywx2xkxsjwkj6rb45gv6xespkiyuceqg6xn.cpp:12912:31: warning: variable ‘tmp_acc0_arr’ set but not used [-Wunused-but-set-variable]
+12912 |                         float tmp_acc0_arr[32];
+      |                               ^~~~~~~~~~~~
+/tmp/cehyjtgmju2nn6npfywx2xkxsjwkj6rb45gv6xespkiyuceqg6xn.cpp: In function ‘void cpp_fused__safe_softmax__weight_int4pack_mm_for_cpu_add_bmm_index_index_put_scalar_tensor_stack_where_114(const bfloat16*, const int32_t*, const bfloat16*, const float*, const float*, const bool*, const bfloat16*, float*, float*, bfloat16*, float*, bool*, float*, float*, float*, bfloat16*, float*, bfloat16*)’:
+/tmp/cehyjtgmju2nn6npfywx2xkxsjwkj6rb45gv6xespkiyuceqg6xn.cpp:13424:31: warning: variable ‘tmp_acc0_arr’ set but not used [-Wunused-but-set-variable]
+13424 |                         float tmp_acc0_arr[32];
+      |                               ^~~~~~~~~~~~
+/tmp/cehyjtgmju2nn6npfywx2xkxsjwkj6rb45gv6xespkiyuceqg6xn.cpp: In function ‘void cpp_fused__safe_softmax__weight_int4pack_mm_for_cpu_add_bmm_index_index_put_scalar_tensor_stack_where_118(const bfloat16*, const int32_t*, const bfloat16*, const bfloat16*, const float*, const float*, const bool*, const bfloat16*, float*, float*, float*, float*, bfloat16*, float*, bool*, float*, float*, float*, bfloat16*, float*, bfloat16*)’:
+/tmp/cehyjtgmju2nn6npfywx2xkxsjwkj6rb45gv6xespkiyuceqg6xn.cpp:13924:31: warning: variable ‘tmp_acc0_arr’ set but not used [-Wunused-but-set-variable]
+13924 |                         float tmp_acc0_arr[32];
+      |                               ^~~~~~~~~~~~
+/tmp/cehyjtgmju2nn6npfywx2xkxsjwkj6rb45gv6xespkiyuceqg6xn.cpp: In function ‘void cpp_fused__safe_softmax__weight_int4pack_mm_for_cpu_add_bmm_index_index_put_scalar_tensor_stack_where_123(const bfloat16*, const int32_t*, const bfloat16*, const float*, const float*, const bool*, const bfloat16*, float*, float*, bfloat16*, float*, bool*, float*, float*, float*, bfloat16*, float*, bfloat16*)’:
+/tmp/cehyjtgmju2nn6npfywx2xkxsjwkj6rb45gv6xespkiyuceqg6xn.cpp:14436:31: warning: variable ‘tmp_acc0_arr’ set but not used [-Wunused-but-set-variable]
+14436 |                         float tmp_acc0_arr[32];
+      |                               ^~~~~~~~~~~~
+/tmp/cehyjtgmju2nn6npfywx2xkxsjwkj6rb45gv6xespkiyuceqg6xn.cpp: In function ‘void cpp_fused__safe_softmax__weight_int4pack_mm_for_cpu_add_bmm_index_index_put_scalar_tensor_stack_where_127(const bfloat16*, const int32_t*, const bfloat16*, const bfloat16*, const float*, const float*, const bool*, const bfloat16*, float*, float*, float*, float*, bfloat16*, float*, bool*, float*, float*, float*, bfloat16*, float*, bfloat16*)’:
+/tmp/cehyjtgmju2nn6npfywx2xkxsjwkj6rb45gv6xespkiyuceqg6xn.cpp:14936:31: warning: variable ‘tmp_acc0_arr’ set but not used [-Wunused-but-set-variable]
+14936 |                         float tmp_acc0_arr[32];
+      |                               ^~~~~~~~~~~~
+/tmp/cehyjtgmju2nn6npfywx2xkxsjwkj6rb45gv6xespkiyuceqg6xn.cpp: In function ‘void cpp_fused__safe_softmax__weight_int4pack_mm_for_cpu_add_bmm_index_index_put_scalar_tensor_stack_where_132(const bfloat16*, const int32_t*, const bfloat16*, const float*, const float*, const bool*, const bfloat16*, float*, float*, bfloat16*, float*, bool*, float*, float*, float*, bfloat16*, float*, bfloat16*)’:
+/tmp/cehyjtgmju2nn6npfywx2xkxsjwkj6rb45gv6xespkiyuceqg6xn.cpp:15448:31: warning: variable ‘tmp_acc0_arr’ set but not used [-Wunused-but-set-variable]
+15448 |                         float tmp_acc0_arr[32];
+      |                               ^~~~~~~~~~~~
+/tmp/cehyjtgmju2nn6npfywx2xkxsjwkj6rb45gv6xespkiyuceqg6xn.cpp: In function ‘void cpp_fused__safe_softmax__weight_int4pack_mm_for_cpu_add_bmm_index_index_put_scalar_tensor_stack_where_136(const bfloat16*, const int32_t*, const bfloat16*, const bfloat16*, const float*, const float*, const bool*, const bfloat16*, float*, float*, float*, float*, bfloat16*, float*, bool*, float*, float*, float*, bfloat16*, float*, bfloat16*)’:
+/tmp/cehyjtgmju2nn6npfywx2xkxsjwkj6rb45gv6xespkiyuceqg6xn.cpp:15948:31: warning: variable ‘tmp_acc0_arr’ set but not used [-Wunused-but-set-variable]
+15948 |                         float tmp_acc0_arr[32];
+      |                               ^~~~~~~~~~~~
+/tmp/cehyjtgmju2nn6npfywx2xkxsjwkj6rb45gv6xespkiyuceqg6xn.cpp: In function ‘void cpp_fused__safe_softmax__weight_int4pack_mm_for_cpu_add_bmm_index_index_put_scalar_tensor_stack_where_141(const bfloat16*, const int32_t*, const bfloat16*, const float*, const float*, const bool*, const bfloat16*, float*, float*, bfloat16*, float*, bool*, float*, float*, float*, bfloat16*, float*, bfloat16*)’:
+/tmp/cehyjtgmju2nn6npfywx2xkxsjwkj6rb45gv6xespkiyuceqg6xn.cpp:16460:31: warning: variable ‘tmp_acc0_arr’ set but not used [-Wunused-but-set-variable]
+16460 |                         float tmp_acc0_arr[32];
+      |                               ^~~~~~~~~~~~
+Unabled to import torchao experimental quant_api with error:  [Errno 2] No such file or directory: '/home/jackkhuu/oss/torchchat/torchao-build/src/ao/torchao/experimental/quant_api.py'
+Using device=cpu
+Setting max_seq_length to 300 for DSO export.
+Loading model...
+Time to load model: 0.11 seconds
+Quantizing the model with: {'linear:int4': {'groupsize': 256}, 'precision': {'dtype': 'bfloat16'}, 'executor': {'accelerator': 'cpu'}}
+Time to quantize model: 51.89 seconds
+-----------------------------------------------------------
+Exporting model using AOT Inductor to /tmp/model34.so
+WARNING!! The path of compiling a dso is deprecated. Please use --output-aoti-package-path to create a .pt2 artifact instead.
+The generated packaged model can be found at: /tmp/model34.so
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+OMP_NUM_THREADS=16 numactl --cpunodebind=0 --membind=0 python3 torchchat.py generate llama3.1 --dso-path /tmp/model34.so --prompt "Once upon a time," --max-new-tokens 256 --device cpu --num-samples 3
+PyTorch version 2.6.0.dev20241218+cu124 available.
+W1218 21:27:17.084127 2263993 site-packages/torch/_export/__init__.py:276] +============================+
+W1218 21:27:17.084672 2263993 site-packages/torch/_export/__init__.py:277] |     !!!   WARNING   !!!    |
+W1218 21:27:17.084886 2263993 site-packages/torch/_export/__init__.py:278] +============================+
+W1218 21:27:17.085091 2263993 site-packages/torch/_export/__init__.py:279] torch._export.aot_compile()/torch._export.aot_load() is being deprecated, please switch to directly calling torch._inductor.aoti_compile_and_package(torch.export.export())/torch._inductor.aoti_load_package() instead.
+[E1218 21:27:17.839127290 shim_common.cpp:1177] Exception in aoti_torch: Unable to find a proxy executor to run custom ops. Please check if there is a json file generated in the same directory as the so, or use torch._inductor.aoti_compile_and_package to package everything into a PT2 artifact.
+[E1218 21:27:17.839186881 shim_common.cpp:1177] Exception in aoti_torch: Unable to find a proxy executor to run custom ops. Please check if there is a json file generated in the same directory as the so, or use torch._inductor.aoti_compile_and_package to package everything into a PT2 artifact.
+[E1218 21:27:17.839195423 shim_common.cpp:1177] Exception in aoti_torch: Unable to find a proxy executor to run custom ops. Please check if there is a json file generated in the same directory as the so, or use torch._inductor.aoti_compile_and_package to package everything into a PT2 artifact.
+[E1218 21:27:17.840321301 shim_common.cpp:246] Exception in aoti_torch: Cannot access data pointer of Tensor that doesn't have storage
+Exception raised from throw_data_ptr_access_error at /pytorch/c10/core/TensorImpl.cpp:309 (most recent call first):
+frame #0: c10::Error::Error(c10::SourceLocation, std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> >) + 0x98 (0x7fd20c4cc788 in /home/jackkhuu/.conda/envs/testrunner/lib/python3.10/site-packages/torch/lib/libc10.so)
+frame #1: c10::detail::torchCheckFail(char const*, char const*, unsigned int, char const*) + 0x6a (0x7fd20c475fbc in /home/jackkhuu/.conda/envs/testrunner/lib/python3.10/site-packages/torch/lib/libc10.so)
+frame #2: c10::TensorImpl::throw_data_ptr_access_error() const + 0x34 (0x7fd20c4a4f64 in /home/jackkhuu/.conda/envs/testrunner/lib/python3.10/site-packages/torch/lib/libc10.so)
+frame #3: aoti_torch_get_data_ptr + 0xd0 (0x7fd1fbc970e0 in /home/jackkhuu/.conda/envs/testrunner/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so)
+frame #4: torch::aot_inductor::AOTInductorModel::run_impl(AtenTensorOpaque**, AtenTensorOpaque**, void*, AOTIProxyExecutorOpaque*) + 0x46ad (0x7fd0b96aebad in /tmp/model34.so)
+frame #5: torch::aot_inductor::AOTInductorModelContainer::run(AtenTensorOpaque**, AtenTensorOpaque**, void*, AOTIProxyExecutorOpaque*) + 0xe1 (0x7fd0b9707281 in /tmp/model34.so)
+frame #6: AOTInductorModelContainerRun + 0x6d (0x7fd0b96e1acd in /tmp/model34.so)
+frame #7: torch::inductor::AOTIModelContainerRunner::run(std::vector<at::Tensor, std::allocator<at::Tensor> > const&, void*) + 0x104 (0x7fd1fbc88c14 in /home/jackkhuu/.conda/envs/testrunner/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so)
+frame #8: torch::inductor::AOTIModelContainerRunnerCpu::run(std::vector<at::Tensor, std::allocator<at::Tensor> > const&, void*) + 0xa (0x7fd1fbc8945a in /home/jackkhuu/.conda/envs/testrunner/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so)
+frame #9: <unknown function> + 0x7f2026 (0x7fd20b5f2026 in /home/jackkhuu/.conda/envs/testrunner/lib/python3.10/site-packages/torch/lib/libtorch_python.so)
+frame #10: <unknown function> + 0x37fe0e (0x7fd20b17fe0e in /home/jackkhuu/.conda/envs/testrunner/lib/python3.10/site-packages/torch/lib/libtorch_python.so)
+<omitting python frames>
+frame #51: <unknown function> + 0x295d0 (0x7fd20d8295d0 in /lib64/libc.so.6)
+frame #52: __libc_start_main + 0x80 (0x7fd20d829680 in /lib64/libc.so.6)
+
+Error: aoti_torch_get_data_ptr(handle_.get(), &result) API call failed at /home/jackkhuu/.conda/envs/testrunner/lib/python3.10/site-packages/torch/include/torch/csrc/inductor/aoti_runtime/utils.h, line 117
+Unabled to import torchao experimental quant_api with error:  [Errno 2] No such file or directory: '/home/jackkhuu/oss/torchchat/torchao-build/src/ao/torchao/experimental/quant_api.py'
+Warning: checkpoint path ignored because an exported model was specified using a DSO, AOTI PACKAGE or PTE path argument
+Warning: checkpoint path ignored because an exported model was specified using a DSO, AOTI PACKAGE or PTE path argument
+Using device=cpu Intel(R) Xeon(R) Platinum 8339HC CPU @ 1.80GHz
+Loading model...
+Time to load model: 0.52 seconds
+-----------------------------------------------------------
+Traceback (most recent call last):
+  File "/home/jackkhuu/oss/torchchat/torchchat.py", line 96, in <module>
+    generate_main(args)
+  File "/home/jackkhuu/oss/torchchat/torchchat/generate.py", line 1247, in main
+    for _ in gen.chat(generator_args):
+  File "/home/jackkhuu/oss/torchchat/torchchat/generate.py", line 1116, in chat
+    for token_tensor, metrics in generator_func:
+  File "/home/jackkhuu/.conda/envs/testrunner/lib/python3.10/site-packages/torch/utils/_contextlib.py", line 36, in generator_context
+    response = gen.send(None)
+  File "/home/jackkhuu/oss/torchchat/torchchat/generate.py", line 647, in generate
+    next_token = self.prefill(
+  File "/home/jackkhuu/oss/torchchat/torchchat/generate.py", line 398, in prefill
+    logits = model(x_sliced, ip_sliced)  # (x[:, i], input_pos[i])da
+  File "/home/jackkhuu/.conda/envs/testrunner/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1739, in _wrapped_call_impl
+    return self._call_impl(*args, **kwargs)
+  File "/home/jackkhuu/.conda/envs/testrunner/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1750, in _call_impl
+    return forward_call(*args, **kwargs)
+  File "/home/jackkhuu/.conda/envs/testrunner/lib/python3.10/site-packages/torch/_export/__init__.py", line 387, in optimized
+    flat_outputs = runner.run(flat_inputs)  # type: ignore[attr-defined]
+RuntimeError: run_func_( container_handle_, input_handles.data(), input_handles.size(), output_handles.data(), output_handles.size(), reinterpret_cast<AOTInductorStreamHandle>(stream_handle), proxy_executor_handle_) API call failed at /pytorch/torch/csrc/inductor/aoti_runner/model_container_runner.cpp, line 107
diff --git a/llama31-1218/cpu_aoti_8.txt b/llama31-1218/cpu_aoti_8.txt
new file mode 100644
index 000000000..89d5b8893
--- /dev/null
+++ b/llama31-1218/cpu_aoti_8.txt
@@ -0,0 +1,232 @@
+python3 torchchat.py export llama3.1 --quantize '{"linear:int8": {"groupsize": 0}, "precision": {"dtype":"bfloat16"}, "executor":{"accelerator":"cpu"}}' --output-dso-path /tmp/model8.so
+OMP_NUM_THREADS=16 numactl --cpunodebind=0 --membind=0 python3 torchchat.py generate llama3.1 --dso-path /tmp/model8.so --prompt "Once upon a time," --max-new-tokens 256 --device cpu --num-samples 3
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+python3 torchchat.py export llama3.1 --quantize '{"linear:int8": {"groupsize": 0}, "precision": {"dtype":"bfloat16"}, "executor":{"accelerator":"cpu"}}' --output-dso-path /tmp/model8.so
+Note: NumExpr detected 22 cores but "NUMEXPR_MAX_THREADS" not set, so enforcing safe limit of 16.
+NumExpr defaulting to 16 threads.
+PyTorch version 2.6.0.dev20241218+cu124 available.
+W1218 21:03:43.869282 1979201 site-packages/torch/_export/__init__.py:276] +============================+
+W1218 21:03:43.869786 1979201 site-packages/torch/_export/__init__.py:277] |     !!!   WARNING   !!!    |
+W1218 21:03:43.870040 1979201 site-packages/torch/_export/__init__.py:278] +============================+
+W1218 21:03:43.870216 1979201 site-packages/torch/_export/__init__.py:279] torch._export.aot_compile()/torch._export.aot_load() is being deprecated, please switch to directly calling torch._inductor.aoti_compile_and_package(torch.export.export())/torch._inductor.aoti_load_package() instead.
+/tmp/cstsuiqkksmkx2ung6x6grqrenfveg57z7enn5z2fxjxw25yycxa.cpp: In function ‘void cpp_fused__safe_softmax__to_copy_add_bmm_index_index_put_scalar_tensor_stack_where_2(const bfloat16*, const int32_t*, const bfloat16*, const float*, const float*, const bool*, const bfloat16*, float*, float*, bfloat16*, float*, bool*, float*, float*, float*, bfloat16*, float*, bfloat16*)’:
+/tmp/cstsuiqkksmkx2ung6x6grqrenfveg57z7enn5z2fxjxw25yycxa.cpp:740:31: warning: variable ‘tmp_acc0_arr’ set but not used [-Wunused-but-set-variable]
+  740 |                         float tmp_acc0_arr[32];
+      |                               ^~~~~~~~~~~~
+/tmp/cstsuiqkksmkx2ung6x6grqrenfveg57z7enn5z2fxjxw25yycxa.cpp: In function ‘void cpp_fused__safe_softmax__to_copy_add_bmm_index_index_put_scalar_tensor_stack_where_7(const bfloat16*, const int32_t*, const bfloat16*, const float*, const float*, const bool*, const bfloat16*, float*, float*, bfloat16*, float*, bool*, float*, float*, float*, bfloat16*, float*, bfloat16*)’:
+/tmp/cstsuiqkksmkx2ung6x6grqrenfveg57z7enn5z2fxjxw25yycxa.cpp:1276:31: warning: variable ‘tmp_acc0_arr’ set but not used [-Wunused-but-set-variable]
+ 1276 |                         float tmp_acc0_arr[32];
+      |                               ^~~~~~~~~~~~
+/tmp/cstsuiqkksmkx2ung6x6grqrenfveg57z7enn5z2fxjxw25yycxa.cpp: In function ‘void cpp_fused__safe_softmax__to_copy_add_bmm_index_index_put_scalar_tensor_stack_where_12(const bfloat16*, const int32_t*, const bfloat16*, const float*, const float*, const bool*, const bfloat16*, float*, float*, bfloat16*, float*, bool*, float*, float*, float*, bfloat16*, float*, bfloat16*)’:
+/tmp/cstsuiqkksmkx2ung6x6grqrenfveg57z7enn5z2fxjxw25yycxa.cpp:1782:31: warning: variable ‘tmp_acc0_arr’ set but not used [-Wunused-but-set-variable]
+ 1782 |                         float tmp_acc0_arr[32];
+      |                               ^~~~~~~~~~~~
+/tmp/cstsuiqkksmkx2ung6x6grqrenfveg57z7enn5z2fxjxw25yycxa.cpp: In function ‘void cpp_fused__safe_softmax__to_copy_add_bmm_index_index_put_scalar_tensor_stack_where_17(const bfloat16*, const int32_t*, const bfloat16*, const float*, const float*, const bool*, const bfloat16*, float*, float*, bfloat16*, float*, bool*, float*, float*, float*, bfloat16*, float*, bfloat16*)’:
+/tmp/cstsuiqkksmkx2ung6x6grqrenfveg57z7enn5z2fxjxw25yycxa.cpp:2294:31: warning: variable ‘tmp_acc0_arr’ set but not used [-Wunused-but-set-variable]
+ 2294 |                         float tmp_acc0_arr[32];
+      |                               ^~~~~~~~~~~~
+/tmp/cstsuiqkksmkx2ung6x6grqrenfveg57z7enn5z2fxjxw25yycxa.cpp: In function ‘void cpp_fused__safe_softmax__to_copy_add_bmm_index_index_put_scalar_tensor_stack_where_22(const bfloat16*, const int32_t*, const bfloat16*, const float*, const float*, const bool*, const bfloat16*, float*, float*, bfloat16*, float*, bool*, float*, float*, float*, bfloat16*, float*, bfloat16*)’:
+/tmp/cstsuiqkksmkx2ung6x6grqrenfveg57z7enn5z2fxjxw25yycxa.cpp:2794:31: warning: variable ‘tmp_acc0_arr’ set but not used [-Wunused-but-set-variable]
+ 2794 |                         float tmp_acc0_arr[32];
+      |                               ^~~~~~~~~~~~
+/tmp/cstsuiqkksmkx2ung6x6grqrenfveg57z7enn5z2fxjxw25yycxa.cpp: In function ‘void cpp_fused__safe_softmax__to_copy_add_bmm_index_index_put_scalar_tensor_stack_where_27(const bfloat16*, const int32_t*, const bfloat16*, const float*, const float*, const bool*, const bfloat16*, float*, float*, bfloat16*, float*, bool*, float*, float*, float*, bfloat16*, float*, bfloat16*)’:
+/tmp/cstsuiqkksmkx2ung6x6grqrenfveg57z7enn5z2fxjxw25yycxa.cpp:3306:31: warning: variable ‘tmp_acc0_arr’ set but not used [-Wunused-but-set-variable]
+ 3306 |                         float tmp_acc0_arr[32];
+      |                               ^~~~~~~~~~~~
+/tmp/cstsuiqkksmkx2ung6x6grqrenfveg57z7enn5z2fxjxw25yycxa.cpp: In function ‘void cpp_fused__safe_softmax__to_copy_add_bmm_index_index_put_scalar_tensor_stack_where_32(const bfloat16*, const int32_t*, const bfloat16*, const float*, const float*, const bool*, const bfloat16*, float*, float*, bfloat16*, float*, bool*, float*, float*, float*, bfloat16*, float*, bfloat16*)’:
+/tmp/cstsuiqkksmkx2ung6x6grqrenfveg57z7enn5z2fxjxw25yycxa.cpp:3806:31: warning: variable ‘tmp_acc0_arr’ set but not used [-Wunused-but-set-variable]
+ 3806 |                         float tmp_acc0_arr[32];
+      |                               ^~~~~~~~~~~~
+/tmp/cstsuiqkksmkx2ung6x6grqrenfveg57z7enn5z2fxjxw25yycxa.cpp: In function ‘void cpp_fused__safe_softmax__to_copy_add_bmm_index_index_put_scalar_tensor_stack_where_37(const bfloat16*, const int32_t*, const bfloat16*, const float*, const float*, const bool*, const bfloat16*, float*, float*, bfloat16*, float*, bool*, float*, float*, float*, bfloat16*, float*, bfloat16*)’:
+/tmp/cstsuiqkksmkx2ung6x6grqrenfveg57z7enn5z2fxjxw25yycxa.cpp:4318:31: warning: variable ‘tmp_acc0_arr’ set but not used [-Wunused-but-set-variable]
+ 4318 |                         float tmp_acc0_arr[32];
+      |                               ^~~~~~~~~~~~
+/tmp/cstsuiqkksmkx2ung6x6grqrenfveg57z7enn5z2fxjxw25yycxa.cpp: In function ‘void cpp_fused__safe_softmax__to_copy_add_bmm_index_index_put_scalar_tensor_stack_where_42(const bfloat16*, const int32_t*, const bfloat16*, const float*, const float*, const bool*, const bfloat16*, float*, float*, bfloat16*, float*, bool*, float*, float*, float*, bfloat16*, float*, bfloat16*)’:
+/tmp/cstsuiqkksmkx2ung6x6grqrenfveg57z7enn5z2fxjxw25yycxa.cpp:4818:31: warning: variable ‘tmp_acc0_arr’ set but not used [-Wunused-but-set-variable]
+ 4818 |                         float tmp_acc0_arr[32];
+      |                               ^~~~~~~~~~~~
+/tmp/cstsuiqkksmkx2ung6x6grqrenfveg57z7enn5z2fxjxw25yycxa.cpp: In function ‘void cpp_fused__safe_softmax__to_copy_add_bmm_index_index_put_scalar_tensor_stack_where_47(const bfloat16*, const int32_t*, const bfloat16*, const float*, const float*, const bool*, const bfloat16*, float*, float*, bfloat16*, float*, bool*, float*, float*, float*, bfloat16*, float*, bfloat16*)’:
+/tmp/cstsuiqkksmkx2ung6x6grqrenfveg57z7enn5z2fxjxw25yycxa.cpp:5330:31: warning: variable ‘tmp_acc0_arr’ set but not used [-Wunused-but-set-variable]
+ 5330 |                         float tmp_acc0_arr[32];
+      |                               ^~~~~~~~~~~~
+/tmp/cstsuiqkksmkx2ung6x6grqrenfveg57z7enn5z2fxjxw25yycxa.cpp: In function ‘void cpp_fused__safe_softmax__to_copy_add_bmm_index_index_put_scalar_tensor_stack_where_52(const bfloat16*, const int32_t*, const bfloat16*, const float*, const float*, const bool*, const bfloat16*, float*, float*, bfloat16*, float*, bool*, float*, float*, float*, bfloat16*, float*, bfloat16*)’:
+/tmp/cstsuiqkksmkx2ung6x6grqrenfveg57z7enn5z2fxjxw25yycxa.cpp:5830:31: warning: variable ‘tmp_acc0_arr’ set but not used [-Wunused-but-set-variable]
+ 5830 |                         float tmp_acc0_arr[32];
+      |                               ^~~~~~~~~~~~
+/tmp/cstsuiqkksmkx2ung6x6grqrenfveg57z7enn5z2fxjxw25yycxa.cpp: In function ‘void cpp_fused__safe_softmax__to_copy_add_bmm_index_index_put_scalar_tensor_stack_where_57(const bfloat16*, const int32_t*, const bfloat16*, const float*, const float*, const bool*, const bfloat16*, float*, float*, bfloat16*, float*, bool*, float*, float*, float*, bfloat16*, float*, bfloat16*)’:
+/tmp/cstsuiqkksmkx2ung6x6grqrenfveg57z7enn5z2fxjxw25yycxa.cpp:6342:31: warning: variable ‘tmp_acc0_arr’ set but not used [-Wunused-but-set-variable]
+ 6342 |                         float tmp_acc0_arr[32];
+      |                               ^~~~~~~~~~~~
+/tmp/cstsuiqkksmkx2ung6x6grqrenfveg57z7enn5z2fxjxw25yycxa.cpp: In function ‘void cpp_fused__safe_softmax__to_copy_add_bmm_index_index_put_scalar_tensor_stack_where_62(const bfloat16*, const int32_t*, const bfloat16*, const float*, const float*, const bool*, const bfloat16*, float*, float*, bfloat16*, float*, bool*, float*, float*, float*, bfloat16*, float*, bfloat16*)’:
+/tmp/cstsuiqkksmkx2ung6x6grqrenfveg57z7enn5z2fxjxw25yycxa.cpp:6842:31: warning: variable ‘tmp_acc0_arr’ set but not used [-Wunused-but-set-variable]
+ 6842 |                         float tmp_acc0_arr[32];
+      |                               ^~~~~~~~~~~~
+/tmp/cstsuiqkksmkx2ung6x6grqrenfveg57z7enn5z2fxjxw25yycxa.cpp: In function ‘void cpp_fused__safe_softmax__to_copy_add_bmm_index_index_put_scalar_tensor_stack_where_67(const bfloat16*, const int32_t*, const bfloat16*, const float*, const float*, const bool*, const bfloat16*, float*, float*, bfloat16*, float*, bool*, float*, float*, float*, bfloat16*, float*, bfloat16*)’:
+/tmp/cstsuiqkksmkx2ung6x6grqrenfveg57z7enn5z2fxjxw25yycxa.cpp:7354:31: warning: variable ‘tmp_acc0_arr’ set but not used [-Wunused-but-set-variable]
+ 7354 |                         float tmp_acc0_arr[32];
+      |                               ^~~~~~~~~~~~
+/tmp/cstsuiqkksmkx2ung6x6grqrenfveg57z7enn5z2fxjxw25yycxa.cpp: In function ‘void cpp_fused__safe_softmax__to_copy_add_bmm_index_index_put_scalar_tensor_stack_where_72(const bfloat16*, const int32_t*, const bfloat16*, const float*, const float*, const bool*, const bfloat16*, float*, float*, bfloat16*, float*, bool*, float*, float*, float*, bfloat16*, float*, bfloat16*)’:
+/tmp/cstsuiqkksmkx2ung6x6grqrenfveg57z7enn5z2fxjxw25yycxa.cpp:7854:31: warning: variable ‘tmp_acc0_arr’ set but not used [-Wunused-but-set-variable]
+ 7854 |                         float tmp_acc0_arr[32];
+      |                               ^~~~~~~~~~~~
+/tmp/cstsuiqkksmkx2ung6x6grqrenfveg57z7enn5z2fxjxw25yycxa.cpp: In function ‘void cpp_fused__safe_softmax__to_copy_add_bmm_index_index_put_scalar_tensor_stack_where_77(const bfloat16*, const int32_t*, const bfloat16*, const float*, const float*, const bool*, const bfloat16*, float*, float*, bfloat16*, float*, bool*, float*, float*, float*, bfloat16*, float*, bfloat16*)’:
+/tmp/cstsuiqkksmkx2ung6x6grqrenfveg57z7enn5z2fxjxw25yycxa.cpp:8366:31: warning: variable ‘tmp_acc0_arr’ set but not used [-Wunused-but-set-variable]
+ 8366 |                         float tmp_acc0_arr[32];
+      |                               ^~~~~~~~~~~~
+/tmp/cstsuiqkksmkx2ung6x6grqrenfveg57z7enn5z2fxjxw25yycxa.cpp: In function ‘void cpp_fused__safe_softmax__to_copy_add_bmm_index_index_put_scalar_tensor_stack_where_82(const bfloat16*, const int32_t*, const bfloat16*, const float*, const float*, const bool*, const bfloat16*, float*, float*, bfloat16*, float*, bool*, float*, float*, float*, bfloat16*, float*, bfloat16*)’:
+/tmp/cstsuiqkksmkx2ung6x6grqrenfveg57z7enn5z2fxjxw25yycxa.cpp:8866:31: warning: variable ‘tmp_acc0_arr’ set but not used [-Wunused-but-set-variable]
+ 8866 |                         float tmp_acc0_arr[32];
+      |                               ^~~~~~~~~~~~
+/tmp/cstsuiqkksmkx2ung6x6grqrenfveg57z7enn5z2fxjxw25yycxa.cpp: In function ‘void cpp_fused__safe_softmax__to_copy_add_bmm_index_index_put_scalar_tensor_stack_where_87(const bfloat16*, const int32_t*, const bfloat16*, const float*, const float*, const bool*, const bfloat16*, float*, float*, bfloat16*, float*, bool*, float*, float*, float*, bfloat16*, float*, bfloat16*)’:
+/tmp/cstsuiqkksmkx2ung6x6grqrenfveg57z7enn5z2fxjxw25yycxa.cpp:9378:31: warning: variable ‘tmp_acc0_arr’ set but not used [-Wunused-but-set-variable]
+ 9378 |                         float tmp_acc0_arr[32];
+      |                               ^~~~~~~~~~~~
+/tmp/cstsuiqkksmkx2ung6x6grqrenfveg57z7enn5z2fxjxw25yycxa.cpp: In function ‘void cpp_fused__safe_softmax__to_copy_add_bmm_index_index_put_scalar_tensor_stack_where_92(const bfloat16*, const int32_t*, const bfloat16*, const float*, const float*, const bool*, const bfloat16*, float*, float*, bfloat16*, float*, bool*, float*, float*, float*, bfloat16*, float*, bfloat16*)’:
+/tmp/cstsuiqkksmkx2ung6x6grqrenfveg57z7enn5z2fxjxw25yycxa.cpp:9878:31: warning: variable ‘tmp_acc0_arr’ set but not used [-Wunused-but-set-variable]
+ 9878 |                         float tmp_acc0_arr[32];
+      |                               ^~~~~~~~~~~~
+/tmp/cstsuiqkksmkx2ung6x6grqrenfveg57z7enn5z2fxjxw25yycxa.cpp: In function ‘void cpp_fused__safe_softmax__to_copy_add_bmm_index_index_put_scalar_tensor_stack_where_97(const bfloat16*, const int32_t*, const bfloat16*, const float*, const float*, const bool*, const bfloat16*, float*, float*, bfloat16*, float*, bool*, float*, float*, float*, bfloat16*, float*, bfloat16*)’:
+/tmp/cstsuiqkksmkx2ung6x6grqrenfveg57z7enn5z2fxjxw25yycxa.cpp:10390:31: warning: variable ‘tmp_acc0_arr’ set but not used [-Wunused-but-set-variable]
+10390 |                         float tmp_acc0_arr[32];
+      |                               ^~~~~~~~~~~~
+/tmp/cstsuiqkksmkx2ung6x6grqrenfveg57z7enn5z2fxjxw25yycxa.cpp: In function ‘void cpp_fused__safe_softmax__to_copy_add_bmm_index_index_put_scalar_tensor_stack_where_102(const bfloat16*, const int32_t*, const bfloat16*, const float*, const float*, const bool*, const bfloat16*, float*, float*, bfloat16*, float*, bool*, float*, float*, float*, bfloat16*, float*, bfloat16*)’:
+/tmp/cstsuiqkksmkx2ung6x6grqrenfveg57z7enn5z2fxjxw25yycxa.cpp:10890:31: warning: variable ‘tmp_acc0_arr’ set but not used [-Wunused-but-set-variable]
+10890 |                         float tmp_acc0_arr[32];
+      |                               ^~~~~~~~~~~~
+/tmp/cstsuiqkksmkx2ung6x6grqrenfveg57z7enn5z2fxjxw25yycxa.cpp: In function ‘void cpp_fused__safe_softmax__to_copy_add_bmm_index_index_put_scalar_tensor_stack_where_107(const bfloat16*, const int32_t*, const bfloat16*, const float*, const float*, const bool*, const bfloat16*, float*, float*, bfloat16*, float*, bool*, float*, float*, float*, bfloat16*, float*, bfloat16*)’:
+/tmp/cstsuiqkksmkx2ung6x6grqrenfveg57z7enn5z2fxjxw25yycxa.cpp:11402:31: warning: variable ‘tmp_acc0_arr’ set but not used [-Wunused-but-set-variable]
+11402 |                         float tmp_acc0_arr[32];
+      |                               ^~~~~~~~~~~~
+/tmp/cstsuiqkksmkx2ung6x6grqrenfveg57z7enn5z2fxjxw25yycxa.cpp: In function ‘void cpp_fused__safe_softmax__to_copy_add_bmm_index_index_put_scalar_tensor_stack_where_112(const bfloat16*, const int32_t*, const bfloat16*, const float*, const float*, const bool*, const bfloat16*, float*, float*, bfloat16*, float*, bool*, float*, float*, float*, bfloat16*, float*, bfloat16*)’:
+/tmp/cstsuiqkksmkx2ung6x6grqrenfveg57z7enn5z2fxjxw25yycxa.cpp:11902:31: warning: variable ‘tmp_acc0_arr’ set but not used [-Wunused-but-set-variable]
+11902 |                         float tmp_acc0_arr[32];
+      |                               ^~~~~~~~~~~~
+/tmp/cstsuiqkksmkx2ung6x6grqrenfveg57z7enn5z2fxjxw25yycxa.cpp: In function ‘void cpp_fused__safe_softmax__to_copy_add_bmm_index_index_put_scalar_tensor_stack_where_117(const bfloat16*, const int32_t*, const bfloat16*, const float*, const float*, const bool*, const bfloat16*, float*, float*, bfloat16*, float*, bool*, float*, float*, float*, bfloat16*, float*, bfloat16*)’:
+/tmp/cstsuiqkksmkx2ung6x6grqrenfveg57z7enn5z2fxjxw25yycxa.cpp:12414:31: warning: variable ‘tmp_acc0_arr’ set but not used [-Wunused-but-set-variable]
+12414 |                         float tmp_acc0_arr[32];
+      |                               ^~~~~~~~~~~~
+/tmp/cstsuiqkksmkx2ung6x6grqrenfveg57z7enn5z2fxjxw25yycxa.cpp: In function ‘void cpp_fused__safe_softmax__to_copy_add_bmm_index_index_put_scalar_tensor_stack_where_122(const bfloat16*, const int32_t*, const bfloat16*, const float*, const float*, const bool*, const bfloat16*, float*, float*, bfloat16*, float*, bool*, float*, float*, float*, bfloat16*, float*, bfloat16*)’:
+/tmp/cstsuiqkksmkx2ung6x6grqrenfveg57z7enn5z2fxjxw25yycxa.cpp:12914:31: warning: variable ‘tmp_acc0_arr’ set but not used [-Wunused-but-set-variable]
+12914 |                         float tmp_acc0_arr[32];
+      |                               ^~~~~~~~~~~~
+/tmp/cstsuiqkksmkx2ung6x6grqrenfveg57z7enn5z2fxjxw25yycxa.cpp: In function ‘void cpp_fused__safe_softmax__to_copy_add_bmm_index_index_put_scalar_tensor_stack_where_127(const bfloat16*, const int32_t*, const bfloat16*, const float*, const float*, const bool*, const bfloat16*, float*, float*, bfloat16*, float*, bool*, float*, float*, float*, bfloat16*, float*, bfloat16*)’:
+/tmp/cstsuiqkksmkx2ung6x6grqrenfveg57z7enn5z2fxjxw25yycxa.cpp:13426:31: warning: variable ‘tmp_acc0_arr’ set but not used [-Wunused-but-set-variable]
+13426 |                         float tmp_acc0_arr[32];
+      |                               ^~~~~~~~~~~~
+/tmp/cstsuiqkksmkx2ung6x6grqrenfveg57z7enn5z2fxjxw25yycxa.cpp: In function ‘void cpp_fused__safe_softmax__to_copy_add_bmm_index_index_put_scalar_tensor_stack_where_132(const bfloat16*, const int32_t*, const bfloat16*, const float*, const float*, const bool*, const bfloat16*, float*, float*, bfloat16*, float*, bool*, float*, float*, float*, bfloat16*, float*, bfloat16*)’:
+/tmp/cstsuiqkksmkx2ung6x6grqrenfveg57z7enn5z2fxjxw25yycxa.cpp:13926:31: warning: variable ‘tmp_acc0_arr’ set but not used [-Wunused-but-set-variable]
+13926 |                         float tmp_acc0_arr[32];
+      |                               ^~~~~~~~~~~~
+/tmp/cstsuiqkksmkx2ung6x6grqrenfveg57z7enn5z2fxjxw25yycxa.cpp: In function ‘void cpp_fused__safe_softmax__to_copy_add_bmm_index_index_put_scalar_tensor_stack_where_137(const bfloat16*, const int32_t*, const bfloat16*, const float*, const float*, const bool*, const bfloat16*, float*, float*, bfloat16*, float*, bool*, float*, float*, float*, bfloat16*, float*, bfloat16*)’:
+/tmp/cstsuiqkksmkx2ung6x6grqrenfveg57z7enn5z2fxjxw25yycxa.cpp:14438:31: warning: variable ‘tmp_acc0_arr’ set but not used [-Wunused-but-set-variable]
+14438 |                         float tmp_acc0_arr[32];
+      |                               ^~~~~~~~~~~~
+/tmp/cstsuiqkksmkx2ung6x6grqrenfveg57z7enn5z2fxjxw25yycxa.cpp: In function ‘void cpp_fused__safe_softmax__to_copy_add_bmm_index_index_put_scalar_tensor_stack_where_142(const bfloat16*, const int32_t*, const bfloat16*, const float*, const float*, const bool*, const bfloat16*, float*, float*, bfloat16*, float*, bool*, float*, float*, float*, bfloat16*, float*, bfloat16*)’:
+/tmp/cstsuiqkksmkx2ung6x6grqrenfveg57z7enn5z2fxjxw25yycxa.cpp:14938:31: warning: variable ‘tmp_acc0_arr’ set but not used [-Wunused-but-set-variable]
+14938 |                         float tmp_acc0_arr[32];
+      |                               ^~~~~~~~~~~~
+/tmp/cstsuiqkksmkx2ung6x6grqrenfveg57z7enn5z2fxjxw25yycxa.cpp: In function ‘void cpp_fused__safe_softmax__to_copy_add_bmm_index_index_put_scalar_tensor_stack_where_147(const bfloat16*, const int32_t*, const bfloat16*, const float*, const float*, const bool*, const bfloat16*, float*, float*, bfloat16*, float*, bool*, float*, float*, float*, bfloat16*, float*, bfloat16*)’:
+/tmp/cstsuiqkksmkx2ung6x6grqrenfveg57z7enn5z2fxjxw25yycxa.cpp:15450:31: warning: variable ‘tmp_acc0_arr’ set but not used [-Wunused-but-set-variable]
+15450 |                         float tmp_acc0_arr[32];
+      |                               ^~~~~~~~~~~~
+/tmp/cstsuiqkksmkx2ung6x6grqrenfveg57z7enn5z2fxjxw25yycxa.cpp: In function ‘void cpp_fused__safe_softmax__to_copy_add_bmm_index_index_put_scalar_tensor_stack_where_152(const bfloat16*, const int32_t*, const bfloat16*, const float*, const float*, const bool*, const bfloat16*, float*, float*, bfloat16*, float*, bool*, float*, float*, float*, bfloat16*, float*, bfloat16*)’:
+/tmp/cstsuiqkksmkx2ung6x6grqrenfveg57z7enn5z2fxjxw25yycxa.cpp:15950:31: warning: variable ‘tmp_acc0_arr’ set but not used [-Wunused-but-set-variable]
+15950 |                         float tmp_acc0_arr[32];
+      |                               ^~~~~~~~~~~~
+/tmp/cstsuiqkksmkx2ung6x6grqrenfveg57z7enn5z2fxjxw25yycxa.cpp: In function ‘void cpp_fused__safe_softmax__to_copy_add_bmm_index_index_put_scalar_tensor_stack_where_157(const bfloat16*, const int32_t*, const bfloat16*, const float*, const float*, const bool*, const bfloat16*, float*, float*, bfloat16*, float*, bool*, float*, float*, float*, bfloat16*, float*, bfloat16*)’:
+/tmp/cstsuiqkksmkx2ung6x6grqrenfveg57z7enn5z2fxjxw25yycxa.cpp:16462:31: warning: variable ‘tmp_acc0_arr’ set but not used [-Wunused-but-set-variable]
+16462 |                         float tmp_acc0_arr[32];
+      |                               ^~~~~~~~~~~~
+Unabled to import torchao experimental quant_api with error:  [Errno 2] No such file or directory: '/home/jackkhuu/oss/torchchat/torchao-build/src/ao/torchao/experimental/quant_api.py'
+Using device=cpu
+Setting max_seq_length to 300 for DSO export.
+Loading model...
+Time to load model: 0.11 seconds
+Quantizing the model with: {'linear:int8': {'groupsize': 0}, 'precision': {'dtype': 'bfloat16'}, 'executor': {'accelerator': 'cpu'}}
+Time to quantize model: 33.35 seconds
+-----------------------------------------------------------
+Exporting model using AOT Inductor to /tmp/model8.so
+WARNING!! The path of compiling a dso is deprecated. Please use --output-aoti-package-path to create a .pt2 artifact instead.
+The generated packaged model can be found at: /tmp/model8.so
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+OMP_NUM_THREADS=16 numactl --cpunodebind=0 --membind=0 python3 torchchat.py generate llama3.1 --dso-path /tmp/model8.so --prompt "Once upon a time," --max-new-tokens 256 --device cpu --num-samples 3
+PyTorch version 2.6.0.dev20241218+cu124 available.
+W1218 21:07:13.590931 2000314 site-packages/torch/_export/__init__.py:276] +============================+
+W1218 21:07:13.591434 2000314 site-packages/torch/_export/__init__.py:277] |     !!!   WARNING   !!!    |
+W1218 21:07:13.591635 2000314 site-packages/torch/_export/__init__.py:278] +============================+
+W1218 21:07:13.591855 2000314 site-packages/torch/_export/__init__.py:279] torch._export.aot_compile()/torch._export.aot_load() is being deprecated, please switch to directly calling torch._inductor.aoti_compile_and_package(torch.export.export())/torch._inductor.aoti_load_package() instead.
+Unabled to import torchao experimental quant_api with error:  [Errno 2] No such file or directory: '/home/jackkhuu/oss/torchchat/torchao-build/src/ao/torchao/experimental/quant_api.py'
+Warning: checkpoint path ignored because an exported model was specified using a DSO, AOTI PACKAGE or PTE path argument
+Warning: checkpoint path ignored because an exported model was specified using a DSO, AOTI PACKAGE or PTE path argument
+Using device=cpu Intel(R) Xeon(R) Platinum 8339HC CPU @ 1.80GHz
+Loading model...
+Time to load model: 0.49 seconds
+-----------------------------------------------------------
+Once upon a time, there was a little girl who lived in a quaint village surrounded by rolling hills and green forests. She had a big dream – to become a great artist and paint the world with vibrant colors.
+The little girl spent every spare moment practicing drawing and painting. She watched the way the sunlight danced through the leaves of the trees, and the way the colors of the sunset transformed the sky.
+One day, she decided to take her passion to the next level and enter a local art competition. She poured her heart and soul into her painting, using every skill she had learned to create a beautiful piece of artwork.
+When the day of the competition arrived, the little girl was nervous but excited. She set up her painting carefully and waited for the judges to make their way around the exhibit.
+The judges were a panel of esteemed artists and art critics who had seen countless works of art in their careers. But when they saw the little girl’s painting, they were shocked.
+How could such a young girl have created such a stunning piece of artwork? The painting was alive with color and energy, just like the little girl had described.
+The judges asked the little girl to explain her inspiration and technique. She talked about the way the light reflected off the leaves, and the way the colors blended together
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~                
+Generated 255 tokens                 
+Time for inference 1: 337.0732 sec total                 
+Time to first token: 8.9997 sec with sequential prefill.                
+
+      Total throughput: 0.7595 tokens/sec, 1.3167 s/token                 
+First token throughput: 0.1111 tokens/sec, 8.9997 s/token                 
+ Next token throughput: 0.7773 tokens/sec, 1.2866 s/token                     
+
+Bandwidth achieved: 0.00 GB/s
+*** This first iteration will include cold start effects for dynamic import, hardware caches. ***
+
+========================================
+
+Once upon a time, in a small village nestled in the rolling hills of Tuscany, there lived a young girl named Sophia. Sophia was a curious and adventurous child, with a heart full of love for the natural world. She spent her days exploring the fields and forests surrounding her village, collecting wildflowers and learning about the secrets of the earth from her wise and gentle grandmother.
+One day, while wandering through the woods, Sophia stumbled upon a hidden glade. In the center of the clearing stood an ancient tree, its gnarled branches twisted and tangled in a way that seemed almost magical. Sophia felt drawn to the tree, as if it was calling to her.
+As she approached the tree, Sophia noticed a small door carved into its trunk. The door was slightly ajar, and a faint light was emanating from within. Sophia's curiosity was piqued, and she pushed the door open, revealing a narrow staircase that descended deep into the earth.
+Without hesitation, Sophia began to climb the stairs, her heart pounding with excitement. At the bottom of the stairs, she found herself in a cozy, candlelit room filled with rows of ancient bookshelves and strange, glowing orbs. A warm, golden light filled the space, and Sophia felt a sense of peace and belonging
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~                
+Generated 255 tokens                 
+Time for inference 2: 250.9440 sec total                 
+Time to first token: 5.1558 sec with sequential prefill.                
+
+      Total throughput: 1.0201 tokens/sec, 0.9803 s/token                 
+First token throughput: 0.1940 tokens/sec, 5.1558 s/token                 
+ Next token throughput: 1.0375 tokens/sec, 0.9639 s/token                     
+
+Bandwidth achieved: 0.00 GB/s
+
+========================================
+
+Once upon a time, the only way to learn a new language was to enroll in a language course or rely on textbooks and language learning apps. However, with the advent of the internet and technology, numerous language learning platforms have emerged, offering a variety of methods and tools to improve your language skills.
+In this article, we'll explore the benefits and drawbacks of these platforms, discuss the different types of platforms, and provide some recommendations for language learners.
+Benefits of online language learning platforms:
+1. **Convenience**: Online language learning platforms are accessible from anywhere with an internet connection, making it easy to learn a language at any time and from any location.
+2. **Flexibility**: Many platforms offer flexible scheduling and lesson options, allowing learners to choose the pace and style that suits them best.
+3. **Cost-effective**: Online language learning platforms can be more affordable than traditional language courses, and some platforms even offer free resources and materials.
+4. **Interactive**: Online platforms often incorporate interactive features, such as games, quizzes, and exercises, which can make language learning more engaging and enjoyable.
+5. **Access to native speakers**: Some platforms connect learners with native speakers, either through video chats or language exchange programs, which can help learners improve their speaking skills and get feedback on their pronunciation.
+
+Draw
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~                
+Generated 255 tokens                 
+Time for inference 3: 345.1208 sec total                 
+Time to first token: 7.9283 sec with sequential prefill.                
+
+      Total throughput: 0.7418 tokens/sec, 1.3481 s/token                 
+First token throughput: 0.1261 tokens/sec, 7.9283 s/token                 
+ Next token throughput: 0.7562 tokens/sec, 1.3223 s/token                     
+
+Bandwidth achieved: 0.00 GB/s
+
+========================================
+
+
+Warning: Excluding compile in calculations                 
+      Average tokens/sec (total): 0.84                 
+Average tokens/sec (first token): 0.14                 
+Average tokens/sec (next tokens): 0.86 
+                
+Memory used: 0.00 GB
diff --git a/llama31-1218/cpu_aoti_b16.txt b/llama31-1218/cpu_aoti_b16.txt
new file mode 100644
index 000000000..fcdd01525
--- /dev/null
+++ b/llama31-1218/cpu_aoti_b16.txt
@@ -0,0 +1,228 @@
+python3 torchchat.py export llama3.1 --quantize '{"precision": {"dtype":"bfloat16"}, "executor":{"accelerator":"cpu"}}' --output-dso-path /tmp/model16.so
+OMP_NUM_THREADS=16 numactl --cpunodebind=0 --membind=0 python3 torchchat.py generate llama3.1 --dso-path /tmp/model16.so --prompt "Once upon a time," --max-new-tokens 256 --device cpu --num-samples 3
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+python3 torchchat.py export llama3.1 --quantize '{"precision": {"dtype":"bfloat16"}, "executor":{"accelerator":"cpu"}}' --output-dso-path /tmp/model16.so
+Note: NumExpr detected 22 cores but "NUMEXPR_MAX_THREADS" not set, so enforcing safe limit of 16.
+NumExpr defaulting to 16 threads.
+PyTorch version 2.6.0.dev20241218+cu124 available.
+W1218 20:48:48.914671 1745987 site-packages/torch/_export/__init__.py:276] +============================+
+W1218 20:48:48.915196 1745987 site-packages/torch/_export/__init__.py:277] |     !!!   WARNING   !!!    |
+W1218 20:48:48.915405 1745987 site-packages/torch/_export/__init__.py:278] +============================+
+W1218 20:48:48.915583 1745987 site-packages/torch/_export/__init__.py:279] torch._export.aot_compile()/torch._export.aot_load() is being deprecated, please switch to directly calling torch._inductor.aoti_compile_and_package(torch.export.export())/torch._inductor.aoti_load_package() instead.
+/tmp/ckry253sjnd53hcycnhihuv6bdalujdd4qdj2ckpytpqytgu5pcj.cpp: In function ‘void cpp_fused__safe_softmax__to_copy_add_bmm_index_index_put_scalar_tensor_stack_where_2(const bfloat16*, const int32_t*, const bfloat16*, const float*, const float*, const bool*, const bfloat16*, float*, float*, bfloat16*, float*, bool*, float*, float*, float*, bfloat16*, float*, bfloat16*)’:
+/tmp/ckry253sjnd53hcycnhihuv6bdalujdd4qdj2ckpytpqytgu5pcj.cpp:740:31: warning: variable ‘tmp_acc0_arr’ set but not used [-Wunused-but-set-variable]
+  740 |                         float tmp_acc0_arr[32];
+      |                               ^~~~~~~~~~~~
+/tmp/ckry253sjnd53hcycnhihuv6bdalujdd4qdj2ckpytpqytgu5pcj.cpp: In function ‘void cpp_fused__safe_softmax__to_copy_add_bmm_index_index_put_scalar_tensor_stack_where_7(const bfloat16*, const int32_t*, const bfloat16*, const float*, const float*, const bool*, const bfloat16*, float*, float*, bfloat16*, float*, bool*, float*, float*, float*, bfloat16*, float*, bfloat16*)’:
+/tmp/ckry253sjnd53hcycnhihuv6bdalujdd4qdj2ckpytpqytgu5pcj.cpp:1276:31: warning: variable ‘tmp_acc0_arr’ set but not used [-Wunused-but-set-variable]
+ 1276 |                         float tmp_acc0_arr[32];
+      |                               ^~~~~~~~~~~~
+/tmp/ckry253sjnd53hcycnhihuv6bdalujdd4qdj2ckpytpqytgu5pcj.cpp: In function ‘void cpp_fused__safe_softmax__to_copy_add_bmm_index_index_put_scalar_tensor_stack_where_12(const bfloat16*, const int32_t*, const bfloat16*, const float*, const float*, const bool*, const bfloat16*, float*, float*, bfloat16*, float*, bool*, float*, float*, float*, bfloat16*, float*, bfloat16*)’:
+/tmp/ckry253sjnd53hcycnhihuv6bdalujdd4qdj2ckpytpqytgu5pcj.cpp:1782:31: warning: variable ‘tmp_acc0_arr’ set but not used [-Wunused-but-set-variable]
+ 1782 |                         float tmp_acc0_arr[32];
+      |                               ^~~~~~~~~~~~
+/tmp/ckry253sjnd53hcycnhihuv6bdalujdd4qdj2ckpytpqytgu5pcj.cpp: In function ‘void cpp_fused__safe_softmax__to_copy_add_bmm_index_index_put_scalar_tensor_stack_where_17(const bfloat16*, const int32_t*, const bfloat16*, const float*, const float*, const bool*, const bfloat16*, float*, float*, bfloat16*, float*, bool*, float*, float*, float*, bfloat16*, float*, bfloat16*)’:
+/tmp/ckry253sjnd53hcycnhihuv6bdalujdd4qdj2ckpytpqytgu5pcj.cpp:2294:31: warning: variable ‘tmp_acc0_arr’ set but not used [-Wunused-but-set-variable]
+ 2294 |                         float tmp_acc0_arr[32];
+      |                               ^~~~~~~~~~~~
+/tmp/ckry253sjnd53hcycnhihuv6bdalujdd4qdj2ckpytpqytgu5pcj.cpp: In function ‘void cpp_fused__safe_softmax__to_copy_add_bmm_index_index_put_scalar_tensor_stack_where_22(const bfloat16*, const int32_t*, const bfloat16*, const float*, const float*, const bool*, const bfloat16*, float*, float*, bfloat16*, float*, bool*, float*, float*, float*, bfloat16*, float*, bfloat16*)’:
+/tmp/ckry253sjnd53hcycnhihuv6bdalujdd4qdj2ckpytpqytgu5pcj.cpp:2794:31: warning: variable ‘tmp_acc0_arr’ set but not used [-Wunused-but-set-variable]
+ 2794 |                         float tmp_acc0_arr[32];
+      |                               ^~~~~~~~~~~~
+/tmp/ckry253sjnd53hcycnhihuv6bdalujdd4qdj2ckpytpqytgu5pcj.cpp: In function ‘void cpp_fused__safe_softmax__to_copy_add_bmm_index_index_put_scalar_tensor_stack_where_27(const bfloat16*, const int32_t*, const bfloat16*, const float*, const float*, const bool*, const bfloat16*, float*, float*, bfloat16*, float*, bool*, float*, float*, float*, bfloat16*, float*, bfloat16*)’:
+/tmp/ckry253sjnd53hcycnhihuv6bdalujdd4qdj2ckpytpqytgu5pcj.cpp:3306:31: warning: variable ‘tmp_acc0_arr’ set but not used [-Wunused-but-set-variable]
+ 3306 |                         float tmp_acc0_arr[32];
+      |                               ^~~~~~~~~~~~
+/tmp/ckry253sjnd53hcycnhihuv6bdalujdd4qdj2ckpytpqytgu5pcj.cpp: In function ‘void cpp_fused__safe_softmax__to_copy_add_bmm_index_index_put_scalar_tensor_stack_where_32(const bfloat16*, const int32_t*, const bfloat16*, const float*, const float*, const bool*, const bfloat16*, float*, float*, bfloat16*, float*, bool*, float*, float*, float*, bfloat16*, float*, bfloat16*)’:
+/tmp/ckry253sjnd53hcycnhihuv6bdalujdd4qdj2ckpytpqytgu5pcj.cpp:3806:31: warning: variable ‘tmp_acc0_arr’ set but not used [-Wunused-but-set-variable]
+ 3806 |                         float tmp_acc0_arr[32];
+      |                               ^~~~~~~~~~~~
+/tmp/ckry253sjnd53hcycnhihuv6bdalujdd4qdj2ckpytpqytgu5pcj.cpp: In function ‘void cpp_fused__safe_softmax__to_copy_add_bmm_index_index_put_scalar_tensor_stack_where_37(const bfloat16*, const int32_t*, const bfloat16*, const float*, const float*, const bool*, const bfloat16*, float*, float*, bfloat16*, float*, bool*, float*, float*, float*, bfloat16*, float*, bfloat16*)’:
+/tmp/ckry253sjnd53hcycnhihuv6bdalujdd4qdj2ckpytpqytgu5pcj.cpp:4318:31: warning: variable ‘tmp_acc0_arr’ set but not used [-Wunused-but-set-variable]
+ 4318 |                         float tmp_acc0_arr[32];
+      |                               ^~~~~~~~~~~~
+/tmp/ckry253sjnd53hcycnhihuv6bdalujdd4qdj2ckpytpqytgu5pcj.cpp: In function ‘void cpp_fused__safe_softmax__to_copy_add_bmm_index_index_put_scalar_tensor_stack_where_42(const bfloat16*, const int32_t*, const bfloat16*, const float*, const float*, const bool*, const bfloat16*, float*, float*, bfloat16*, float*, bool*, float*, float*, float*, bfloat16*, float*, bfloat16*)’:
+/tmp/ckry253sjnd53hcycnhihuv6bdalujdd4qdj2ckpytpqytgu5pcj.cpp:4818:31: warning: variable ‘tmp_acc0_arr’ set but not used [-Wunused-but-set-variable]
+ 4818 |                         float tmp_acc0_arr[32];
+      |                               ^~~~~~~~~~~~
+/tmp/ckry253sjnd53hcycnhihuv6bdalujdd4qdj2ckpytpqytgu5pcj.cpp: In function ‘void cpp_fused__safe_softmax__to_copy_add_bmm_index_index_put_scalar_tensor_stack_where_47(const bfloat16*, const int32_t*, const bfloat16*, const float*, const float*, const bool*, const bfloat16*, float*, float*, bfloat16*, float*, bool*, float*, float*, float*, bfloat16*, float*, bfloat16*)’:
+/tmp/ckry253sjnd53hcycnhihuv6bdalujdd4qdj2ckpytpqytgu5pcj.cpp:5330:31: warning: variable ‘tmp_acc0_arr’ set but not used [-Wunused-but-set-variable]
+ 5330 |                         float tmp_acc0_arr[32];
+      |                               ^~~~~~~~~~~~
+/tmp/ckry253sjnd53hcycnhihuv6bdalujdd4qdj2ckpytpqytgu5pcj.cpp: In function ‘void cpp_fused__safe_softmax__to_copy_add_bmm_index_index_put_scalar_tensor_stack_where_52(const bfloat16*, const int32_t*, const bfloat16*, const float*, const float*, const bool*, const bfloat16*, float*, float*, bfloat16*, float*, bool*, float*, float*, float*, bfloat16*, float*, bfloat16*)’:
+/tmp/ckry253sjnd53hcycnhihuv6bdalujdd4qdj2ckpytpqytgu5pcj.cpp:5830:31: warning: variable ‘tmp_acc0_arr’ set but not used [-Wunused-but-set-variable]
+ 5830 |                         float tmp_acc0_arr[32];
+      |                               ^~~~~~~~~~~~
+/tmp/ckry253sjnd53hcycnhihuv6bdalujdd4qdj2ckpytpqytgu5pcj.cpp: In function ‘void cpp_fused__safe_softmax__to_copy_add_bmm_index_index_put_scalar_tensor_stack_where_57(const bfloat16*, const int32_t*, const bfloat16*, const float*, const float*, const bool*, const bfloat16*, float*, float*, bfloat16*, float*, bool*, float*, float*, float*, bfloat16*, float*, bfloat16*)’:
+/tmp/ckry253sjnd53hcycnhihuv6bdalujdd4qdj2ckpytpqytgu5pcj.cpp:6342:31: warning: variable ‘tmp_acc0_arr’ set but not used [-Wunused-but-set-variable]
+ 6342 |                         float tmp_acc0_arr[32];
+      |                               ^~~~~~~~~~~~
+/tmp/ckry253sjnd53hcycnhihuv6bdalujdd4qdj2ckpytpqytgu5pcj.cpp: In function ‘void cpp_fused__safe_softmax__to_copy_add_bmm_index_index_put_scalar_tensor_stack_where_62(const bfloat16*, const int32_t*, const bfloat16*, const float*, const float*, const bool*, const bfloat16*, float*, float*, bfloat16*, float*, bool*, float*, float*, float*, bfloat16*, float*, bfloat16*)’:
+/tmp/ckry253sjnd53hcycnhihuv6bdalujdd4qdj2ckpytpqytgu5pcj.cpp:6842:31: warning: variable ‘tmp_acc0_arr’ set but not used [-Wunused-but-set-variable]
+ 6842 |                         float tmp_acc0_arr[32];
+      |                               ^~~~~~~~~~~~
+/tmp/ckry253sjnd53hcycnhihuv6bdalujdd4qdj2ckpytpqytgu5pcj.cpp: In function ‘void cpp_fused__safe_softmax__to_copy_add_bmm_index_index_put_scalar_tensor_stack_where_67(const bfloat16*, const int32_t*, const bfloat16*, const float*, const float*, const bool*, const bfloat16*, float*, float*, bfloat16*, float*, bool*, float*, float*, float*, bfloat16*, float*, bfloat16*)’:
+/tmp/ckry253sjnd53hcycnhihuv6bdalujdd4qdj2ckpytpqytgu5pcj.cpp:7354:31: warning: variable ‘tmp_acc0_arr’ set but not used [-Wunused-but-set-variable]
+ 7354 |                         float tmp_acc0_arr[32];
+      |                               ^~~~~~~~~~~~
+/tmp/ckry253sjnd53hcycnhihuv6bdalujdd4qdj2ckpytpqytgu5pcj.cpp: In function ‘void cpp_fused__safe_softmax__to_copy_add_bmm_index_index_put_scalar_tensor_stack_where_72(const bfloat16*, const int32_t*, const bfloat16*, const float*, const float*, const bool*, const bfloat16*, float*, float*, bfloat16*, float*, bool*, float*, float*, float*, bfloat16*, float*, bfloat16*)’:
+/tmp/ckry253sjnd53hcycnhihuv6bdalujdd4qdj2ckpytpqytgu5pcj.cpp:7854:31: warning: variable ‘tmp_acc0_arr’ set but not used [-Wunused-but-set-variable]
+ 7854 |                         float tmp_acc0_arr[32];
+      |                               ^~~~~~~~~~~~
+/tmp/ckry253sjnd53hcycnhihuv6bdalujdd4qdj2ckpytpqytgu5pcj.cpp: In function ‘void cpp_fused__safe_softmax__to_copy_add_bmm_index_index_put_scalar_tensor_stack_where_77(const bfloat16*, const int32_t*, const bfloat16*, const float*, const float*, const bool*, const bfloat16*, float*, float*, bfloat16*, float*, bool*, float*, float*, float*, bfloat16*, float*, bfloat16*)’:
+/tmp/ckry253sjnd53hcycnhihuv6bdalujdd4qdj2ckpytpqytgu5pcj.cpp:8366:31: warning: variable ‘tmp_acc0_arr’ set but not used [-Wunused-but-set-variable]
+ 8366 |                         float tmp_acc0_arr[32];
+      |                               ^~~~~~~~~~~~
+/tmp/ckry253sjnd53hcycnhihuv6bdalujdd4qdj2ckpytpqytgu5pcj.cpp: In function ‘void cpp_fused__safe_softmax__to_copy_add_bmm_index_index_put_scalar_tensor_stack_where_82(const bfloat16*, const int32_t*, const bfloat16*, const float*, const float*, const bool*, const bfloat16*, float*, float*, bfloat16*, float*, bool*, float*, float*, float*, bfloat16*, float*, bfloat16*)’:
+/tmp/ckry253sjnd53hcycnhihuv6bdalujdd4qdj2ckpytpqytgu5pcj.cpp:8866:31: warning: variable ‘tmp_acc0_arr’ set but not used [-Wunused-but-set-variable]
+ 8866 |                         float tmp_acc0_arr[32];
+      |                               ^~~~~~~~~~~~
+/tmp/ckry253sjnd53hcycnhihuv6bdalujdd4qdj2ckpytpqytgu5pcj.cpp: In function ‘void cpp_fused__safe_softmax__to_copy_add_bmm_index_index_put_scalar_tensor_stack_where_87(const bfloat16*, const int32_t*, const bfloat16*, const float*, const float*, const bool*, const bfloat16*, float*, float*, bfloat16*, float*, bool*, float*, float*, float*, bfloat16*, float*, bfloat16*)’:
+/tmp/ckry253sjnd53hcycnhihuv6bdalujdd4qdj2ckpytpqytgu5pcj.cpp:9378:31: warning: variable ‘tmp_acc0_arr’ set but not used [-Wunused-but-set-variable]
+ 9378 |                         float tmp_acc0_arr[32];
+      |                               ^~~~~~~~~~~~
+/tmp/ckry253sjnd53hcycnhihuv6bdalujdd4qdj2ckpytpqytgu5pcj.cpp: In function ‘void cpp_fused__safe_softmax__to_copy_add_bmm_index_index_put_scalar_tensor_stack_where_92(const bfloat16*, const int32_t*, const bfloat16*, const float*, const float*, const bool*, const bfloat16*, float*, float*, bfloat16*, float*, bool*, float*, float*, float*, bfloat16*, float*, bfloat16*)’:
+/tmp/ckry253sjnd53hcycnhihuv6bdalujdd4qdj2ckpytpqytgu5pcj.cpp:9878:31: warning: variable ‘tmp_acc0_arr’ set but not used [-Wunused-but-set-variable]
+ 9878 |                         float tmp_acc0_arr[32];
+      |                               ^~~~~~~~~~~~
+/tmp/ckry253sjnd53hcycnhihuv6bdalujdd4qdj2ckpytpqytgu5pcj.cpp: In function ‘void cpp_fused__safe_softmax__to_copy_add_bmm_index_index_put_scalar_tensor_stack_where_97(const bfloat16*, const int32_t*, const bfloat16*, const float*, const float*, const bool*, const bfloat16*, float*, float*, bfloat16*, float*, bool*, float*, float*, float*, bfloat16*, float*, bfloat16*)’:
+/tmp/ckry253sjnd53hcycnhihuv6bdalujdd4qdj2ckpytpqytgu5pcj.cpp:10390:31: warning: variable ‘tmp_acc0_arr’ set but not used [-Wunused-but-set-variable]
+10390 |                         float tmp_acc0_arr[32];
+      |                               ^~~~~~~~~~~~
+/tmp/ckry253sjnd53hcycnhihuv6bdalujdd4qdj2ckpytpqytgu5pcj.cpp: In function ‘void cpp_fused__safe_softmax__to_copy_add_bmm_index_index_put_scalar_tensor_stack_where_102(const bfloat16*, const int32_t*, const bfloat16*, const float*, const float*, const bool*, const bfloat16*, float*, float*, bfloat16*, float*, bool*, float*, float*, float*, bfloat16*, float*, bfloat16*)’:
+/tmp/ckry253sjnd53hcycnhihuv6bdalujdd4qdj2ckpytpqytgu5pcj.cpp:10890:31: warning: variable ‘tmp_acc0_arr’ set but not used [-Wunused-but-set-variable]
+10890 |                         float tmp_acc0_arr[32];
+      |                               ^~~~~~~~~~~~
+/tmp/ckry253sjnd53hcycnhihuv6bdalujdd4qdj2ckpytpqytgu5pcj.cpp: In function ‘void cpp_fused__safe_softmax__to_copy_add_bmm_index_index_put_scalar_tensor_stack_where_107(const bfloat16*, const int32_t*, const bfloat16*, const float*, const float*, const bool*, const bfloat16*, float*, float*, bfloat16*, float*, bool*, float*, float*, float*, bfloat16*, float*, bfloat16*)’:
+/tmp/ckry253sjnd53hcycnhihuv6bdalujdd4qdj2ckpytpqytgu5pcj.cpp:11402:31: warning: variable ‘tmp_acc0_arr’ set but not used [-Wunused-but-set-variable]
+11402 |                         float tmp_acc0_arr[32];
+      |                               ^~~~~~~~~~~~
+/tmp/ckry253sjnd53hcycnhihuv6bdalujdd4qdj2ckpytpqytgu5pcj.cpp: In function ‘void cpp_fused__safe_softmax__to_copy_add_bmm_index_index_put_scalar_tensor_stack_where_112(const bfloat16*, const int32_t*, const bfloat16*, const float*, const float*, const bool*, const bfloat16*, float*, float*, bfloat16*, float*, bool*, float*, float*, float*, bfloat16*, float*, bfloat16*)’:
+/tmp/ckry253sjnd53hcycnhihuv6bdalujdd4qdj2ckpytpqytgu5pcj.cpp:11902:31: warning: variable ‘tmp_acc0_arr’ set but not used [-Wunused-but-set-variable]
+11902 |                         float tmp_acc0_arr[32];
+      |                               ^~~~~~~~~~~~
+/tmp/ckry253sjnd53hcycnhihuv6bdalujdd4qdj2ckpytpqytgu5pcj.cpp: In function ‘void cpp_fused__safe_softmax__to_copy_add_bmm_index_index_put_scalar_tensor_stack_where_117(const bfloat16*, const int32_t*, const bfloat16*, const float*, const float*, const bool*, const bfloat16*, float*, float*, bfloat16*, float*, bool*, float*, float*, float*, bfloat16*, float*, bfloat16*)’:
+/tmp/ckry253sjnd53hcycnhihuv6bdalujdd4qdj2ckpytpqytgu5pcj.cpp:12414:31: warning: variable ‘tmp_acc0_arr’ set but not used [-Wunused-but-set-variable]
+12414 |                         float tmp_acc0_arr[32];
+      |                               ^~~~~~~~~~~~
+/tmp/ckry253sjnd53hcycnhihuv6bdalujdd4qdj2ckpytpqytgu5pcj.cpp: In function ‘void cpp_fused__safe_softmax__to_copy_add_bmm_index_index_put_scalar_tensor_stack_where_122(const bfloat16*, const int32_t*, const bfloat16*, const float*, const float*, const bool*, const bfloat16*, float*, float*, bfloat16*, float*, bool*, float*, float*, float*, bfloat16*, float*, bfloat16*)’:
+/tmp/ckry253sjnd53hcycnhihuv6bdalujdd4qdj2ckpytpqytgu5pcj.cpp:12914:31: warning: variable ‘tmp_acc0_arr’ set but not used [-Wunused-but-set-variable]
+12914 |                         float tmp_acc0_arr[32];
+      |                               ^~~~~~~~~~~~
+/tmp/ckry253sjnd53hcycnhihuv6bdalujdd4qdj2ckpytpqytgu5pcj.cpp: In function ‘void cpp_fused__safe_softmax__to_copy_add_bmm_index_index_put_scalar_tensor_stack_where_127(const bfloat16*, const int32_t*, const bfloat16*, const float*, const float*, const bool*, const bfloat16*, float*, float*, bfloat16*, float*, bool*, float*, float*, float*, bfloat16*, float*, bfloat16*)’:
+/tmp/ckry253sjnd53hcycnhihuv6bdalujdd4qdj2ckpytpqytgu5pcj.cpp:13426:31: warning: variable ‘tmp_acc0_arr’ set but not used [-Wunused-but-set-variable]
+13426 |                         float tmp_acc0_arr[32];
+      |                               ^~~~~~~~~~~~
+/tmp/ckry253sjnd53hcycnhihuv6bdalujdd4qdj2ckpytpqytgu5pcj.cpp: In function ‘void cpp_fused__safe_softmax__to_copy_add_bmm_index_index_put_scalar_tensor_stack_where_132(const bfloat16*, const int32_t*, const bfloat16*, const float*, const float*, const bool*, const bfloat16*, float*, float*, bfloat16*, float*, bool*, float*, float*, float*, bfloat16*, float*, bfloat16*)’:
+/tmp/ckry253sjnd53hcycnhihuv6bdalujdd4qdj2ckpytpqytgu5pcj.cpp:13926:31: warning: variable ‘tmp_acc0_arr’ set but not used [-Wunused-but-set-variable]
+13926 |                         float tmp_acc0_arr[32];
+      |                               ^~~~~~~~~~~~
+/tmp/ckry253sjnd53hcycnhihuv6bdalujdd4qdj2ckpytpqytgu5pcj.cpp: In function ‘void cpp_fused__safe_softmax__to_copy_add_bmm_index_index_put_scalar_tensor_stack_where_137(const bfloat16*, const int32_t*, const bfloat16*, const float*, const float*, const bool*, const bfloat16*, float*, float*, bfloat16*, float*, bool*, float*, float*, float*, bfloat16*, float*, bfloat16*)’:
+/tmp/ckry253sjnd53hcycnhihuv6bdalujdd4qdj2ckpytpqytgu5pcj.cpp:14438:31: warning: variable ‘tmp_acc0_arr’ set but not used [-Wunused-but-set-variable]
+14438 |                         float tmp_acc0_arr[32];
+      |                               ^~~~~~~~~~~~
+/tmp/ckry253sjnd53hcycnhihuv6bdalujdd4qdj2ckpytpqytgu5pcj.cpp: In function ‘void cpp_fused__safe_softmax__to_copy_add_bmm_index_index_put_scalar_tensor_stack_where_142(const bfloat16*, const int32_t*, const bfloat16*, const float*, const float*, const bool*, const bfloat16*, float*, float*, bfloat16*, float*, bool*, float*, float*, float*, bfloat16*, float*, bfloat16*)’:
+/tmp/ckry253sjnd53hcycnhihuv6bdalujdd4qdj2ckpytpqytgu5pcj.cpp:14938:31: warning: variable ‘tmp_acc0_arr’ set but not used [-Wunused-but-set-variable]
+14938 |                         float tmp_acc0_arr[32];
+      |                               ^~~~~~~~~~~~
+/tmp/ckry253sjnd53hcycnhihuv6bdalujdd4qdj2ckpytpqytgu5pcj.cpp: In function ‘void cpp_fused__safe_softmax__to_copy_add_bmm_index_index_put_scalar_tensor_stack_where_147(const bfloat16*, const int32_t*, const bfloat16*, const float*, const float*, const bool*, const bfloat16*, float*, float*, bfloat16*, float*, bool*, float*, float*, float*, bfloat16*, float*, bfloat16*)’:
+/tmp/ckry253sjnd53hcycnhihuv6bdalujdd4qdj2ckpytpqytgu5pcj.cpp:15450:31: warning: variable ‘tmp_acc0_arr’ set but not used [-Wunused-but-set-variable]
+15450 |                         float tmp_acc0_arr[32];
+      |                               ^~~~~~~~~~~~
+/tmp/ckry253sjnd53hcycnhihuv6bdalujdd4qdj2ckpytpqytgu5pcj.cpp: In function ‘void cpp_fused__safe_softmax__to_copy_add_bmm_index_index_put_scalar_tensor_stack_where_152(const bfloat16*, const int32_t*, const bfloat16*, const float*, const float*, const bool*, const bfloat16*, float*, float*, bfloat16*, float*, bool*, float*, float*, float*, bfloat16*, float*, bfloat16*)’:
+/tmp/ckry253sjnd53hcycnhihuv6bdalujdd4qdj2ckpytpqytgu5pcj.cpp:15950:31: warning: variable ‘tmp_acc0_arr’ set but not used [-Wunused-but-set-variable]
+15950 |                         float tmp_acc0_arr[32];
+      |                               ^~~~~~~~~~~~
+/tmp/ckry253sjnd53hcycnhihuv6bdalujdd4qdj2ckpytpqytgu5pcj.cpp: In function ‘void cpp_fused__safe_softmax__to_copy_add_bmm_index_index_put_scalar_tensor_stack_where_157(const bfloat16*, const int32_t*, const bfloat16*, const float*, const float*, const bool*, const bfloat16*, float*, float*, bfloat16*, float*, bool*, float*, float*, float*, bfloat16*, float*, bfloat16*)’:
+/tmp/ckry253sjnd53hcycnhihuv6bdalujdd4qdj2ckpytpqytgu5pcj.cpp:16462:31: warning: variable ‘tmp_acc0_arr’ set but not used [-Wunused-but-set-variable]
+16462 |                         float tmp_acc0_arr[32];
+      |                               ^~~~~~~~~~~~
+Unabled to import torchao experimental quant_api with error:  [Errno 2] No such file or directory: '/home/jackkhuu/oss/torchchat/torchao-build/src/ao/torchao/experimental/quant_api.py'
+Using device=cpu
+Setting max_seq_length to 300 for DSO export.
+Loading model...
+Time to load model: 0.11 seconds
+Quantizing the model with: {'precision': {'dtype': 'bfloat16'}, 'executor': {'accelerator': 'cpu'}}
+Time to quantize model: 0.01 seconds
+-----------------------------------------------------------
+Exporting model using AOT Inductor to /tmp/model16.so
+WARNING!! The path of compiling a dso is deprecated. Please use --output-aoti-package-path to create a .pt2 artifact instead.
+The generated packaged model can be found at: /tmp/model16.so
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+OMP_NUM_THREADS=16 numactl --cpunodebind=0 --membind=0 python3 torchchat.py generate llama3.1 --dso-path /tmp/model16.so --prompt "Once upon a time," --max-new-tokens 256 --device cpu --num-samples 3
+PyTorch version 2.6.0.dev20241218+cu124 available.
+W1218 20:52:03.983966 1768041 site-packages/torch/_export/__init__.py:276] +============================+
+W1218 20:52:03.984513 1768041 site-packages/torch/_export/__init__.py:277] |     !!!   WARNING   !!!    |
+W1218 20:52:03.984724 1768041 site-packages/torch/_export/__init__.py:278] +============================+
+W1218 20:52:03.984975 1768041 site-packages/torch/_export/__init__.py:279] torch._export.aot_compile()/torch._export.aot_load() is being deprecated, please switch to directly calling torch._inductor.aoti_compile_and_package(torch.export.export())/torch._inductor.aoti_load_package() instead.
+Unabled to import torchao experimental quant_api with error:  [Errno 2] No such file or directory: '/home/jackkhuu/oss/torchchat/torchao-build/src/ao/torchao/experimental/quant_api.py'
+Warning: checkpoint path ignored because an exported model was specified using a DSO, AOTI PACKAGE or PTE path argument
+Warning: checkpoint path ignored because an exported model was specified using a DSO, AOTI PACKAGE or PTE path argument
+Using device=cpu Intel(R) Xeon(R) Platinum 8339HC CPU @ 1.80GHz
+Loading model...
+Time to load model: 0.52 seconds
+-----------------------------------------------------------
+Once upon a time, in a small village, there lived a young man named Kaito. Kaito was an extraordinary individual, blessed with a unique gift – he could see and communicate with the spirits of the dead. This gift allowed him to connect with the souls of those who had passed away, offering them peace and closure as they transitioned to the afterlife.
+
+As Kaito's abilities developed, he began to realize the weight of his responsibility. With great power comes great burden, and Kaito felt the crushing pressure of carrying the secrets and stories of the dead. The villagers, unaware of Kaito's gift, would often seek his help in times of crisis, and he found solace in being able to provide solace to those in need.
+
+One fateful evening, a mysterious woman appeared in the village. Her presence was shrouded in an aura of sorrow, and Kaito sensed that she was not like the others. She introduced herself as Akane, a spirit guide who had been tasked with searching for a specific soul. Akane had been tracking the spirit of a young girl who had died under tragic circumstances, and she believed that Kaito possessed the ability to help her locate the girl's spirit.
+
+Kaito, intrigued by Ak
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~                
+Generated 255 tokens                 
+Time for inference 1: 315.7137 sec total                 
+Time to first token: 10.6364 sec with sequential prefill.                
+
+      Total throughput: 0.8109 tokens/sec, 1.2333 s/token                 
+First token throughput: 0.0940 tokens/sec, 10.6364 s/token                 
+ Next token throughput: 0.8359 tokens/sec, 1.1964 s/token                     
+
+Bandwidth achieved: 0.00 GB/s
+*** This first iteration will include cold start effects for dynamic import, hardware caches. ***
+
+========================================
+
+Once upon a time, in a far-off place, there lived a large group of animals who were all different in shape, size, and color. They lived together in a beautiful forest called Forestville.
+In Forestville, there was a big, tall tree that stood out among all the other trees. It was the tallest tree in the forest, and its trunk was thick and wide. The animals called this tree "Tree-mendous."
+Tree-mendous was a very proud tree. He loved to stand tall and feel the wind blowing through his leaves. He thought he was the best tree in the forest, and he looked down on all the other trees.
+One day, a wise old owl named Professor Hootenanny flew into Forestville. Professor Hootenanny was known for being very wise and kind. He looked at Tree-mendous with his big, round eyes and said, "Tree-mendous, you are a wonderful tree, but have you ever noticed that the other trees in the forest are special too?"
+Tree-mendous looked at the other trees and saw that they were all different. There were trees with thin trunks, trees with wide branches, and trees with leaves that shimmered in the sunlight. He thought to himself, "I'm not
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~                
+Generated 255 tokens                 
+Time for inference 2: 146.4761 sec total                 
+Time to first token: 3.1811 sec with sequential prefill.                
+
+      Total throughput: 1.7477 tokens/sec, 0.5722 s/token                 
+First token throughput: 0.3144 tokens/sec, 3.1811 s/token                 
+ Next token throughput: 1.7795 tokens/sec, 0.5619 s/token                     
+
+Bandwidth achieved: 0.00 GB/s
+
+========================================
+
+Once upon a time, in a land far, far away, there was a beautiful island called Zanzibar. The island had lush green forests, stunning beaches, and crystal-clear waters lapped gently against the shore. It was a paradise on earth.
+On this enchanted island, there lived a young girl named Layla. Layla lived in a small village surrounded by tall palm trees and buzzing insects. She loved exploring the island, discovering hidden coves and secret waterfalls. She was fascinated by the sea and spent most of her days swimming, snorkeling, or simply watching the waves.
+One day, while Layla was out on a swim, she noticed a peculiar object floating towards her. As she got closer, she realized it was a beautiful wooden trunk, adorned with intricate carvings. The trunk was made of a type of wood she had never seen before – a deep, rich brown that seemed almost otherworldly.
+Layla was intrigued and decided to bring the trunk to the shore. As she lifted it out of the water, she felt an unusual energy emanating from it. She couldn't explain it, but she sensed that the trunk was special – that it might hold secrets or treasures beyond her wildest dreams.
+Back in the village, Layla showed the trunk to
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~                
+Generated 255 tokens                 
+Time for inference 3: 189.9574 sec total                 
+Time to first token: 3.5927 sec with sequential prefill.                
+
+      Total throughput: 1.3477 tokens/sec, 0.7420 s/token                 
+First token throughput: 0.2783 tokens/sec, 3.5927 s/token                 
+ Next token throughput: 1.3683 tokens/sec, 0.7308 s/token                     
+
+Bandwidth achieved: 0.00 GB/s
+
+========================================
+
+
+Warning: Excluding compile in calculations                 
+      Average tokens/sec (total): 1.30                 
+Average tokens/sec (first token): 0.23                 
+Average tokens/sec (next tokens): 1.33 
+                
+Memory used: 0.00 GB
diff --git a/llama31-1218/cpu_aoti_pt2_4.txt b/llama31-1218/cpu_aoti_pt2_4.txt
new file mode 100644
index 000000000..2c62cf059
--- /dev/null
+++ b/llama31-1218/cpu_aoti_pt2_4.txt
@@ -0,0 +1,675 @@
+python3 torchchat.py export llama3.1 --quantize '{"linear:int4": {"groupsize": 256}, "precision": {"dtype":"bfloat16"}, "executor":{"accelerator":"cpu"}}' --output-aoti-package-path /tmp/model34.pt2
+OMP_NUM_THREADS=16 numactl --cpunodebind=0 --membind=0 python3 torchchat.py generate llama3.1 --aoti-package-path /tmp/model34.pt2 --prompt "Once upon a time," --max-new-tokens 256 --device cpu --num-samples 3
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+python3 torchchat.py export llama3.1 --quantize '{"linear:int4": {"groupsize": 256}, "precision": {"dtype":"bfloat16"}, "executor":{"accelerator":"cpu"}}' --output-aoti-package-path /tmp/model34.pt2
+Note: NumExpr detected 22 cores but "NUMEXPR_MAX_THREADS" not set, so enforcing safe limit of 16.
+NumExpr defaulting to 16 threads.
+PyTorch version 2.6.0.dev20241218+cu124 available.
+linear: model.layers.0.attention.wq, in=4096, out=4096
+linear: model.layers.0.attention.wk, in=4096, out=1024
+linear: model.layers.0.attention.wv, in=4096, out=1024
+linear: model.layers.0.attention.wo, in=4096, out=4096
+linear: model.layers.0.feed_forward.w1, in=4096, out=14336
+linear: model.layers.0.feed_forward.w2, in=14336, out=4096
+linear: model.layers.0.feed_forward.w3, in=4096, out=14336
+linear: model.layers.1.attention.wq, in=4096, out=4096
+linear: model.layers.1.attention.wk, in=4096, out=1024
+linear: model.layers.1.attention.wv, in=4096, out=1024
+linear: model.layers.1.attention.wo, in=4096, out=4096
+linear: model.layers.1.feed_forward.w1, in=4096, out=14336
+linear: model.layers.1.feed_forward.w2, in=14336, out=4096
+linear: model.layers.1.feed_forward.w3, in=4096, out=14336
+linear: model.layers.2.attention.wq, in=4096, out=4096
+linear: model.layers.2.attention.wk, in=4096, out=1024
+linear: model.layers.2.attention.wv, in=4096, out=1024
+linear: model.layers.2.attention.wo, in=4096, out=4096
+linear: model.layers.2.feed_forward.w1, in=4096, out=14336
+linear: model.layers.2.feed_forward.w2, in=14336, out=4096
+linear: model.layers.2.feed_forward.w3, in=4096, out=14336
+linear: model.layers.3.attention.wq, in=4096, out=4096
+linear: model.layers.3.attention.wk, in=4096, out=1024
+linear: model.layers.3.attention.wv, in=4096, out=1024
+linear: model.layers.3.attention.wo, in=4096, out=4096
+linear: model.layers.3.feed_forward.w1, in=4096, out=14336
+linear: model.layers.3.feed_forward.w2, in=14336, out=4096
+linear: model.layers.3.feed_forward.w3, in=4096, out=14336
+linear: model.layers.4.attention.wq, in=4096, out=4096
+linear: model.layers.4.attention.wk, in=4096, out=1024
+linear: model.layers.4.attention.wv, in=4096, out=1024
+linear: model.layers.4.attention.wo, in=4096, out=4096
+linear: model.layers.4.feed_forward.w1, in=4096, out=14336
+linear: model.layers.4.feed_forward.w2, in=14336, out=4096
+linear: model.layers.4.feed_forward.w3, in=4096, out=14336
+linear: model.layers.5.attention.wq, in=4096, out=4096
+linear: model.layers.5.attention.wk, in=4096, out=1024
+linear: model.layers.5.attention.wv, in=4096, out=1024
+linear: model.layers.5.attention.wo, in=4096, out=4096
+linear: model.layers.5.feed_forward.w1, in=4096, out=14336
+linear: model.layers.5.feed_forward.w2, in=14336, out=4096
+linear: model.layers.5.feed_forward.w3, in=4096, out=14336
+linear: model.layers.6.attention.wq, in=4096, out=4096
+linear: model.layers.6.attention.wk, in=4096, out=1024
+linear: model.layers.6.attention.wv, in=4096, out=1024
+linear: model.layers.6.attention.wo, in=4096, out=4096
+linear: model.layers.6.feed_forward.w1, in=4096, out=14336
+linear: model.layers.6.feed_forward.w2, in=14336, out=4096
+linear: model.layers.6.feed_forward.w3, in=4096, out=14336
+linear: model.layers.7.attention.wq, in=4096, out=4096
+linear: model.layers.7.attention.wk, in=4096, out=1024
+linear: model.layers.7.attention.wv, in=4096, out=1024
+linear: model.layers.7.attention.wo, in=4096, out=4096
+linear: model.layers.7.feed_forward.w1, in=4096, out=14336
+linear: model.layers.7.feed_forward.w2, in=14336, out=4096
+linear: model.layers.7.feed_forward.w3, in=4096, out=14336
+linear: model.layers.8.attention.wq, in=4096, out=4096
+linear: model.layers.8.attention.wk, in=4096, out=1024
+linear: model.layers.8.attention.wv, in=4096, out=1024
+linear: model.layers.8.attention.wo, in=4096, out=4096
+linear: model.layers.8.feed_forward.w1, in=4096, out=14336
+linear: model.layers.8.feed_forward.w2, in=14336, out=4096
+linear: model.layers.8.feed_forward.w3, in=4096, out=14336
+linear: model.layers.9.attention.wq, in=4096, out=4096
+linear: model.layers.9.attention.wk, in=4096, out=1024
+linear: model.layers.9.attention.wv, in=4096, out=1024
+linear: model.layers.9.attention.wo, in=4096, out=4096
+linear: model.layers.9.feed_forward.w1, in=4096, out=14336
+linear: model.layers.9.feed_forward.w2, in=14336, out=4096
+linear: model.layers.9.feed_forward.w3, in=4096, out=14336
+linear: model.layers.10.attention.wq, in=4096, out=4096
+linear: model.layers.10.attention.wk, in=4096, out=1024
+linear: model.layers.10.attention.wv, in=4096, out=1024
+linear: model.layers.10.attention.wo, in=4096, out=4096
+linear: model.layers.10.feed_forward.w1, in=4096, out=14336
+linear: model.layers.10.feed_forward.w2, in=14336, out=4096
+linear: model.layers.10.feed_forward.w3, in=4096, out=14336
+linear: model.layers.11.attention.wq, in=4096, out=4096
+linear: model.layers.11.attention.wk, in=4096, out=1024
+linear: model.layers.11.attention.wv, in=4096, out=1024
+linear: model.layers.11.attention.wo, in=4096, out=4096
+linear: model.layers.11.feed_forward.w1, in=4096, out=14336
+linear: model.layers.11.feed_forward.w2, in=14336, out=4096
+linear: model.layers.11.feed_forward.w3, in=4096, out=14336
+linear: model.layers.12.attention.wq, in=4096, out=4096
+linear: model.layers.12.attention.wk, in=4096, out=1024
+linear: model.layers.12.attention.wv, in=4096, out=1024
+linear: model.layers.12.attention.wo, in=4096, out=4096
+linear: model.layers.12.feed_forward.w1, in=4096, out=14336
+linear: model.layers.12.feed_forward.w2, in=14336, out=4096
+linear: model.layers.12.feed_forward.w3, in=4096, out=14336
+linear: model.layers.13.attention.wq, in=4096, out=4096
+linear: model.layers.13.attention.wk, in=4096, out=1024
+linear: model.layers.13.attention.wv, in=4096, out=1024
+linear: model.layers.13.attention.wo, in=4096, out=4096
+linear: model.layers.13.feed_forward.w1, in=4096, out=14336
+linear: model.layers.13.feed_forward.w2, in=14336, out=4096
+linear: model.layers.13.feed_forward.w3, in=4096, out=14336
+linear: model.layers.14.attention.wq, in=4096, out=4096
+linear: model.layers.14.attention.wk, in=4096, out=1024
+linear: model.layers.14.attention.wv, in=4096, out=1024
+linear: model.layers.14.attention.wo, in=4096, out=4096
+linear: model.layers.14.feed_forward.w1, in=4096, out=14336
+linear: model.layers.14.feed_forward.w2, in=14336, out=4096
+linear: model.layers.14.feed_forward.w3, in=4096, out=14336
+linear: model.layers.15.attention.wq, in=4096, out=4096
+linear: model.layers.15.attention.wk, in=4096, out=1024
+linear: model.layers.15.attention.wv, in=4096, out=1024
+linear: model.layers.15.attention.wo, in=4096, out=4096
+linear: model.layers.15.feed_forward.w1, in=4096, out=14336
+linear: model.layers.15.feed_forward.w2, in=14336, out=4096
+linear: model.layers.15.feed_forward.w3, in=4096, out=14336
+linear: model.layers.16.attention.wq, in=4096, out=4096
+linear: model.layers.16.attention.wk, in=4096, out=1024
+linear: model.layers.16.attention.wv, in=4096, out=1024
+linear: model.layers.16.attention.wo, in=4096, out=4096
+linear: model.layers.16.feed_forward.w1, in=4096, out=14336
+linear: model.layers.16.feed_forward.w2, in=14336, out=4096
+linear: model.layers.16.feed_forward.w3, in=4096, out=14336
+linear: model.layers.17.attention.wq, in=4096, out=4096
+linear: model.layers.17.attention.wk, in=4096, out=1024
+linear: model.layers.17.attention.wv, in=4096, out=1024
+linear: model.layers.17.attention.wo, in=4096, out=4096
+linear: model.layers.17.feed_forward.w1, in=4096, out=14336
+linear: model.layers.17.feed_forward.w2, in=14336, out=4096
+linear: model.layers.17.feed_forward.w3, in=4096, out=14336
+linear: model.layers.18.attention.wq, in=4096, out=4096
+linear: model.layers.18.attention.wk, in=4096, out=1024
+linear: model.layers.18.attention.wv, in=4096, out=1024
+linear: model.layers.18.attention.wo, in=4096, out=4096
+linear: model.layers.18.feed_forward.w1, in=4096, out=14336
+linear: model.layers.18.feed_forward.w2, in=14336, out=4096
+linear: model.layers.18.feed_forward.w3, in=4096, out=14336
+linear: model.layers.19.attention.wq, in=4096, out=4096
+linear: model.layers.19.attention.wk, in=4096, out=1024
+linear: model.layers.19.attention.wv, in=4096, out=1024
+linear: model.layers.19.attention.wo, in=4096, out=4096
+linear: model.layers.19.feed_forward.w1, in=4096, out=14336
+linear: model.layers.19.feed_forward.w2, in=14336, out=4096
+linear: model.layers.19.feed_forward.w3, in=4096, out=14336
+linear: model.layers.20.attention.wq, in=4096, out=4096
+linear: model.layers.20.attention.wk, in=4096, out=1024
+linear: model.layers.20.attention.wv, in=4096, out=1024
+linear: model.layers.20.attention.wo, in=4096, out=4096
+linear: model.layers.20.feed_forward.w1, in=4096, out=14336
+linear: model.layers.20.feed_forward.w2, in=14336, out=4096
+linear: model.layers.20.feed_forward.w3, in=4096, out=14336
+linear: model.layers.21.attention.wq, in=4096, out=4096
+linear: model.layers.21.attention.wk, in=4096, out=1024
+linear: model.layers.21.attention.wv, in=4096, out=1024
+linear: model.layers.21.attention.wo, in=4096, out=4096
+linear: model.layers.21.feed_forward.w1, in=4096, out=14336
+linear: model.layers.21.feed_forward.w2, in=14336, out=4096
+linear: model.layers.21.feed_forward.w3, in=4096, out=14336
+linear: model.layers.22.attention.wq, in=4096, out=4096
+linear: model.layers.22.attention.wk, in=4096, out=1024
+linear: model.layers.22.attention.wv, in=4096, out=1024
+linear: model.layers.22.attention.wo, in=4096, out=4096
+linear: model.layers.22.feed_forward.w1, in=4096, out=14336
+linear: model.layers.22.feed_forward.w2, in=14336, out=4096
+linear: model.layers.22.feed_forward.w3, in=4096, out=14336
+linear: model.layers.23.attention.wq, in=4096, out=4096
+linear: model.layers.23.attention.wk, in=4096, out=1024
+linear: model.layers.23.attention.wv, in=4096, out=1024
+linear: model.layers.23.attention.wo, in=4096, out=4096
+linear: model.layers.23.feed_forward.w1, in=4096, out=14336
+linear: model.layers.23.feed_forward.w2, in=14336, out=4096
+linear: model.layers.23.feed_forward.w3, in=4096, out=14336
+linear: model.layers.24.attention.wq, in=4096, out=4096
+linear: model.layers.24.attention.wk, in=4096, out=1024
+linear: model.layers.24.attention.wv, in=4096, out=1024
+linear: model.layers.24.attention.wo, in=4096, out=4096
+linear: model.layers.24.feed_forward.w1, in=4096, out=14336
+linear: model.layers.24.feed_forward.w2, in=14336, out=4096
+linear: model.layers.24.feed_forward.w3, in=4096, out=14336
+linear: model.layers.25.attention.wq, in=4096, out=4096
+linear: model.layers.25.attention.wk, in=4096, out=1024
+linear: model.layers.25.attention.wv, in=4096, out=1024
+linear: model.layers.25.attention.wo, in=4096, out=4096
+linear: model.layers.25.feed_forward.w1, in=4096, out=14336
+linear: model.layers.25.feed_forward.w2, in=14336, out=4096
+linear: model.layers.25.feed_forward.w3, in=4096, out=14336
+linear: model.layers.26.attention.wq, in=4096, out=4096
+linear: model.layers.26.attention.wk, in=4096, out=1024
+linear: model.layers.26.attention.wv, in=4096, out=1024
+linear: model.layers.26.attention.wo, in=4096, out=4096
+linear: model.layers.26.feed_forward.w1, in=4096, out=14336
+linear: model.layers.26.feed_forward.w2, in=14336, out=4096
+linear: model.layers.26.feed_forward.w3, in=4096, out=14336
+linear: model.layers.27.attention.wq, in=4096, out=4096
+linear: model.layers.27.attention.wk, in=4096, out=1024
+linear: model.layers.27.attention.wv, in=4096, out=1024
+linear: model.layers.27.attention.wo, in=4096, out=4096
+linear: model.layers.27.feed_forward.w1, in=4096, out=14336
+linear: model.layers.27.feed_forward.w2, in=14336, out=4096
+linear: model.layers.27.feed_forward.w3, in=4096, out=14336
+linear: model.layers.28.attention.wq, in=4096, out=4096
+linear: model.layers.28.attention.wk, in=4096, out=1024
+linear: model.layers.28.attention.wv, in=4096, out=1024
+linear: model.layers.28.attention.wo, in=4096, out=4096
+linear: model.layers.28.feed_forward.w1, in=4096, out=14336
+linear: model.layers.28.feed_forward.w2, in=14336, out=4096
+linear: model.layers.28.feed_forward.w3, in=4096, out=14336
+linear: model.layers.29.attention.wq, in=4096, out=4096
+linear: model.layers.29.attention.wk, in=4096, out=1024
+linear: model.layers.29.attention.wv, in=4096, out=1024
+linear: model.layers.29.attention.wo, in=4096, out=4096
+linear: model.layers.29.feed_forward.w1, in=4096, out=14336
+linear: model.layers.29.feed_forward.w2, in=14336, out=4096
+linear: model.layers.29.feed_forward.w3, in=4096, out=14336
+linear: model.layers.30.attention.wq, in=4096, out=4096
+linear: model.layers.30.attention.wk, in=4096, out=1024
+linear: model.layers.30.attention.wv, in=4096, out=1024
+linear: model.layers.30.attention.wo, in=4096, out=4096
+linear: model.layers.30.feed_forward.w1, in=4096, out=14336
+linear: model.layers.30.feed_forward.w2, in=14336, out=4096
+linear: model.layers.30.feed_forward.w3, in=4096, out=14336
+linear: model.layers.31.attention.wq, in=4096, out=4096
+linear: model.layers.31.attention.wk, in=4096, out=1024
+linear: model.layers.31.attention.wv, in=4096, out=1024
+linear: model.layers.31.attention.wo, in=4096, out=4096
+linear: model.layers.31.feed_forward.w1, in=4096, out=14336
+linear: model.layers.31.feed_forward.w2, in=14336, out=4096
+linear: model.layers.31.feed_forward.w3, in=4096, out=14336
+linear: model.output, in=4096, out=128256
+W1218 22:11:23.372608 2802828 site-packages/torch/_export/__init__.py:276] +============================+
+W1218 22:11:23.373078 2802828 site-packages/torch/_export/__init__.py:277] |     !!!   WARNING   !!!    |
+W1218 22:11:23.373262 2802828 site-packages/torch/_export/__init__.py:278] +============================+
+W1218 22:11:23.373430 2802828 site-packages/torch/_export/__init__.py:279] torch._export.aot_compile()/torch._export.aot_load() is being deprecated, please switch to directly calling torch._inductor.aoti_compile_and_package(torch.export.export())/torch._inductor.aoti_load_package() instead.
+W1218 22:12:46.461726 2802828 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 22:12:46.468596 2802828 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 22:12:46.470198 2802828 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 22:12:47.201159 2802828 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 22:12:47.297874 2802828 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 22:12:47.300174 2802828 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 22:12:47.339769 2802828 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 22:12:47.433822 2802828 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 22:12:47.475113 2802828 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 22:12:47.476532 2802828 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 22:12:47.660433 2802828 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 22:12:47.736146 2802828 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 22:12:47.737496 2802828 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 22:12:47.757074 2802828 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 22:12:47.855982 2802828 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 22:12:47.857260 2802828 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 22:12:47.858124 2802828 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 22:12:48.082039 2802828 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 22:12:48.143729 2802828 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 22:12:48.145647 2802828 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 22:12:48.165501 2802828 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 22:12:48.235743 2802828 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 22:12:48.277133 2802828 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 22:12:48.278588 2802828 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 22:12:48.462252 2802828 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 22:12:48.538017 2802828 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 22:12:48.539429 2802828 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 22:12:48.559775 2802828 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 22:12:48.659585 2802828 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 22:12:48.660865 2802828 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 22:12:48.661805 2802828 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 22:12:48.887362 2802828 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 22:12:48.947196 2802828 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 22:12:48.948890 2802828 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 22:12:48.968614 2802828 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 22:12:49.039291 2802828 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 22:12:49.079915 2802828 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 22:12:49.081361 2802828 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 22:12:49.264541 2802828 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 22:12:49.338868 2802828 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 22:12:49.340224 2802828 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 22:12:49.362356 2802828 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 22:12:49.460777 2802828 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 22:12:49.462046 2802828 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 22:12:49.462922 2802828 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 22:12:49.689336 2802828 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 22:12:49.751157 2802828 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 22:12:49.752787 2802828 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 22:12:49.772349 2802828 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 22:12:49.840305 2802828 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 22:12:49.880804 2802828 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 22:12:49.882225 2802828 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 22:12:50.070218 2802828 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 22:12:50.149145 2802828 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 22:12:50.151005 2802828 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 22:12:50.173357 2802828 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 22:12:50.273503 2802828 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 22:12:50.274716 2802828 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 22:12:50.275571 2802828 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 22:12:50.510883 2802828 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 22:12:50.572785 2802828 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 22:12:50.574409 2802828 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 22:12:50.595184 2802828 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 22:12:50.669406 2802828 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 22:12:50.711164 2802828 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 22:12:50.712607 2802828 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 22:12:50.897115 2802828 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 22:12:50.973845 2802828 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 22:12:50.975217 2802828 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 22:12:50.995077 2802828 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 22:12:51.098264 2802828 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 22:12:51.099516 2802828 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 22:12:51.100359 2802828 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 22:12:51.338036 2802828 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 22:12:51.401548 2802828 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 22:12:51.403246 2802828 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 22:12:51.426133 2802828 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 22:12:51.497843 2802828 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 22:12:51.542722 2802828 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 22:12:51.544188 2802828 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 22:12:51.737831 2802828 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 22:12:51.821027 2802828 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 22:12:51.822437 2802828 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 22:12:51.843536 2802828 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 22:12:51.947347 2802828 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 22:12:51.948610 2802828 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 22:12:51.949471 2802828 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 22:12:52.189249 2802828 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 22:12:52.254444 2802828 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 22:12:52.256206 2802828 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 22:12:52.276727 2802828 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 22:12:52.347008 2802828 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 22:12:52.390086 2802828 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 22:12:52.391535 2802828 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 22:12:52.589207 2802828 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 22:12:52.666415 2802828 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 22:12:52.667842 2802828 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 22:12:52.688168 2802828 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 22:12:52.823189 2802828 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 22:12:52.824411 2802828 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 22:12:52.825266 2802828 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 22:12:53.058931 2802828 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 22:12:53.122122 2802828 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 22:12:53.123764 2802828 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 22:12:53.144212 2802828 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 22:12:53.216144 2802828 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 22:12:53.260614 2802828 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 22:12:53.262092 2802828 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 22:12:53.457319 2802828 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 22:12:53.537926 2802828 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 22:12:53.539318 2802828 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 22:12:53.559041 2802828 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 22:12:53.658147 2802828 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 22:12:53.659414 2802828 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 22:12:53.660297 2802828 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 22:12:53.884375 2802828 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 22:12:53.943917 2802828 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 22:12:53.945912 2802828 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 22:12:53.965394 2802828 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 22:12:54.034184 2802828 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 22:12:54.075333 2802828 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 22:12:54.076703 2802828 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 22:12:54.266802 2802828 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 22:12:54.341948 2802828 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 22:12:54.343383 2802828 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 22:12:54.363815 2802828 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 22:12:54.470491 2802828 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 22:12:54.471735 2802828 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 22:12:54.472580 2802828 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 22:12:54.714915 2802828 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 22:12:54.779760 2802828 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 22:12:54.781448 2802828 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 22:12:54.804198 2802828 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 22:12:54.882733 2802828 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 22:12:54.927253 2802828 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 22:12:54.928916 2802828 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 22:12:55.123911 2802828 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 22:12:55.202120 2802828 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 22:12:55.203631 2802828 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 22:12:55.227060 2802828 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 22:12:55.334188 2802828 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 22:12:55.335349 2802828 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 22:12:55.336208 2802828 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 22:12:55.556961 2802828 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 22:12:55.613975 2802828 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 22:12:55.615412 2802828 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 22:12:55.633921 2802828 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 22:12:55.701146 2802828 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 22:12:55.741412 2802828 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 22:12:55.742688 2802828 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 22:12:55.920435 2802828 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 22:12:55.995349 2802828 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 22:12:55.996612 2802828 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 22:12:56.016690 2802828 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 22:12:56.115525 2802828 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 22:12:56.116807 2802828 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 22:12:56.117607 2802828 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 22:12:56.360300 2802828 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 22:12:56.418181 2802828 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 22:12:56.419783 2802828 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 22:12:56.439618 2802828 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 22:12:56.510113 2802828 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 22:12:56.550720 2802828 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 22:12:56.552163 2802828 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 22:12:56.740461 2802828 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 22:12:56.816210 2802828 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 22:12:56.817551 2802828 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 22:12:56.837262 2802828 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 22:12:56.936809 2802828 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 22:12:56.937974 2802828 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 22:12:56.938823 2802828 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 22:12:57.165059 2802828 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 22:12:57.226573 2802828 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 22:12:57.228243 2802828 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 22:12:57.251551 2802828 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 22:12:57.320437 2802828 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 22:12:57.362297 2802828 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 22:12:57.363722 2802828 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 22:12:57.550893 2802828 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 22:12:57.628423 2802828 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 22:12:57.629752 2802828 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 22:12:57.649501 2802828 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 22:12:57.748721 2802828 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 22:12:57.749920 2802828 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 22:12:57.750804 2802828 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 22:12:57.972223 2802828 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 22:12:58.032149 2802828 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 22:12:58.033773 2802828 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 22:12:58.054236 2802828 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 22:12:58.125870 2802828 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 22:12:58.170925 2802828 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 22:12:58.172387 2802828 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 22:12:58.362270 2802828 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 22:12:58.437392 2802828 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 22:12:58.438791 2802828 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 22:12:58.458563 2802828 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 22:12:58.558342 2802828 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 22:12:58.559588 2802828 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 22:12:58.560432 2802828 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 22:12:58.789718 2802828 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 22:12:58.850277 2802828 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 22:12:58.851915 2802828 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 22:12:58.872097 2802828 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 22:12:58.944486 2802828 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 22:12:58.986999 2802828 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 22:12:58.988436 2802828 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 22:12:59.178077 2802828 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 22:12:59.256412 2802828 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 22:12:59.257729 2802828 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 22:12:59.277033 2802828 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 22:12:59.374599 2802828 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 22:12:59.375792 2802828 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 22:12:59.376591 2802828 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 22:12:59.599847 2802828 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 22:12:59.662117 2802828 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 22:12:59.664037 2802828 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 22:12:59.684201 2802828 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 22:12:59.751502 2802828 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 22:12:59.791753 2802828 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 22:12:59.793139 2802828 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 22:12:59.980031 2802828 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 22:13:00.054829 2802828 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 22:13:00.056214 2802828 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 22:13:00.076529 2802828 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 22:13:00.127331 2802828 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm_for_cpu.default is missing a c-shim implementation, using proxy executor as fallback
+/tmp/torchinductor_jackkhuu/c5o2v2ioswxwskfczjnkmp3pd2krfij5mcm77ewxglwjn3bvjwlp/cehyjtgmju2nn6npfywx2xkxsjwkj6rb45gv6xespkiyuceqg6xn.cpp: In function ‘void cpp_fused__safe_softmax__weight_int4pack_mm_for_cpu_add_bmm_index_index_put_scalar_tensor_stack_where_1(const bfloat16*, const int32_t*, const bfloat16*, const bfloat16*, const float*, const float*, const bool*, const bfloat16*, float*, float*, float*, float*, bfloat16*, float*, bool*, float*, float*, float*, bfloat16*, float*, bfloat16*)’:
+/tmp/torchinductor_jackkhuu/c5o2v2ioswxwskfczjnkmp3pd2krfij5mcm77ewxglwjn3bvjwlp/cehyjtgmju2nn6npfywx2xkxsjwkj6rb45gv6xespkiyuceqg6xn.cpp:738:31: warning: variable ‘tmp_acc0_arr’ set but not used [-Wunused-but-set-variable]
+  738 |                         float tmp_acc0_arr[32];
+      |                               ^~~~~~~~~~~~
+/tmp/torchinductor_jackkhuu/c5o2v2ioswxwskfczjnkmp3pd2krfij5mcm77ewxglwjn3bvjwlp/cehyjtgmju2nn6npfywx2xkxsjwkj6rb45gv6xespkiyuceqg6xn.cpp: In function ‘void cpp_fused__safe_softmax__weight_int4pack_mm_for_cpu_add_bmm_index_index_put_scalar_tensor_stack_where_6(const bfloat16*, const int32_t*, const bfloat16*, const float*, const float*, const bool*, const bfloat16*, float*, float*, bfloat16*, float*, bool*, float*, float*, float*, bfloat16*, float*, bfloat16*)’:
+/tmp/torchinductor_jackkhuu/c5o2v2ioswxwskfczjnkmp3pd2krfij5mcm77ewxglwjn3bvjwlp/cehyjtgmju2nn6npfywx2xkxsjwkj6rb45gv6xespkiyuceqg6xn.cpp:1274:31: warning: variable ‘tmp_acc0_arr’ set but not used [-Wunused-but-set-variable]
+ 1274 |                         float tmp_acc0_arr[32];
+      |                               ^~~~~~~~~~~~
+/tmp/torchinductor_jackkhuu/c5o2v2ioswxwskfczjnkmp3pd2krfij5mcm77ewxglwjn3bvjwlp/cehyjtgmju2nn6npfywx2xkxsjwkj6rb45gv6xespkiyuceqg6xn.cpp: In function ‘void cpp_fused__safe_softmax__weight_int4pack_mm_for_cpu_add_bmm_index_index_put_scalar_tensor_stack_where_10(const bfloat16*, const int32_t*, const bfloat16*, const bfloat16*, const float*, const float*, const bool*, const bfloat16*, float*, float*, float*, float*, bfloat16*, float*, bool*, float*, float*, float*, bfloat16*, float*, bfloat16*)’:
+/tmp/torchinductor_jackkhuu/c5o2v2ioswxwskfczjnkmp3pd2krfij5mcm77ewxglwjn3bvjwlp/cehyjtgmju2nn6npfywx2xkxsjwkj6rb45gv6xespkiyuceqg6xn.cpp:1780:31: warning: variable ‘tmp_acc0_arr’ set but not used [-Wunused-but-set-variable]
+ 1780 |                         float tmp_acc0_arr[32];
+      |                               ^~~~~~~~~~~~
+/tmp/torchinductor_jackkhuu/c5o2v2ioswxwskfczjnkmp3pd2krfij5mcm77ewxglwjn3bvjwlp/cehyjtgmju2nn6npfywx2xkxsjwkj6rb45gv6xespkiyuceqg6xn.cpp: In function ‘void cpp_fused__safe_softmax__weight_int4pack_mm_for_cpu_add_bmm_index_index_put_scalar_tensor_stack_where_15(const bfloat16*, const int32_t*, const bfloat16*, const float*, const float*, const bool*, const bfloat16*, float*, float*, bfloat16*, float*, bool*, float*, float*, float*, bfloat16*, float*, bfloat16*)’:
+/tmp/torchinductor_jackkhuu/c5o2v2ioswxwskfczjnkmp3pd2krfij5mcm77ewxglwjn3bvjwlp/cehyjtgmju2nn6npfywx2xkxsjwkj6rb45gv6xespkiyuceqg6xn.cpp:2292:31: warning: variable ‘tmp_acc0_arr’ set but not used [-Wunused-but-set-variable]
+ 2292 |                         float tmp_acc0_arr[32];
+      |                               ^~~~~~~~~~~~
+/tmp/torchinductor_jackkhuu/c5o2v2ioswxwskfczjnkmp3pd2krfij5mcm77ewxglwjn3bvjwlp/cehyjtgmju2nn6npfywx2xkxsjwkj6rb45gv6xespkiyuceqg6xn.cpp: In function ‘void cpp_fused__safe_softmax__weight_int4pack_mm_for_cpu_add_bmm_index_index_put_scalar_tensor_stack_where_19(const bfloat16*, const int32_t*, const bfloat16*, const bfloat16*, const float*, const float*, const bool*, const bfloat16*, float*, float*, float*, float*, bfloat16*, float*, bool*, float*, float*, float*, bfloat16*, float*, bfloat16*)’:
+/tmp/torchinductor_jackkhuu/c5o2v2ioswxwskfczjnkmp3pd2krfij5mcm77ewxglwjn3bvjwlp/cehyjtgmju2nn6npfywx2xkxsjwkj6rb45gv6xespkiyuceqg6xn.cpp:2792:31: warning: variable ‘tmp_acc0_arr’ set but not used [-Wunused-but-set-variable]
+ 2792 |                         float tmp_acc0_arr[32];
+      |                               ^~~~~~~~~~~~
+/tmp/torchinductor_jackkhuu/c5o2v2ioswxwskfczjnkmp3pd2krfij5mcm77ewxglwjn3bvjwlp/cehyjtgmju2nn6npfywx2xkxsjwkj6rb45gv6xespkiyuceqg6xn.cpp: In function ‘void cpp_fused__safe_softmax__weight_int4pack_mm_for_cpu_add_bmm_index_index_put_scalar_tensor_stack_where_24(const bfloat16*, const int32_t*, const bfloat16*, const float*, const float*, const bool*, const bfloat16*, float*, float*, bfloat16*, float*, bool*, float*, float*, float*, bfloat16*, float*, bfloat16*)’:
+/tmp/torchinductor_jackkhuu/c5o2v2ioswxwskfczjnkmp3pd2krfij5mcm77ewxglwjn3bvjwlp/cehyjtgmju2nn6npfywx2xkxsjwkj6rb45gv6xespkiyuceqg6xn.cpp:3304:31: warning: variable ‘tmp_acc0_arr’ set but not used [-Wunused-but-set-variable]
+ 3304 |                         float tmp_acc0_arr[32];
+      |                               ^~~~~~~~~~~~
+/tmp/torchinductor_jackkhuu/c5o2v2ioswxwskfczjnkmp3pd2krfij5mcm77ewxglwjn3bvjwlp/cehyjtgmju2nn6npfywx2xkxsjwkj6rb45gv6xespkiyuceqg6xn.cpp: In function ‘void cpp_fused__safe_softmax__weight_int4pack_mm_for_cpu_add_bmm_index_index_put_scalar_tensor_stack_where_28(const bfloat16*, const int32_t*, const bfloat16*, const bfloat16*, const float*, const float*, const bool*, const bfloat16*, float*, float*, float*, float*, bfloat16*, float*, bool*, float*, float*, float*, bfloat16*, float*, bfloat16*)’:
+/tmp/torchinductor_jackkhuu/c5o2v2ioswxwskfczjnkmp3pd2krfij5mcm77ewxglwjn3bvjwlp/cehyjtgmju2nn6npfywx2xkxsjwkj6rb45gv6xespkiyuceqg6xn.cpp:3804:31: warning: variable ‘tmp_acc0_arr’ set but not used [-Wunused-but-set-variable]
+ 3804 |                         float tmp_acc0_arr[32];
+      |                               ^~~~~~~~~~~~
+/tmp/torchinductor_jackkhuu/c5o2v2ioswxwskfczjnkmp3pd2krfij5mcm77ewxglwjn3bvjwlp/cehyjtgmju2nn6npfywx2xkxsjwkj6rb45gv6xespkiyuceqg6xn.cpp: In function ‘void cpp_fused__safe_softmax__weight_int4pack_mm_for_cpu_add_bmm_index_index_put_scalar_tensor_stack_where_33(const bfloat16*, const int32_t*, const bfloat16*, const float*, const float*, const bool*, const bfloat16*, float*, float*, bfloat16*, float*, bool*, float*, float*, float*, bfloat16*, float*, bfloat16*)’:
+/tmp/torchinductor_jackkhuu/c5o2v2ioswxwskfczjnkmp3pd2krfij5mcm77ewxglwjn3bvjwlp/cehyjtgmju2nn6npfywx2xkxsjwkj6rb45gv6xespkiyuceqg6xn.cpp:4316:31: warning: variable ‘tmp_acc0_arr’ set but not used [-Wunused-but-set-variable]
+ 4316 |                         float tmp_acc0_arr[32];
+      |                               ^~~~~~~~~~~~
+/tmp/torchinductor_jackkhuu/c5o2v2ioswxwskfczjnkmp3pd2krfij5mcm77ewxglwjn3bvjwlp/cehyjtgmju2nn6npfywx2xkxsjwkj6rb45gv6xespkiyuceqg6xn.cpp: In function ‘void cpp_fused__safe_softmax__weight_int4pack_mm_for_cpu_add_bmm_index_index_put_scalar_tensor_stack_where_37(const bfloat16*, const int32_t*, const bfloat16*, const bfloat16*, const float*, const float*, const bool*, const bfloat16*, float*, float*, float*, float*, bfloat16*, float*, bool*, float*, float*, float*, bfloat16*, float*, bfloat16*)’:
+/tmp/torchinductor_jackkhuu/c5o2v2ioswxwskfczjnkmp3pd2krfij5mcm77ewxglwjn3bvjwlp/cehyjtgmju2nn6npfywx2xkxsjwkj6rb45gv6xespkiyuceqg6xn.cpp:4816:31: warning: variable ‘tmp_acc0_arr’ set but not used [-Wunused-but-set-variable]
+ 4816 |                         float tmp_acc0_arr[32];
+      |                               ^~~~~~~~~~~~
+/tmp/torchinductor_jackkhuu/c5o2v2ioswxwskfczjnkmp3pd2krfij5mcm77ewxglwjn3bvjwlp/cehyjtgmju2nn6npfywx2xkxsjwkj6rb45gv6xespkiyuceqg6xn.cpp: In function ‘void cpp_fused__safe_softmax__weight_int4pack_mm_for_cpu_add_bmm_index_index_put_scalar_tensor_stack_where_42(const bfloat16*, const int32_t*, const bfloat16*, const float*, const float*, const bool*, const bfloat16*, float*, float*, bfloat16*, float*, bool*, float*, float*, float*, bfloat16*, float*, bfloat16*)’:
+/tmp/torchinductor_jackkhuu/c5o2v2ioswxwskfczjnkmp3pd2krfij5mcm77ewxglwjn3bvjwlp/cehyjtgmju2nn6npfywx2xkxsjwkj6rb45gv6xespkiyuceqg6xn.cpp:5328:31: warning: variable ‘tmp_acc0_arr’ set but not used [-Wunused-but-set-variable]
+ 5328 |                         float tmp_acc0_arr[32];
+      |                               ^~~~~~~~~~~~
+/tmp/torchinductor_jackkhuu/c5o2v2ioswxwskfczjnkmp3pd2krfij5mcm77ewxglwjn3bvjwlp/cehyjtgmju2nn6npfywx2xkxsjwkj6rb45gv6xespkiyuceqg6xn.cpp: In function ‘void cpp_fused__safe_softmax__weight_int4pack_mm_for_cpu_add_bmm_index_index_put_scalar_tensor_stack_where_46(const bfloat16*, const int32_t*, const bfloat16*, const bfloat16*, const float*, const float*, const bool*, const bfloat16*, float*, float*, float*, float*, bfloat16*, float*, bool*, float*, float*, float*, bfloat16*, float*, bfloat16*)’:
+/tmp/torchinductor_jackkhuu/c5o2v2ioswxwskfczjnkmp3pd2krfij5mcm77ewxglwjn3bvjwlp/cehyjtgmju2nn6npfywx2xkxsjwkj6rb45gv6xespkiyuceqg6xn.cpp:5828:31: warning: variable ‘tmp_acc0_arr’ set but not used [-Wunused-but-set-variable]
+ 5828 |                         float tmp_acc0_arr[32];
+      |                               ^~~~~~~~~~~~
+/tmp/torchinductor_jackkhuu/c5o2v2ioswxwskfczjnkmp3pd2krfij5mcm77ewxglwjn3bvjwlp/cehyjtgmju2nn6npfywx2xkxsjwkj6rb45gv6xespkiyuceqg6xn.cpp: In function ‘void cpp_fused__safe_softmax__weight_int4pack_mm_for_cpu_add_bmm_index_index_put_scalar_tensor_stack_where_51(const bfloat16*, const int32_t*, const bfloat16*, const float*, const float*, const bool*, const bfloat16*, float*, float*, bfloat16*, float*, bool*, float*, float*, float*, bfloat16*, float*, bfloat16*)’:
+/tmp/torchinductor_jackkhuu/c5o2v2ioswxwskfczjnkmp3pd2krfij5mcm77ewxglwjn3bvjwlp/cehyjtgmju2nn6npfywx2xkxsjwkj6rb45gv6xespkiyuceqg6xn.cpp:6340:31: warning: variable ‘tmp_acc0_arr’ set but not used [-Wunused-but-set-variable]
+ 6340 |                         float tmp_acc0_arr[32];
+      |                               ^~~~~~~~~~~~
+/tmp/torchinductor_jackkhuu/c5o2v2ioswxwskfczjnkmp3pd2krfij5mcm77ewxglwjn3bvjwlp/cehyjtgmju2nn6npfywx2xkxsjwkj6rb45gv6xespkiyuceqg6xn.cpp: In function ‘void cpp_fused__safe_softmax__weight_int4pack_mm_for_cpu_add_bmm_index_index_put_scalar_tensor_stack_where_55(const bfloat16*, const int32_t*, const bfloat16*, const bfloat16*, const float*, const float*, const bool*, const bfloat16*, float*, float*, float*, float*, bfloat16*, float*, bool*, float*, float*, float*, bfloat16*, float*, bfloat16*)’:
+/tmp/torchinductor_jackkhuu/c5o2v2ioswxwskfczjnkmp3pd2krfij5mcm77ewxglwjn3bvjwlp/cehyjtgmju2nn6npfywx2xkxsjwkj6rb45gv6xespkiyuceqg6xn.cpp:6840:31: warning: variable ‘tmp_acc0_arr’ set but not used [-Wunused-but-set-variable]
+ 6840 |                         float tmp_acc0_arr[32];
+      |                               ^~~~~~~~~~~~
+/tmp/torchinductor_jackkhuu/c5o2v2ioswxwskfczjnkmp3pd2krfij5mcm77ewxglwjn3bvjwlp/cehyjtgmju2nn6npfywx2xkxsjwkj6rb45gv6xespkiyuceqg6xn.cpp: In function ‘void cpp_fused__safe_softmax__weight_int4pack_mm_for_cpu_add_bmm_index_index_put_scalar_tensor_stack_where_60(const bfloat16*, const int32_t*, const bfloat16*, const float*, const float*, const bool*, const bfloat16*, float*, float*, bfloat16*, float*, bool*, float*, float*, float*, bfloat16*, float*, bfloat16*)’:
+/tmp/torchinductor_jackkhuu/c5o2v2ioswxwskfczjnkmp3pd2krfij5mcm77ewxglwjn3bvjwlp/cehyjtgmju2nn6npfywx2xkxsjwkj6rb45gv6xespkiyuceqg6xn.cpp:7352:31: warning: variable ‘tmp_acc0_arr’ set but not used [-Wunused-but-set-variable]
+ 7352 |                         float tmp_acc0_arr[32];
+      |                               ^~~~~~~~~~~~
+/tmp/torchinductor_jackkhuu/c5o2v2ioswxwskfczjnkmp3pd2krfij5mcm77ewxglwjn3bvjwlp/cehyjtgmju2nn6npfywx2xkxsjwkj6rb45gv6xespkiyuceqg6xn.cpp: In function ‘void cpp_fused__safe_softmax__weight_int4pack_mm_for_cpu_add_bmm_index_index_put_scalar_tensor_stack_where_64(const bfloat16*, const int32_t*, const bfloat16*, const bfloat16*, const float*, const float*, const bool*, const bfloat16*, float*, float*, float*, float*, bfloat16*, float*, bool*, float*, float*, float*, bfloat16*, float*, bfloat16*)’:
+/tmp/torchinductor_jackkhuu/c5o2v2ioswxwskfczjnkmp3pd2krfij5mcm77ewxglwjn3bvjwlp/cehyjtgmju2nn6npfywx2xkxsjwkj6rb45gv6xespkiyuceqg6xn.cpp:7852:31: warning: variable ‘tmp_acc0_arr’ set but not used [-Wunused-but-set-variable]
+ 7852 |                         float tmp_acc0_arr[32];
+      |                               ^~~~~~~~~~~~
+/tmp/torchinductor_jackkhuu/c5o2v2ioswxwskfczjnkmp3pd2krfij5mcm77ewxglwjn3bvjwlp/cehyjtgmju2nn6npfywx2xkxsjwkj6rb45gv6xespkiyuceqg6xn.cpp: In function ‘void cpp_fused__safe_softmax__weight_int4pack_mm_for_cpu_add_bmm_index_index_put_scalar_tensor_stack_where_69(const bfloat16*, const int32_t*, const bfloat16*, const float*, const float*, const bool*, const bfloat16*, float*, float*, bfloat16*, float*, bool*, float*, float*, float*, bfloat16*, float*, bfloat16*)’:
+/tmp/torchinductor_jackkhuu/c5o2v2ioswxwskfczjnkmp3pd2krfij5mcm77ewxglwjn3bvjwlp/cehyjtgmju2nn6npfywx2xkxsjwkj6rb45gv6xespkiyuceqg6xn.cpp:8364:31: warning: variable ‘tmp_acc0_arr’ set but not used [-Wunused-but-set-variable]
+ 8364 |                         float tmp_acc0_arr[32];
+      |                               ^~~~~~~~~~~~
+/tmp/torchinductor_jackkhuu/c5o2v2ioswxwskfczjnkmp3pd2krfij5mcm77ewxglwjn3bvjwlp/cehyjtgmju2nn6npfywx2xkxsjwkj6rb45gv6xespkiyuceqg6xn.cpp: In function ‘void cpp_fused__safe_softmax__weight_int4pack_mm_for_cpu_add_bmm_index_index_put_scalar_tensor_stack_where_73(const bfloat16*, const int32_t*, const bfloat16*, const bfloat16*, const float*, const float*, const bool*, const bfloat16*, float*, float*, float*, float*, bfloat16*, float*, bool*, float*, float*, float*, bfloat16*, float*, bfloat16*)’:
+/tmp/torchinductor_jackkhuu/c5o2v2ioswxwskfczjnkmp3pd2krfij5mcm77ewxglwjn3bvjwlp/cehyjtgmju2nn6npfywx2xkxsjwkj6rb45gv6xespkiyuceqg6xn.cpp:8864:31: warning: variable ‘tmp_acc0_arr’ set but not used [-Wunused-but-set-variable]
+ 8864 |                         float tmp_acc0_arr[32];
+      |                               ^~~~~~~~~~~~
+/tmp/torchinductor_jackkhuu/c5o2v2ioswxwskfczjnkmp3pd2krfij5mcm77ewxglwjn3bvjwlp/cehyjtgmju2nn6npfywx2xkxsjwkj6rb45gv6xespkiyuceqg6xn.cpp: In function ‘void cpp_fused__safe_softmax__weight_int4pack_mm_for_cpu_add_bmm_index_index_put_scalar_tensor_stack_where_78(const bfloat16*, const int32_t*, const bfloat16*, const float*, const float*, const bool*, const bfloat16*, float*, float*, bfloat16*, float*, bool*, float*, float*, float*, bfloat16*, float*, bfloat16*)’:
+/tmp/torchinductor_jackkhuu/c5o2v2ioswxwskfczjnkmp3pd2krfij5mcm77ewxglwjn3bvjwlp/cehyjtgmju2nn6npfywx2xkxsjwkj6rb45gv6xespkiyuceqg6xn.cpp:9376:31: warning: variable ‘tmp_acc0_arr’ set but not used [-Wunused-but-set-variable]
+ 9376 |                         float tmp_acc0_arr[32];
+      |                               ^~~~~~~~~~~~
+/tmp/torchinductor_jackkhuu/c5o2v2ioswxwskfczjnkmp3pd2krfij5mcm77ewxglwjn3bvjwlp/cehyjtgmju2nn6npfywx2xkxsjwkj6rb45gv6xespkiyuceqg6xn.cpp: In function ‘void cpp_fused__safe_softmax__weight_int4pack_mm_for_cpu_add_bmm_index_index_put_scalar_tensor_stack_where_82(const bfloat16*, const int32_t*, const bfloat16*, const bfloat16*, const float*, const float*, const bool*, const bfloat16*, float*, float*, float*, float*, bfloat16*, float*, bool*, float*, float*, float*, bfloat16*, float*, bfloat16*)’:
+/tmp/torchinductor_jackkhuu/c5o2v2ioswxwskfczjnkmp3pd2krfij5mcm77ewxglwjn3bvjwlp/cehyjtgmju2nn6npfywx2xkxsjwkj6rb45gv6xespkiyuceqg6xn.cpp:9876:31: warning: variable ‘tmp_acc0_arr’ set but not used [-Wunused-but-set-variable]
+ 9876 |                         float tmp_acc0_arr[32];
+      |                               ^~~~~~~~~~~~
+/tmp/torchinductor_jackkhuu/c5o2v2ioswxwskfczjnkmp3pd2krfij5mcm77ewxglwjn3bvjwlp/cehyjtgmju2nn6npfywx2xkxsjwkj6rb45gv6xespkiyuceqg6xn.cpp: In function ‘void cpp_fused__safe_softmax__weight_int4pack_mm_for_cpu_add_bmm_index_index_put_scalar_tensor_stack_where_87(const bfloat16*, const int32_t*, const bfloat16*, const float*, const float*, const bool*, const bfloat16*, float*, float*, bfloat16*, float*, bool*, float*, float*, float*, bfloat16*, float*, bfloat16*)’:
+/tmp/torchinductor_jackkhuu/c5o2v2ioswxwskfczjnkmp3pd2krfij5mcm77ewxglwjn3bvjwlp/cehyjtgmju2nn6npfywx2xkxsjwkj6rb45gv6xespkiyuceqg6xn.cpp:10388:31: warning: variable ‘tmp_acc0_arr’ set but not used [-Wunused-but-set-variable]
+10388 |                         float tmp_acc0_arr[32];
+      |                               ^~~~~~~~~~~~
+/tmp/torchinductor_jackkhuu/c5o2v2ioswxwskfczjnkmp3pd2krfij5mcm77ewxglwjn3bvjwlp/cehyjtgmju2nn6npfywx2xkxsjwkj6rb45gv6xespkiyuceqg6xn.cpp: In function ‘void cpp_fused__safe_softmax__weight_int4pack_mm_for_cpu_add_bmm_index_index_put_scalar_tensor_stack_where_91(const bfloat16*, const int32_t*, const bfloat16*, const bfloat16*, const float*, const float*, const bool*, const bfloat16*, float*, float*, float*, float*, bfloat16*, float*, bool*, float*, float*, float*, bfloat16*, float*, bfloat16*)’:
+/tmp/torchinductor_jackkhuu/c5o2v2ioswxwskfczjnkmp3pd2krfij5mcm77ewxglwjn3bvjwlp/cehyjtgmju2nn6npfywx2xkxsjwkj6rb45gv6xespkiyuceqg6xn.cpp:10888:31: warning: variable ‘tmp_acc0_arr’ set but not used [-Wunused-but-set-variable]
+10888 |                         float tmp_acc0_arr[32];
+      |                               ^~~~~~~~~~~~
+/tmp/torchinductor_jackkhuu/c5o2v2ioswxwskfczjnkmp3pd2krfij5mcm77ewxglwjn3bvjwlp/cehyjtgmju2nn6npfywx2xkxsjwkj6rb45gv6xespkiyuceqg6xn.cpp: In function ‘void cpp_fused__safe_softmax__weight_int4pack_mm_for_cpu_add_bmm_index_index_put_scalar_tensor_stack_where_96(const bfloat16*, const int32_t*, const bfloat16*, const float*, const float*, const bool*, const bfloat16*, float*, float*, bfloat16*, float*, bool*, float*, float*, float*, bfloat16*, float*, bfloat16*)’:
+/tmp/torchinductor_jackkhuu/c5o2v2ioswxwskfczjnkmp3pd2krfij5mcm77ewxglwjn3bvjwlp/cehyjtgmju2nn6npfywx2xkxsjwkj6rb45gv6xespkiyuceqg6xn.cpp:11400:31: warning: variable ‘tmp_acc0_arr’ set but not used [-Wunused-but-set-variable]
+11400 |                         float tmp_acc0_arr[32];
+      |                               ^~~~~~~~~~~~
+/tmp/torchinductor_jackkhuu/c5o2v2ioswxwskfczjnkmp3pd2krfij5mcm77ewxglwjn3bvjwlp/cehyjtgmju2nn6npfywx2xkxsjwkj6rb45gv6xespkiyuceqg6xn.cpp: In function ‘void cpp_fused__safe_softmax__weight_int4pack_mm_for_cpu_add_bmm_index_index_put_scalar_tensor_stack_where_100(const bfloat16*, const int32_t*, const bfloat16*, const bfloat16*, const float*, const float*, const bool*, const bfloat16*, float*, float*, float*, float*, bfloat16*, float*, bool*, float*, float*, float*, bfloat16*, float*, bfloat16*)’:
+/tmp/torchinductor_jackkhuu/c5o2v2ioswxwskfczjnkmp3pd2krfij5mcm77ewxglwjn3bvjwlp/cehyjtgmju2nn6npfywx2xkxsjwkj6rb45gv6xespkiyuceqg6xn.cpp:11900:31: warning: variable ‘tmp_acc0_arr’ set but not used [-Wunused-but-set-variable]
+11900 |                         float tmp_acc0_arr[32];
+      |                               ^~~~~~~~~~~~
+/tmp/torchinductor_jackkhuu/c5o2v2ioswxwskfczjnkmp3pd2krfij5mcm77ewxglwjn3bvjwlp/cehyjtgmju2nn6npfywx2xkxsjwkj6rb45gv6xespkiyuceqg6xn.cpp: In function ‘void cpp_fused__safe_softmax__weight_int4pack_mm_for_cpu_add_bmm_index_index_put_scalar_tensor_stack_where_105(const bfloat16*, const int32_t*, const bfloat16*, const float*, const float*, const bool*, const bfloat16*, float*, float*, bfloat16*, float*, bool*, float*, float*, float*, bfloat16*, float*, bfloat16*)’:
+/tmp/torchinductor_jackkhuu/c5o2v2ioswxwskfczjnkmp3pd2krfij5mcm77ewxglwjn3bvjwlp/cehyjtgmju2nn6npfywx2xkxsjwkj6rb45gv6xespkiyuceqg6xn.cpp:12412:31: warning: variable ‘tmp_acc0_arr’ set but not used [-Wunused-but-set-variable]
+12412 |                         float tmp_acc0_arr[32];
+      |                               ^~~~~~~~~~~~
+/tmp/torchinductor_jackkhuu/c5o2v2ioswxwskfczjnkmp3pd2krfij5mcm77ewxglwjn3bvjwlp/cehyjtgmju2nn6npfywx2xkxsjwkj6rb45gv6xespkiyuceqg6xn.cpp: In function ‘void cpp_fused__safe_softmax__weight_int4pack_mm_for_cpu_add_bmm_index_index_put_scalar_tensor_stack_where_109(const bfloat16*, const int32_t*, const bfloat16*, const bfloat16*, const float*, const float*, const bool*, const bfloat16*, float*, float*, float*, float*, bfloat16*, float*, bool*, float*, float*, float*, bfloat16*, float*, bfloat16*)’:
+/tmp/torchinductor_jackkhuu/c5o2v2ioswxwskfczjnkmp3pd2krfij5mcm77ewxglwjn3bvjwlp/cehyjtgmju2nn6npfywx2xkxsjwkj6rb45gv6xespkiyuceqg6xn.cpp:12912:31: warning: variable ‘tmp_acc0_arr’ set but not used [-Wunused-but-set-variable]
+12912 |                         float tmp_acc0_arr[32];
+      |                               ^~~~~~~~~~~~
+/tmp/torchinductor_jackkhuu/c5o2v2ioswxwskfczjnkmp3pd2krfij5mcm77ewxglwjn3bvjwlp/cehyjtgmju2nn6npfywx2xkxsjwkj6rb45gv6xespkiyuceqg6xn.cpp: In function ‘void cpp_fused__safe_softmax__weight_int4pack_mm_for_cpu_add_bmm_index_index_put_scalar_tensor_stack_where_114(const bfloat16*, const int32_t*, const bfloat16*, const float*, const float*, const bool*, const bfloat16*, float*, float*, bfloat16*, float*, bool*, float*, float*, float*, bfloat16*, float*, bfloat16*)’:
+/tmp/torchinductor_jackkhuu/c5o2v2ioswxwskfczjnkmp3pd2krfij5mcm77ewxglwjn3bvjwlp/cehyjtgmju2nn6npfywx2xkxsjwkj6rb45gv6xespkiyuceqg6xn.cpp:13424:31: warning: variable ‘tmp_acc0_arr’ set but not used [-Wunused-but-set-variable]
+13424 |                         float tmp_acc0_arr[32];
+      |                               ^~~~~~~~~~~~
+/tmp/torchinductor_jackkhuu/c5o2v2ioswxwskfczjnkmp3pd2krfij5mcm77ewxglwjn3bvjwlp/cehyjtgmju2nn6npfywx2xkxsjwkj6rb45gv6xespkiyuceqg6xn.cpp: In function ‘void cpp_fused__safe_softmax__weight_int4pack_mm_for_cpu_add_bmm_index_index_put_scalar_tensor_stack_where_118(const bfloat16*, const int32_t*, const bfloat16*, const bfloat16*, const float*, const float*, const bool*, const bfloat16*, float*, float*, float*, float*, bfloat16*, float*, bool*, float*, float*, float*, bfloat16*, float*, bfloat16*)’:
+/tmp/torchinductor_jackkhuu/c5o2v2ioswxwskfczjnkmp3pd2krfij5mcm77ewxglwjn3bvjwlp/cehyjtgmju2nn6npfywx2xkxsjwkj6rb45gv6xespkiyuceqg6xn.cpp:13924:31: warning: variable ‘tmp_acc0_arr’ set but not used [-Wunused-but-set-variable]
+13924 |                         float tmp_acc0_arr[32];
+      |                               ^~~~~~~~~~~~
+/tmp/torchinductor_jackkhuu/c5o2v2ioswxwskfczjnkmp3pd2krfij5mcm77ewxglwjn3bvjwlp/cehyjtgmju2nn6npfywx2xkxsjwkj6rb45gv6xespkiyuceqg6xn.cpp: In function ‘void cpp_fused__safe_softmax__weight_int4pack_mm_for_cpu_add_bmm_index_index_put_scalar_tensor_stack_where_123(const bfloat16*, const int32_t*, const bfloat16*, const float*, const float*, const bool*, const bfloat16*, float*, float*, bfloat16*, float*, bool*, float*, float*, float*, bfloat16*, float*, bfloat16*)’:
+/tmp/torchinductor_jackkhuu/c5o2v2ioswxwskfczjnkmp3pd2krfij5mcm77ewxglwjn3bvjwlp/cehyjtgmju2nn6npfywx2xkxsjwkj6rb45gv6xespkiyuceqg6xn.cpp:14436:31: warning: variable ‘tmp_acc0_arr’ set but not used [-Wunused-but-set-variable]
+14436 |                         float tmp_acc0_arr[32];
+      |                               ^~~~~~~~~~~~
+/tmp/torchinductor_jackkhuu/c5o2v2ioswxwskfczjnkmp3pd2krfij5mcm77ewxglwjn3bvjwlp/cehyjtgmju2nn6npfywx2xkxsjwkj6rb45gv6xespkiyuceqg6xn.cpp: In function ‘void cpp_fused__safe_softmax__weight_int4pack_mm_for_cpu_add_bmm_index_index_put_scalar_tensor_stack_where_127(const bfloat16*, const int32_t*, const bfloat16*, const bfloat16*, const float*, const float*, const bool*, const bfloat16*, float*, float*, float*, float*, bfloat16*, float*, bool*, float*, float*, float*, bfloat16*, float*, bfloat16*)’:
+/tmp/torchinductor_jackkhuu/c5o2v2ioswxwskfczjnkmp3pd2krfij5mcm77ewxglwjn3bvjwlp/cehyjtgmju2nn6npfywx2xkxsjwkj6rb45gv6xespkiyuceqg6xn.cpp:14936:31: warning: variable ‘tmp_acc0_arr’ set but not used [-Wunused-but-set-variable]
+14936 |                         float tmp_acc0_arr[32];
+      |                               ^~~~~~~~~~~~
+/tmp/torchinductor_jackkhuu/c5o2v2ioswxwskfczjnkmp3pd2krfij5mcm77ewxglwjn3bvjwlp/cehyjtgmju2nn6npfywx2xkxsjwkj6rb45gv6xespkiyuceqg6xn.cpp: In function ‘void cpp_fused__safe_softmax__weight_int4pack_mm_for_cpu_add_bmm_index_index_put_scalar_tensor_stack_where_132(const bfloat16*, const int32_t*, const bfloat16*, const float*, const float*, const bool*, const bfloat16*, float*, float*, bfloat16*, float*, bool*, float*, float*, float*, bfloat16*, float*, bfloat16*)’:
+/tmp/torchinductor_jackkhuu/c5o2v2ioswxwskfczjnkmp3pd2krfij5mcm77ewxglwjn3bvjwlp/cehyjtgmju2nn6npfywx2xkxsjwkj6rb45gv6xespkiyuceqg6xn.cpp:15448:31: warning: variable ‘tmp_acc0_arr’ set but not used [-Wunused-but-set-variable]
+15448 |                         float tmp_acc0_arr[32];
+      |                               ^~~~~~~~~~~~
+/tmp/torchinductor_jackkhuu/c5o2v2ioswxwskfczjnkmp3pd2krfij5mcm77ewxglwjn3bvjwlp/cehyjtgmju2nn6npfywx2xkxsjwkj6rb45gv6xespkiyuceqg6xn.cpp: In function ‘void cpp_fused__safe_softmax__weight_int4pack_mm_for_cpu_add_bmm_index_index_put_scalar_tensor_stack_where_136(const bfloat16*, const int32_t*, const bfloat16*, const bfloat16*, const float*, const float*, const bool*, const bfloat16*, float*, float*, float*, float*, bfloat16*, float*, bool*, float*, float*, float*, bfloat16*, float*, bfloat16*)’:
+/tmp/torchinductor_jackkhuu/c5o2v2ioswxwskfczjnkmp3pd2krfij5mcm77ewxglwjn3bvjwlp/cehyjtgmju2nn6npfywx2xkxsjwkj6rb45gv6xespkiyuceqg6xn.cpp:15948:31: warning: variable ‘tmp_acc0_arr’ set but not used [-Wunused-but-set-variable]
+15948 |                         float tmp_acc0_arr[32];
+      |                               ^~~~~~~~~~~~
+/tmp/torchinductor_jackkhuu/c5o2v2ioswxwskfczjnkmp3pd2krfij5mcm77ewxglwjn3bvjwlp/cehyjtgmju2nn6npfywx2xkxsjwkj6rb45gv6xespkiyuceqg6xn.cpp: In function ‘void cpp_fused__safe_softmax__weight_int4pack_mm_for_cpu_add_bmm_index_index_put_scalar_tensor_stack_where_141(const bfloat16*, const int32_t*, const bfloat16*, const float*, const float*, const bool*, const bfloat16*, float*, float*, bfloat16*, float*, bool*, float*, float*, float*, bfloat16*, float*, bfloat16*)’:
+/tmp/torchinductor_jackkhuu/c5o2v2ioswxwskfczjnkmp3pd2krfij5mcm77ewxglwjn3bvjwlp/cehyjtgmju2nn6npfywx2xkxsjwkj6rb45gv6xespkiyuceqg6xn.cpp:16460:31: warning: variable ‘tmp_acc0_arr’ set but not used [-Wunused-but-set-variable]
+16460 |                         float tmp_acc0_arr[32];
+      |                               ^~~~~~~~~~~~
+Unabled to import torchao experimental quant_api with error:  [Errno 2] No such file or directory: '/home/jackkhuu/oss/torchchat/torchao-build/src/ao/torchao/experimental/quant_api.py'
+Using device=cpu
+Setting max_seq_length to 300 for DSO export.
+Loading model...
+Time to load model: 0.11 seconds
+Quantizing the model with: {'linear:int4': {'groupsize': 256}, 'precision': {'dtype': 'bfloat16'}, 'executor': {'accelerator': 'cpu'}}
+Time to quantize model: 44.51 seconds
+-----------------------------------------------------------
+Exporting model using AOT Inductor to /tmp/model34.pt2
+The generated packaged model can be found at: /tmp/model34.pt2
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+OMP_NUM_THREADS=16 numactl --cpunodebind=0 --membind=0 python3 torchchat.py generate llama3.1 --aoti-package-path /tmp/model34.pt2 --prompt "Once upon a time," --max-new-tokens 256 --device cpu --num-samples 3
+PyTorch version 2.6.0.dev20241218+cu124 available.
+Unabled to import torchao experimental quant_api with error:  [Errno 2] No such file or directory: '/home/jackkhuu/oss/torchchat/torchao-build/src/ao/torchao/experimental/quant_api.py'
+Warning: checkpoint path ignored because an exported model was specified using a DSO, AOTI PACKAGE or PTE path argument
+Warning: checkpoint path ignored because an exported model was specified using a DSO, AOTI PACKAGE or PTE path argument
+Using device=cpu Intel(R) Xeon(R) Platinum 8339HC CPU @ 1.80GHz
+Loading model...
+Time to load model: 0.50 seconds
+-----------------------------------------------------------
+Once upon a time, deep in a dense forest there lived a group of animals who were known as the Forest Friends. They were a diverse group of animals, each with their unique skills and abilities. There was Jack, the brave and strong bear who loved to climb trees and swim in the river. There was also Lily, the gentle and kind rabbit who loved to pick berries and flowers for all to enjoy. Next was Sammy, the quick-witted and clever squirrel who loved to climb high up in the trees and whisper messages to the other animals using a complex system of chirps and squeaks. Last but not least, there was Benny, the loyal and honest beaver who loved to build and repair the homes of the other animals. Among all these animals, there was one special animal named Max, the brave and adventurous tiger who loved to explore and discover new things.
+
+One day, a big and scary storm rolled into the forest. The wind was howling, the thunder was booming, and the rain was pouring down. The animals of the Forest Friends were all scared and unsure of what to do. Jack was worried about the river rising, Lily was worried about the storm damaging her beautiful garden, Sammy was worried about the branches breaking, and Benny was worried about the homes being destroyed.
+
+Max
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~                
+Generated 255 tokens                 
+Time for inference 1: 290.2572 sec total                 
+Time to first token: 9.0272 sec with sequential prefill.                
+
+      Total throughput: 0.8820 tokens/sec, 1.1338 s/token                 
+First token throughput: 0.1108 tokens/sec, 9.0272 s/token                 
+ Next token throughput: 0.9067 tokens/sec, 1.1029 s/token                     
+
+Bandwidth achieved: 0.00 GB/s
+*** This first iteration will include cold start effects for dynamic import, hardware caches. ***
+
+========================================
+
+Once upon a time, in the tiny village of Vakkalapalle, there lived an 80-year-old woman named Kumari. Kumari was a cheerful and energetic lady, who lived a life full of independence and joy. She was always willing to take on whatever task that came her way, and no one could ever convince her that she was too old for anything. She got up early every morning, took a dip in the nearby pond, and then began her day’s activities with a twinkle in her mind.
+
+Kumari’s days revolved around her three main priorities: earning money, cooking food, and interacting with people. She worked as a laborer for a local farmer, cutting grass and gathering firewood. She also made an effort to cook delicious meals for her neighbors, especially the children, and her evenings were filled with playing badminton or cards with the young folk or going out on walks with her friends.
+
+Kumari’s most precious value in life was the ability to earn money. She believed that money was the key to independence. Moreover, for her, independence meant the ability to make her own choices, travel, choose her own food, wear her favorite clothes, and live life on her own terms.
+
+As soon as her husband passed away, Kum
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~                
+Generated 255 tokens                 
+Time for inference 2: 425.3548 sec total                 
+Time to first token: 9.2799 sec with sequential prefill.                
+
+      Total throughput: 0.6019 tokens/sec, 1.6615 s/token                 
+First token throughput: 0.1078 tokens/sec, 9.2799 s/token                 
+ Next token throughput: 0.6129 tokens/sec, 1.6317 s/token                     
+
+Bandwidth achieved: 0.00 GB/s
+
+========================================
+
+Once upon a time, not too long ago, in a small village nestled in a beautiful valley, there lived a young girl named Sophie. Sophie was a curious and adventurous child, always eager to explore the world around her. She loved nothing more than to climb trees, chase after butterflies, and play in the nearby stream that ran through the heart of the village.
+
+One day, while out on one of her many adventures, Sophie stumbled upon an old, mysterious-looking wooden box hidden away in a thicket of bushes near the stream. The box was old and worn, with intricate carvings of leaves and vines etched into its surface. It looked as though it had been buried beneath the earth for many years, and Sophie felt a thrill of excitement as she carefully opened the lid.
+
+As she lifted the lid, a faint misty mist emerged from within, carrying with it the scent of damp earth and moss. The air was filled with an otherworldly energy, and Sophie felt a sudden sense of calm wash over her. It was as though she had stumbled into a hidden world within the heart of the earth itself.
+
+Deep within the box, Sophie found a small, leather-bound book. The cover was worn and cracked, but the pages within were filled with beautiful, handwritten illustrations of the
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~                
+Generated 255 tokens                 
+Time for inference 3: 265.8577 sec total                 
+Time to first token: 8.9230 sec with sequential prefill.                
+
+      Total throughput: 0.9629 tokens/sec, 1.0385 s/token                 
+First token throughput: 0.1121 tokens/sec, 8.9230 s/token                 
+ Next token throughput: 0.9925 tokens/sec, 1.0076 s/token                     
+
+Bandwidth achieved: 0.00 GB/s
+
+========================================
+
+
+Warning: Excluding compile in calculations                 
+      Average tokens/sec (total): 0.82                 
+Average tokens/sec (first token): 0.11                 
+Average tokens/sec (next tokens): 0.84 
+                
+Memory used: 0.00 GB
diff --git a/llama31-1218/cpu_aoti_pt2_8.txt b/llama31-1218/cpu_aoti_pt2_8.txt
new file mode 100644
index 000000000..57a261e59
--- /dev/null
+++ b/llama31-1218/cpu_aoti_pt2_8.txt
@@ -0,0 +1,228 @@
+python3 torchchat.py export llama3.1 --quantize '{"linear:int8": {"groupsize": 0}, "precision": {"dtype":"bfloat16"}, "executor":{"accelerator":"cpu"}}' --output-aoti-package-path /tmp/model8.pt2
+OMP_NUM_THREADS=16 numactl --cpunodebind=0 --membind=0 python3 torchchat.py generate llama3.1 --aoti-package-path /tmp/model8.pt2 --prompt "Once upon a time," --max-new-tokens 256 --device cpu --num-samples 3
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+python3 torchchat.py export llama3.1 --quantize '{"linear:int8": {"groupsize": 0}, "precision": {"dtype":"bfloat16"}, "executor":{"accelerator":"cpu"}}' --output-aoti-package-path /tmp/model8.pt2
+Note: NumExpr detected 22 cores but "NUMEXPR_MAX_THREADS" not set, so enforcing safe limit of 16.
+NumExpr defaulting to 16 threads.
+PyTorch version 2.6.0.dev20241218+cu124 available.
+W1218 21:51:28.251947 2539546 site-packages/torch/_export/__init__.py:276] +============================+
+W1218 21:51:28.252467 2539546 site-packages/torch/_export/__init__.py:277] |     !!!   WARNING   !!!    |
+W1218 21:51:28.252670 2539546 site-packages/torch/_export/__init__.py:278] +============================+
+W1218 21:51:28.252855 2539546 site-packages/torch/_export/__init__.py:279] torch._export.aot_compile()/torch._export.aot_load() is being deprecated, please switch to directly calling torch._inductor.aoti_compile_and_package(torch.export.export())/torch._inductor.aoti_load_package() instead.
+/tmp/torchinductor_jackkhuu/clwlk6qr5j5pkuourbuvzm7ziqe63ybjruymcu3j2i7zqv4pikg6/cstsuiqkksmkx2ung6x6grqrenfveg57z7enn5z2fxjxw25yycxa.cpp: In function ‘void cpp_fused__safe_softmax__to_copy_add_bmm_index_index_put_scalar_tensor_stack_where_2(const bfloat16*, const int32_t*, const bfloat16*, const float*, const float*, const bool*, const bfloat16*, float*, float*, bfloat16*, float*, bool*, float*, float*, float*, bfloat16*, float*, bfloat16*)’:
+/tmp/torchinductor_jackkhuu/clwlk6qr5j5pkuourbuvzm7ziqe63ybjruymcu3j2i7zqv4pikg6/cstsuiqkksmkx2ung6x6grqrenfveg57z7enn5z2fxjxw25yycxa.cpp:740:31: warning: variable ‘tmp_acc0_arr’ set but not used [-Wunused-but-set-variable]
+  740 |                         float tmp_acc0_arr[32];
+      |                               ^~~~~~~~~~~~
+/tmp/torchinductor_jackkhuu/clwlk6qr5j5pkuourbuvzm7ziqe63ybjruymcu3j2i7zqv4pikg6/cstsuiqkksmkx2ung6x6grqrenfveg57z7enn5z2fxjxw25yycxa.cpp: In function ‘void cpp_fused__safe_softmax__to_copy_add_bmm_index_index_put_scalar_tensor_stack_where_7(const bfloat16*, const int32_t*, const bfloat16*, const float*, const float*, const bool*, const bfloat16*, float*, float*, bfloat16*, float*, bool*, float*, float*, float*, bfloat16*, float*, bfloat16*)’:
+/tmp/torchinductor_jackkhuu/clwlk6qr5j5pkuourbuvzm7ziqe63ybjruymcu3j2i7zqv4pikg6/cstsuiqkksmkx2ung6x6grqrenfveg57z7enn5z2fxjxw25yycxa.cpp:1276:31: warning: variable ‘tmp_acc0_arr’ set but not used [-Wunused-but-set-variable]
+ 1276 |                         float tmp_acc0_arr[32];
+      |                               ^~~~~~~~~~~~
+/tmp/torchinductor_jackkhuu/clwlk6qr5j5pkuourbuvzm7ziqe63ybjruymcu3j2i7zqv4pikg6/cstsuiqkksmkx2ung6x6grqrenfveg57z7enn5z2fxjxw25yycxa.cpp: In function ‘void cpp_fused__safe_softmax__to_copy_add_bmm_index_index_put_scalar_tensor_stack_where_12(const bfloat16*, const int32_t*, const bfloat16*, const float*, const float*, const bool*, const bfloat16*, float*, float*, bfloat16*, float*, bool*, float*, float*, float*, bfloat16*, float*, bfloat16*)’:
+/tmp/torchinductor_jackkhuu/clwlk6qr5j5pkuourbuvzm7ziqe63ybjruymcu3j2i7zqv4pikg6/cstsuiqkksmkx2ung6x6grqrenfveg57z7enn5z2fxjxw25yycxa.cpp:1782:31: warning: variable ‘tmp_acc0_arr’ set but not used [-Wunused-but-set-variable]
+ 1782 |                         float tmp_acc0_arr[32];
+      |                               ^~~~~~~~~~~~
+/tmp/torchinductor_jackkhuu/clwlk6qr5j5pkuourbuvzm7ziqe63ybjruymcu3j2i7zqv4pikg6/cstsuiqkksmkx2ung6x6grqrenfveg57z7enn5z2fxjxw25yycxa.cpp: In function ‘void cpp_fused__safe_softmax__to_copy_add_bmm_index_index_put_scalar_tensor_stack_where_17(const bfloat16*, const int32_t*, const bfloat16*, const float*, const float*, const bool*, const bfloat16*, float*, float*, bfloat16*, float*, bool*, float*, float*, float*, bfloat16*, float*, bfloat16*)’:
+/tmp/torchinductor_jackkhuu/clwlk6qr5j5pkuourbuvzm7ziqe63ybjruymcu3j2i7zqv4pikg6/cstsuiqkksmkx2ung6x6grqrenfveg57z7enn5z2fxjxw25yycxa.cpp:2294:31: warning: variable ‘tmp_acc0_arr’ set but not used [-Wunused-but-set-variable]
+ 2294 |                         float tmp_acc0_arr[32];
+      |                               ^~~~~~~~~~~~
+/tmp/torchinductor_jackkhuu/clwlk6qr5j5pkuourbuvzm7ziqe63ybjruymcu3j2i7zqv4pikg6/cstsuiqkksmkx2ung6x6grqrenfveg57z7enn5z2fxjxw25yycxa.cpp: In function ‘void cpp_fused__safe_softmax__to_copy_add_bmm_index_index_put_scalar_tensor_stack_where_22(const bfloat16*, const int32_t*, const bfloat16*, const float*, const float*, const bool*, const bfloat16*, float*, float*, bfloat16*, float*, bool*, float*, float*, float*, bfloat16*, float*, bfloat16*)’:
+/tmp/torchinductor_jackkhuu/clwlk6qr5j5pkuourbuvzm7ziqe63ybjruymcu3j2i7zqv4pikg6/cstsuiqkksmkx2ung6x6grqrenfveg57z7enn5z2fxjxw25yycxa.cpp:2794:31: warning: variable ‘tmp_acc0_arr’ set but not used [-Wunused-but-set-variable]
+ 2794 |                         float tmp_acc0_arr[32];
+      |                               ^~~~~~~~~~~~
+/tmp/torchinductor_jackkhuu/clwlk6qr5j5pkuourbuvzm7ziqe63ybjruymcu3j2i7zqv4pikg6/cstsuiqkksmkx2ung6x6grqrenfveg57z7enn5z2fxjxw25yycxa.cpp: In function ‘void cpp_fused__safe_softmax__to_copy_add_bmm_index_index_put_scalar_tensor_stack_where_27(const bfloat16*, const int32_t*, const bfloat16*, const float*, const float*, const bool*, const bfloat16*, float*, float*, bfloat16*, float*, bool*, float*, float*, float*, bfloat16*, float*, bfloat16*)’:
+/tmp/torchinductor_jackkhuu/clwlk6qr5j5pkuourbuvzm7ziqe63ybjruymcu3j2i7zqv4pikg6/cstsuiqkksmkx2ung6x6grqrenfveg57z7enn5z2fxjxw25yycxa.cpp:3306:31: warning: variable ‘tmp_acc0_arr’ set but not used [-Wunused-but-set-variable]
+ 3306 |                         float tmp_acc0_arr[32];
+      |                               ^~~~~~~~~~~~
+/tmp/torchinductor_jackkhuu/clwlk6qr5j5pkuourbuvzm7ziqe63ybjruymcu3j2i7zqv4pikg6/cstsuiqkksmkx2ung6x6grqrenfveg57z7enn5z2fxjxw25yycxa.cpp: In function ‘void cpp_fused__safe_softmax__to_copy_add_bmm_index_index_put_scalar_tensor_stack_where_32(const bfloat16*, const int32_t*, const bfloat16*, const float*, const float*, const bool*, const bfloat16*, float*, float*, bfloat16*, float*, bool*, float*, float*, float*, bfloat16*, float*, bfloat16*)’:
+/tmp/torchinductor_jackkhuu/clwlk6qr5j5pkuourbuvzm7ziqe63ybjruymcu3j2i7zqv4pikg6/cstsuiqkksmkx2ung6x6grqrenfveg57z7enn5z2fxjxw25yycxa.cpp:3806:31: warning: variable ‘tmp_acc0_arr’ set but not used [-Wunused-but-set-variable]
+ 3806 |                         float tmp_acc0_arr[32];
+      |                               ^~~~~~~~~~~~
+/tmp/torchinductor_jackkhuu/clwlk6qr5j5pkuourbuvzm7ziqe63ybjruymcu3j2i7zqv4pikg6/cstsuiqkksmkx2ung6x6grqrenfveg57z7enn5z2fxjxw25yycxa.cpp: In function ‘void cpp_fused__safe_softmax__to_copy_add_bmm_index_index_put_scalar_tensor_stack_where_37(const bfloat16*, const int32_t*, const bfloat16*, const float*, const float*, const bool*, const bfloat16*, float*, float*, bfloat16*, float*, bool*, float*, float*, float*, bfloat16*, float*, bfloat16*)’:
+/tmp/torchinductor_jackkhuu/clwlk6qr5j5pkuourbuvzm7ziqe63ybjruymcu3j2i7zqv4pikg6/cstsuiqkksmkx2ung6x6grqrenfveg57z7enn5z2fxjxw25yycxa.cpp:4318:31: warning: variable ‘tmp_acc0_arr’ set but not used [-Wunused-but-set-variable]
+ 4318 |                         float tmp_acc0_arr[32];
+      |                               ^~~~~~~~~~~~
+/tmp/torchinductor_jackkhuu/clwlk6qr5j5pkuourbuvzm7ziqe63ybjruymcu3j2i7zqv4pikg6/cstsuiqkksmkx2ung6x6grqrenfveg57z7enn5z2fxjxw25yycxa.cpp: In function ‘void cpp_fused__safe_softmax__to_copy_add_bmm_index_index_put_scalar_tensor_stack_where_42(const bfloat16*, const int32_t*, const bfloat16*, const float*, const float*, const bool*, const bfloat16*, float*, float*, bfloat16*, float*, bool*, float*, float*, float*, bfloat16*, float*, bfloat16*)’:
+/tmp/torchinductor_jackkhuu/clwlk6qr5j5pkuourbuvzm7ziqe63ybjruymcu3j2i7zqv4pikg6/cstsuiqkksmkx2ung6x6grqrenfveg57z7enn5z2fxjxw25yycxa.cpp:4818:31: warning: variable ‘tmp_acc0_arr’ set but not used [-Wunused-but-set-variable]
+ 4818 |                         float tmp_acc0_arr[32];
+      |                               ^~~~~~~~~~~~
+/tmp/torchinductor_jackkhuu/clwlk6qr5j5pkuourbuvzm7ziqe63ybjruymcu3j2i7zqv4pikg6/cstsuiqkksmkx2ung6x6grqrenfveg57z7enn5z2fxjxw25yycxa.cpp: In function ‘void cpp_fused__safe_softmax__to_copy_add_bmm_index_index_put_scalar_tensor_stack_where_47(const bfloat16*, const int32_t*, const bfloat16*, const float*, const float*, const bool*, const bfloat16*, float*, float*, bfloat16*, float*, bool*, float*, float*, float*, bfloat16*, float*, bfloat16*)’:
+/tmp/torchinductor_jackkhuu/clwlk6qr5j5pkuourbuvzm7ziqe63ybjruymcu3j2i7zqv4pikg6/cstsuiqkksmkx2ung6x6grqrenfveg57z7enn5z2fxjxw25yycxa.cpp:5330:31: warning: variable ‘tmp_acc0_arr’ set but not used [-Wunused-but-set-variable]
+ 5330 |                         float tmp_acc0_arr[32];
+      |                               ^~~~~~~~~~~~
+/tmp/torchinductor_jackkhuu/clwlk6qr5j5pkuourbuvzm7ziqe63ybjruymcu3j2i7zqv4pikg6/cstsuiqkksmkx2ung6x6grqrenfveg57z7enn5z2fxjxw25yycxa.cpp: In function ‘void cpp_fused__safe_softmax__to_copy_add_bmm_index_index_put_scalar_tensor_stack_where_52(const bfloat16*, const int32_t*, const bfloat16*, const float*, const float*, const bool*, const bfloat16*, float*, float*, bfloat16*, float*, bool*, float*, float*, float*, bfloat16*, float*, bfloat16*)’:
+/tmp/torchinductor_jackkhuu/clwlk6qr5j5pkuourbuvzm7ziqe63ybjruymcu3j2i7zqv4pikg6/cstsuiqkksmkx2ung6x6grqrenfveg57z7enn5z2fxjxw25yycxa.cpp:5830:31: warning: variable ‘tmp_acc0_arr’ set but not used [-Wunused-but-set-variable]
+ 5830 |                         float tmp_acc0_arr[32];
+      |                               ^~~~~~~~~~~~
+/tmp/torchinductor_jackkhuu/clwlk6qr5j5pkuourbuvzm7ziqe63ybjruymcu3j2i7zqv4pikg6/cstsuiqkksmkx2ung6x6grqrenfveg57z7enn5z2fxjxw25yycxa.cpp: In function ‘void cpp_fused__safe_softmax__to_copy_add_bmm_index_index_put_scalar_tensor_stack_where_57(const bfloat16*, const int32_t*, const bfloat16*, const float*, const float*, const bool*, const bfloat16*, float*, float*, bfloat16*, float*, bool*, float*, float*, float*, bfloat16*, float*, bfloat16*)’:
+/tmp/torchinductor_jackkhuu/clwlk6qr5j5pkuourbuvzm7ziqe63ybjruymcu3j2i7zqv4pikg6/cstsuiqkksmkx2ung6x6grqrenfveg57z7enn5z2fxjxw25yycxa.cpp:6342:31: warning: variable ‘tmp_acc0_arr’ set but not used [-Wunused-but-set-variable]
+ 6342 |                         float tmp_acc0_arr[32];
+      |                               ^~~~~~~~~~~~
+/tmp/torchinductor_jackkhuu/clwlk6qr5j5pkuourbuvzm7ziqe63ybjruymcu3j2i7zqv4pikg6/cstsuiqkksmkx2ung6x6grqrenfveg57z7enn5z2fxjxw25yycxa.cpp: In function ‘void cpp_fused__safe_softmax__to_copy_add_bmm_index_index_put_scalar_tensor_stack_where_62(const bfloat16*, const int32_t*, const bfloat16*, const float*, const float*, const bool*, const bfloat16*, float*, float*, bfloat16*, float*, bool*, float*, float*, float*, bfloat16*, float*, bfloat16*)’:
+/tmp/torchinductor_jackkhuu/clwlk6qr5j5pkuourbuvzm7ziqe63ybjruymcu3j2i7zqv4pikg6/cstsuiqkksmkx2ung6x6grqrenfveg57z7enn5z2fxjxw25yycxa.cpp:6842:31: warning: variable ‘tmp_acc0_arr’ set but not used [-Wunused-but-set-variable]
+ 6842 |                         float tmp_acc0_arr[32];
+      |                               ^~~~~~~~~~~~
+/tmp/torchinductor_jackkhuu/clwlk6qr5j5pkuourbuvzm7ziqe63ybjruymcu3j2i7zqv4pikg6/cstsuiqkksmkx2ung6x6grqrenfveg57z7enn5z2fxjxw25yycxa.cpp: In function ‘void cpp_fused__safe_softmax__to_copy_add_bmm_index_index_put_scalar_tensor_stack_where_67(const bfloat16*, const int32_t*, const bfloat16*, const float*, const float*, const bool*, const bfloat16*, float*, float*, bfloat16*, float*, bool*, float*, float*, float*, bfloat16*, float*, bfloat16*)’:
+/tmp/torchinductor_jackkhuu/clwlk6qr5j5pkuourbuvzm7ziqe63ybjruymcu3j2i7zqv4pikg6/cstsuiqkksmkx2ung6x6grqrenfveg57z7enn5z2fxjxw25yycxa.cpp:7354:31: warning: variable ‘tmp_acc0_arr’ set but not used [-Wunused-but-set-variable]
+ 7354 |                         float tmp_acc0_arr[32];
+      |                               ^~~~~~~~~~~~
+/tmp/torchinductor_jackkhuu/clwlk6qr5j5pkuourbuvzm7ziqe63ybjruymcu3j2i7zqv4pikg6/cstsuiqkksmkx2ung6x6grqrenfveg57z7enn5z2fxjxw25yycxa.cpp: In function ‘void cpp_fused__safe_softmax__to_copy_add_bmm_index_index_put_scalar_tensor_stack_where_72(const bfloat16*, const int32_t*, const bfloat16*, const float*, const float*, const bool*, const bfloat16*, float*, float*, bfloat16*, float*, bool*, float*, float*, float*, bfloat16*, float*, bfloat16*)’:
+/tmp/torchinductor_jackkhuu/clwlk6qr5j5pkuourbuvzm7ziqe63ybjruymcu3j2i7zqv4pikg6/cstsuiqkksmkx2ung6x6grqrenfveg57z7enn5z2fxjxw25yycxa.cpp:7854:31: warning: variable ‘tmp_acc0_arr’ set but not used [-Wunused-but-set-variable]
+ 7854 |                         float tmp_acc0_arr[32];
+      |                               ^~~~~~~~~~~~
+/tmp/torchinductor_jackkhuu/clwlk6qr5j5pkuourbuvzm7ziqe63ybjruymcu3j2i7zqv4pikg6/cstsuiqkksmkx2ung6x6grqrenfveg57z7enn5z2fxjxw25yycxa.cpp: In function ‘void cpp_fused__safe_softmax__to_copy_add_bmm_index_index_put_scalar_tensor_stack_where_77(const bfloat16*, const int32_t*, const bfloat16*, const float*, const float*, const bool*, const bfloat16*, float*, float*, bfloat16*, float*, bool*, float*, float*, float*, bfloat16*, float*, bfloat16*)’:
+/tmp/torchinductor_jackkhuu/clwlk6qr5j5pkuourbuvzm7ziqe63ybjruymcu3j2i7zqv4pikg6/cstsuiqkksmkx2ung6x6grqrenfveg57z7enn5z2fxjxw25yycxa.cpp:8366:31: warning: variable ‘tmp_acc0_arr’ set but not used [-Wunused-but-set-variable]
+ 8366 |                         float tmp_acc0_arr[32];
+      |                               ^~~~~~~~~~~~
+/tmp/torchinductor_jackkhuu/clwlk6qr5j5pkuourbuvzm7ziqe63ybjruymcu3j2i7zqv4pikg6/cstsuiqkksmkx2ung6x6grqrenfveg57z7enn5z2fxjxw25yycxa.cpp: In function ‘void cpp_fused__safe_softmax__to_copy_add_bmm_index_index_put_scalar_tensor_stack_where_82(const bfloat16*, const int32_t*, const bfloat16*, const float*, const float*, const bool*, const bfloat16*, float*, float*, bfloat16*, float*, bool*, float*, float*, float*, bfloat16*, float*, bfloat16*)’:
+/tmp/torchinductor_jackkhuu/clwlk6qr5j5pkuourbuvzm7ziqe63ybjruymcu3j2i7zqv4pikg6/cstsuiqkksmkx2ung6x6grqrenfveg57z7enn5z2fxjxw25yycxa.cpp:8866:31: warning: variable ‘tmp_acc0_arr’ set but not used [-Wunused-but-set-variable]
+ 8866 |                         float tmp_acc0_arr[32];
+      |                               ^~~~~~~~~~~~
+/tmp/torchinductor_jackkhuu/clwlk6qr5j5pkuourbuvzm7ziqe63ybjruymcu3j2i7zqv4pikg6/cstsuiqkksmkx2ung6x6grqrenfveg57z7enn5z2fxjxw25yycxa.cpp: In function ‘void cpp_fused__safe_softmax__to_copy_add_bmm_index_index_put_scalar_tensor_stack_where_87(const bfloat16*, const int32_t*, const bfloat16*, const float*, const float*, const bool*, const bfloat16*, float*, float*, bfloat16*, float*, bool*, float*, float*, float*, bfloat16*, float*, bfloat16*)’:
+/tmp/torchinductor_jackkhuu/clwlk6qr5j5pkuourbuvzm7ziqe63ybjruymcu3j2i7zqv4pikg6/cstsuiqkksmkx2ung6x6grqrenfveg57z7enn5z2fxjxw25yycxa.cpp:9378:31: warning: variable ‘tmp_acc0_arr’ set but not used [-Wunused-but-set-variable]
+ 9378 |                         float tmp_acc0_arr[32];
+      |                               ^~~~~~~~~~~~
+/tmp/torchinductor_jackkhuu/clwlk6qr5j5pkuourbuvzm7ziqe63ybjruymcu3j2i7zqv4pikg6/cstsuiqkksmkx2ung6x6grqrenfveg57z7enn5z2fxjxw25yycxa.cpp: In function ‘void cpp_fused__safe_softmax__to_copy_add_bmm_index_index_put_scalar_tensor_stack_where_92(const bfloat16*, const int32_t*, const bfloat16*, const float*, const float*, const bool*, const bfloat16*, float*, float*, bfloat16*, float*, bool*, float*, float*, float*, bfloat16*, float*, bfloat16*)’:
+/tmp/torchinductor_jackkhuu/clwlk6qr5j5pkuourbuvzm7ziqe63ybjruymcu3j2i7zqv4pikg6/cstsuiqkksmkx2ung6x6grqrenfveg57z7enn5z2fxjxw25yycxa.cpp:9878:31: warning: variable ‘tmp_acc0_arr’ set but not used [-Wunused-but-set-variable]
+ 9878 |                         float tmp_acc0_arr[32];
+      |                               ^~~~~~~~~~~~
+/tmp/torchinductor_jackkhuu/clwlk6qr5j5pkuourbuvzm7ziqe63ybjruymcu3j2i7zqv4pikg6/cstsuiqkksmkx2ung6x6grqrenfveg57z7enn5z2fxjxw25yycxa.cpp: In function ‘void cpp_fused__safe_softmax__to_copy_add_bmm_index_index_put_scalar_tensor_stack_where_97(const bfloat16*, const int32_t*, const bfloat16*, const float*, const float*, const bool*, const bfloat16*, float*, float*, bfloat16*, float*, bool*, float*, float*, float*, bfloat16*, float*, bfloat16*)’:
+/tmp/torchinductor_jackkhuu/clwlk6qr5j5pkuourbuvzm7ziqe63ybjruymcu3j2i7zqv4pikg6/cstsuiqkksmkx2ung6x6grqrenfveg57z7enn5z2fxjxw25yycxa.cpp:10390:31: warning: variable ‘tmp_acc0_arr’ set but not used [-Wunused-but-set-variable]
+10390 |                         float tmp_acc0_arr[32];
+      |                               ^~~~~~~~~~~~
+/tmp/torchinductor_jackkhuu/clwlk6qr5j5pkuourbuvzm7ziqe63ybjruymcu3j2i7zqv4pikg6/cstsuiqkksmkx2ung6x6grqrenfveg57z7enn5z2fxjxw25yycxa.cpp: In function ‘void cpp_fused__safe_softmax__to_copy_add_bmm_index_index_put_scalar_tensor_stack_where_102(const bfloat16*, const int32_t*, const bfloat16*, const float*, const float*, const bool*, const bfloat16*, float*, float*, bfloat16*, float*, bool*, float*, float*, float*, bfloat16*, float*, bfloat16*)’:
+/tmp/torchinductor_jackkhuu/clwlk6qr5j5pkuourbuvzm7ziqe63ybjruymcu3j2i7zqv4pikg6/cstsuiqkksmkx2ung6x6grqrenfveg57z7enn5z2fxjxw25yycxa.cpp:10890:31: warning: variable ‘tmp_acc0_arr’ set but not used [-Wunused-but-set-variable]
+10890 |                         float tmp_acc0_arr[32];
+      |                               ^~~~~~~~~~~~
+/tmp/torchinductor_jackkhuu/clwlk6qr5j5pkuourbuvzm7ziqe63ybjruymcu3j2i7zqv4pikg6/cstsuiqkksmkx2ung6x6grqrenfveg57z7enn5z2fxjxw25yycxa.cpp: In function ‘void cpp_fused__safe_softmax__to_copy_add_bmm_index_index_put_scalar_tensor_stack_where_107(const bfloat16*, const int32_t*, const bfloat16*, const float*, const float*, const bool*, const bfloat16*, float*, float*, bfloat16*, float*, bool*, float*, float*, float*, bfloat16*, float*, bfloat16*)’:
+/tmp/torchinductor_jackkhuu/clwlk6qr5j5pkuourbuvzm7ziqe63ybjruymcu3j2i7zqv4pikg6/cstsuiqkksmkx2ung6x6grqrenfveg57z7enn5z2fxjxw25yycxa.cpp:11402:31: warning: variable ‘tmp_acc0_arr’ set but not used [-Wunused-but-set-variable]
+11402 |                         float tmp_acc0_arr[32];
+      |                               ^~~~~~~~~~~~
+/tmp/torchinductor_jackkhuu/clwlk6qr5j5pkuourbuvzm7ziqe63ybjruymcu3j2i7zqv4pikg6/cstsuiqkksmkx2ung6x6grqrenfveg57z7enn5z2fxjxw25yycxa.cpp: In function ‘void cpp_fused__safe_softmax__to_copy_add_bmm_index_index_put_scalar_tensor_stack_where_112(const bfloat16*, const int32_t*, const bfloat16*, const float*, const float*, const bool*, const bfloat16*, float*, float*, bfloat16*, float*, bool*, float*, float*, float*, bfloat16*, float*, bfloat16*)’:
+/tmp/torchinductor_jackkhuu/clwlk6qr5j5pkuourbuvzm7ziqe63ybjruymcu3j2i7zqv4pikg6/cstsuiqkksmkx2ung6x6grqrenfveg57z7enn5z2fxjxw25yycxa.cpp:11902:31: warning: variable ‘tmp_acc0_arr’ set but not used [-Wunused-but-set-variable]
+11902 |                         float tmp_acc0_arr[32];
+      |                               ^~~~~~~~~~~~
+/tmp/torchinductor_jackkhuu/clwlk6qr5j5pkuourbuvzm7ziqe63ybjruymcu3j2i7zqv4pikg6/cstsuiqkksmkx2ung6x6grqrenfveg57z7enn5z2fxjxw25yycxa.cpp: In function ‘void cpp_fused__safe_softmax__to_copy_add_bmm_index_index_put_scalar_tensor_stack_where_117(const bfloat16*, const int32_t*, const bfloat16*, const float*, const float*, const bool*, const bfloat16*, float*, float*, bfloat16*, float*, bool*, float*, float*, float*, bfloat16*, float*, bfloat16*)’:
+/tmp/torchinductor_jackkhuu/clwlk6qr5j5pkuourbuvzm7ziqe63ybjruymcu3j2i7zqv4pikg6/cstsuiqkksmkx2ung6x6grqrenfveg57z7enn5z2fxjxw25yycxa.cpp:12414:31: warning: variable ‘tmp_acc0_arr’ set but not used [-Wunused-but-set-variable]
+12414 |                         float tmp_acc0_arr[32];
+      |                               ^~~~~~~~~~~~
+/tmp/torchinductor_jackkhuu/clwlk6qr5j5pkuourbuvzm7ziqe63ybjruymcu3j2i7zqv4pikg6/cstsuiqkksmkx2ung6x6grqrenfveg57z7enn5z2fxjxw25yycxa.cpp: In function ‘void cpp_fused__safe_softmax__to_copy_add_bmm_index_index_put_scalar_tensor_stack_where_122(const bfloat16*, const int32_t*, const bfloat16*, const float*, const float*, const bool*, const bfloat16*, float*, float*, bfloat16*, float*, bool*, float*, float*, float*, bfloat16*, float*, bfloat16*)’:
+/tmp/torchinductor_jackkhuu/clwlk6qr5j5pkuourbuvzm7ziqe63ybjruymcu3j2i7zqv4pikg6/cstsuiqkksmkx2ung6x6grqrenfveg57z7enn5z2fxjxw25yycxa.cpp:12914:31: warning: variable ‘tmp_acc0_arr’ set but not used [-Wunused-but-set-variable]
+12914 |                         float tmp_acc0_arr[32];
+      |                               ^~~~~~~~~~~~
+/tmp/torchinductor_jackkhuu/clwlk6qr5j5pkuourbuvzm7ziqe63ybjruymcu3j2i7zqv4pikg6/cstsuiqkksmkx2ung6x6grqrenfveg57z7enn5z2fxjxw25yycxa.cpp: In function ‘void cpp_fused__safe_softmax__to_copy_add_bmm_index_index_put_scalar_tensor_stack_where_127(const bfloat16*, const int32_t*, const bfloat16*, const float*, const float*, const bool*, const bfloat16*, float*, float*, bfloat16*, float*, bool*, float*, float*, float*, bfloat16*, float*, bfloat16*)’:
+/tmp/torchinductor_jackkhuu/clwlk6qr5j5pkuourbuvzm7ziqe63ybjruymcu3j2i7zqv4pikg6/cstsuiqkksmkx2ung6x6grqrenfveg57z7enn5z2fxjxw25yycxa.cpp:13426:31: warning: variable ‘tmp_acc0_arr’ set but not used [-Wunused-but-set-variable]
+13426 |                         float tmp_acc0_arr[32];
+      |                               ^~~~~~~~~~~~
+/tmp/torchinductor_jackkhuu/clwlk6qr5j5pkuourbuvzm7ziqe63ybjruymcu3j2i7zqv4pikg6/cstsuiqkksmkx2ung6x6grqrenfveg57z7enn5z2fxjxw25yycxa.cpp: In function ‘void cpp_fused__safe_softmax__to_copy_add_bmm_index_index_put_scalar_tensor_stack_where_132(const bfloat16*, const int32_t*, const bfloat16*, const float*, const float*, const bool*, const bfloat16*, float*, float*, bfloat16*, float*, bool*, float*, float*, float*, bfloat16*, float*, bfloat16*)’:
+/tmp/torchinductor_jackkhuu/clwlk6qr5j5pkuourbuvzm7ziqe63ybjruymcu3j2i7zqv4pikg6/cstsuiqkksmkx2ung6x6grqrenfveg57z7enn5z2fxjxw25yycxa.cpp:13926:31: warning: variable ‘tmp_acc0_arr’ set but not used [-Wunused-but-set-variable]
+13926 |                         float tmp_acc0_arr[32];
+      |                               ^~~~~~~~~~~~
+/tmp/torchinductor_jackkhuu/clwlk6qr5j5pkuourbuvzm7ziqe63ybjruymcu3j2i7zqv4pikg6/cstsuiqkksmkx2ung6x6grqrenfveg57z7enn5z2fxjxw25yycxa.cpp: In function ‘void cpp_fused__safe_softmax__to_copy_add_bmm_index_index_put_scalar_tensor_stack_where_137(const bfloat16*, const int32_t*, const bfloat16*, const float*, const float*, const bool*, const bfloat16*, float*, float*, bfloat16*, float*, bool*, float*, float*, float*, bfloat16*, float*, bfloat16*)’:
+/tmp/torchinductor_jackkhuu/clwlk6qr5j5pkuourbuvzm7ziqe63ybjruymcu3j2i7zqv4pikg6/cstsuiqkksmkx2ung6x6grqrenfveg57z7enn5z2fxjxw25yycxa.cpp:14438:31: warning: variable ‘tmp_acc0_arr’ set but not used [-Wunused-but-set-variable]
+14438 |                         float tmp_acc0_arr[32];
+      |                               ^~~~~~~~~~~~
+/tmp/torchinductor_jackkhuu/clwlk6qr5j5pkuourbuvzm7ziqe63ybjruymcu3j2i7zqv4pikg6/cstsuiqkksmkx2ung6x6grqrenfveg57z7enn5z2fxjxw25yycxa.cpp: In function ‘void cpp_fused__safe_softmax__to_copy_add_bmm_index_index_put_scalar_tensor_stack_where_142(const bfloat16*, const int32_t*, const bfloat16*, const float*, const float*, const bool*, const bfloat16*, float*, float*, bfloat16*, float*, bool*, float*, float*, float*, bfloat16*, float*, bfloat16*)’:
+/tmp/torchinductor_jackkhuu/clwlk6qr5j5pkuourbuvzm7ziqe63ybjruymcu3j2i7zqv4pikg6/cstsuiqkksmkx2ung6x6grqrenfveg57z7enn5z2fxjxw25yycxa.cpp:14938:31: warning: variable ‘tmp_acc0_arr’ set but not used [-Wunused-but-set-variable]
+14938 |                         float tmp_acc0_arr[32];
+      |                               ^~~~~~~~~~~~
+/tmp/torchinductor_jackkhuu/clwlk6qr5j5pkuourbuvzm7ziqe63ybjruymcu3j2i7zqv4pikg6/cstsuiqkksmkx2ung6x6grqrenfveg57z7enn5z2fxjxw25yycxa.cpp: In function ‘void cpp_fused__safe_softmax__to_copy_add_bmm_index_index_put_scalar_tensor_stack_where_147(const bfloat16*, const int32_t*, const bfloat16*, const float*, const float*, const bool*, const bfloat16*, float*, float*, bfloat16*, float*, bool*, float*, float*, float*, bfloat16*, float*, bfloat16*)’:
+/tmp/torchinductor_jackkhuu/clwlk6qr5j5pkuourbuvzm7ziqe63ybjruymcu3j2i7zqv4pikg6/cstsuiqkksmkx2ung6x6grqrenfveg57z7enn5z2fxjxw25yycxa.cpp:15450:31: warning: variable ‘tmp_acc0_arr’ set but not used [-Wunused-but-set-variable]
+15450 |                         float tmp_acc0_arr[32];
+      |                               ^~~~~~~~~~~~
+/tmp/torchinductor_jackkhuu/clwlk6qr5j5pkuourbuvzm7ziqe63ybjruymcu3j2i7zqv4pikg6/cstsuiqkksmkx2ung6x6grqrenfveg57z7enn5z2fxjxw25yycxa.cpp: In function ‘void cpp_fused__safe_softmax__to_copy_add_bmm_index_index_put_scalar_tensor_stack_where_152(const bfloat16*, const int32_t*, const bfloat16*, const float*, const float*, const bool*, const bfloat16*, float*, float*, bfloat16*, float*, bool*, float*, float*, float*, bfloat16*, float*, bfloat16*)’:
+/tmp/torchinductor_jackkhuu/clwlk6qr5j5pkuourbuvzm7ziqe63ybjruymcu3j2i7zqv4pikg6/cstsuiqkksmkx2ung6x6grqrenfveg57z7enn5z2fxjxw25yycxa.cpp:15950:31: warning: variable ‘tmp_acc0_arr’ set but not used [-Wunused-but-set-variable]
+15950 |                         float tmp_acc0_arr[32];
+      |                               ^~~~~~~~~~~~
+/tmp/torchinductor_jackkhuu/clwlk6qr5j5pkuourbuvzm7ziqe63ybjruymcu3j2i7zqv4pikg6/cstsuiqkksmkx2ung6x6grqrenfveg57z7enn5z2fxjxw25yycxa.cpp: In function ‘void cpp_fused__safe_softmax__to_copy_add_bmm_index_index_put_scalar_tensor_stack_where_157(const bfloat16*, const int32_t*, const bfloat16*, const float*, const float*, const bool*, const bfloat16*, float*, float*, bfloat16*, float*, bool*, float*, float*, float*, bfloat16*, float*, bfloat16*)’:
+/tmp/torchinductor_jackkhuu/clwlk6qr5j5pkuourbuvzm7ziqe63ybjruymcu3j2i7zqv4pikg6/cstsuiqkksmkx2ung6x6grqrenfveg57z7enn5z2fxjxw25yycxa.cpp:16462:31: warning: variable ‘tmp_acc0_arr’ set but not used [-Wunused-but-set-variable]
+16462 |                         float tmp_acc0_arr[32];
+      |                               ^~~~~~~~~~~~
+Unabled to import torchao experimental quant_api with error:  [Errno 2] No such file or directory: '/home/jackkhuu/oss/torchchat/torchao-build/src/ao/torchao/experimental/quant_api.py'
+Using device=cpu
+Setting max_seq_length to 300 for DSO export.
+Loading model...
+Time to load model: 0.11 seconds
+Quantizing the model with: {'linear:int8': {'groupsize': 0}, 'precision': {'dtype': 'bfloat16'}, 'executor': {'accelerator': 'cpu'}}
+Time to quantize model: 47.34 seconds
+-----------------------------------------------------------
+Exporting model using AOT Inductor to /tmp/model8.pt2
+The generated packaged model can be found at: /tmp/model8.pt2
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+OMP_NUM_THREADS=16 numactl --cpunodebind=0 --membind=0 python3 torchchat.py generate llama3.1 --aoti-package-path /tmp/model8.pt2 --prompt "Once upon a time," --max-new-tokens 256 --device cpu --num-samples 3
+PyTorch version 2.6.0.dev20241218+cu124 available.
+Unabled to import torchao experimental quant_api with error:  [Errno 2] No such file or directory: '/home/jackkhuu/oss/torchchat/torchao-build/src/ao/torchao/experimental/quant_api.py'
+Warning: checkpoint path ignored because an exported model was specified using a DSO, AOTI PACKAGE or PTE path argument
+Warning: checkpoint path ignored because an exported model was specified using a DSO, AOTI PACKAGE or PTE path argument
+Using device=cpu Intel(R) Xeon(R) Platinum 8339HC CPU @ 1.80GHz
+Loading model...
+Time to load model: 0.39 seconds
+-----------------------------------------------------------
+Once upon a time, there was a small village called Puddle town. Puddle town was surrounded by beautiful rolling hills and a meandering river, with a gentle river-side path that wound its way through the village, past gardens and cottages, to the foot of the hills.
+The villagers were simple folk, living off the land and the river. They grew their own fruits and vegetables, fish in the river, and kept a few animals for meat and milk. The villagers were friendly and welcoming to outsiders, but they were also fiercely protective of their village and their way of life.
+One day, a stranger came to Puddle town. The stranger was a traveler, who had been walking for days and was carrying a large pack on his back. He was tired and hungry, and he asked the villagers for food and shelter.
+The villagers were wary of the stranger, but they were also kind and generous. They invited him to stay in the village for a while, and offered him food and a place to rest.
+As the stranger settled in, the villagers began to notice that he was different from them. He was city-born and bred, and he had a strange, stiff way of talking and walking. He seemed out of place in the simple, rustic surroundings of Puddle town.
+But
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~                
+Generated 255 tokens                 
+Time for inference 1: 285.2112 sec total                 
+Time to first token: 8.2256 sec with sequential prefill.                
+
+      Total throughput: 0.8976 tokens/sec, 1.1141 s/token                 
+First token throughput: 0.1216 tokens/sec, 8.2256 s/token                 
+ Next token throughput: 0.9206 tokens/sec, 1.0862 s/token                     
+
+Bandwidth achieved: 0.00 GB/s
+*** This first iteration will include cold start effects for dynamic import, hardware caches. ***
+
+========================================
+
+Once upon a time, in the early days of the internet, you had to be extremely careful about the web sites you visited, as they could potentially be sites that contained malware or viruses. Now, it seems like just about everyone has a web site, or a blog, or a social media presence. It's a brave new world.
+The reality is that the internet is a lot safer now than it was back in the day, thanks to advances in technology and a greater awareness of online safety. However, that doesn't mean we're completely free from worries. While you're browsing online, your personal data could be compromised, if you're not careful.
+Here are some online safety tips to keep your data safe and secure:
+1. Use a reputable antivirus software: Your antivirus software is your best defense against malware and viruses. Make sure you're using a reputable brand and keep it up to date.
+2. Use strong passwords: This should go without saying, but make sure you're using strong and unique passwords for all online accounts.
+3. Use two-factor authentication: Two-factor authentication adds an extra layer of security to your online accounts. It requires a second form of verification, such as a code sent to your phone, in addition to your password.
+4. Keep your browser and operating system
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~                
+Generated 255 tokens                 
+Time for inference 2: 269.5341 sec total                 
+Time to first token: 5.2441 sec with sequential prefill.                
+
+      Total throughput: 0.9498 tokens/sec, 1.0529 s/token                 
+First token throughput: 0.1907 tokens/sec, 5.2441 s/token                 
+ Next token throughput: 0.9648 tokens/sec, 1.0364 s/token                     
+
+Bandwidth achieved: 0.00 GB/s
+
+========================================
+
+Once upon a time, there was a very kind and caring girl named Sophia. She lived in a small village surrounded by rolling hills and dense forests. Sophia loved to explore the outdoors and was always eager to help those in need.
+
+One day, while out on a walk, Sophia came across a small, lost puppy. The puppy was shivering with fear and didn't know where its home was. Sophia immediately scooped up the puppy and cradled it in her arms, trying to comfort it. As she looked into the puppy's big, brown eyes, she knew she had to find its home.
+
+Sophia set out on a mission to find the puppy's family. She asked the villagers if they knew anyone who was missing a puppy, but no one seemed to recognize the little ball of fluff. Just when Sophia was about to give up, she remembered a wise old woman who lived on the outskirts of the village. The old woman was known for her knowledge of the local animals and their habitats.
+
+Sophia visited the old woman, who listened carefully as Sophia described the puppy. The old woman nodded thoughtfully and said, "I think I know just the family you might be looking for." She gave Sophia a map with a path leading to a nearby farm.
+
+Sophia followed the
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~                
+Generated 255 tokens                 
+Time for inference 3: 324.5938 sec total                 
+Time to first token: 9.1764 sec with sequential prefill.                
+
+      Total throughput: 0.7887 tokens/sec, 1.2679 s/token                 
+First token throughput: 0.1090 tokens/sec, 9.1764 s/token                 
+ Next token throughput: 0.8085 tokens/sec, 1.2369 s/token                     
+
+Bandwidth achieved: 0.00 GB/s
+
+========================================
+
+
+Warning: Excluding compile in calculations                 
+      Average tokens/sec (total): 0.88                 
+Average tokens/sec (first token): 0.14                 
+Average tokens/sec (next tokens): 0.90 
+                
+Memory used: 0.00 GB
diff --git a/llama31-1218/cpu_aoti_pt2_b16.txt b/llama31-1218/cpu_aoti_pt2_b16.txt
new file mode 100644
index 000000000..9c9144937
--- /dev/null
+++ b/llama31-1218/cpu_aoti_pt2_b16.txt
@@ -0,0 +1,222 @@
+python3 torchchat.py export llama3.1 --quantize '{"precision": {"dtype":"bfloat16"}, "executor":{"accelerator":"cpu"}}' --output-aoti-package-path /tmp/model16.pt2
+OMP_NUM_THREADS=16 numactl --cpunodebind=0 --membind=0 python3 torchchat.py generate llama3.1 --aoti-package-path /tmp/model16.pt2 --prompt "Once upon a time," --max-new-tokens 256 --device cpu --num-samples 3
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+python3 torchchat.py export llama3.1 --quantize '{"precision": {"dtype":"bfloat16"}, "executor":{"accelerator":"cpu"}}' --output-aoti-package-path /tmp/model16.pt2
+Note: NumExpr detected 22 cores but "NUMEXPR_MAX_THREADS" not set, so enforcing safe limit of 16.
+NumExpr defaulting to 16 threads.
+PyTorch version 2.6.0.dev20241218+cu124 available.
+W1218 21:27:31.944499 2265306 site-packages/torch/_export/__init__.py:276] +============================+
+W1218 21:27:31.945011 2265306 site-packages/torch/_export/__init__.py:277] |     !!!   WARNING   !!!    |
+W1218 21:27:31.945233 2265306 site-packages/torch/_export/__init__.py:278] +============================+
+W1218 21:27:31.945490 2265306 site-packages/torch/_export/__init__.py:279] torch._export.aot_compile()/torch._export.aot_load() is being deprecated, please switch to directly calling torch._inductor.aoti_compile_and_package(torch.export.export())/torch._inductor.aoti_load_package() instead.
+/tmp/torchinductor_jackkhuu/cs36dbtcv6inf3gvez7zkbzuf4duad4rqfhnitwlbpchqxhnjvb2/ckry253sjnd53hcycnhihuv6bdalujdd4qdj2ckpytpqytgu5pcj.cpp: In function ‘void cpp_fused__safe_softmax__to_copy_add_bmm_index_index_put_scalar_tensor_stack_where_2(const bfloat16*, const int32_t*, const bfloat16*, const float*, const float*, const bool*, const bfloat16*, float*, float*, bfloat16*, float*, bool*, float*, float*, float*, bfloat16*, float*, bfloat16*)’:
+/tmp/torchinductor_jackkhuu/cs36dbtcv6inf3gvez7zkbzuf4duad4rqfhnitwlbpchqxhnjvb2/ckry253sjnd53hcycnhihuv6bdalujdd4qdj2ckpytpqytgu5pcj.cpp:740:31: warning: variable ‘tmp_acc0_arr’ set but not used [-Wunused-but-set-variable]
+  740 |                         float tmp_acc0_arr[32];
+      |                               ^~~~~~~~~~~~
+/tmp/torchinductor_jackkhuu/cs36dbtcv6inf3gvez7zkbzuf4duad4rqfhnitwlbpchqxhnjvb2/ckry253sjnd53hcycnhihuv6bdalujdd4qdj2ckpytpqytgu5pcj.cpp: In function ‘void cpp_fused__safe_softmax__to_copy_add_bmm_index_index_put_scalar_tensor_stack_where_7(const bfloat16*, const int32_t*, const bfloat16*, const float*, const float*, const bool*, const bfloat16*, float*, float*, bfloat16*, float*, bool*, float*, float*, float*, bfloat16*, float*, bfloat16*)’:
+/tmp/torchinductor_jackkhuu/cs36dbtcv6inf3gvez7zkbzuf4duad4rqfhnitwlbpchqxhnjvb2/ckry253sjnd53hcycnhihuv6bdalujdd4qdj2ckpytpqytgu5pcj.cpp:1276:31: warning: variable ‘tmp_acc0_arr’ set but not used [-Wunused-but-set-variable]
+ 1276 |                         float tmp_acc0_arr[32];
+      |                               ^~~~~~~~~~~~
+/tmp/torchinductor_jackkhuu/cs36dbtcv6inf3gvez7zkbzuf4duad4rqfhnitwlbpchqxhnjvb2/ckry253sjnd53hcycnhihuv6bdalujdd4qdj2ckpytpqytgu5pcj.cpp: In function ‘void cpp_fused__safe_softmax__to_copy_add_bmm_index_index_put_scalar_tensor_stack_where_12(const bfloat16*, const int32_t*, const bfloat16*, const float*, const float*, const bool*, const bfloat16*, float*, float*, bfloat16*, float*, bool*, float*, float*, float*, bfloat16*, float*, bfloat16*)’:
+/tmp/torchinductor_jackkhuu/cs36dbtcv6inf3gvez7zkbzuf4duad4rqfhnitwlbpchqxhnjvb2/ckry253sjnd53hcycnhihuv6bdalujdd4qdj2ckpytpqytgu5pcj.cpp:1782:31: warning: variable ‘tmp_acc0_arr’ set but not used [-Wunused-but-set-variable]
+ 1782 |                         float tmp_acc0_arr[32];
+      |                               ^~~~~~~~~~~~
+/tmp/torchinductor_jackkhuu/cs36dbtcv6inf3gvez7zkbzuf4duad4rqfhnitwlbpchqxhnjvb2/ckry253sjnd53hcycnhihuv6bdalujdd4qdj2ckpytpqytgu5pcj.cpp: In function ‘void cpp_fused__safe_softmax__to_copy_add_bmm_index_index_put_scalar_tensor_stack_where_17(const bfloat16*, const int32_t*, const bfloat16*, const float*, const float*, const bool*, const bfloat16*, float*, float*, bfloat16*, float*, bool*, float*, float*, float*, bfloat16*, float*, bfloat16*)’:
+/tmp/torchinductor_jackkhuu/cs36dbtcv6inf3gvez7zkbzuf4duad4rqfhnitwlbpchqxhnjvb2/ckry253sjnd53hcycnhihuv6bdalujdd4qdj2ckpytpqytgu5pcj.cpp:2294:31: warning: variable ‘tmp_acc0_arr’ set but not used [-Wunused-but-set-variable]
+ 2294 |                         float tmp_acc0_arr[32];
+      |                               ^~~~~~~~~~~~
+/tmp/torchinductor_jackkhuu/cs36dbtcv6inf3gvez7zkbzuf4duad4rqfhnitwlbpchqxhnjvb2/ckry253sjnd53hcycnhihuv6bdalujdd4qdj2ckpytpqytgu5pcj.cpp: In function ‘void cpp_fused__safe_softmax__to_copy_add_bmm_index_index_put_scalar_tensor_stack_where_22(const bfloat16*, const int32_t*, const bfloat16*, const float*, const float*, const bool*, const bfloat16*, float*, float*, bfloat16*, float*, bool*, float*, float*, float*, bfloat16*, float*, bfloat16*)’:
+/tmp/torchinductor_jackkhuu/cs36dbtcv6inf3gvez7zkbzuf4duad4rqfhnitwlbpchqxhnjvb2/ckry253sjnd53hcycnhihuv6bdalujdd4qdj2ckpytpqytgu5pcj.cpp:2794:31: warning: variable ‘tmp_acc0_arr’ set but not used [-Wunused-but-set-variable]
+ 2794 |                         float tmp_acc0_arr[32];
+      |                               ^~~~~~~~~~~~
+/tmp/torchinductor_jackkhuu/cs36dbtcv6inf3gvez7zkbzuf4duad4rqfhnitwlbpchqxhnjvb2/ckry253sjnd53hcycnhihuv6bdalujdd4qdj2ckpytpqytgu5pcj.cpp: In function ‘void cpp_fused__safe_softmax__to_copy_add_bmm_index_index_put_scalar_tensor_stack_where_27(const bfloat16*, const int32_t*, const bfloat16*, const float*, const float*, const bool*, const bfloat16*, float*, float*, bfloat16*, float*, bool*, float*, float*, float*, bfloat16*, float*, bfloat16*)’:
+/tmp/torchinductor_jackkhuu/cs36dbtcv6inf3gvez7zkbzuf4duad4rqfhnitwlbpchqxhnjvb2/ckry253sjnd53hcycnhihuv6bdalujdd4qdj2ckpytpqytgu5pcj.cpp:3306:31: warning: variable ‘tmp_acc0_arr’ set but not used [-Wunused-but-set-variable]
+ 3306 |                         float tmp_acc0_arr[32];
+      |                               ^~~~~~~~~~~~
+/tmp/torchinductor_jackkhuu/cs36dbtcv6inf3gvez7zkbzuf4duad4rqfhnitwlbpchqxhnjvb2/ckry253sjnd53hcycnhihuv6bdalujdd4qdj2ckpytpqytgu5pcj.cpp: In function ‘void cpp_fused__safe_softmax__to_copy_add_bmm_index_index_put_scalar_tensor_stack_where_32(const bfloat16*, const int32_t*, const bfloat16*, const float*, const float*, const bool*, const bfloat16*, float*, float*, bfloat16*, float*, bool*, float*, float*, float*, bfloat16*, float*, bfloat16*)’:
+/tmp/torchinductor_jackkhuu/cs36dbtcv6inf3gvez7zkbzuf4duad4rqfhnitwlbpchqxhnjvb2/ckry253sjnd53hcycnhihuv6bdalujdd4qdj2ckpytpqytgu5pcj.cpp:3806:31: warning: variable ‘tmp_acc0_arr’ set but not used [-Wunused-but-set-variable]
+ 3806 |                         float tmp_acc0_arr[32];
+      |                               ^~~~~~~~~~~~
+/tmp/torchinductor_jackkhuu/cs36dbtcv6inf3gvez7zkbzuf4duad4rqfhnitwlbpchqxhnjvb2/ckry253sjnd53hcycnhihuv6bdalujdd4qdj2ckpytpqytgu5pcj.cpp: In function ‘void cpp_fused__safe_softmax__to_copy_add_bmm_index_index_put_scalar_tensor_stack_where_37(const bfloat16*, const int32_t*, const bfloat16*, const float*, const float*, const bool*, const bfloat16*, float*, float*, bfloat16*, float*, bool*, float*, float*, float*, bfloat16*, float*, bfloat16*)’:
+/tmp/torchinductor_jackkhuu/cs36dbtcv6inf3gvez7zkbzuf4duad4rqfhnitwlbpchqxhnjvb2/ckry253sjnd53hcycnhihuv6bdalujdd4qdj2ckpytpqytgu5pcj.cpp:4318:31: warning: variable ‘tmp_acc0_arr’ set but not used [-Wunused-but-set-variable]
+ 4318 |                         float tmp_acc0_arr[32];
+      |                               ^~~~~~~~~~~~
+/tmp/torchinductor_jackkhuu/cs36dbtcv6inf3gvez7zkbzuf4duad4rqfhnitwlbpchqxhnjvb2/ckry253sjnd53hcycnhihuv6bdalujdd4qdj2ckpytpqytgu5pcj.cpp: In function ‘void cpp_fused__safe_softmax__to_copy_add_bmm_index_index_put_scalar_tensor_stack_where_42(const bfloat16*, const int32_t*, const bfloat16*, const float*, const float*, const bool*, const bfloat16*, float*, float*, bfloat16*, float*, bool*, float*, float*, float*, bfloat16*, float*, bfloat16*)’:
+/tmp/torchinductor_jackkhuu/cs36dbtcv6inf3gvez7zkbzuf4duad4rqfhnitwlbpchqxhnjvb2/ckry253sjnd53hcycnhihuv6bdalujdd4qdj2ckpytpqytgu5pcj.cpp:4818:31: warning: variable ‘tmp_acc0_arr’ set but not used [-Wunused-but-set-variable]
+ 4818 |                         float tmp_acc0_arr[32];
+      |                               ^~~~~~~~~~~~
+/tmp/torchinductor_jackkhuu/cs36dbtcv6inf3gvez7zkbzuf4duad4rqfhnitwlbpchqxhnjvb2/ckry253sjnd53hcycnhihuv6bdalujdd4qdj2ckpytpqytgu5pcj.cpp: In function ‘void cpp_fused__safe_softmax__to_copy_add_bmm_index_index_put_scalar_tensor_stack_where_47(const bfloat16*, const int32_t*, const bfloat16*, const float*, const float*, const bool*, const bfloat16*, float*, float*, bfloat16*, float*, bool*, float*, float*, float*, bfloat16*, float*, bfloat16*)’:
+/tmp/torchinductor_jackkhuu/cs36dbtcv6inf3gvez7zkbzuf4duad4rqfhnitwlbpchqxhnjvb2/ckry253sjnd53hcycnhihuv6bdalujdd4qdj2ckpytpqytgu5pcj.cpp:5330:31: warning: variable ‘tmp_acc0_arr’ set but not used [-Wunused-but-set-variable]
+ 5330 |                         float tmp_acc0_arr[32];
+      |                               ^~~~~~~~~~~~
+/tmp/torchinductor_jackkhuu/cs36dbtcv6inf3gvez7zkbzuf4duad4rqfhnitwlbpchqxhnjvb2/ckry253sjnd53hcycnhihuv6bdalujdd4qdj2ckpytpqytgu5pcj.cpp: In function ‘void cpp_fused__safe_softmax__to_copy_add_bmm_index_index_put_scalar_tensor_stack_where_52(const bfloat16*, const int32_t*, const bfloat16*, const float*, const float*, const bool*, const bfloat16*, float*, float*, bfloat16*, float*, bool*, float*, float*, float*, bfloat16*, float*, bfloat16*)’:
+/tmp/torchinductor_jackkhuu/cs36dbtcv6inf3gvez7zkbzuf4duad4rqfhnitwlbpchqxhnjvb2/ckry253sjnd53hcycnhihuv6bdalujdd4qdj2ckpytpqytgu5pcj.cpp:5830:31: warning: variable ‘tmp_acc0_arr’ set but not used [-Wunused-but-set-variable]
+ 5830 |                         float tmp_acc0_arr[32];
+      |                               ^~~~~~~~~~~~
+/tmp/torchinductor_jackkhuu/cs36dbtcv6inf3gvez7zkbzuf4duad4rqfhnitwlbpchqxhnjvb2/ckry253sjnd53hcycnhihuv6bdalujdd4qdj2ckpytpqytgu5pcj.cpp: In function ‘void cpp_fused__safe_softmax__to_copy_add_bmm_index_index_put_scalar_tensor_stack_where_57(const bfloat16*, const int32_t*, const bfloat16*, const float*, const float*, const bool*, const bfloat16*, float*, float*, bfloat16*, float*, bool*, float*, float*, float*, bfloat16*, float*, bfloat16*)’:
+/tmp/torchinductor_jackkhuu/cs36dbtcv6inf3gvez7zkbzuf4duad4rqfhnitwlbpchqxhnjvb2/ckry253sjnd53hcycnhihuv6bdalujdd4qdj2ckpytpqytgu5pcj.cpp:6342:31: warning: variable ‘tmp_acc0_arr’ set but not used [-Wunused-but-set-variable]
+ 6342 |                         float tmp_acc0_arr[32];
+      |                               ^~~~~~~~~~~~
+/tmp/torchinductor_jackkhuu/cs36dbtcv6inf3gvez7zkbzuf4duad4rqfhnitwlbpchqxhnjvb2/ckry253sjnd53hcycnhihuv6bdalujdd4qdj2ckpytpqytgu5pcj.cpp: In function ‘void cpp_fused__safe_softmax__to_copy_add_bmm_index_index_put_scalar_tensor_stack_where_62(const bfloat16*, const int32_t*, const bfloat16*, const float*, const float*, const bool*, const bfloat16*, float*, float*, bfloat16*, float*, bool*, float*, float*, float*, bfloat16*, float*, bfloat16*)’:
+/tmp/torchinductor_jackkhuu/cs36dbtcv6inf3gvez7zkbzuf4duad4rqfhnitwlbpchqxhnjvb2/ckry253sjnd53hcycnhihuv6bdalujdd4qdj2ckpytpqytgu5pcj.cpp:6842:31: warning: variable ‘tmp_acc0_arr’ set but not used [-Wunused-but-set-variable]
+ 6842 |                         float tmp_acc0_arr[32];
+      |                               ^~~~~~~~~~~~
+/tmp/torchinductor_jackkhuu/cs36dbtcv6inf3gvez7zkbzuf4duad4rqfhnitwlbpchqxhnjvb2/ckry253sjnd53hcycnhihuv6bdalujdd4qdj2ckpytpqytgu5pcj.cpp: In function ‘void cpp_fused__safe_softmax__to_copy_add_bmm_index_index_put_scalar_tensor_stack_where_67(const bfloat16*, const int32_t*, const bfloat16*, const float*, const float*, const bool*, const bfloat16*, float*, float*, bfloat16*, float*, bool*, float*, float*, float*, bfloat16*, float*, bfloat16*)’:
+/tmp/torchinductor_jackkhuu/cs36dbtcv6inf3gvez7zkbzuf4duad4rqfhnitwlbpchqxhnjvb2/ckry253sjnd53hcycnhihuv6bdalujdd4qdj2ckpytpqytgu5pcj.cpp:7354:31: warning: variable ‘tmp_acc0_arr’ set but not used [-Wunused-but-set-variable]
+ 7354 |                         float tmp_acc0_arr[32];
+      |                               ^~~~~~~~~~~~
+/tmp/torchinductor_jackkhuu/cs36dbtcv6inf3gvez7zkbzuf4duad4rqfhnitwlbpchqxhnjvb2/ckry253sjnd53hcycnhihuv6bdalujdd4qdj2ckpytpqytgu5pcj.cpp: In function ‘void cpp_fused__safe_softmax__to_copy_add_bmm_index_index_put_scalar_tensor_stack_where_72(const bfloat16*, const int32_t*, const bfloat16*, const float*, const float*, const bool*, const bfloat16*, float*, float*, bfloat16*, float*, bool*, float*, float*, float*, bfloat16*, float*, bfloat16*)’:
+/tmp/torchinductor_jackkhuu/cs36dbtcv6inf3gvez7zkbzuf4duad4rqfhnitwlbpchqxhnjvb2/ckry253sjnd53hcycnhihuv6bdalujdd4qdj2ckpytpqytgu5pcj.cpp:7854:31: warning: variable ‘tmp_acc0_arr’ set but not used [-Wunused-but-set-variable]
+ 7854 |                         float tmp_acc0_arr[32];
+      |                               ^~~~~~~~~~~~
+/tmp/torchinductor_jackkhuu/cs36dbtcv6inf3gvez7zkbzuf4duad4rqfhnitwlbpchqxhnjvb2/ckry253sjnd53hcycnhihuv6bdalujdd4qdj2ckpytpqytgu5pcj.cpp: In function ‘void cpp_fused__safe_softmax__to_copy_add_bmm_index_index_put_scalar_tensor_stack_where_77(const bfloat16*, const int32_t*, const bfloat16*, const float*, const float*, const bool*, const bfloat16*, float*, float*, bfloat16*, float*, bool*, float*, float*, float*, bfloat16*, float*, bfloat16*)’:
+/tmp/torchinductor_jackkhuu/cs36dbtcv6inf3gvez7zkbzuf4duad4rqfhnitwlbpchqxhnjvb2/ckry253sjnd53hcycnhihuv6bdalujdd4qdj2ckpytpqytgu5pcj.cpp:8366:31: warning: variable ‘tmp_acc0_arr’ set but not used [-Wunused-but-set-variable]
+ 8366 |                         float tmp_acc0_arr[32];
+      |                               ^~~~~~~~~~~~
+/tmp/torchinductor_jackkhuu/cs36dbtcv6inf3gvez7zkbzuf4duad4rqfhnitwlbpchqxhnjvb2/ckry253sjnd53hcycnhihuv6bdalujdd4qdj2ckpytpqytgu5pcj.cpp: In function ‘void cpp_fused__safe_softmax__to_copy_add_bmm_index_index_put_scalar_tensor_stack_where_82(const bfloat16*, const int32_t*, const bfloat16*, const float*, const float*, const bool*, const bfloat16*, float*, float*, bfloat16*, float*, bool*, float*, float*, float*, bfloat16*, float*, bfloat16*)’:
+/tmp/torchinductor_jackkhuu/cs36dbtcv6inf3gvez7zkbzuf4duad4rqfhnitwlbpchqxhnjvb2/ckry253sjnd53hcycnhihuv6bdalujdd4qdj2ckpytpqytgu5pcj.cpp:8866:31: warning: variable ‘tmp_acc0_arr’ set but not used [-Wunused-but-set-variable]
+ 8866 |                         float tmp_acc0_arr[32];
+      |                               ^~~~~~~~~~~~
+/tmp/torchinductor_jackkhuu/cs36dbtcv6inf3gvez7zkbzuf4duad4rqfhnitwlbpchqxhnjvb2/ckry253sjnd53hcycnhihuv6bdalujdd4qdj2ckpytpqytgu5pcj.cpp: In function ‘void cpp_fused__safe_softmax__to_copy_add_bmm_index_index_put_scalar_tensor_stack_where_87(const bfloat16*, const int32_t*, const bfloat16*, const float*, const float*, const bool*, const bfloat16*, float*, float*, bfloat16*, float*, bool*, float*, float*, float*, bfloat16*, float*, bfloat16*)’:
+/tmp/torchinductor_jackkhuu/cs36dbtcv6inf3gvez7zkbzuf4duad4rqfhnitwlbpchqxhnjvb2/ckry253sjnd53hcycnhihuv6bdalujdd4qdj2ckpytpqytgu5pcj.cpp:9378:31: warning: variable ‘tmp_acc0_arr’ set but not used [-Wunused-but-set-variable]
+ 9378 |                         float tmp_acc0_arr[32];
+      |                               ^~~~~~~~~~~~
+/tmp/torchinductor_jackkhuu/cs36dbtcv6inf3gvez7zkbzuf4duad4rqfhnitwlbpchqxhnjvb2/ckry253sjnd53hcycnhihuv6bdalujdd4qdj2ckpytpqytgu5pcj.cpp: In function ‘void cpp_fused__safe_softmax__to_copy_add_bmm_index_index_put_scalar_tensor_stack_where_92(const bfloat16*, const int32_t*, const bfloat16*, const float*, const float*, const bool*, const bfloat16*, float*, float*, bfloat16*, float*, bool*, float*, float*, float*, bfloat16*, float*, bfloat16*)’:
+/tmp/torchinductor_jackkhuu/cs36dbtcv6inf3gvez7zkbzuf4duad4rqfhnitwlbpchqxhnjvb2/ckry253sjnd53hcycnhihuv6bdalujdd4qdj2ckpytpqytgu5pcj.cpp:9878:31: warning: variable ‘tmp_acc0_arr’ set but not used [-Wunused-but-set-variable]
+ 9878 |                         float tmp_acc0_arr[32];
+      |                               ^~~~~~~~~~~~
+/tmp/torchinductor_jackkhuu/cs36dbtcv6inf3gvez7zkbzuf4duad4rqfhnitwlbpchqxhnjvb2/ckry253sjnd53hcycnhihuv6bdalujdd4qdj2ckpytpqytgu5pcj.cpp: In function ‘void cpp_fused__safe_softmax__to_copy_add_bmm_index_index_put_scalar_tensor_stack_where_97(const bfloat16*, const int32_t*, const bfloat16*, const float*, const float*, const bool*, const bfloat16*, float*, float*, bfloat16*, float*, bool*, float*, float*, float*, bfloat16*, float*, bfloat16*)’:
+/tmp/torchinductor_jackkhuu/cs36dbtcv6inf3gvez7zkbzuf4duad4rqfhnitwlbpchqxhnjvb2/ckry253sjnd53hcycnhihuv6bdalujdd4qdj2ckpytpqytgu5pcj.cpp:10390:31: warning: variable ‘tmp_acc0_arr’ set but not used [-Wunused-but-set-variable]
+10390 |                         float tmp_acc0_arr[32];
+      |                               ^~~~~~~~~~~~
+/tmp/torchinductor_jackkhuu/cs36dbtcv6inf3gvez7zkbzuf4duad4rqfhnitwlbpchqxhnjvb2/ckry253sjnd53hcycnhihuv6bdalujdd4qdj2ckpytpqytgu5pcj.cpp: In function ‘void cpp_fused__safe_softmax__to_copy_add_bmm_index_index_put_scalar_tensor_stack_where_102(const bfloat16*, const int32_t*, const bfloat16*, const float*, const float*, const bool*, const bfloat16*, float*, float*, bfloat16*, float*, bool*, float*, float*, float*, bfloat16*, float*, bfloat16*)’:
+/tmp/torchinductor_jackkhuu/cs36dbtcv6inf3gvez7zkbzuf4duad4rqfhnitwlbpchqxhnjvb2/ckry253sjnd53hcycnhihuv6bdalujdd4qdj2ckpytpqytgu5pcj.cpp:10890:31: warning: variable ‘tmp_acc0_arr’ set but not used [-Wunused-but-set-variable]
+10890 |                         float tmp_acc0_arr[32];
+      |                               ^~~~~~~~~~~~
+/tmp/torchinductor_jackkhuu/cs36dbtcv6inf3gvez7zkbzuf4duad4rqfhnitwlbpchqxhnjvb2/ckry253sjnd53hcycnhihuv6bdalujdd4qdj2ckpytpqytgu5pcj.cpp: In function ‘void cpp_fused__safe_softmax__to_copy_add_bmm_index_index_put_scalar_tensor_stack_where_107(const bfloat16*, const int32_t*, const bfloat16*, const float*, const float*, const bool*, const bfloat16*, float*, float*, bfloat16*, float*, bool*, float*, float*, float*, bfloat16*, float*, bfloat16*)’:
+/tmp/torchinductor_jackkhuu/cs36dbtcv6inf3gvez7zkbzuf4duad4rqfhnitwlbpchqxhnjvb2/ckry253sjnd53hcycnhihuv6bdalujdd4qdj2ckpytpqytgu5pcj.cpp:11402:31: warning: variable ‘tmp_acc0_arr’ set but not used [-Wunused-but-set-variable]
+11402 |                         float tmp_acc0_arr[32];
+      |                               ^~~~~~~~~~~~
+/tmp/torchinductor_jackkhuu/cs36dbtcv6inf3gvez7zkbzuf4duad4rqfhnitwlbpchqxhnjvb2/ckry253sjnd53hcycnhihuv6bdalujdd4qdj2ckpytpqytgu5pcj.cpp: In function ‘void cpp_fused__safe_softmax__to_copy_add_bmm_index_index_put_scalar_tensor_stack_where_112(const bfloat16*, const int32_t*, const bfloat16*, const float*, const float*, const bool*, const bfloat16*, float*, float*, bfloat16*, float*, bool*, float*, float*, float*, bfloat16*, float*, bfloat16*)’:
+/tmp/torchinductor_jackkhuu/cs36dbtcv6inf3gvez7zkbzuf4duad4rqfhnitwlbpchqxhnjvb2/ckry253sjnd53hcycnhihuv6bdalujdd4qdj2ckpytpqytgu5pcj.cpp:11902:31: warning: variable ‘tmp_acc0_arr’ set but not used [-Wunused-but-set-variable]
+11902 |                         float tmp_acc0_arr[32];
+      |                               ^~~~~~~~~~~~
+/tmp/torchinductor_jackkhuu/cs36dbtcv6inf3gvez7zkbzuf4duad4rqfhnitwlbpchqxhnjvb2/ckry253sjnd53hcycnhihuv6bdalujdd4qdj2ckpytpqytgu5pcj.cpp: In function ‘void cpp_fused__safe_softmax__to_copy_add_bmm_index_index_put_scalar_tensor_stack_where_117(const bfloat16*, const int32_t*, const bfloat16*, const float*, const float*, const bool*, const bfloat16*, float*, float*, bfloat16*, float*, bool*, float*, float*, float*, bfloat16*, float*, bfloat16*)’:
+/tmp/torchinductor_jackkhuu/cs36dbtcv6inf3gvez7zkbzuf4duad4rqfhnitwlbpchqxhnjvb2/ckry253sjnd53hcycnhihuv6bdalujdd4qdj2ckpytpqytgu5pcj.cpp:12414:31: warning: variable ‘tmp_acc0_arr’ set but not used [-Wunused-but-set-variable]
+12414 |                         float tmp_acc0_arr[32];
+      |                               ^~~~~~~~~~~~
+/tmp/torchinductor_jackkhuu/cs36dbtcv6inf3gvez7zkbzuf4duad4rqfhnitwlbpchqxhnjvb2/ckry253sjnd53hcycnhihuv6bdalujdd4qdj2ckpytpqytgu5pcj.cpp: In function ‘void cpp_fused__safe_softmax__to_copy_add_bmm_index_index_put_scalar_tensor_stack_where_122(const bfloat16*, const int32_t*, const bfloat16*, const float*, const float*, const bool*, const bfloat16*, float*, float*, bfloat16*, float*, bool*, float*, float*, float*, bfloat16*, float*, bfloat16*)’:
+/tmp/torchinductor_jackkhuu/cs36dbtcv6inf3gvez7zkbzuf4duad4rqfhnitwlbpchqxhnjvb2/ckry253sjnd53hcycnhihuv6bdalujdd4qdj2ckpytpqytgu5pcj.cpp:12914:31: warning: variable ‘tmp_acc0_arr’ set but not used [-Wunused-but-set-variable]
+12914 |                         float tmp_acc0_arr[32];
+      |                               ^~~~~~~~~~~~
+/tmp/torchinductor_jackkhuu/cs36dbtcv6inf3gvez7zkbzuf4duad4rqfhnitwlbpchqxhnjvb2/ckry253sjnd53hcycnhihuv6bdalujdd4qdj2ckpytpqytgu5pcj.cpp: In function ‘void cpp_fused__safe_softmax__to_copy_add_bmm_index_index_put_scalar_tensor_stack_where_127(const bfloat16*, const int32_t*, const bfloat16*, const float*, const float*, const bool*, const bfloat16*, float*, float*, bfloat16*, float*, bool*, float*, float*, float*, bfloat16*, float*, bfloat16*)’:
+/tmp/torchinductor_jackkhuu/cs36dbtcv6inf3gvez7zkbzuf4duad4rqfhnitwlbpchqxhnjvb2/ckry253sjnd53hcycnhihuv6bdalujdd4qdj2ckpytpqytgu5pcj.cpp:13426:31: warning: variable ‘tmp_acc0_arr’ set but not used [-Wunused-but-set-variable]
+13426 |                         float tmp_acc0_arr[32];
+      |                               ^~~~~~~~~~~~
+/tmp/torchinductor_jackkhuu/cs36dbtcv6inf3gvez7zkbzuf4duad4rqfhnitwlbpchqxhnjvb2/ckry253sjnd53hcycnhihuv6bdalujdd4qdj2ckpytpqytgu5pcj.cpp: In function ‘void cpp_fused__safe_softmax__to_copy_add_bmm_index_index_put_scalar_tensor_stack_where_132(const bfloat16*, const int32_t*, const bfloat16*, const float*, const float*, const bool*, const bfloat16*, float*, float*, bfloat16*, float*, bool*, float*, float*, float*, bfloat16*, float*, bfloat16*)’:
+/tmp/torchinductor_jackkhuu/cs36dbtcv6inf3gvez7zkbzuf4duad4rqfhnitwlbpchqxhnjvb2/ckry253sjnd53hcycnhihuv6bdalujdd4qdj2ckpytpqytgu5pcj.cpp:13926:31: warning: variable ‘tmp_acc0_arr’ set but not used [-Wunused-but-set-variable]
+13926 |                         float tmp_acc0_arr[32];
+      |                               ^~~~~~~~~~~~
+/tmp/torchinductor_jackkhuu/cs36dbtcv6inf3gvez7zkbzuf4duad4rqfhnitwlbpchqxhnjvb2/ckry253sjnd53hcycnhihuv6bdalujdd4qdj2ckpytpqytgu5pcj.cpp: In function ‘void cpp_fused__safe_softmax__to_copy_add_bmm_index_index_put_scalar_tensor_stack_where_137(const bfloat16*, const int32_t*, const bfloat16*, const float*, const float*, const bool*, const bfloat16*, float*, float*, bfloat16*, float*, bool*, float*, float*, float*, bfloat16*, float*, bfloat16*)’:
+/tmp/torchinductor_jackkhuu/cs36dbtcv6inf3gvez7zkbzuf4duad4rqfhnitwlbpchqxhnjvb2/ckry253sjnd53hcycnhihuv6bdalujdd4qdj2ckpytpqytgu5pcj.cpp:14438:31: warning: variable ‘tmp_acc0_arr’ set but not used [-Wunused-but-set-variable]
+14438 |                         float tmp_acc0_arr[32];
+      |                               ^~~~~~~~~~~~
+/tmp/torchinductor_jackkhuu/cs36dbtcv6inf3gvez7zkbzuf4duad4rqfhnitwlbpchqxhnjvb2/ckry253sjnd53hcycnhihuv6bdalujdd4qdj2ckpytpqytgu5pcj.cpp: In function ‘void cpp_fused__safe_softmax__to_copy_add_bmm_index_index_put_scalar_tensor_stack_where_142(const bfloat16*, const int32_t*, const bfloat16*, const float*, const float*, const bool*, const bfloat16*, float*, float*, bfloat16*, float*, bool*, float*, float*, float*, bfloat16*, float*, bfloat16*)’:
+/tmp/torchinductor_jackkhuu/cs36dbtcv6inf3gvez7zkbzuf4duad4rqfhnitwlbpchqxhnjvb2/ckry253sjnd53hcycnhihuv6bdalujdd4qdj2ckpytpqytgu5pcj.cpp:14938:31: warning: variable ‘tmp_acc0_arr’ set but not used [-Wunused-but-set-variable]
+14938 |                         float tmp_acc0_arr[32];
+      |                               ^~~~~~~~~~~~
+/tmp/torchinductor_jackkhuu/cs36dbtcv6inf3gvez7zkbzuf4duad4rqfhnitwlbpchqxhnjvb2/ckry253sjnd53hcycnhihuv6bdalujdd4qdj2ckpytpqytgu5pcj.cpp: In function ‘void cpp_fused__safe_softmax__to_copy_add_bmm_index_index_put_scalar_tensor_stack_where_147(const bfloat16*, const int32_t*, const bfloat16*, const float*, const float*, const bool*, const bfloat16*, float*, float*, bfloat16*, float*, bool*, float*, float*, float*, bfloat16*, float*, bfloat16*)’:
+/tmp/torchinductor_jackkhuu/cs36dbtcv6inf3gvez7zkbzuf4duad4rqfhnitwlbpchqxhnjvb2/ckry253sjnd53hcycnhihuv6bdalujdd4qdj2ckpytpqytgu5pcj.cpp:15450:31: warning: variable ‘tmp_acc0_arr’ set but not used [-Wunused-but-set-variable]
+15450 |                         float tmp_acc0_arr[32];
+      |                               ^~~~~~~~~~~~
+/tmp/torchinductor_jackkhuu/cs36dbtcv6inf3gvez7zkbzuf4duad4rqfhnitwlbpchqxhnjvb2/ckry253sjnd53hcycnhihuv6bdalujdd4qdj2ckpytpqytgu5pcj.cpp: In function ‘void cpp_fused__safe_softmax__to_copy_add_bmm_index_index_put_scalar_tensor_stack_where_152(const bfloat16*, const int32_t*, const bfloat16*, const float*, const float*, const bool*, const bfloat16*, float*, float*, bfloat16*, float*, bool*, float*, float*, float*, bfloat16*, float*, bfloat16*)’:
+/tmp/torchinductor_jackkhuu/cs36dbtcv6inf3gvez7zkbzuf4duad4rqfhnitwlbpchqxhnjvb2/ckry253sjnd53hcycnhihuv6bdalujdd4qdj2ckpytpqytgu5pcj.cpp:15950:31: warning: variable ‘tmp_acc0_arr’ set but not used [-Wunused-but-set-variable]
+15950 |                         float tmp_acc0_arr[32];
+      |                               ^~~~~~~~~~~~
+/tmp/torchinductor_jackkhuu/cs36dbtcv6inf3gvez7zkbzuf4duad4rqfhnitwlbpchqxhnjvb2/ckry253sjnd53hcycnhihuv6bdalujdd4qdj2ckpytpqytgu5pcj.cpp: In function ‘void cpp_fused__safe_softmax__to_copy_add_bmm_index_index_put_scalar_tensor_stack_where_157(const bfloat16*, const int32_t*, const bfloat16*, const float*, const float*, const bool*, const bfloat16*, float*, float*, bfloat16*, float*, bool*, float*, float*, float*, bfloat16*, float*, bfloat16*)’:
+/tmp/torchinductor_jackkhuu/cs36dbtcv6inf3gvez7zkbzuf4duad4rqfhnitwlbpchqxhnjvb2/ckry253sjnd53hcycnhihuv6bdalujdd4qdj2ckpytpqytgu5pcj.cpp:16462:31: warning: variable ‘tmp_acc0_arr’ set but not used [-Wunused-but-set-variable]
+16462 |                         float tmp_acc0_arr[32];
+      |                               ^~~~~~~~~~~~
+Unabled to import torchao experimental quant_api with error:  [Errno 2] No such file or directory: '/home/jackkhuu/oss/torchchat/torchao-build/src/ao/torchao/experimental/quant_api.py'
+Using device=cpu
+Setting max_seq_length to 300 for DSO export.
+Loading model...
+Time to load model: 0.11 seconds
+Quantizing the model with: {'precision': {'dtype': 'bfloat16'}, 'executor': {'accelerator': 'cpu'}}
+Time to quantize model: 0.01 seconds
+-----------------------------------------------------------
+Exporting model using AOT Inductor to /tmp/model16.pt2
+The generated packaged model can be found at: /tmp/model16.pt2
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+OMP_NUM_THREADS=16 numactl --cpunodebind=0 --membind=0 python3 torchchat.py generate llama3.1 --aoti-package-path /tmp/model16.pt2 --prompt "Once upon a time," --max-new-tokens 256 --device cpu --num-samples 3
+PyTorch version 2.6.0.dev20241218+cu124 available.
+Unabled to import torchao experimental quant_api with error:  [Errno 2] No such file or directory: '/home/jackkhuu/oss/torchchat/torchao-build/src/ao/torchao/experimental/quant_api.py'
+Warning: checkpoint path ignored because an exported model was specified using a DSO, AOTI PACKAGE or PTE path argument
+Warning: checkpoint path ignored because an exported model was specified using a DSO, AOTI PACKAGE or PTE path argument
+Using device=cpu Intel(R) Xeon(R) Platinum 8339HC CPU @ 1.80GHz
+Loading model...
+Time to load model: 0.51 seconds
+-----------------------------------------------------------
+Once upon a time, in a land far, far away, there was a magical kingdom hidden deep within a dense forest. The kingdom was called Aethoria, and it was a place of wonder and enchantment.
+Aethoria was ruled by a wise and just queen named Lyra, who was loved by her people for her kindness, compassion, and unwavering commitment to justice. She was a skilled healer and a master of the ancient magic that flowed through the land.
+One day, a great darkness threatened to consume Aethoria. A powerful and evil sorcerer named Malakai had risen to power in a neighboring kingdom, and he sought to conquer and destroy Aethoria.
+Queen Lyra knew that she had to act quickly to protect her kingdom. She called upon her most trusted advisors, a group of wise and powerful wizards who had served her for many years.
+Together, they set out to gather a small band of brave warriors to aid them in their quest to defeat Malakai and save Aethoria. They traveled far and wide, searching for warriors who possessed the skills and bravery needed to face the darkness that threatened their kingdom.
+
+As they traveled, they came across a young warrior named Eira. Eira was a skilled fighter, a master of the sword and
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~                
+Generated 255 tokens                 
+Time for inference 1: 393.3059 sec total                 
+Time to first token: 14.8847 sec with sequential prefill.                
+
+      Total throughput: 0.6509 tokens/sec, 1.5364 s/token                 
+First token throughput: 0.0672 tokens/sec, 14.8847 s/token                 
+ Next token throughput: 0.6739 tokens/sec, 1.4840 s/token                     
+
+Bandwidth achieved: 0.00 GB/s
+*** This first iteration will include cold start effects for dynamic import, hardware caches. ***
+
+========================================
+
+Once upon a time, in a far-off land, there was a magical kingdom filled with rolling hills, sparkling lakes, and dense forests. The kingdom was ruled by a just and fair king, who loved his people and protected them from harm.
+One day, a young prince named Leo was born to the king and queen. Leo was a curious and adventurous boy, always eager to explore the world beyond the castle walls. As he grew older, he became fascinated with the stories of a legendary dragon that was said to live in a nearby mountain range.
+According to legend, the dragon was a fierce and powerful creature, with scales as black as coal and eyes that glowed like embers. It was said that the dragon had the power to grant wishes to those who treated it with kindness and respect.
+Leo was determined to find the dragon and make friends with it. He spent every spare moment studying the ancient maps and texts that held the secrets of the mountain range. He gathered a team of brave and skilled companions, including a wise old wizard, a skilled warrior, and a quick-witted thief.
+Together, they set out on a perilous journey to find the dragon. They traversed treacherous mountain paths, crossed raging rivers, and braved the dangers of the dark forest. Along the
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~                
+Generated 255 tokens                 
+Time for inference 2: 388.7405 sec total                 
+Time to first token: 10.3053 sec with sequential prefill.                
+
+      Total throughput: 0.6585 tokens/sec, 1.5185 s/token                 
+First token throughput: 0.0970 tokens/sec, 10.3053 s/token                 
+ Next token throughput: 0.6738 tokens/sec, 1.4841 s/token                     
+
+Bandwidth achieved: 0.00 GB/s
+
+========================================
+
+Once upon a time, in a land far, far away, there was a little boy named Timmy who lived with his parents in a small village surrounded by beautiful green mountains. Timmy was a curious and adventurous boy who loved to explore the world around him. One day, while wandering through the woods near his village, Timmy stumbled upon a strange and mysterious object buried in the ground.
+The object was a small, glowing crystal that seemed to pulsate with an otherworldly energy. Intrigued, Timmy picked up the crystal and held it in his hand, feeling an unexpected surge of power and knowledge flood through his mind. Suddenly, he could hear the whispers of the ancient ones, the spirits of the land, and the secrets of the universe.
+Timmy was both amazed and terrified by the crystal's power, but he couldn't resist the urge to explore further. He continued to hold the crystal, and with each passing moment, he gained more knowledge and understanding of the world around him. He learned about the intricate web of life that connected all living beings, and the delicate balance of the natural world.
+As Timmy continued to explore the crystal's power, he began to notice strange occurrences happening around him. Plants began to grow faster and stronger, animals seemed to be communicating
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~                
+Generated 255 tokens                 
+Time for inference 3: 321.7408 sec total                 
+Time to first token: 6.7838 sec with sequential prefill.                
+
+      Total throughput: 0.7957 tokens/sec, 1.2568 s/token                 
+First token throughput: 0.1474 tokens/sec, 6.7838 s/token                 
+ Next token throughput: 0.8096 tokens/sec, 1.2351 s/token                     
+
+Bandwidth achieved: 0.00 GB/s
+
+========================================
+
+
+Warning: Excluding compile in calculations                 
+      Average tokens/sec (total): 0.70                 
+Average tokens/sec (first token): 0.10                 
+Average tokens/sec (next tokens): 0.72 
+                
+Memory used: 0.00 GB
diff --git a/llama31-1218/cpu_compile_4.txt b/llama31-1218/cpu_compile_4.txt
new file mode 100644
index 000000000..e9cbd05c2
--- /dev/null
+++ b/llama31-1218/cpu_compile_4.txt
@@ -0,0 +1,302 @@
+
+OMP_NUM_THREADS=16 numactl --cpunodebind=0 --membind=0 python3 torchchat.py generate llama3.1 --quantize '{"linear:int4": {"groupsize": 256}, "precision": {"dtype":"bfloat16"}, "executor":{"accelerator":"cpu"}}' --prompt "Once upon a time," --max-new-tokens 256 --compile --num-samples 3
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+OMP_NUM_THREADS=16 numactl --cpunodebind=0 --membind=0 python3 torchchat.py generate llama3.1 --quantize '{"linear:int4": {"groupsize": 256}, "precision": {"dtype":"bfloat16"}, "executor":{"accelerator":"cpu"}}' --prompt "Once upon a time," --max-new-tokens 256 --compile --num-samples 3
+PyTorch version 2.6.0.dev20241218+cu124 available.
+linear: model.layers.0.attention.wq, in=4096, out=4096
+linear: model.layers.0.attention.wk, in=4096, out=1024
+linear: model.layers.0.attention.wv, in=4096, out=1024
+linear: model.layers.0.attention.wo, in=4096, out=4096
+linear: model.layers.0.feed_forward.w1, in=4096, out=14336
+linear: model.layers.0.feed_forward.w2, in=14336, out=4096
+linear: model.layers.0.feed_forward.w3, in=4096, out=14336
+linear: model.layers.1.attention.wq, in=4096, out=4096
+linear: model.layers.1.attention.wk, in=4096, out=1024
+linear: model.layers.1.attention.wv, in=4096, out=1024
+linear: model.layers.1.attention.wo, in=4096, out=4096
+linear: model.layers.1.feed_forward.w1, in=4096, out=14336
+linear: model.layers.1.feed_forward.w2, in=14336, out=4096
+linear: model.layers.1.feed_forward.w3, in=4096, out=14336
+linear: model.layers.2.attention.wq, in=4096, out=4096
+linear: model.layers.2.attention.wk, in=4096, out=1024
+linear: model.layers.2.attention.wv, in=4096, out=1024
+linear: model.layers.2.attention.wo, in=4096, out=4096
+linear: model.layers.2.feed_forward.w1, in=4096, out=14336
+linear: model.layers.2.feed_forward.w2, in=14336, out=4096
+linear: model.layers.2.feed_forward.w3, in=4096, out=14336
+linear: model.layers.3.attention.wq, in=4096, out=4096
+linear: model.layers.3.attention.wk, in=4096, out=1024
+linear: model.layers.3.attention.wv, in=4096, out=1024
+linear: model.layers.3.attention.wo, in=4096, out=4096
+linear: model.layers.3.feed_forward.w1, in=4096, out=14336
+linear: model.layers.3.feed_forward.w2, in=14336, out=4096
+linear: model.layers.3.feed_forward.w3, in=4096, out=14336
+linear: model.layers.4.attention.wq, in=4096, out=4096
+linear: model.layers.4.attention.wk, in=4096, out=1024
+linear: model.layers.4.attention.wv, in=4096, out=1024
+linear: model.layers.4.attention.wo, in=4096, out=4096
+linear: model.layers.4.feed_forward.w1, in=4096, out=14336
+linear: model.layers.4.feed_forward.w2, in=14336, out=4096
+linear: model.layers.4.feed_forward.w3, in=4096, out=14336
+linear: model.layers.5.attention.wq, in=4096, out=4096
+linear: model.layers.5.attention.wk, in=4096, out=1024
+linear: model.layers.5.attention.wv, in=4096, out=1024
+linear: model.layers.5.attention.wo, in=4096, out=4096
+linear: model.layers.5.feed_forward.w1, in=4096, out=14336
+linear: model.layers.5.feed_forward.w2, in=14336, out=4096
+linear: model.layers.5.feed_forward.w3, in=4096, out=14336
+linear: model.layers.6.attention.wq, in=4096, out=4096
+linear: model.layers.6.attention.wk, in=4096, out=1024
+linear: model.layers.6.attention.wv, in=4096, out=1024
+linear: model.layers.6.attention.wo, in=4096, out=4096
+linear: model.layers.6.feed_forward.w1, in=4096, out=14336
+linear: model.layers.6.feed_forward.w2, in=14336, out=4096
+linear: model.layers.6.feed_forward.w3, in=4096, out=14336
+linear: model.layers.7.attention.wq, in=4096, out=4096
+linear: model.layers.7.attention.wk, in=4096, out=1024
+linear: model.layers.7.attention.wv, in=4096, out=1024
+linear: model.layers.7.attention.wo, in=4096, out=4096
+linear: model.layers.7.feed_forward.w1, in=4096, out=14336
+linear: model.layers.7.feed_forward.w2, in=14336, out=4096
+linear: model.layers.7.feed_forward.w3, in=4096, out=14336
+linear: model.layers.8.attention.wq, in=4096, out=4096
+linear: model.layers.8.attention.wk, in=4096, out=1024
+linear: model.layers.8.attention.wv, in=4096, out=1024
+linear: model.layers.8.attention.wo, in=4096, out=4096
+linear: model.layers.8.feed_forward.w1, in=4096, out=14336
+linear: model.layers.8.feed_forward.w2, in=14336, out=4096
+linear: model.layers.8.feed_forward.w3, in=4096, out=14336
+linear: model.layers.9.attention.wq, in=4096, out=4096
+linear: model.layers.9.attention.wk, in=4096, out=1024
+linear: model.layers.9.attention.wv, in=4096, out=1024
+linear: model.layers.9.attention.wo, in=4096, out=4096
+linear: model.layers.9.feed_forward.w1, in=4096, out=14336
+linear: model.layers.9.feed_forward.w2, in=14336, out=4096
+linear: model.layers.9.feed_forward.w3, in=4096, out=14336
+linear: model.layers.10.attention.wq, in=4096, out=4096
+linear: model.layers.10.attention.wk, in=4096, out=1024
+linear: model.layers.10.attention.wv, in=4096, out=1024
+linear: model.layers.10.attention.wo, in=4096, out=4096
+linear: model.layers.10.feed_forward.w1, in=4096, out=14336
+linear: model.layers.10.feed_forward.w2, in=14336, out=4096
+linear: model.layers.10.feed_forward.w3, in=4096, out=14336
+linear: model.layers.11.attention.wq, in=4096, out=4096
+linear: model.layers.11.attention.wk, in=4096, out=1024
+linear: model.layers.11.attention.wv, in=4096, out=1024
+linear: model.layers.11.attention.wo, in=4096, out=4096
+linear: model.layers.11.feed_forward.w1, in=4096, out=14336
+linear: model.layers.11.feed_forward.w2, in=14336, out=4096
+linear: model.layers.11.feed_forward.w3, in=4096, out=14336
+linear: model.layers.12.attention.wq, in=4096, out=4096
+linear: model.layers.12.attention.wk, in=4096, out=1024
+linear: model.layers.12.attention.wv, in=4096, out=1024
+linear: model.layers.12.attention.wo, in=4096, out=4096
+linear: model.layers.12.feed_forward.w1, in=4096, out=14336
+linear: model.layers.12.feed_forward.w2, in=14336, out=4096
+linear: model.layers.12.feed_forward.w3, in=4096, out=14336
+linear: model.layers.13.attention.wq, in=4096, out=4096
+linear: model.layers.13.attention.wk, in=4096, out=1024
+linear: model.layers.13.attention.wv, in=4096, out=1024
+linear: model.layers.13.attention.wo, in=4096, out=4096
+linear: model.layers.13.feed_forward.w1, in=4096, out=14336
+linear: model.layers.13.feed_forward.w2, in=14336, out=4096
+linear: model.layers.13.feed_forward.w3, in=4096, out=14336
+linear: model.layers.14.attention.wq, in=4096, out=4096
+linear: model.layers.14.attention.wk, in=4096, out=1024
+linear: model.layers.14.attention.wv, in=4096, out=1024
+linear: model.layers.14.attention.wo, in=4096, out=4096
+linear: model.layers.14.feed_forward.w1, in=4096, out=14336
+linear: model.layers.14.feed_forward.w2, in=14336, out=4096
+linear: model.layers.14.feed_forward.w3, in=4096, out=14336
+linear: model.layers.15.attention.wq, in=4096, out=4096
+linear: model.layers.15.attention.wk, in=4096, out=1024
+linear: model.layers.15.attention.wv, in=4096, out=1024
+linear: model.layers.15.attention.wo, in=4096, out=4096
+linear: model.layers.15.feed_forward.w1, in=4096, out=14336
+linear: model.layers.15.feed_forward.w2, in=14336, out=4096
+linear: model.layers.15.feed_forward.w3, in=4096, out=14336
+linear: model.layers.16.attention.wq, in=4096, out=4096
+linear: model.layers.16.attention.wk, in=4096, out=1024
+linear: model.layers.16.attention.wv, in=4096, out=1024
+linear: model.layers.16.attention.wo, in=4096, out=4096
+linear: model.layers.16.feed_forward.w1, in=4096, out=14336
+linear: model.layers.16.feed_forward.w2, in=14336, out=4096
+linear: model.layers.16.feed_forward.w3, in=4096, out=14336
+linear: model.layers.17.attention.wq, in=4096, out=4096
+linear: model.layers.17.attention.wk, in=4096, out=1024
+linear: model.layers.17.attention.wv, in=4096, out=1024
+linear: model.layers.17.attention.wo, in=4096, out=4096
+linear: model.layers.17.feed_forward.w1, in=4096, out=14336
+linear: model.layers.17.feed_forward.w2, in=14336, out=4096
+linear: model.layers.17.feed_forward.w3, in=4096, out=14336
+linear: model.layers.18.attention.wq, in=4096, out=4096
+linear: model.layers.18.attention.wk, in=4096, out=1024
+linear: model.layers.18.attention.wv, in=4096, out=1024
+linear: model.layers.18.attention.wo, in=4096, out=4096
+linear: model.layers.18.feed_forward.w1, in=4096, out=14336
+linear: model.layers.18.feed_forward.w2, in=14336, out=4096
+linear: model.layers.18.feed_forward.w3, in=4096, out=14336
+linear: model.layers.19.attention.wq, in=4096, out=4096
+linear: model.layers.19.attention.wk, in=4096, out=1024
+linear: model.layers.19.attention.wv, in=4096, out=1024
+linear: model.layers.19.attention.wo, in=4096, out=4096
+linear: model.layers.19.feed_forward.w1, in=4096, out=14336
+linear: model.layers.19.feed_forward.w2, in=14336, out=4096
+linear: model.layers.19.feed_forward.w3, in=4096, out=14336
+linear: model.layers.20.attention.wq, in=4096, out=4096
+linear: model.layers.20.attention.wk, in=4096, out=1024
+linear: model.layers.20.attention.wv, in=4096, out=1024
+linear: model.layers.20.attention.wo, in=4096, out=4096
+linear: model.layers.20.feed_forward.w1, in=4096, out=14336
+linear: model.layers.20.feed_forward.w2, in=14336, out=4096
+linear: model.layers.20.feed_forward.w3, in=4096, out=14336
+linear: model.layers.21.attention.wq, in=4096, out=4096
+linear: model.layers.21.attention.wk, in=4096, out=1024
+linear: model.layers.21.attention.wv, in=4096, out=1024
+linear: model.layers.21.attention.wo, in=4096, out=4096
+linear: model.layers.21.feed_forward.w1, in=4096, out=14336
+linear: model.layers.21.feed_forward.w2, in=14336, out=4096
+linear: model.layers.21.feed_forward.w3, in=4096, out=14336
+linear: model.layers.22.attention.wq, in=4096, out=4096
+linear: model.layers.22.attention.wk, in=4096, out=1024
+linear: model.layers.22.attention.wv, in=4096, out=1024
+linear: model.layers.22.attention.wo, in=4096, out=4096
+linear: model.layers.22.feed_forward.w1, in=4096, out=14336
+linear: model.layers.22.feed_forward.w2, in=14336, out=4096
+linear: model.layers.22.feed_forward.w3, in=4096, out=14336
+linear: model.layers.23.attention.wq, in=4096, out=4096
+linear: model.layers.23.attention.wk, in=4096, out=1024
+linear: model.layers.23.attention.wv, in=4096, out=1024
+linear: model.layers.23.attention.wo, in=4096, out=4096
+linear: model.layers.23.feed_forward.w1, in=4096, out=14336
+linear: model.layers.23.feed_forward.w2, in=14336, out=4096
+linear: model.layers.23.feed_forward.w3, in=4096, out=14336
+linear: model.layers.24.attention.wq, in=4096, out=4096
+linear: model.layers.24.attention.wk, in=4096, out=1024
+linear: model.layers.24.attention.wv, in=4096, out=1024
+linear: model.layers.24.attention.wo, in=4096, out=4096
+linear: model.layers.24.feed_forward.w1, in=4096, out=14336
+linear: model.layers.24.feed_forward.w2, in=14336, out=4096
+linear: model.layers.24.feed_forward.w3, in=4096, out=14336
+linear: model.layers.25.attention.wq, in=4096, out=4096
+linear: model.layers.25.attention.wk, in=4096, out=1024
+linear: model.layers.25.attention.wv, in=4096, out=1024
+linear: model.layers.25.attention.wo, in=4096, out=4096
+linear: model.layers.25.feed_forward.w1, in=4096, out=14336
+linear: model.layers.25.feed_forward.w2, in=14336, out=4096
+linear: model.layers.25.feed_forward.w3, in=4096, out=14336
+linear: model.layers.26.attention.wq, in=4096, out=4096
+linear: model.layers.26.attention.wk, in=4096, out=1024
+linear: model.layers.26.attention.wv, in=4096, out=1024
+linear: model.layers.26.attention.wo, in=4096, out=4096
+linear: model.layers.26.feed_forward.w1, in=4096, out=14336
+linear: model.layers.26.feed_forward.w2, in=14336, out=4096
+linear: model.layers.26.feed_forward.w3, in=4096, out=14336
+linear: model.layers.27.attention.wq, in=4096, out=4096
+linear: model.layers.27.attention.wk, in=4096, out=1024
+linear: model.layers.27.attention.wv, in=4096, out=1024
+linear: model.layers.27.attention.wo, in=4096, out=4096
+linear: model.layers.27.feed_forward.w1, in=4096, out=14336
+linear: model.layers.27.feed_forward.w2, in=14336, out=4096
+linear: model.layers.27.feed_forward.w3, in=4096, out=14336
+linear: model.layers.28.attention.wq, in=4096, out=4096
+linear: model.layers.28.attention.wk, in=4096, out=1024
+linear: model.layers.28.attention.wv, in=4096, out=1024
+linear: model.layers.28.attention.wo, in=4096, out=4096
+linear: model.layers.28.feed_forward.w1, in=4096, out=14336
+linear: model.layers.28.feed_forward.w2, in=14336, out=4096
+linear: model.layers.28.feed_forward.w3, in=4096, out=14336
+linear: model.layers.29.attention.wq, in=4096, out=4096
+linear: model.layers.29.attention.wk, in=4096, out=1024
+linear: model.layers.29.attention.wv, in=4096, out=1024
+linear: model.layers.29.attention.wo, in=4096, out=4096
+linear: model.layers.29.feed_forward.w1, in=4096, out=14336
+linear: model.layers.29.feed_forward.w2, in=14336, out=4096
+linear: model.layers.29.feed_forward.w3, in=4096, out=14336
+linear: model.layers.30.attention.wq, in=4096, out=4096
+linear: model.layers.30.attention.wk, in=4096, out=1024
+linear: model.layers.30.attention.wv, in=4096, out=1024
+linear: model.layers.30.attention.wo, in=4096, out=4096
+linear: model.layers.30.feed_forward.w1, in=4096, out=14336
+linear: model.layers.30.feed_forward.w2, in=14336, out=4096
+linear: model.layers.30.feed_forward.w3, in=4096, out=14336
+linear: model.layers.31.attention.wq, in=4096, out=4096
+linear: model.layers.31.attention.wk, in=4096, out=1024
+linear: model.layers.31.attention.wv, in=4096, out=1024
+linear: model.layers.31.attention.wo, in=4096, out=4096
+linear: model.layers.31.feed_forward.w1, in=4096, out=14336
+linear: model.layers.31.feed_forward.w2, in=14336, out=4096
+linear: model.layers.31.feed_forward.w3, in=4096, out=14336
+linear: model.output, in=4096, out=128256
+Unabled to import torchao experimental quant_api with error:  [Errno 2] No such file or directory: '/home/jackkhuu/oss/torchchat/torchao-build/src/ao/torchao/experimental/quant_api.py'
+Using device=cpu Intel(R) Xeon(R) Platinum 8339HC CPU @ 1.80GHz
+Loading model...
+Time to load model: 0.11 seconds
+Quantizing the model with: {'linear:int4': {'groupsize': 256}, 'precision': {'dtype': 'bfloat16'}, 'executor': {'accelerator': 'cpu'}}
+Time to quantize model: 17.28 seconds
+-----------------------------------------------------------
+Once upon a time, in a small village nestled in the rolling hills of a far-off land, there lived a young boy named John. John was a bright and curious child, always eager to learn and explore the world around him. He spent most of his days playing in the nearby woods, collecting leaves and watching the animals that lived there.
+One day, while wandering through the woods, John stumbled upon a small clearing surrounded by tall trees and filled with a variety of colorful flowers and plants. In the center of the clearing stood an old, gnarled tree, its branches twisted and tangled in a way that seemed almost magical.
+As John approached the tree, he noticed that the air around him seemed to grow quieter. The rustling of leaves and chirping of birds ceased, and an eerie stillness fell over the clearing. John felt a shiver run down his spine as he reached out to touch the trunk of the tree.
+As soon as he made contact with the tree, John was enveloped in a vivid dreamlike vision. He saw a great city, bustling with people and noise, with towering buildings and strange contraptions moving through the air. He saw a group of people, standing together and looking up at a magnificent sight that made him feel small and insignificant. He saw
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~                
+Generated 255 tokens                 
+Time for inference 1: 162.0921 sec total                 
+Time to first token: 0.2540 sec with parallel prefill.                
+
+      Total throughput: 1.5793 tokens/sec, 0.6332 s/token                 
+First token throughput: 3.9368 tokens/sec, 0.2540 s/token                 
+ Next token throughput: 1.5756 tokens/sec, 0.6347 s/token                     
+
+Bandwidth achieved: 7.77 GB/s
+*** This first iteration will include cold start effects for dynamic import, hardware caches, JIT compilation. ***
+just-in-time compilation time (incl run time): 1.6e+02 seconds
+
+========================================
+
+Once upon a time, the country was divided into four kingdoms, each with their own special power and magic. The kingdoms were called the Kingdom of Solitude, the Kingdom of Serenity, the Kingdom of Radiance, and the Kingdom of Shadows.
+The Kingdom of Solitude was a land of ice and snow, where the people were gentle and kind. They had the power to communicate with animals, and could create powerful illusions with their magic.
+The Kingdom of Serenity was a land of calm and peace, where the people were wise and just. They had the power to heal with their magic, and could create powerful shields to protect themselves and others.
+The Kingdom of Radiance was a land of light and warmth, where the people were confident and brave. They had the power to create powerful explosions of light and heat with their magic, and could create powerful illusions to deceive and distract.
+The Kingdom of Shadows was a land of darkness and mystery, where the people were cunning and stealthy. They had the power to move through darkness and shadows, and could create powerful illusions to confuse and disorient.
+One day, a young prince from the Kingdom of Solitude named Leo fell in love with a beautiful princess from the Kingdom of Radiance named Rayne. However, their love was forbidden, as
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~                
+Generated 255 tokens                 
+Time for inference 2: 24.4277 sec total                 
+Time to first token: 0.3471 sec with parallel prefill.                
+
+      Total throughput: 10.4799 tokens/sec, 0.0954 s/token                 
+First token throughput: 2.8812 tokens/sec, 0.3471 s/token                 
+ Next token throughput: 10.5894 tokens/sec, 0.0944 s/token                     
+
+Bandwidth achieved: 51.57 GB/s
+
+========================================
+
+Once upon a time, in a land far, far away, there was a beautiful kingdom called Azura. It was a place of wonder and magic, where dragons roared and flowers bloomed in every color of the rainbow. The kingdom was ruled by a just and wise king, who loved his people and did everything in his power to keep them safe and happy.
+
+But one day, a dark and evil sorcerer cast a spell over the kingdom. The sorcerer was a powerful and cruel man, who delighted in causing suffering and pain to others. He had a special hatred for the people of Azura, and he sought to destroy their kingdom and everything they held dear.
+
+The king, who had always been a good and just ruler, was at first powerless to stop the sorcerer. But as he looked out over his kingdom and saw the suffering he had caused, he knew that he had to do something. He called upon his bravest knights and together they set out to find a way to defeat the sorcerer and break the spell.
+
+As they journeyed through the kingdom, they met many challenges and obstacles. They fought off fierce monsters and overcame treacherous landscapes, but they never gave up. And finally, after many days and nights of travel, they came to the sor
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~                
+Generated 255 tokens                 
+Time for inference 3: 20.5156 sec total                 
+Time to first token: 0.3200 sec with parallel prefill.                
+
+      Total throughput: 12.4783 tokens/sec, 0.0801 s/token                 
+First token throughput: 3.1250 tokens/sec, 0.3200 s/token                 
+ Next token throughput: 12.6265 tokens/sec, 0.0792 s/token                     
+
+Bandwidth achieved: 61.40 GB/s
+
+========================================
+
+
+Warning: Excluding compile in calculations                 
+      Average tokens/sec (total): 11.48                 
+Average tokens/sec (first token): 3.00                 
+Average tokens/sec (next tokens): 11.61 
+                
+Memory used: 0.00 GB
diff --git a/llama31-1218/cpu_compile_8.txt b/llama31-1218/cpu_compile_8.txt
new file mode 100644
index 000000000..c09fa11f4
--- /dev/null
+++ b/llama31-1218/cpu_compile_8.txt
@@ -0,0 +1,80 @@
+
+OMP_NUM_THREADS=16 numactl --cpunodebind=0 --membind=0 python3 torchchat.py generate llama3.1 --quantize '{"linear:int8": {"groupsize": 0}, "precision": {"dtype":"bfloat16"}, "executor":{"accelerator":"cpu"}}' --prompt "Once upon a time," --max-new-tokens 256 --compile --num-samples 3
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+OMP_NUM_THREADS=16 numactl --cpunodebind=0 --membind=0 python3 torchchat.py generate llama3.1 --quantize '{"linear:int8": {"groupsize": 0}, "precision": {"dtype":"bfloat16"}, "executor":{"accelerator":"cpu"}}' --prompt "Once upon a time," --max-new-tokens 256 --compile --num-samples 3
+PyTorch version 2.6.0.dev20241218+cu124 available.
+Unabled to import torchao experimental quant_api with error:  [Errno 2] No such file or directory: '/home/jackkhuu/oss/torchchat/torchao-build/src/ao/torchao/experimental/quant_api.py'
+Using device=cpu Intel(R) Xeon(R) Platinum 8339HC CPU @ 1.80GHz
+Loading model...
+Time to load model: 0.10 seconds
+Quantizing the model with: {'linear:int8': {'groupsize': 0}, 'precision': {'dtype': 'bfloat16'}, 'executor': {'accelerator': 'cpu'}}
+Time to quantize model: 29.60 seconds
+-----------------------------------------------------------
+Once upon a time, in a small village nestled in the rolling hills of Provence, there lived a young artist named Sophie. Sophie was known throughout the village for her exquisite watercolors of the local flora and fauna. She spent her days painting the vibrant bouquets of wildflowers that bloomed in the nearby fields, and the majestic birds that soared through the sky.
+
+Sophie's studio was a cozy little room above her family's bakery, filled with the sweet scent of freshly baked bread wafting through the air. Her easel stood by the window, where she could paint the ever-changing light of the Provençal sky. Sophie's art was not just a reflection of her love for nature, but also a way for her to connect with the beauty of the world around her.
+
+One day, a wealthy patron from the city, Monsieur LeFleur, arrived in the village in search of a new artist to paint the gardens of his estate. He had heard about Sophie's extraordinary talent and sought her out, eager to commission a series of paintings that would capture the essence of the Provençal landscape.
+
+Sophie was both thrilled and intimidated by the opportunity. She had never painted on such a grand scale before, and the pressure to produce something truly exceptional was daunting.
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~                
+Generated 255 tokens                 
+Time for inference 1: 176.6127 sec total                 
+Time to first token: 0.6387 sec with parallel prefill.                
+
+      Total throughput: 1.4495 tokens/sec, 0.6899 s/token                 
+First token throughput: 1.5656 tokens/sec, 0.6387 s/token                 
+ Next token throughput: 1.4491 tokens/sec, 0.6901 s/token                     
+
+Bandwidth achieved: 12.41 GB/s
+*** This first iteration will include cold start effects for dynamic import, hardware caches, JIT compilation. ***
+just-in-time compilation time (incl run time): 1.8e+02 seconds
+
+========================================
+
+Once upon a time, there was a young girl named Sophie who lived in a small village surrounded by rolling hills and dense forests. She was a curious and adventurous child, always eager to explore the world around her.
+One day, while wandering through the forest, Sophie stumbled upon a hidden path she had never seen before. The path was overgrown with vines and shrubs, and it looked as though it hadn't been used in years. Sophie's curiosity was piqued, and she decided to investigate further.
+As she made her way down the path, the trees grew taller and the air grew thick with the scent of wildflowers. Sophie felt as though she was walking through a secret world, one that few people knew existed. The path twisted and turned, leading her deeper into the forest.
+Suddenly, Sophie heard the sound of running water. She followed the sound and soon found herself standing at the edge of a beautiful waterfall. The water cascaded down a rocky cliff, creating a misty veil that surrounded her.
+Sophie felt a sense of wonder and awe wash over her. She had never seen anything like it before. She couldn't help but feel a sense of magic in the air, as though the waterfall was a portal to a different world.
+As she stood there, taking in
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~                
+Generated 255 tokens                 
+Time for inference 2: 35.0481 sec total                 
+Time to first token: 0.3972 sec with parallel prefill.                
+
+      Total throughput: 7.3043 tokens/sec, 0.1369 s/token                 
+First token throughput: 2.5176 tokens/sec, 0.3972 s/token                 
+ Next token throughput: 7.3591 tokens/sec, 0.1359 s/token                     
+
+Bandwidth achieved: 62.52 GB/s
+
+========================================
+
+Once upon a time, in the rolling hills of Somerset, there was a small village nestled among the picturesque countryside. The village was called Langport, and it was famous for its beautiful riverside setting and its rich history.
+One day, a young girl named Emily moved to Langport with her family. She was excited to explore her new surroundings and make some new friends. As she wandered through the village, she discovered a quaint little shop with a sign that read "Curious Goods".
+Emily's curiosity was piqued, and she pushed open the door to venture inside. The shop was dimly lit, with shelves upon shelves of strange and wondrous objects. There were vintage dolls, antique clocks, and mysterious boxes with strange symbols etched onto their lids.
+The shopkeeper, an elderly man with a kind face and twinkling eyes, welcomed Emily to his store. "Ah, a new face in town," he said with a warm smile. "What brings you to Curious Goods?"
+Emily's eyes widened as she scanned the shelves. "I'm not sure," she said. "I just saw the sign and thought it looked interesting."
+The shopkeeper chuckled. "Ah, curiosity is a wonderful thing. Let me show you some of my favorite treasures."
+As Emily browsed
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~                
+Generated 255 tokens                 
+Time for inference 3: 34.1833 sec total                 
+Time to first token: 0.3417 sec with parallel prefill.                
+
+      Total throughput: 7.4890 tokens/sec, 0.1335 s/token                 
+First token throughput: 2.9262 tokens/sec, 0.3417 s/token                 
+ Next token throughput: 7.5351 tokens/sec, 0.1327 s/token                     
+
+Bandwidth achieved: 64.10 GB/s
+
+========================================
+
+
+Warning: Excluding compile in calculations                 
+      Average tokens/sec (total): 7.40                 
+Average tokens/sec (first token): 2.72                 
+Average tokens/sec (next tokens): 7.45 
+                
+Memory used: 0.00 GB
diff --git a/llama31-1218/cpu_compile_b16.txt b/llama31-1218/cpu_compile_b16.txt
new file mode 100644
index 000000000..8bb393c78
--- /dev/null
+++ b/llama31-1218/cpu_compile_b16.txt
@@ -0,0 +1,75 @@
+
+OMP_NUM_THREADS=16 numactl --cpunodebind=0 --membind=0 python3 torchchat.py generate llama3.1 --quantize '{"precision": {"dtype":"bfloat16"}, "executor":{"accelerator":"cpu"}}' --prompt "Once upon a time," --max-new-tokens 256 --compile --num-samples 3
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+OMP_NUM_THREADS=16 numactl --cpunodebind=0 --membind=0 python3 torchchat.py generate llama3.1 --quantize '{"precision": {"dtype":"bfloat16"}, "executor":{"accelerator":"cpu"}}' --prompt "Once upon a time," --max-new-tokens 256 --compile --num-samples 3
+PyTorch version 2.6.0.dev20241218+cu124 available.
+Unabled to import torchao experimental quant_api with error:  [Errno 2] No such file or directory: '/home/jackkhuu/oss/torchchat/torchao-build/src/ao/torchao/experimental/quant_api.py'
+Using device=cpu Intel(R) Xeon(R) Platinum 8339HC CPU @ 1.80GHz
+Loading model...
+Time to load model: 0.11 seconds
+Quantizing the model with: {'precision': {'dtype': 'bfloat16'}, 'executor': {'accelerator': 'cpu'}}
+Time to quantize model: 0.01 seconds
+-----------------------------------------------------------
+Once upon a time, in the ancient kingdom of Aethoria, there lived a young warrior princess named Eira. Eira was known throughout the land for her unmatched bravery, strength, and unwavering dedication to justice. Her people adored her, and her enemies trembled at the mere mention of her name.
+Eira's journey began when she was just a child, training in the ways of combat and magic under the watchful eye of her wise and powerful mentor, the sorceress Arachne. Arachne had taken Eira under her wing, recognizing the young princess's innate potential and grooming her to one day take the throne.
+As Eira grew in power and wisdom, she became increasingly frustrated with the injustices that plagued her kingdom. Corruption, tyranny, and oppression were rampant, and Eira saw it as her duty to put an end to them. She spent countless hours studying the ancient lore and magical texts, seeking a deeper understanding of the world and the forces that shaped it.
+One fateful day, a dark and malevolent force began to spread across the land, threatening to destroy everything Eira held dear. A powerful sorcerer-king, named Malakar, had risen to power, using his mastery of dark magic to enslave the people
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~                
+Generated 255 tokens                 
+Time for inference 1: 220.2756 sec total                 
+Time to first token: 0.6765 sec with parallel prefill.                
+
+      Total throughput: 1.1622 tokens/sec, 0.8605 s/token                 
+First token throughput: 1.4782 tokens/sec, 0.6765 s/token                 
+ Next token throughput: 1.1612 tokens/sec, 0.8612 s/token                     
+
+Bandwidth achieved: 18.67 GB/s
+*** This first iteration will include cold start effects for dynamic import, hardware caches, JIT compilation. ***
+just-in-time compilation time (incl run time): 2.2e+02 seconds
+
+========================================
+
+Once upon a time, there lived a rabbit named Rosie. Rosie was different from the other rabbits in the forest. While they were content to spend their days nibbling on carrots and lounging in the sun, Rosie had a passion for adventure.
+Rosie had heard of a hidden garden deep within the forest, full of the most beautiful and exotic flowers she had ever seen. She had always dreamed of exploring it, but she was afraid of the unknown dangers that lay within.
+One day, Rosie decided that she had had enough of being afraid. She packed a small bag with some carrots, a canteen of water, and a map, and set off on her journey to the hidden garden.
+As she wandered through the forest, Rosie encountered all sorts of obstacles. She had to navigate through thick underbrush, cross rushing streams, and climb steep hills. But she persevered, driven by her determination to reach the garden.
+Finally, after what seemed like hours of walking, Rosie caught sight of a glimpse of green through the trees. She quickened her pace, her heart racing with excitement.
+As she pushed through the final curtain of foliage, Rosie gasped in wonder. Before her lay a garden unlike any she had ever seen. The flowers were more vibrant and exotic than she had imagined,
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~                
+Generated 255 tokens                 
+Time for inference 2: 81.8519 sec total                 
+Time to first token: 0.4276 sec with parallel prefill.                
+
+      Total throughput: 3.1276 tokens/sec, 0.3197 s/token                 
+First token throughput: 2.3387 tokens/sec, 0.4276 s/token                 
+ Next token throughput: 3.1317 tokens/sec, 0.3193 s/token                     
+
+Bandwidth achieved: 50.23 GB/s
+
+========================================
+
+Once upon a time, in a land far, far away, there was the most magical city in all the land. It was a place where dreams came true and magic was real. The city was called Everwood, and it was a place of wonder and enchantment.
+Everwood was a city of towering trees that seemed to stretch up to the sky, their branches tangled and woven together in a way that seemed almost... magical. The trees were covered in leaves that shimmered and sparkled in the sunlight, and the air was filled with the sweet scent of honey and lavender.
+In the heart of Everwood, there was a great and ancient tree, the Heartwood. It was said that the Heartwood was the source of all magic in the city, and that it held the secrets of the ancient ones. The Heartwood was a place of great power and wisdom, and many sought to unlock its secrets.
+In Everwood, the inhabitants were a magical people, with the ability to communicate with animals and control the elements. They lived in harmony with the natural world, and their city was a reflection of this harmony. Everwood was a place of beauty and wonder, where anything seemed possible.
+But, as magical as Everwood was, it was not without its dangers. There were dark
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~                
+Generated 255 tokens                 
+Time for inference 3: 83.6049 sec total                 
+Time to first token: 0.4247 sec with parallel prefill.                
+
+      Total throughput: 3.0620 tokens/sec, 0.3266 s/token                 
+First token throughput: 2.3546 tokens/sec, 0.4247 s/token                 
+ Next token throughput: 3.0656 tokens/sec, 0.3262 s/token                     
+
+Bandwidth achieved: 49.18 GB/s
+
+========================================
+
+
+Warning: Excluding compile in calculations                 
+      Average tokens/sec (total): 3.09                 
+Average tokens/sec (first token): 2.35                 
+Average tokens/sec (next tokens): 3.10 
+                
+Memory used: 0.00 GB
diff --git a/llama31-1218/cpu_eager_4.txt b/llama31-1218/cpu_eager_4.txt
new file mode 100644
index 000000000..2e43f9394
--- /dev/null
+++ b/llama31-1218/cpu_eager_4.txt
@@ -0,0 +1,301 @@
+
+OMP_NUM_THREADS=16 numactl --cpunodebind=0 --membind=0 python3 torchchat.py generate llama3.1 --quantize '{"linear:int4": {"groupsize": 256}, "precision": {"dtype":"bfloat16"}, "executor":{"accelerator":"cpu"}}' --prompt "Once upon a time," --max-new-tokens 256 --num-samples 3
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+OMP_NUM_THREADS=16 numactl --cpunodebind=0 --membind=0 python3 torchchat.py generate llama3.1 --quantize '{"linear:int4": {"groupsize": 256}, "precision": {"dtype":"bfloat16"}, "executor":{"accelerator":"cpu"}}' --prompt "Once upon a time," --max-new-tokens 256 --num-samples 3
+PyTorch version 2.6.0.dev20241218+cu124 available.
+linear: model.layers.0.attention.wq, in=4096, out=4096
+linear: model.layers.0.attention.wk, in=4096, out=1024
+linear: model.layers.0.attention.wv, in=4096, out=1024
+linear: model.layers.0.attention.wo, in=4096, out=4096
+linear: model.layers.0.feed_forward.w1, in=4096, out=14336
+linear: model.layers.0.feed_forward.w2, in=14336, out=4096
+linear: model.layers.0.feed_forward.w3, in=4096, out=14336
+linear: model.layers.1.attention.wq, in=4096, out=4096
+linear: model.layers.1.attention.wk, in=4096, out=1024
+linear: model.layers.1.attention.wv, in=4096, out=1024
+linear: model.layers.1.attention.wo, in=4096, out=4096
+linear: model.layers.1.feed_forward.w1, in=4096, out=14336
+linear: model.layers.1.feed_forward.w2, in=14336, out=4096
+linear: model.layers.1.feed_forward.w3, in=4096, out=14336
+linear: model.layers.2.attention.wq, in=4096, out=4096
+linear: model.layers.2.attention.wk, in=4096, out=1024
+linear: model.layers.2.attention.wv, in=4096, out=1024
+linear: model.layers.2.attention.wo, in=4096, out=4096
+linear: model.layers.2.feed_forward.w1, in=4096, out=14336
+linear: model.layers.2.feed_forward.w2, in=14336, out=4096
+linear: model.layers.2.feed_forward.w3, in=4096, out=14336
+linear: model.layers.3.attention.wq, in=4096, out=4096
+linear: model.layers.3.attention.wk, in=4096, out=1024
+linear: model.layers.3.attention.wv, in=4096, out=1024
+linear: model.layers.3.attention.wo, in=4096, out=4096
+linear: model.layers.3.feed_forward.w1, in=4096, out=14336
+linear: model.layers.3.feed_forward.w2, in=14336, out=4096
+linear: model.layers.3.feed_forward.w3, in=4096, out=14336
+linear: model.layers.4.attention.wq, in=4096, out=4096
+linear: model.layers.4.attention.wk, in=4096, out=1024
+linear: model.layers.4.attention.wv, in=4096, out=1024
+linear: model.layers.4.attention.wo, in=4096, out=4096
+linear: model.layers.4.feed_forward.w1, in=4096, out=14336
+linear: model.layers.4.feed_forward.w2, in=14336, out=4096
+linear: model.layers.4.feed_forward.w3, in=4096, out=14336
+linear: model.layers.5.attention.wq, in=4096, out=4096
+linear: model.layers.5.attention.wk, in=4096, out=1024
+linear: model.layers.5.attention.wv, in=4096, out=1024
+linear: model.layers.5.attention.wo, in=4096, out=4096
+linear: model.layers.5.feed_forward.w1, in=4096, out=14336
+linear: model.layers.5.feed_forward.w2, in=14336, out=4096
+linear: model.layers.5.feed_forward.w3, in=4096, out=14336
+linear: model.layers.6.attention.wq, in=4096, out=4096
+linear: model.layers.6.attention.wk, in=4096, out=1024
+linear: model.layers.6.attention.wv, in=4096, out=1024
+linear: model.layers.6.attention.wo, in=4096, out=4096
+linear: model.layers.6.feed_forward.w1, in=4096, out=14336
+linear: model.layers.6.feed_forward.w2, in=14336, out=4096
+linear: model.layers.6.feed_forward.w3, in=4096, out=14336
+linear: model.layers.7.attention.wq, in=4096, out=4096
+linear: model.layers.7.attention.wk, in=4096, out=1024
+linear: model.layers.7.attention.wv, in=4096, out=1024
+linear: model.layers.7.attention.wo, in=4096, out=4096
+linear: model.layers.7.feed_forward.w1, in=4096, out=14336
+linear: model.layers.7.feed_forward.w2, in=14336, out=4096
+linear: model.layers.7.feed_forward.w3, in=4096, out=14336
+linear: model.layers.8.attention.wq, in=4096, out=4096
+linear: model.layers.8.attention.wk, in=4096, out=1024
+linear: model.layers.8.attention.wv, in=4096, out=1024
+linear: model.layers.8.attention.wo, in=4096, out=4096
+linear: model.layers.8.feed_forward.w1, in=4096, out=14336
+linear: model.layers.8.feed_forward.w2, in=14336, out=4096
+linear: model.layers.8.feed_forward.w3, in=4096, out=14336
+linear: model.layers.9.attention.wq, in=4096, out=4096
+linear: model.layers.9.attention.wk, in=4096, out=1024
+linear: model.layers.9.attention.wv, in=4096, out=1024
+linear: model.layers.9.attention.wo, in=4096, out=4096
+linear: model.layers.9.feed_forward.w1, in=4096, out=14336
+linear: model.layers.9.feed_forward.w2, in=14336, out=4096
+linear: model.layers.9.feed_forward.w3, in=4096, out=14336
+linear: model.layers.10.attention.wq, in=4096, out=4096
+linear: model.layers.10.attention.wk, in=4096, out=1024
+linear: model.layers.10.attention.wv, in=4096, out=1024
+linear: model.layers.10.attention.wo, in=4096, out=4096
+linear: model.layers.10.feed_forward.w1, in=4096, out=14336
+linear: model.layers.10.feed_forward.w2, in=14336, out=4096
+linear: model.layers.10.feed_forward.w3, in=4096, out=14336
+linear: model.layers.11.attention.wq, in=4096, out=4096
+linear: model.layers.11.attention.wk, in=4096, out=1024
+linear: model.layers.11.attention.wv, in=4096, out=1024
+linear: model.layers.11.attention.wo, in=4096, out=4096
+linear: model.layers.11.feed_forward.w1, in=4096, out=14336
+linear: model.layers.11.feed_forward.w2, in=14336, out=4096
+linear: model.layers.11.feed_forward.w3, in=4096, out=14336
+linear: model.layers.12.attention.wq, in=4096, out=4096
+linear: model.layers.12.attention.wk, in=4096, out=1024
+linear: model.layers.12.attention.wv, in=4096, out=1024
+linear: model.layers.12.attention.wo, in=4096, out=4096
+linear: model.layers.12.feed_forward.w1, in=4096, out=14336
+linear: model.layers.12.feed_forward.w2, in=14336, out=4096
+linear: model.layers.12.feed_forward.w3, in=4096, out=14336
+linear: model.layers.13.attention.wq, in=4096, out=4096
+linear: model.layers.13.attention.wk, in=4096, out=1024
+linear: model.layers.13.attention.wv, in=4096, out=1024
+linear: model.layers.13.attention.wo, in=4096, out=4096
+linear: model.layers.13.feed_forward.w1, in=4096, out=14336
+linear: model.layers.13.feed_forward.w2, in=14336, out=4096
+linear: model.layers.13.feed_forward.w3, in=4096, out=14336
+linear: model.layers.14.attention.wq, in=4096, out=4096
+linear: model.layers.14.attention.wk, in=4096, out=1024
+linear: model.layers.14.attention.wv, in=4096, out=1024
+linear: model.layers.14.attention.wo, in=4096, out=4096
+linear: model.layers.14.feed_forward.w1, in=4096, out=14336
+linear: model.layers.14.feed_forward.w2, in=14336, out=4096
+linear: model.layers.14.feed_forward.w3, in=4096, out=14336
+linear: model.layers.15.attention.wq, in=4096, out=4096
+linear: model.layers.15.attention.wk, in=4096, out=1024
+linear: model.layers.15.attention.wv, in=4096, out=1024
+linear: model.layers.15.attention.wo, in=4096, out=4096
+linear: model.layers.15.feed_forward.w1, in=4096, out=14336
+linear: model.layers.15.feed_forward.w2, in=14336, out=4096
+linear: model.layers.15.feed_forward.w3, in=4096, out=14336
+linear: model.layers.16.attention.wq, in=4096, out=4096
+linear: model.layers.16.attention.wk, in=4096, out=1024
+linear: model.layers.16.attention.wv, in=4096, out=1024
+linear: model.layers.16.attention.wo, in=4096, out=4096
+linear: model.layers.16.feed_forward.w1, in=4096, out=14336
+linear: model.layers.16.feed_forward.w2, in=14336, out=4096
+linear: model.layers.16.feed_forward.w3, in=4096, out=14336
+linear: model.layers.17.attention.wq, in=4096, out=4096
+linear: model.layers.17.attention.wk, in=4096, out=1024
+linear: model.layers.17.attention.wv, in=4096, out=1024
+linear: model.layers.17.attention.wo, in=4096, out=4096
+linear: model.layers.17.feed_forward.w1, in=4096, out=14336
+linear: model.layers.17.feed_forward.w2, in=14336, out=4096
+linear: model.layers.17.feed_forward.w3, in=4096, out=14336
+linear: model.layers.18.attention.wq, in=4096, out=4096
+linear: model.layers.18.attention.wk, in=4096, out=1024
+linear: model.layers.18.attention.wv, in=4096, out=1024
+linear: model.layers.18.attention.wo, in=4096, out=4096
+linear: model.layers.18.feed_forward.w1, in=4096, out=14336
+linear: model.layers.18.feed_forward.w2, in=14336, out=4096
+linear: model.layers.18.feed_forward.w3, in=4096, out=14336
+linear: model.layers.19.attention.wq, in=4096, out=4096
+linear: model.layers.19.attention.wk, in=4096, out=1024
+linear: model.layers.19.attention.wv, in=4096, out=1024
+linear: model.layers.19.attention.wo, in=4096, out=4096
+linear: model.layers.19.feed_forward.w1, in=4096, out=14336
+linear: model.layers.19.feed_forward.w2, in=14336, out=4096
+linear: model.layers.19.feed_forward.w3, in=4096, out=14336
+linear: model.layers.20.attention.wq, in=4096, out=4096
+linear: model.layers.20.attention.wk, in=4096, out=1024
+linear: model.layers.20.attention.wv, in=4096, out=1024
+linear: model.layers.20.attention.wo, in=4096, out=4096
+linear: model.layers.20.feed_forward.w1, in=4096, out=14336
+linear: model.layers.20.feed_forward.w2, in=14336, out=4096
+linear: model.layers.20.feed_forward.w3, in=4096, out=14336
+linear: model.layers.21.attention.wq, in=4096, out=4096
+linear: model.layers.21.attention.wk, in=4096, out=1024
+linear: model.layers.21.attention.wv, in=4096, out=1024
+linear: model.layers.21.attention.wo, in=4096, out=4096
+linear: model.layers.21.feed_forward.w1, in=4096, out=14336
+linear: model.layers.21.feed_forward.w2, in=14336, out=4096
+linear: model.layers.21.feed_forward.w3, in=4096, out=14336
+linear: model.layers.22.attention.wq, in=4096, out=4096
+linear: model.layers.22.attention.wk, in=4096, out=1024
+linear: model.layers.22.attention.wv, in=4096, out=1024
+linear: model.layers.22.attention.wo, in=4096, out=4096
+linear: model.layers.22.feed_forward.w1, in=4096, out=14336
+linear: model.layers.22.feed_forward.w2, in=14336, out=4096
+linear: model.layers.22.feed_forward.w3, in=4096, out=14336
+linear: model.layers.23.attention.wq, in=4096, out=4096
+linear: model.layers.23.attention.wk, in=4096, out=1024
+linear: model.layers.23.attention.wv, in=4096, out=1024
+linear: model.layers.23.attention.wo, in=4096, out=4096
+linear: model.layers.23.feed_forward.w1, in=4096, out=14336
+linear: model.layers.23.feed_forward.w2, in=14336, out=4096
+linear: model.layers.23.feed_forward.w3, in=4096, out=14336
+linear: model.layers.24.attention.wq, in=4096, out=4096
+linear: model.layers.24.attention.wk, in=4096, out=1024
+linear: model.layers.24.attention.wv, in=4096, out=1024
+linear: model.layers.24.attention.wo, in=4096, out=4096
+linear: model.layers.24.feed_forward.w1, in=4096, out=14336
+linear: model.layers.24.feed_forward.w2, in=14336, out=4096
+linear: model.layers.24.feed_forward.w3, in=4096, out=14336
+linear: model.layers.25.attention.wq, in=4096, out=4096
+linear: model.layers.25.attention.wk, in=4096, out=1024
+linear: model.layers.25.attention.wv, in=4096, out=1024
+linear: model.layers.25.attention.wo, in=4096, out=4096
+linear: model.layers.25.feed_forward.w1, in=4096, out=14336
+linear: model.layers.25.feed_forward.w2, in=14336, out=4096
+linear: model.layers.25.feed_forward.w3, in=4096, out=14336
+linear: model.layers.26.attention.wq, in=4096, out=4096
+linear: model.layers.26.attention.wk, in=4096, out=1024
+linear: model.layers.26.attention.wv, in=4096, out=1024
+linear: model.layers.26.attention.wo, in=4096, out=4096
+linear: model.layers.26.feed_forward.w1, in=4096, out=14336
+linear: model.layers.26.feed_forward.w2, in=14336, out=4096
+linear: model.layers.26.feed_forward.w3, in=4096, out=14336
+linear: model.layers.27.attention.wq, in=4096, out=4096
+linear: model.layers.27.attention.wk, in=4096, out=1024
+linear: model.layers.27.attention.wv, in=4096, out=1024
+linear: model.layers.27.attention.wo, in=4096, out=4096
+linear: model.layers.27.feed_forward.w1, in=4096, out=14336
+linear: model.layers.27.feed_forward.w2, in=14336, out=4096
+linear: model.layers.27.feed_forward.w3, in=4096, out=14336
+linear: model.layers.28.attention.wq, in=4096, out=4096
+linear: model.layers.28.attention.wk, in=4096, out=1024
+linear: model.layers.28.attention.wv, in=4096, out=1024
+linear: model.layers.28.attention.wo, in=4096, out=4096
+linear: model.layers.28.feed_forward.w1, in=4096, out=14336
+linear: model.layers.28.feed_forward.w2, in=14336, out=4096
+linear: model.layers.28.feed_forward.w3, in=4096, out=14336
+linear: model.layers.29.attention.wq, in=4096, out=4096
+linear: model.layers.29.attention.wk, in=4096, out=1024
+linear: model.layers.29.attention.wv, in=4096, out=1024
+linear: model.layers.29.attention.wo, in=4096, out=4096
+linear: model.layers.29.feed_forward.w1, in=4096, out=14336
+linear: model.layers.29.feed_forward.w2, in=14336, out=4096
+linear: model.layers.29.feed_forward.w3, in=4096, out=14336
+linear: model.layers.30.attention.wq, in=4096, out=4096
+linear: model.layers.30.attention.wk, in=4096, out=1024
+linear: model.layers.30.attention.wv, in=4096, out=1024
+linear: model.layers.30.attention.wo, in=4096, out=4096
+linear: model.layers.30.feed_forward.w1, in=4096, out=14336
+linear: model.layers.30.feed_forward.w2, in=14336, out=4096
+linear: model.layers.30.feed_forward.w3, in=4096, out=14336
+linear: model.layers.31.attention.wq, in=4096, out=4096
+linear: model.layers.31.attention.wk, in=4096, out=1024
+linear: model.layers.31.attention.wv, in=4096, out=1024
+linear: model.layers.31.attention.wo, in=4096, out=4096
+linear: model.layers.31.feed_forward.w1, in=4096, out=14336
+linear: model.layers.31.feed_forward.w2, in=14336, out=4096
+linear: model.layers.31.feed_forward.w3, in=4096, out=14336
+linear: model.output, in=4096, out=128256
+Unabled to import torchao experimental quant_api with error:  [Errno 2] No such file or directory: '/home/jackkhuu/oss/torchchat/torchao-build/src/ao/torchao/experimental/quant_api.py'
+Using device=cpu Intel(R) Xeon(R) Platinum 8339HC CPU @ 1.80GHz
+Loading model...
+Time to load model: 0.11 seconds
+Quantizing the model with: {'linear:int4': {'groupsize': 256}, 'precision': {'dtype': 'bfloat16'}, 'executor': {'accelerator': 'cpu'}}
+Time to quantize model: 20.92 seconds
+-----------------------------------------------------------
+Once upon a time, there was a beautiful princess named Sophia. She lived in a beautiful castle with her parents, the king and queen. One day, a wicked sorcerer cast a spell on Sophia, turning her into a frog.
+He thought it would be funny to see the princess as a frog, and he would be the one who would be able to boss her around.
+But, the princess didn't lose hope. She still had her beauty and kindness inside, even though she was now a frog.
+She told her father, the king, about the sorcerer's spell, and the king knew just what to do. He sent a message to all the good wizards in the land, asking for their help in breaking the spell.
+Meanwhile, the princess tried to be brave and sit around the castle courtyard, even though she was very sad to be a frog.
+One day, a handsome young wizard named Leo came to the castle. He had heard about the princess being turned into a frog and wanted to help.
+Leo used his magic to create a magical water that would break the spell. But, he needed the help of the princess to do it. He had to talk to her and make her remember her true self.
+The princess was still in frog form, but Leo talked to her and
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~                
+Generated 255 tokens                 
+Time for inference 1: 37.6731 sec total                 
+Time to first token: 0.2535 sec with parallel prefill.                
+
+      Total throughput: 6.7953 tokens/sec, 0.1472 s/token                 
+First token throughput: 3.9442 tokens/sec, 0.2535 s/token                 
+ Next token throughput: 6.8146 tokens/sec, 0.1467 s/token                     
+
+Bandwidth achieved: 33.44 GB/s
+*** This first iteration will include cold start effects for dynamic import, hardware caches. ***
+
+========================================
+
+Once upon a time, there was a young girl named Lily who lived in a small village surrounded by vast fields of wheat. She spent most of her days playing in these fields with her friends, chasing after the butterflies that fluttered about and lying in the warm sun that shone brightly overhead.
+But Lily was not just any ordinary girl. She had a secret - she was a dreamer. Every night, when the sun had dipped below the horizon, Lily would sneak away to a nearby clearing deep in the heart of the wheat field. There, under the light of the full moon, she would spread out a beautiful, shimmering blanket, and begin to weave a magical tale.
+Lily's stories were like nothing anyone in the village had ever heard before. They were full of imagination, full of wonder, and full of magic. They were like stepping stones into another world, a world far more enchanting than their own simple village life.
+
+One night, as Lily was busy weaving her tale, she noticed a strange, shimmering light coming from a nearby patch of tall grass. She felt a sudden shiver run down her spine, and then, to her surprise, a handsome young man appeared before her. He was unlike anyone she had ever seen before - his eyes gleamed with a
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~                
+Generated 255 tokens                 
+Time for inference 2: 34.9801 sec total                 
+Time to first token: 0.2575 sec with parallel prefill.                
+
+      Total throughput: 7.3184 tokens/sec, 0.1366 s/token                 
+First token throughput: 3.8833 tokens/sec, 0.2575 s/token                 
+ Next token throughput: 7.3439 tokens/sec, 0.1362 s/token                     
+
+Bandwidth achieved: 36.01 GB/s
+
+========================================
+
+Once upon a time, in a small village surrounded by dense forests, there lived a young girl named Lily. Lily was an adventurous soul, always eager to explore the unknown and test her limits. She spent most of her days helping her mother with household chores and listening to the stories of the village elder, who was a wise and experienced hunter.
+One day, the village elder decided to tell Lily a story about a hidden water spring deep in the forest, which was said to possess magical healing properties. Intrigued by the tale, Lily begged the elder to take her to see it for herself. The elder, seeing the fire in Lily's eyes, agreed to take her on a journey to seek out the elusive spring.
+Under the light of the full moon, the duo set off into the forest, their path winding through the dense undergrowth. As they walked, the elder taught Lily the ways of tracking and hunting, impressing the girl with his expertise and patience. The air was filled with the scent of blooming wildflowers, and the chirping of crickets provided a soothing background melody.
+The pair continued on for hours, the darkness illuminated only by the glint of fireflies and the occasional beam of moonlight that broke through the canopy. Lily's excitement grew with each passing minute
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~                
+Generated 255 tokens                 
+Time for inference 3: 37.6900 sec total                 
+Time to first token: 0.2522 sec with parallel prefill.                
+
+      Total throughput: 6.7922 tokens/sec, 0.1472 s/token                 
+First token throughput: 3.9647 tokens/sec, 0.2522 s/token                 
+ Next token throughput: 6.8113 tokens/sec, 0.1468 s/token                     
+
+Bandwidth achieved: 33.42 GB/s
+
+========================================
+
+
+Warning: Excluding compile in calculations                 
+      Average tokens/sec (total): 6.97                 
+Average tokens/sec (first token): 3.93                 
+Average tokens/sec (next tokens): 6.99 
+                
+Memory used: 0.00 GB
diff --git a/llama31-1218/cpu_eager_8.txt b/llama31-1218/cpu_eager_8.txt
new file mode 100644
index 000000000..2add1700c
--- /dev/null
+++ b/llama31-1218/cpu_eager_8.txt
@@ -0,0 +1,77 @@
+
+OMP_NUM_THREADS=16 numactl --cpunodebind=0 --membind=0 python3 torchchat.py generate llama3.1 --quantize '{"linear:int8": {"groupsize": 0}, "precision": {"dtype":"bfloat16"}, "executor":{"accelerator":"cpu"}}' --prompt "Once upon a time," --max-new-tokens 256 --num-samples 3
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+OMP_NUM_THREADS=16 numactl --cpunodebind=0 --membind=0 python3 torchchat.py generate llama3.1 --quantize '{"linear:int8": {"groupsize": 0}, "precision": {"dtype":"bfloat16"}, "executor":{"accelerator":"cpu"}}' --prompt "Once upon a time," --max-new-tokens 256 --num-samples 3
+PyTorch version 2.6.0.dev20241218+cu124 available.
+Unabled to import torchao experimental quant_api with error:  [Errno 2] No such file or directory: '/home/jackkhuu/oss/torchchat/torchao-build/src/ao/torchao/experimental/quant_api.py'
+Using device=cpu Intel(R) Xeon(R) Platinum 8339HC CPU @ 1.80GHz
+Loading model...
+Time to load model: 0.11 seconds
+Quantizing the model with: {'linear:int8': {'groupsize': 0}, 'precision': {'dtype': 'bfloat16'}, 'executor': {'accelerator': 'cpu'}}
+Time to quantize model: 26.22 seconds
+-----------------------------------------------------------
+Once upon a time, in a world much like our own, there existed a small village surrounded by rolling hills and dense forests. The villagers lived simple lives, relying on the land to provide for their needs. They were a hardworking and community-minded people, always willing to lend a helping hand to those in need.
+The village was led by a wise and just elder named Thorne. Thorne was a fair and compassionate leader, loved by all the villagers. He was respected for his knowledge and his ability to make difficult decisions that benefited the village as a whole.
+One day, a young girl named Lyra wandered into the village. She was a curious and adventurous child, with long, curly hair the color of honey and eyes as blue as the sky. Lyra was drawn to the village by the warmth and kindness of its people, and she quickly became a part of their community.
+As Lyra settled into village life, she began to notice strange occurrences happening around her. Tools would go missing, food would disappear from the village stores, and strange noises could be heard in the dead of night. The villagers seemed hesitant to talk about these things, and Lyra sensed that there was something they were not telling her.
+Lyra's curiosity got the better of her, and she began to
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~                
+Generated 255 tokens                 
+Time for inference 1: 54.6390 sec total                 
+Time to first token: 0.4829 sec with parallel prefill.                
+
+      Total throughput: 4.6853 tokens/sec, 0.2134 s/token                 
+First token throughput: 2.0709 tokens/sec, 0.4829 s/token                 
+ Next token throughput: 4.7086 tokens/sec, 0.2124 s/token                     
+
+Bandwidth achieved: 40.10 GB/s
+*** This first iteration will include cold start effects for dynamic import, hardware caches. ***
+
+========================================
+
+Once upon a time, in a world not so different from our own, there was a small group of people who were deeply troubled by the world around them. They saw the suffering, the poverty, the injustice, and the pain of so many others. They were haunted by the thought that they could do something to change it.
+These people were not naive idealists, but rather, they were thinking people who had done their research and were determined to make a difference. They were driven by a sense of compassion and a desire to create a better world.
+But as they began to organize and plan their efforts, they hit a roadblock. They faced resistance from those who were already benefiting from the status quo, and they were met with skepticism by those who doubted their ability to succeed.
+Undeterred, these dedicated individuals persevered, driven by their conviction that a better world was possible. They worked tirelessly, often behind the scenes, to build a movement that would bring about meaningful change.
+And slowly but surely, their efforts began to bear fruit. The movement gained momentum, and more and more people began to join their cause. They faced setbacks and challenges, but they never lost sight of their goal.
+Eventually, their efforts led to a significant shift in the way people thought about and addressed the issues
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~                
+Generated 255 tokens                 
+Time for inference 2: 49.9193 sec total                 
+Time to first token: 0.2877 sec with parallel prefill.                
+
+      Total throughput: 5.1283 tokens/sec, 0.1950 s/token                 
+First token throughput: 3.4754 tokens/sec, 0.2877 s/token                 
+ Next token throughput: 5.1379 tokens/sec, 0.1946 s/token                     
+
+Bandwidth achieved: 43.89 GB/s
+
+========================================
+
+Once upon a time, in a far-off galaxy, there existed a magnificent cosmic being known as the Cosmic Dreamer. The Cosmic Dreamer was a creator of the most exquisite and intricate dreams, filled with vivid imagery, symphonies of sound, and scents that danced in the air. Its realm, the Dreamworld, was a realm of endless possibilities, where anything could become reality.
+
+The Cosmic Dreamer's powers were boundless, and it could craft the most fantastical dreams, transporting those who entered the Dreamworld into realms of wonder and awe. Its dreams were said to be so real that they could awaken the deepest desires and passions within the heart.
+
+One day, a young dreamer named Luna stumbled upon a hidden portal that led to the Dreamworld. As she entered the realm, she found herself face to face with the Cosmic Dreamer. The Cosmic Dreamer gazed at Luna with a benevolent smile, saying, "Welcome, little one, to the Dreamworld. I have been expecting you."
+
+Luna's eyes widened with wonder as the Cosmic Dreamer began to weave a dream tailored specifically for her. The dream was a majestic celebration of adventure, filled with soaring mountains, sparkling waterfalls, and radiant sunshine. The Cosmic Dreamer crafted the dream with such precision
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~                
+Generated 255 tokens                 
+Time for inference 3: 52.0644 sec total                 
+Time to first token: 0.3769 sec with parallel prefill.                
+
+      Total throughput: 4.9170 tokens/sec, 0.2034 s/token                 
+First token throughput: 2.6533 tokens/sec, 0.3769 s/token                 
+ Next token throughput: 4.9335 tokens/sec, 0.2027 s/token                     
+
+Bandwidth achieved: 42.08 GB/s
+
+========================================
+
+
+Warning: Excluding compile in calculations                 
+      Average tokens/sec (total): 4.91                 
+Average tokens/sec (first token): 2.73                 
+Average tokens/sec (next tokens): 4.93 
+                
+Memory used: 0.00 GB
diff --git a/llama31-1218/cpu_eager_b16.txt b/llama31-1218/cpu_eager_b16.txt
new file mode 100644
index 000000000..d8bc8d1be
--- /dev/null
+++ b/llama31-1218/cpu_eager_b16.txt
@@ -0,0 +1,73 @@
+
+OMP_NUM_THREADS=16 numactl --cpunodebind=0 --membind=0 python3 torchchat.py generate llama3.1 --quantize '{"precision": {"dtype":"bfloat16"}, "executor":{"accelerator":"cpu"}}' --prompt "Once upon a time," --max-new-tokens 256 --num-samples 3
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+OMP_NUM_THREADS=16 numactl --cpunodebind=0 --membind=0 python3 torchchat.py generate llama3.1 --quantize '{"precision": {"dtype":"bfloat16"}, "executor":{"accelerator":"cpu"}}' --prompt "Once upon a time," --max-new-tokens 256 --num-samples 3
+PyTorch version 2.6.0.dev20241218+cu124 available.
+Unabled to import torchao experimental quant_api with error:  [Errno 2] No such file or directory: '/home/jackkhuu/oss/torchchat/torchao-build/src/ao/torchao/experimental/quant_api.py'
+Using device=cpu Intel(R) Xeon(R) Platinum 8339HC CPU @ 1.80GHz
+Loading model...
+Time to load model: 0.12 seconds
+Quantizing the model with: {'precision': {'dtype': 'bfloat16'}, 'executor': {'accelerator': 'cpu'}}
+Time to quantize model: 0.01 seconds
+-----------------------------------------------------------
+Once upon a time, in the small town of Willow Creek, there was a legendary chef named Madame LeRoux. Madame LeRoux was known for her exquisite cooking, her impeccable taste, and her ability to create dishes that were both delicious and visually stunning.
+Madame LeRoux's restaurant, Le Coeur de la Vie, was the most popular eatery in town, and people came from all over to taste her creations. Her menu was always changing, but one thing remained constant – her ability to surprise and delight her customers with each new dish.
+One day, a young chef named Pierre approached Madame LeRoux with a proposal. Pierre had been working in the kitchen of a nearby restaurant, but he was tired of the monotony of his job and wanted to learn from the best. He begged Madame LeRoux to take him on as her apprentice, and she, seeing the fire in his eyes, agreed.
+Pierre was overjoyed and threw himself into his work, learning everything he could from Madame LeRoux. He watched as she chopped, sautéed, and plated each dish with precision and care. He tasted each new creation, taking notes and asking questions.
+As the days turned into weeks, Pierre's skills improved dramatically. He was soon helping Madame
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~                
+Generated 255 tokens                 
+Time for inference 1: 106.8787 sec total                 
+Time to first token: 0.6674 sec with parallel prefill.                
+
+      Total throughput: 2.3952 tokens/sec, 0.4175 s/token                 
+First token throughput: 1.4983 tokens/sec, 0.6674 s/token                 
+ Next token throughput: 2.4009 tokens/sec, 0.4165 s/token                     
+
+Bandwidth achieved: 38.47 GB/s
+*** This first iteration will include cold start effects for dynamic import, hardware caches. ***
+
+========================================
+
+Once upon a time, there was a small town nestled in the heart of a bustling city. The town was called Willow Creek, and it was known for its quaint charm and warm hospitality. The residents of Willow Creek took great pride in their community and worked hard to maintain its beauty and character.
+One of the most notable features of Willow Creek was its community garden. The garden was a vibrant and bustling hub of activity, where residents came together to grow their own fruits and vegetables. The garden was tended by a group of dedicated volunteers, who spent countless hours tending to the soil, watering the plants, and harvesting the crops.
+At the center of the community garden was a beautiful old oak tree, its gnarled branches stretching towards the sky like a giant's fingers. The tree was said to be over a century old, and it was a beloved landmark in the garden. Children would play beneath its branches, and lovers would sit in its shade, watching the sun set over the garden.
+But the community garden was more than just a pretty place – it was also a source of food for many of the town's residents. The garden produced a bounty of fresh fruits and vegetables, which were distributed to those who needed them most. The garden was a symbol of the community's generosity and compassion,
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~                
+Generated 255 tokens                 
+Time for inference 2: 94.8618 sec total                 
+Time to first token: 0.3929 sec with parallel prefill.                
+
+      Total throughput: 2.6987 tokens/sec, 0.3706 s/token                 
+First token throughput: 2.5453 tokens/sec, 0.3929 s/token                 
+ Next token throughput: 2.6993 tokens/sec, 0.3705 s/token                     
+
+Bandwidth achieved: 43.34 GB/s
+
+========================================
+
+Once upon a time, in a small village nestled in the rolling hills of Tuscany, there lived a young apprentice named Leo. Leo was a skilled woodworker, but he had a passion for creating beautiful and intricate wooden masks. Every year, the villagers would gather to celebrate the Harvest Festival, and Leo would create a stunning mask to wear as part of the festivities.
+One year, Leo became fascinated with a particularly beautiful wooden mask he had found in his mentor's workshop. The mask was unlike any he had ever seen before - its intricate carvings and delicate details seemed to dance across its surface. Leo became obsessed with the mask and spent every spare moment trying to recreate it.
+As Leo worked tirelessly on his masterpiece, he began to notice strange occurrences around the village. Tools would go missing, and strange noises could be heard coming from the workshop at night. Leo started to feel a presence around him, as if the mask was exerting some kind of influence over him.
+Despite his growing unease, Leo was determined to complete his mask. He worked through the night, his hands moving with a strange, almost supernatural precision. As the mask took shape, Leo felt himself becoming more and more consumed by its power.
+On the night of the Harvest Festival, Leo wore the mask as he
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~                
+Generated 255 tokens                 
+Time for inference 3: 92.6331 sec total                 
+Time to first token: 0.3726 sec with parallel prefill.                
+
+      Total throughput: 2.7636 tokens/sec, 0.3618 s/token                 
+First token throughput: 2.6839 tokens/sec, 0.3726 s/token                 
+ Next token throughput: 2.7639 tokens/sec, 0.3618 s/token                     
+
+Bandwidth achieved: 44.38 GB/s
+
+========================================
+
+
+Warning: Excluding compile in calculations                 
+      Average tokens/sec (total): 2.62                 
+Average tokens/sec (first token): 2.24                 
+Average tokens/sec (next tokens): 2.62 
+                
+Memory used: 0.00 GB
diff --git a/llama31-1218/cuda_aoti_4.txt b/llama31-1218/cuda_aoti_4.txt
new file mode 100644
index 000000000..84665598f
--- /dev/null
+++ b/llama31-1218/cuda_aoti_4.txt
@@ -0,0 +1,301 @@
+python3 torchchat.py export llama3.1 --quantize '{"linear:int4": {"groupsize": 256}, "precision": {"dtype":"bfloat16"}, "executor":{"accelerator":"cuda"}}' --output-dso-path /tmp/model34.so
+OMP_NUM_THREADS=16 numactl --cpunodebind=0 --membind=0 python3 torchchat.py generate llama3.1 --dso-path /tmp/model34.so --prompt "Once upon a time," --max-new-tokens 200 --device cuda --num-samples 3
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+python3 torchchat.py export llama3.1 --quantize '{"linear:int4": {"groupsize": 256}, "precision": {"dtype":"bfloat16"}, "executor":{"accelerator":"cuda"}}' --output-dso-path /tmp/model34.so
+Note: NumExpr detected 22 cores but "NUMEXPR_MAX_THREADS" not set, so enforcing safe limit of 16.
+NumExpr defaulting to 16 threads.
+PyTorch version 2.6.0.dev20241218+cu124 available.
+W1218 20:32:12.493177 1654082 site-packages/torch/_export/__init__.py:276] +============================+
+W1218 20:32:12.493719 1654082 site-packages/torch/_export/__init__.py:277] |     !!!   WARNING   !!!    |
+W1218 20:32:12.493938 1654082 site-packages/torch/_export/__init__.py:278] +============================+
+W1218 20:32:12.494111 1654082 site-packages/torch/_export/__init__.py:279] torch._export.aot_compile()/torch._export.aot_load() is being deprecated, please switch to directly calling torch._inductor.aoti_compile_and_package(torch.export.export())/torch._inductor.aoti_load_package() instead.
+W1218 20:33:11.883745 1654082 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 20:33:11.885487 1654082 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 20:33:12.177311 1654082 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 20:33:12.252445 1654082 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 20:33:12.286209 1654082 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 20:33:12.287917 1654082 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 20:33:12.299919 1654082 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 20:33:12.336472 1654082 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 20:33:12.337986 1654082 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 20:33:12.510349 1654082 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 20:33:12.551547 1654082 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 20:33:12.586616 1654082 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 20:33:12.587886 1654082 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 20:33:12.597007 1654082 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 20:33:12.635042 1654082 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 20:33:12.636319 1654082 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 20:33:12.807197 1654082 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 20:33:12.845380 1654082 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 20:33:12.869709 1654082 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 20:33:12.871110 1654082 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 20:33:12.880138 1654082 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 20:33:12.908805 1654082 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 20:33:12.910059 1654082 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 20:33:13.079605 1654082 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 20:33:13.118574 1654082 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 20:33:13.151574 1654082 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 20:33:13.152825 1654082 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 20:33:13.161900 1654082 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 20:33:13.197574 1654082 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 20:33:13.198793 1654082 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 20:33:13.367423 1654082 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 20:33:13.406470 1654082 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 20:33:13.428939 1654082 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 20:33:13.430405 1654082 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 20:33:13.439952 1654082 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 20:33:13.466367 1654082 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 20:33:13.467884 1654082 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 20:33:13.639369 1654082 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 20:33:13.680004 1654082 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 20:33:13.711776 1654082 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 20:33:13.713212 1654082 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 20:33:13.722688 1654082 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 20:33:13.761627 1654082 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 20:33:13.762990 1654082 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 20:33:13.941797 1654082 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 20:33:13.981113 1654082 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 20:33:14.004502 1654082 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 20:33:14.006090 1654082 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 20:33:14.016382 1654082 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 20:33:14.043857 1654082 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 20:33:14.045392 1654082 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 20:33:14.244626 1654082 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 20:33:14.284300 1654082 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 20:33:14.315188 1654082 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 20:33:14.316496 1654082 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 20:33:14.325798 1654082 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 20:33:14.361856 1654082 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 20:33:14.363121 1654082 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 20:33:14.531267 1654082 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 20:33:14.569061 1654082 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 20:33:14.591941 1654082 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 20:33:14.593466 1654082 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 20:33:14.603515 1654082 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 20:33:14.628669 1654082 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 20:33:14.630388 1654082 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 20:33:14.799149 1654082 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 20:33:14.836615 1654082 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 20:33:14.865977 1654082 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 20:33:14.867137 1654082 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 20:33:14.875937 1654082 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 20:33:14.913592 1654082 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 20:33:14.914828 1654082 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 20:33:15.080706 1654082 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 20:33:15.119511 1654082 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 20:33:15.142627 1654082 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 20:33:15.144167 1654082 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 20:33:15.154067 1654082 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 20:33:15.180239 1654082 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 20:33:15.181762 1654082 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 20:33:15.357598 1654082 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 20:33:15.397039 1654082 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 20:33:15.427740 1654082 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 20:33:15.429046 1654082 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 20:33:15.438679 1654082 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 20:33:15.474384 1654082 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 20:33:15.475556 1654082 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 20:33:15.640358 1654082 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 20:33:15.677274 1654082 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 20:33:15.698557 1654082 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 20:33:15.702336 1654082 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 20:33:15.711260 1654082 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 20:33:15.735607 1654082 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 20:33:15.736973 1654082 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 20:33:15.904883 1654082 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 20:33:15.943511 1654082 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 20:33:15.973752 1654082 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 20:33:15.975023 1654082 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 20:33:15.984304 1654082 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 20:33:16.020385 1654082 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 20:33:16.021641 1654082 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 20:33:16.189793 1654082 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 20:33:16.228986 1654082 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 20:33:16.251099 1654082 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 20:33:16.252490 1654082 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 20:33:16.261878 1654082 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 20:33:16.286751 1654082 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 20:33:16.288439 1654082 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 20:33:16.453825 1654082 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 20:33:16.494160 1654082 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 20:33:16.523545 1654082 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 20:33:16.524790 1654082 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 20:33:16.533545 1654082 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 20:33:16.568760 1654082 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 20:33:16.569951 1654082 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 20:33:16.735313 1654082 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 20:33:16.773086 1654082 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 20:33:16.795398 1654082 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 20:33:16.796875 1654082 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 20:33:16.806146 1654082 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 20:33:16.831155 1654082 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 20:33:16.832510 1654082 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 20:33:17.000422 1654082 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 20:33:17.039112 1654082 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 20:33:17.068969 1654082 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 20:33:17.070197 1654082 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 20:33:17.079399 1654082 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 20:33:17.114097 1654082 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 20:33:17.115124 1654082 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 20:33:17.277422 1654082 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 20:33:17.317598 1654082 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 20:33:17.339446 1654082 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 20:33:17.341159 1654082 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 20:33:17.350399 1654082 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 20:33:17.375364 1654082 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 20:33:17.376756 1654082 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 20:33:17.544634 1654082 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 20:33:17.582466 1654082 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 20:33:17.612783 1654082 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 20:33:17.614010 1654082 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 20:33:17.623139 1654082 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 20:33:17.658950 1654082 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 20:33:17.660189 1654082 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 20:33:17.827371 1654082 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 20:33:17.864999 1654082 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 20:33:17.887009 1654082 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 20:33:17.888326 1654082 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 20:33:17.897363 1654082 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 20:33:17.922188 1654082 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 20:33:17.923474 1654082 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 20:33:18.094491 1654082 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 20:33:18.135612 1654082 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 20:33:18.176028 1654082 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 20:33:18.177658 1654082 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 20:33:18.189448 1654082 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 20:33:18.237649 1654082 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 20:33:18.238981 1654082 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 20:33:18.407770 1654082 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 20:33:18.448994 1654082 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 20:33:18.471654 1654082 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 20:33:18.473419 1654082 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 20:33:18.483160 1654082 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 20:33:18.511078 1654082 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 20:33:18.512598 1654082 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 20:33:18.684798 1654082 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 20:33:18.724859 1654082 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 20:33:18.755660 1654082 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 20:33:18.756993 1654082 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 20:33:18.767073 1654082 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 20:33:18.803838 1654082 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 20:33:18.805159 1654082 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 20:33:18.976819 1654082 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 20:33:19.017429 1654082 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 20:33:19.040679 1654082 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 20:33:19.042178 1654082 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 20:33:19.052081 1654082 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 20:33:19.077362 1654082 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 20:33:19.079102 1654082 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 20:33:19.249027 1654082 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 20:33:19.289184 1654082 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 20:33:19.323073 1654082 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 20:33:19.324293 1654082 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 20:33:19.333361 1654082 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 20:33:19.369641 1654082 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 20:33:19.370843 1654082 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 20:33:19.544332 1654082 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 20:33:19.587039 1654082 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 20:33:19.612599 1654082 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 20:33:19.614142 1654082 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 20:33:19.624497 1654082 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 20:33:19.651644 1654082 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 20:33:19.653145 1654082 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 20:33:19.836291 1654082 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 20:33:19.875658 1654082 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 20:33:19.907209 1654082 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 20:33:19.908440 1654082 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 20:33:19.918065 1654082 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 20:33:19.953979 1654082 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 20:33:19.955211 1654082 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 20:33:20.123053 1654082 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 20:33:20.166215 1654082 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 20:33:20.188936 1654082 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 20:33:20.190997 1654082 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 20:33:20.201327 1654082 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 20:33:20.228008 1654082 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 20:33:20.229360 1654082 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 20:33:20.407273 1654082 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 20:33:20.458337 1654082 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 20:33:20.496504 1654082 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 20:33:20.497747 1654082 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 20:33:20.508665 1654082 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 20:33:20.547976 1654082 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 20:33:20.549809 1654082 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 20:33:20.769025 1654082 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 20:33:20.811230 1654082 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 20:33:20.834301 1654082 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 20:33:20.835803 1654082 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 20:33:20.846228 1654082 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 20:33:20.875365 1654082 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 20:33:20.877553 1654082 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 20:33:21.071123 1654082 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 20:33:21.113878 1654082 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 20:33:21.144387 1654082 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 20:33:21.145593 1654082 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 20:33:21.154984 1654082 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 20:33:21.176146 1654082 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm.default is missing a c-shim implementation, using proxy executor as fallback
+Unabled to import torchao experimental quant_api with error:  [Errno 2] No such file or directory: '/home/jackkhuu/oss/torchchat/torchao-build/src/ao/torchao/experimental/quant_api.py'
+Using device=cuda
+Setting max_seq_length to 300 for DSO export.
+Loading model...
+Time to load model: 6.12 seconds
+Quantizing the model with: {'linear:int4': {'groupsize': 256}, 'precision': {'dtype': 'bfloat16'}, 'executor': {'accelerator': 'cuda'}}
+Time to quantize model: 0.58 seconds
+-----------------------------------------------------------
+Exporting model using AOT Inductor to /tmp/model34.so
+WARNING!! The path of compiling a dso is deprecated. Please use --output-aoti-package-path to create a .pt2 artifact instead.
+The generated packaged model can be found at: /tmp/model34.so
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+OMP_NUM_THREADS=16 numactl --cpunodebind=0 --membind=0 python3 torchchat.py generate llama3.1 --dso-path /tmp/model34.so --prompt "Once upon a time," --max-new-tokens 200 --device cuda --num-samples 3
+PyTorch version 2.6.0.dev20241218+cu124 available.
+W1218 20:35:37.689507 1671842 site-packages/torch/_export/__init__.py:276] +============================+
+W1218 20:35:37.689961 1671842 site-packages/torch/_export/__init__.py:277] |     !!!   WARNING   !!!    |
+W1218 20:35:37.690145 1671842 site-packages/torch/_export/__init__.py:278] +============================+
+W1218 20:35:37.690307 1671842 site-packages/torch/_export/__init__.py:279] torch._export.aot_compile()/torch._export.aot_load() is being deprecated, please switch to directly calling torch._inductor.aoti_compile_and_package(torch.export.export())/torch._inductor.aoti_load_package() instead.
+[E1218 20:35:39.524097915 shim_common.cpp:1177] Exception in aoti_torch: Unable to find a proxy executor to run custom ops. Please check if there is a json file generated in the same directory as the so, or use torch._inductor.aoti_compile_and_package to package everything into a PT2 artifact.
+[E1218 20:35:39.524140439 shim_common.cpp:1177] Exception in aoti_torch: Unable to find a proxy executor to run custom ops. Please check if there is a json file generated in the same directory as the so, or use torch._inductor.aoti_compile_and_package to package everything into a PT2 artifact.
+[E1218 20:35:39.524797602 shim_common.cpp:246] Exception in aoti_torch: Cannot access data pointer of Tensor that doesn't have storage
+Exception raised from throw_data_ptr_access_error at /pytorch/c10/core/TensorImpl.cpp:309 (most recent call first):
+frame #0: c10::Error::Error(c10::SourceLocation, std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> >) + 0x98 (0x7f09dc77d788 in /home/jackkhuu/.conda/envs/testrunner/lib/python3.10/site-packages/torch/lib/libc10.so)
+frame #1: c10::detail::torchCheckFail(char const*, char const*, unsigned int, char const*) + 0x6a (0x7f09dc726fbc in /home/jackkhuu/.conda/envs/testrunner/lib/python3.10/site-packages/torch/lib/libc10.so)
+frame #2: c10::TensorImpl::throw_data_ptr_access_error() const + 0x34 (0x7f09dc755f64 in /home/jackkhuu/.conda/envs/testrunner/lib/python3.10/site-packages/torch/lib/libc10.so)
+frame #3: aoti_torch_get_data_ptr + 0xd0 (0x7f09cc2970e0 in /home/jackkhuu/.conda/envs/testrunner/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so)
+frame #4: torch::aot_inductor::AOTInductorModel::run_impl(AtenTensorOpaque**, AtenTensorOpaque**, CUstream_st*, AOTIProxyExecutorOpaque*) + 0x4229 (0x7f08898fc719 in /tmp/model34.so)
+frame #5: torch::aot_inductor::AOTInductorModelContainer::run(AtenTensorOpaque**, AtenTensorOpaque**, CUstream_st*, AOTIProxyExecutorOpaque*) + 0xe4 (0x7f088995b234 in /tmp/model34.so)
+frame #6: AOTInductorModelContainerRun + 0x6d (0x7f0889933eed in /tmp/model34.so)
+frame #7: torch::inductor::AOTIModelContainerRunner::run(std::vector<at::Tensor, std::allocator<at::Tensor> > const&, void*) + 0x104 (0x7f09cc288c14 in /home/jackkhuu/.conda/envs/testrunner/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so)
+frame #8: torch::inductor::AOTIModelContainerRunnerCuda::run(std::vector<at::Tensor, std::allocator<at::Tensor> > const&, void*) + 0x1e (0x7f0995a74e8e in /home/jackkhuu/.conda/envs/testrunner/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so)
+frame #9: <unknown function> + 0x7f1d16 (0x7f09dbbf1d16 in /home/jackkhuu/.conda/envs/testrunner/lib/python3.10/site-packages/torch/lib/libtorch_python.so)
+frame #10: <unknown function> + 0x37fe0e (0x7f09db77fe0e in /home/jackkhuu/.conda/envs/testrunner/lib/python3.10/site-packages/torch/lib/libtorch_python.so)
+<omitting python frames>
+frame #51: <unknown function> + 0x295d0 (0x7f09dde295d0 in /lib64/libc.so.6)
+frame #52: __libc_start_main + 0x80 (0x7f09dde29680 in /lib64/libc.so.6)
+
+Error: aoti_torch_get_data_ptr(handle_.get(), &result) API call failed at /home/jackkhuu/.conda/envs/testrunner/lib/python3.10/site-packages/torch/include/torch/csrc/inductor/aoti_runtime/utils.h, line 117
+Unabled to import torchao experimental quant_api with error:  [Errno 2] No such file or directory: '/home/jackkhuu/oss/torchchat/torchao-build/src/ao/torchao/experimental/quant_api.py'
+Warning: checkpoint path ignored because an exported model was specified using a DSO, AOTI PACKAGE or PTE path argument
+Warning: checkpoint path ignored because an exported model was specified using a DSO, AOTI PACKAGE or PTE path argument
+Using device=cuda NVIDIA PG509-210
+Loading model...
+Time to load model: 6.35 seconds
+-----------------------------------------------------------
+Traceback (most recent call last):
+  File "/home/jackkhuu/oss/torchchat/torchchat.py", line 96, in <module>
+    generate_main(args)
+  File "/home/jackkhuu/oss/torchchat/torchchat/generate.py", line 1247, in main
+    for _ in gen.chat(generator_args):
+  File "/home/jackkhuu/oss/torchchat/torchchat/generate.py", line 1116, in chat
+    for token_tensor, metrics in generator_func:
+  File "/home/jackkhuu/.conda/envs/testrunner/lib/python3.10/site-packages/torch/utils/_contextlib.py", line 36, in generator_context
+    response = gen.send(None)
+  File "/home/jackkhuu/oss/torchchat/torchchat/generate.py", line 647, in generate
+    next_token = self.prefill(
+  File "/home/jackkhuu/oss/torchchat/torchchat/generate.py", line 398, in prefill
+    logits = model(x_sliced, ip_sliced)  # (x[:, i], input_pos[i])da
+  File "/home/jackkhuu/.conda/envs/testrunner/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1739, in _wrapped_call_impl
+    return self._call_impl(*args, **kwargs)
+  File "/home/jackkhuu/.conda/envs/testrunner/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1750, in _call_impl
+    return forward_call(*args, **kwargs)
+  File "/home/jackkhuu/.conda/envs/testrunner/lib/python3.10/site-packages/torch/_export/__init__.py", line 387, in optimized
+    flat_outputs = runner.run(flat_inputs)  # type: ignore[attr-defined]
+RuntimeError: run_func_( container_handle_, input_handles.data(), input_handles.size(), output_handles.data(), output_handles.size(), reinterpret_cast<AOTInductorStreamHandle>(stream_handle), proxy_executor_handle_) API call failed at /pytorch/torch/csrc/inductor/aoti_runner/model_container_runner.cpp, line 107
diff --git a/llama31-1218/cuda_aoti_8.txt b/llama31-1218/cuda_aoti_8.txt
new file mode 100644
index 000000000..edbc9ec4c
--- /dev/null
+++ b/llama31-1218/cuda_aoti_8.txt
@@ -0,0 +1,119 @@
+python3 torchchat.py export llama3.1 --quantize '{"linear:int8": {"groupsize": 0}, "precision": {"dtype":"bfloat16"}, "executor":{"accelerator":"cuda"}}' --output-dso-path /tmp/model8.so
+OMP_NUM_THREADS=16 numactl --cpunodebind=0 --membind=0 python3 torchchat.py generate llama3.1 --dso-path /tmp/model8.so --prompt "Once upon a time," --max-new-tokens 200 --device cuda --num-samples 3
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+python3 torchchat.py export llama3.1 --quantize '{"linear:int8": {"groupsize": 0}, "precision": {"dtype":"bfloat16"}, "executor":{"accelerator":"cuda"}}' --output-dso-path /tmp/model8.so
+Note: NumExpr detected 22 cores but "NUMEXPR_MAX_THREADS" not set, so enforcing safe limit of 16.
+NumExpr defaulting to 16 threads.
+PyTorch version 2.6.0.dev20241218+cu124 available.
+W1218 20:28:13.686731 1631740 site-packages/torch/_export/__init__.py:276] +============================+
+W1218 20:28:13.687165 1631740 site-packages/torch/_export/__init__.py:277] |     !!!   WARNING   !!!    |
+W1218 20:28:13.687381 1631740 site-packages/torch/_export/__init__.py:278] +============================+
+W1218 20:28:13.687657 1631740 site-packages/torch/_export/__init__.py:279] torch._export.aot_compile()/torch._export.aot_load() is being deprecated, please switch to directly calling torch._inductor.aoti_compile_and_package(torch.export.export())/torch._inductor.aoti_load_package() instead.
+Unabled to import torchao experimental quant_api with error:  [Errno 2] No such file or directory: '/home/jackkhuu/oss/torchchat/torchao-build/src/ao/torchao/experimental/quant_api.py'
+Using device=cuda
+Setting max_seq_length to 300 for DSO export.
+Loading model...
+Time to load model: 5.97 seconds
+Quantizing the model with: {'linear:int8': {'groupsize': 0}, 'precision': {'dtype': 'bfloat16'}, 'executor': {'accelerator': 'cuda'}}
+Time to quantize model: 0.38 seconds
+-----------------------------------------------------------
+Exporting model using AOT Inductor to /tmp/model8.so
+WARNING!! The path of compiling a dso is deprecated. Please use --output-aoti-package-path to create a .pt2 artifact instead.
+The generated packaged model can be found at: /tmp/model8.so
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+OMP_NUM_THREADS=16 numactl --cpunodebind=0 --membind=0 python3 torchchat.py generate llama3.1 --dso-path /tmp/model8.so --prompt "Once upon a time," --max-new-tokens 200 --device cuda --num-samples 3
+PyTorch version 2.6.0.dev20241218+cu124 available.
+W1218 20:31:42.800761 1651896 site-packages/torch/_export/__init__.py:276] +============================+
+W1218 20:31:42.801322 1651896 site-packages/torch/_export/__init__.py:277] |     !!!   WARNING   !!!    |
+W1218 20:31:42.801504 1651896 site-packages/torch/_export/__init__.py:278] +============================+
+W1218 20:31:42.801690 1651896 site-packages/torch/_export/__init__.py:279] torch._export.aot_compile()/torch._export.aot_load() is being deprecated, please switch to directly calling torch._inductor.aoti_compile_and_package(torch.export.export())/torch._inductor.aoti_load_package() instead.
+[W1218 20:31:45.602179216 cbmcj2dmlnltvqhdfh4tdkqxo4b5oww22djpo7usr3qp3mii4b3j.cpp:6944] Warning: "Input 0 was compiled as 16-bytes aligned, but it is not aligned at run time. Copying to an aligned tensor to guarantee correctness, but expect a performance hit." (function run_impl)
+[W1218 20:31:45.602349016 cbmcj2dmlnltvqhdfh4tdkqxo4b5oww22djpo7usr3qp3mii4b3j.cpp:6951] Warning: "Input 1 was compiled as 16-bytes aligned, but it is not aligned at run time. Copying to an aligned tensor to guarantee correctness, but expect a performance hit." (function run_impl)
+[W1218 20:31:45.609939364 cbmcj2dmlnltvqhdfh4tdkqxo4b5oww22djpo7usr3qp3mii4b3j.cpp:6944] Warning: "Input 0 was compiled as 16-bytes aligned, but it is not aligned at run time. Copying to an aligned tensor to guarantee correctness, but expect a performance hit." (function run_impl)
+[W1218 20:31:45.610055760 cbmcj2dmlnltvqhdfh4tdkqxo4b5oww22djpo7usr3qp3mii4b3j.cpp:6951] Warning: "Input 1 was compiled as 16-bytes aligned, but it is not aligned at run time. Copying to an aligned tensor to guarantee correctness, but expect a performance hit." (function run_impl)
+[W1218 20:31:45.617693449 cbmcj2dmlnltvqhdfh4tdkqxo4b5oww22djpo7usr3qp3mii4b3j.cpp:6944] Warning: "Input 0 was compiled as 16-bytes aligned, but it is not aligned at run time. Copying to an aligned tensor to guarantee correctness, but expect a performance hit." (function run_impl)
+[W1218 20:31:45.617788824 cbmcj2dmlnltvqhdfh4tdkqxo4b5oww22djpo7usr3qp3mii4b3j.cpp:6951] Warning: "Input 1 was compiled as 16-bytes aligned, but it is not aligned at run time. Copying to an aligned tensor to guarantee correctness, but expect a performance hit." (function run_impl)
+[W1218 20:31:45.632980577 cbmcj2dmlnltvqhdfh4tdkqxo4b5oww22djpo7usr3qp3mii4b3j.cpp:6944] Warning: "Input 0 was compiled as 16-bytes aligned, but it is not aligned at run time. Copying to an aligned tensor to guarantee correctness, but expect a performance hit." (function run_impl)
+[W1218 20:31:45.633093254 cbmcj2dmlnltvqhdfh4tdkqxo4b5oww22djpo7usr3qp3mii4b3j.cpp:6951] Warning: "Input 1 was compiled as 16-bytes aligned, but it is not aligned at run time. Copying to an aligned tensor to guarantee correctness, but expect a performance hit." (function run_impl)
+Unabled to import torchao experimental quant_api with error:  [Errno 2] No such file or directory: '/home/jackkhuu/oss/torchchat/torchao-build/src/ao/torchao/experimental/quant_api.py'
+Warning: checkpoint path ignored because an exported model was specified using a DSO, AOTI PACKAGE or PTE path argument
+Warning: checkpoint path ignored because an exported model was specified using a DSO, AOTI PACKAGE or PTE path argument
+Using device=cuda NVIDIA PG509-210
+Loading model...
+Time to load model: 7.00 seconds
+-----------------------------------------------------------
+Once upon a time, in a small village nestled in the rolling hills of Provence, France, there lived a young boy named Pierre. Pierre was a curious and adventurous boy who loved nothing more than exploring the fields, forests, and villages that surrounded his home.
+One day, while wandering through the village, Pierre stumbled upon a small, mysterious shop tucked away on a side street. The sign above the door read "Maison de Magie" (House of Magic), and the window displayed an assortment of strange and wondrous objects that seemed to shimmer and glow in the sunlight.
+Pierre's curiosity was piqued, and he pushed open the door to reveal a dimly lit interior filled with rows of dusty shelves, colorful bottles, and peculiar contraptions that whirred and whizzed with soft humming noises. A figure emerged from the shadows, a wise and kindly old man with a long white beard and twinkling eyes.
+"Bonjour, young Pierre," the old man said in a warm,
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~                
+Generated 199 tokens                 
+Time for inference 1: 1.8658 sec total                 
+Time to first token: 0.1900 sec with sequential prefill.                
+
+      Total throughput: 107.1912 tokens/sec, 0.0093 s/token                 
+First token throughput: 5.2624 tokens/sec, 0.1900 s/token                 
+ Next token throughput: 118.7494 tokens/sec, 0.0084 s/token                     
+
+Bandwidth achieved: 0.00 GB/s
+*** This first iteration will include cold start effects for dynamic import, hardware caches. ***
+[W1218 20:31:47.454663167 cbmcj2dmlnltvqhdfh4tdkqxo4b5oww22djpo7usr3qp3mii4b3j.cpp:6944] Warning: "Input 0 was compiled as 16-bytes aligned, but it is not aligned at run time. Copying to an aligned tensor to guarantee correctness, but expect a performance hit." (function run_impl)
+[W1218 20:31:47.454792461 cbmcj2dmlnltvqhdfh4tdkqxo4b5oww22djpo7usr3qp3mii4b3j.cpp:6951] Warning: "Input 1 was compiled as 16-bytes aligned, but it is not aligned at run time. Copying to an aligned tensor to guarantee correctness, but expect a performance hit." (function run_impl)
+[W1218 20:31:47.462407192 cbmcj2dmlnltvqhdfh4tdkqxo4b5oww22djpo7usr3qp3mii4b3j.cpp:6944] Warning: "Input 0 was compiled as 16-bytes aligned, but it is not aligned at run time. Copying to an aligned tensor to guarantee correctness, but expect a performance hit." (function run_impl)
+[W1218 20:31:47.462541832 cbmcj2dmlnltvqhdfh4tdkqxo4b5oww22djpo7usr3qp3mii4b3j.cpp:6951] Warning: "Input 1 was compiled as 16-bytes aligned, but it is not aligned at run time. Copying to an aligned tensor to guarantee correctness, but expect a performance hit." (function run_impl)
+[W1218 20:31:47.470147862 cbmcj2dmlnltvqhdfh4tdkqxo4b5oww22djpo7usr3qp3mii4b3j.cpp:6944] Warning: "Input 0 was compiled as 16-bytes aligned, but it is not aligned at run time. Copying to an aligned tensor to guarantee correctness, but expect a performance hit." (function run_impl)
+[W1218 20:31:47.470275931 cbmcj2dmlnltvqhdfh4tdkqxo4b5oww22djpo7usr3qp3mii4b3j.cpp:6951] Warning: "Input 1 was compiled as 16-bytes aligned, but it is not aligned at run time. Copying to an aligned tensor to guarantee correctness, but expect a performance hit." (function run_impl)
+[W1218 20:31:47.485473631 cbmcj2dmlnltvqhdfh4tdkqxo4b5oww22djpo7usr3qp3mii4b3j.cpp:6944] Warning: "Input 0 was compiled as 16-bytes aligned, but it is not aligned at run time. Copying to an aligned tensor to guarantee correctness, but expect a performance hit." (function run_impl)
+[W1218 20:31:47.485587356 cbmcj2dmlnltvqhdfh4tdkqxo4b5oww22djpo7usr3qp3mii4b3j.cpp:6951] Warning: "Input 1 was compiled as 16-bytes aligned, but it is not aligned at run time. Copying to an aligned tensor to guarantee correctness, but expect a performance hit." (function run_impl)
+
+========================================
+
+Once upon a time, there was a magical kingdom hidden deep within a vast, enchanted forest. The kingdom was ruled by a wise and just king, who was loved by all his subjects. The king's most trusted advisor was a wise old wizard named Zarath, who possessed great knowledge and power.
+One day, a young apprentice named Eryndor arrived at the kingdom, seeking to learn the ancient arts of magic from the great wizard Zarath. Eryndor was a skilled warrior, but he was determined to become a powerful wizard, just like his idol, Zarath.
+Zarath, seeing the potential in the young apprentice, took Eryndor under his wing and began to teach him the intricacies of magic. Eryndor proved to be a quick learner, mastering spells and incantations with ease. He spent every waking moment studying and practicing, determined to become the greatest wizard of all time.
+As time passed, Eryndor's skills improved dramatically, and he began
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~                
+Generated 199 tokens                 
+Time for inference 2: 1.7017 sec total                 
+Time to first token: 0.0414 sec with sequential prefill.                
+
+      Total throughput: 117.5274 tokens/sec, 0.0085 s/token                 
+First token throughput: 24.1350 tokens/sec, 0.0414 s/token                 
+ Next token throughput: 119.8580 tokens/sec, 0.0083 s/token                     
+
+Bandwidth achieved: 0.00 GB/s
+[W1218 20:31:49.156798800 cbmcj2dmlnltvqhdfh4tdkqxo4b5oww22djpo7usr3qp3mii4b3j.cpp:6944] Warning: "Input 0 was compiled as 16-bytes aligned, but it is not aligned at run time. Copying to an aligned tensor to guarantee correctness, but expect a performance hit." (function run_impl)
+[W1218 20:31:49.156895446 cbmcj2dmlnltvqhdfh4tdkqxo4b5oww22djpo7usr3qp3mii4b3j.cpp:6951] Warning: "Input 1 was compiled as 16-bytes aligned, but it is not aligned at run time. Copying to an aligned tensor to guarantee correctness, but expect a performance hit." (function run_impl)
+[W1218 20:31:49.164536369 cbmcj2dmlnltvqhdfh4tdkqxo4b5oww22djpo7usr3qp3mii4b3j.cpp:6944] Warning: "Input 0 was compiled as 16-bytes aligned, but it is not aligned at run time. Copying to an aligned tensor to guarantee correctness, but expect a performance hit." (function run_impl)
+[W1218 20:31:49.164621418 cbmcj2dmlnltvqhdfh4tdkqxo4b5oww22djpo7usr3qp3mii4b3j.cpp:6951] Warning: "Input 1 was compiled as 16-bytes aligned, but it is not aligned at run time. Copying to an aligned tensor to guarantee correctness, but expect a performance hit." (function run_impl)
+[W1218 20:31:49.172263213 cbmcj2dmlnltvqhdfh4tdkqxo4b5oww22djpo7usr3qp3mii4b3j.cpp:6944] Warning: "Input 0 was compiled as 16-bytes aligned, but it is not aligned at run time. Copying to an aligned tensor to guarantee correctness, but expect a performance hit." (function run_impl)
+[W1218 20:31:49.172387187 cbmcj2dmlnltvqhdfh4tdkqxo4b5oww22djpo7usr3qp3mii4b3j.cpp:6951] Warning: "Input 1 was compiled as 16-bytes aligned, but it is not aligned at run time. Copying to an aligned tensor to guarantee correctness, but expect a performance hit." (function run_impl)
+[W1218 20:31:49.187656990 cbmcj2dmlnltvqhdfh4tdkqxo4b5oww22djpo7usr3qp3mii4b3j.cpp:6944] Warning: "Input 0 was compiled as 16-bytes aligned, but it is not aligned at run time. Copying to an aligned tensor to guarantee correctness, but expect a performance hit." (function run_impl)
+[W1218 20:31:49.187775543 cbmcj2dmlnltvqhdfh4tdkqxo4b5oww22djpo7usr3qp3mii4b3j.cpp:6951] Warning: "Input 1 was compiled as 16-bytes aligned, but it is not aligned at run time. Copying to an aligned tensor to guarantee correctness, but expect a performance hit." (function run_impl)
+
+========================================
+
+Once upon a time, in a far-off land, there was a small village nestled between two great mountains. The villagers were simple folk, living off the land and making their living by farming and herding. They were a hardworking people, and their lives were marked by the changing seasons.
+One day, a strange and wondrous event occurred in the village. A great, glowing crystal appeared in the center of the village square, filling the air with a soft, pulsing light. The villagers were amazed and a little frightened by this sudden appearance, and they gathered around the crystal in wonder.
+As they watched, the crystal began to glow brighter and brighter, until it seemed to be radiating a warm, golden light. The villagers felt a strange sense of peace and tranquility wash over them, and they began to feel a deep connection to the crystal.
+One of the villagers, a young woman named Aria, felt an especially strong connection to the crystal. She felt drawn to it, as if she was
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~                
+Generated 199 tokens                 
+Time for inference 3: 1.7026 sec total                 
+Time to first token: 0.0420 sec with sequential prefill.                
+
+      Total throughput: 117.4643 tokens/sec, 0.0085 s/token                 
+First token throughput: 23.8167 tokens/sec, 0.0420 s/token                 
+ Next token throughput: 119.8321 tokens/sec, 0.0083 s/token                     
+
+Bandwidth achieved: 0.00 GB/s
+
+========================================
+
+
+Warning: Excluding compile in calculations                 
+      Average tokens/sec (total): 114.06                 
+Average tokens/sec (first token): 17.74                 
+Average tokens/sec (next tokens): 119.48 
+                
+Memory used: 0.05 GB
diff --git a/llama31-1218/cuda_aoti_b16.txt b/llama31-1218/cuda_aoti_b16.txt
new file mode 100644
index 000000000..6f1c53aea
--- /dev/null
+++ b/llama31-1218/cuda_aoti_b16.txt
@@ -0,0 +1,119 @@
+python3 torchchat.py export llama3.1 --quantize '{"precision": {"dtype":"bfloat16"}, "executor":{"accelerator":"cuda"}}' --output-dso-path /tmp/model16.so
+OMP_NUM_THREADS=16 numactl --cpunodebind=0 --membind=0 python3 torchchat.py generate llama3.1 --dso-path /tmp/model16.so --prompt "Once upon a time," --max-new-tokens 200 --device cuda --num-samples 3
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+python3 torchchat.py export llama3.1 --quantize '{"precision": {"dtype":"bfloat16"}, "executor":{"accelerator":"cuda"}}' --output-dso-path /tmp/model16.so
+Note: NumExpr detected 22 cores but "NUMEXPR_MAX_THREADS" not set, so enforcing safe limit of 16.
+NumExpr defaulting to 16 threads.
+PyTorch version 2.6.0.dev20241218+cu124 available.
+W1218 20:23:42.332001 1605281 site-packages/torch/_export/__init__.py:276] +============================+
+W1218 20:23:42.332482 1605281 site-packages/torch/_export/__init__.py:277] |     !!!   WARNING   !!!    |
+W1218 20:23:42.332674 1605281 site-packages/torch/_export/__init__.py:278] +============================+
+W1218 20:23:42.332856 1605281 site-packages/torch/_export/__init__.py:279] torch._export.aot_compile()/torch._export.aot_load() is being deprecated, please switch to directly calling torch._inductor.aoti_compile_and_package(torch.export.export())/torch._inductor.aoti_load_package() instead.
+Unabled to import torchao experimental quant_api with error:  [Errno 2] No such file or directory: '/home/jackkhuu/oss/torchchat/torchao-build/src/ao/torchao/experimental/quant_api.py'
+Using device=cuda
+Setting max_seq_length to 300 for DSO export.
+Loading model...
+Time to load model: 5.94 seconds
+Quantizing the model with: {'precision': {'dtype': 'bfloat16'}, 'executor': {'accelerator': 'cuda'}}
+Time to quantize model: 0.01 seconds
+-----------------------------------------------------------
+Exporting model using AOT Inductor to /tmp/model16.so
+WARNING!! The path of compiling a dso is deprecated. Please use --output-aoti-package-path to create a .pt2 artifact instead.
+The generated packaged model can be found at: /tmp/model16.so
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+OMP_NUM_THREADS=16 numactl --cpunodebind=0 --membind=0 python3 torchchat.py generate llama3.1 --dso-path /tmp/model16.so --prompt "Once upon a time," --max-new-tokens 200 --device cuda --num-samples 3
+PyTorch version 2.6.0.dev20241218+cu124 available.
+W1218 20:27:40.073230 1629167 site-packages/torch/_export/__init__.py:276] +============================+
+W1218 20:27:40.073839 1629167 site-packages/torch/_export/__init__.py:277] |     !!!   WARNING   !!!    |
+W1218 20:27:40.074088 1629167 site-packages/torch/_export/__init__.py:278] +============================+
+W1218 20:27:40.074266 1629167 site-packages/torch/_export/__init__.py:279] torch._export.aot_compile()/torch._export.aot_load() is being deprecated, please switch to directly calling torch._inductor.aoti_compile_and_package(torch.export.export())/torch._inductor.aoti_load_package() instead.
+[W1218 20:27:45.772529049 co6zzpgebfaztbdh6pt7cko6cjwmho5p7smhnx5ua2dr23o4uxyi.cpp:4473] Warning: "Input 0 was compiled as 16-bytes aligned, but it is not aligned at run time. Copying to an aligned tensor to guarantee correctness, but expect a performance hit." (function run_impl)
+[W1218 20:27:45.772695184 co6zzpgebfaztbdh6pt7cko6cjwmho5p7smhnx5ua2dr23o4uxyi.cpp:4480] Warning: "Input 1 was compiled as 16-bytes aligned, but it is not aligned at run time. Copying to an aligned tensor to guarantee correctness, but expect a performance hit." (function run_impl)
+[W1218 20:27:45.783607451 co6zzpgebfaztbdh6pt7cko6cjwmho5p7smhnx5ua2dr23o4uxyi.cpp:4473] Warning: "Input 0 was compiled as 16-bytes aligned, but it is not aligned at run time. Copying to an aligned tensor to guarantee correctness, but expect a performance hit." (function run_impl)
+[W1218 20:27:45.783685681 co6zzpgebfaztbdh6pt7cko6cjwmho5p7smhnx5ua2dr23o4uxyi.cpp:4480] Warning: "Input 1 was compiled as 16-bytes aligned, but it is not aligned at run time. Copying to an aligned tensor to guarantee correctness, but expect a performance hit." (function run_impl)
+[W1218 20:27:45.794621353 co6zzpgebfaztbdh6pt7cko6cjwmho5p7smhnx5ua2dr23o4uxyi.cpp:4473] Warning: "Input 0 was compiled as 16-bytes aligned, but it is not aligned at run time. Copying to an aligned tensor to guarantee correctness, but expect a performance hit." (function run_impl)
+[W1218 20:27:45.794706872 co6zzpgebfaztbdh6pt7cko6cjwmho5p7smhnx5ua2dr23o4uxyi.cpp:4480] Warning: "Input 1 was compiled as 16-bytes aligned, but it is not aligned at run time. Copying to an aligned tensor to guarantee correctness, but expect a performance hit." (function run_impl)
+[W1218 20:27:45.816522690 co6zzpgebfaztbdh6pt7cko6cjwmho5p7smhnx5ua2dr23o4uxyi.cpp:4473] Warning: "Input 0 was compiled as 16-bytes aligned, but it is not aligned at run time. Copying to an aligned tensor to guarantee correctness, but expect a performance hit." (function run_impl)
+[W1218 20:27:45.816643669 co6zzpgebfaztbdh6pt7cko6cjwmho5p7smhnx5ua2dr23o4uxyi.cpp:4480] Warning: "Input 1 was compiled as 16-bytes aligned, but it is not aligned at run time. Copying to an aligned tensor to guarantee correctness, but expect a performance hit." (function run_impl)
+Unabled to import torchao experimental quant_api with error:  [Errno 2] No such file or directory: '/home/jackkhuu/oss/torchchat/torchao-build/src/ao/torchao/experimental/quant_api.py'
+Warning: checkpoint path ignored because an exported model was specified using a DSO, AOTI PACKAGE or PTE path argument
+Warning: checkpoint path ignored because an exported model was specified using a DSO, AOTI PACKAGE or PTE path argument
+Using device=cuda NVIDIA PG509-210
+Loading model...
+Time to load model: 7.25 seconds
+-----------------------------------------------------------
+Once upon a time, there was a beautiful and enchanting forest called Luminaria. In the heart of Luminaria, there was a magnificent tree that stood tall and proud. Its name was Elyria, and it was said to be the wisest and most ancient tree in all the land.
+Elyria was a majestic sight to behold, with leaves that shimmered like silver and a trunk that glowed with a soft, ethereal light. Its branches stretched up towards the sky, as if trying to reach the heavens themselves. Beneath its boughs, a soft, golden mist hovered, imbuing the air with an otherworldly scent.
+For centuries, creatures of all kinds flocked to Elyria, seeking the ancient tree's counsel and wisdom. The creatures would gather at the base of the tree, and Elyria would share its knowledge with them, offering guidance and insight into the mysteries of the universe.
+One day, a young traveler named Aria stumbled upon Lumin
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~                
+Generated 199 tokens                 
+Time for inference 1: 2.5453 sec total                 
+Time to first token: 0.2054 sec with sequential prefill.                
+
+      Total throughput: 78.5767 tokens/sec, 0.0127 s/token                 
+First token throughput: 4.8692 tokens/sec, 0.2054 s/token                 
+ Next token throughput: 85.0461 tokens/sec, 0.0118 s/token                     
+
+Bandwidth achieved: 0.00 GB/s
+*** This first iteration will include cold start effects for dynamic import, hardware caches. ***
+[W1218 20:27:47.304206808 co6zzpgebfaztbdh6pt7cko6cjwmho5p7smhnx5ua2dr23o4uxyi.cpp:4473] Warning: "Input 0 was compiled as 16-bytes aligned, but it is not aligned at run time. Copying to an aligned tensor to guarantee correctness, but expect a performance hit." (function run_impl)
+[W1218 20:27:47.304334246 co6zzpgebfaztbdh6pt7cko6cjwmho5p7smhnx5ua2dr23o4uxyi.cpp:4480] Warning: "Input 1 was compiled as 16-bytes aligned, but it is not aligned at run time. Copying to an aligned tensor to guarantee correctness, but expect a performance hit." (function run_impl)
+[W1218 20:27:47.315306955 co6zzpgebfaztbdh6pt7cko6cjwmho5p7smhnx5ua2dr23o4uxyi.cpp:4473] Warning: "Input 0 was compiled as 16-bytes aligned, but it is not aligned at run time. Copying to an aligned tensor to guarantee correctness, but expect a performance hit." (function run_impl)
+[W1218 20:27:47.315432286 co6zzpgebfaztbdh6pt7cko6cjwmho5p7smhnx5ua2dr23o4uxyi.cpp:4480] Warning: "Input 1 was compiled as 16-bytes aligned, but it is not aligned at run time. Copying to an aligned tensor to guarantee correctness, but expect a performance hit." (function run_impl)
+[W1218 20:27:47.326418049 co6zzpgebfaztbdh6pt7cko6cjwmho5p7smhnx5ua2dr23o4uxyi.cpp:4473] Warning: "Input 0 was compiled as 16-bytes aligned, but it is not aligned at run time. Copying to an aligned tensor to guarantee correctness, but expect a performance hit." (function run_impl)
+[W1218 20:27:47.326525337 co6zzpgebfaztbdh6pt7cko6cjwmho5p7smhnx5ua2dr23o4uxyi.cpp:4480] Warning: "Input 1 was compiled as 16-bytes aligned, but it is not aligned at run time. Copying to an aligned tensor to guarantee correctness, but expect a performance hit." (function run_impl)
+[W1218 20:27:47.348440330 co6zzpgebfaztbdh6pt7cko6cjwmho5p7smhnx5ua2dr23o4uxyi.cpp:4473] Warning: "Input 0 was compiled as 16-bytes aligned, but it is not aligned at run time. Copying to an aligned tensor to guarantee correctness, but expect a performance hit." (function run_impl)
+[W1218 20:27:47.348548073 co6zzpgebfaztbdh6pt7cko6cjwmho5p7smhnx5ua2dr23o4uxyi.cpp:4480] Warning: "Input 1 was compiled as 16-bytes aligned, but it is not aligned at run time. Copying to an aligned tensor to guarantee correctness, but expect a performance hit." (function run_impl)
+
+========================================
+
+Once upon a time, in the countryside of Finland, there was a small village surrounded by vast forests and the occasional lake. In this village, there lived a young woman named Aki. She was a skilled craftsman and artisan, known for her beautiful hand-woven baskets and intricate wood carvings. Aki lived a simple life, relying on the natural resources of the forest to create her wares.
+
+One day, a wealthy merchant named Olaf arrived in the village. He was a sharp business owner, and he had heard about Aki's exceptional craftsmanship. Olaf was impressed by Aki's baskets, which were sturdy, beautiful, and perfect for carrying goods. He approached Aki with a proposal: he wanted to buy all of her baskets and sell them in the cities, making a handsome profit.
+
+Aki was hesitant at first, but Olaf convinced her to sell her baskets to him. He promised her a good price and guaranteed that she would be famous throughout the land for her exceptional craftsmanship
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~                
+Generated 199 tokens                 
+Time for inference 2: 2.3927 sec total                 
+Time to first token: 0.0583 sec with sequential prefill.                
+
+      Total throughput: 83.5881 tokens/sec, 0.0120 s/token                 
+First token throughput: 17.1515 tokens/sec, 0.0583 s/token                 
+ Next token throughput: 85.2474 tokens/sec, 0.0117 s/token                     
+
+Bandwidth achieved: 0.00 GB/s
+[W1218 20:27:49.697225030 co6zzpgebfaztbdh6pt7cko6cjwmho5p7smhnx5ua2dr23o4uxyi.cpp:4473] Warning: "Input 0 was compiled as 16-bytes aligned, but it is not aligned at run time. Copying to an aligned tensor to guarantee correctness, but expect a performance hit." (function run_impl)
+[W1218 20:27:49.697339594 co6zzpgebfaztbdh6pt7cko6cjwmho5p7smhnx5ua2dr23o4uxyi.cpp:4480] Warning: "Input 1 was compiled as 16-bytes aligned, but it is not aligned at run time. Copying to an aligned tensor to guarantee correctness, but expect a performance hit." (function run_impl)
+[W1218 20:27:49.708285795 co6zzpgebfaztbdh6pt7cko6cjwmho5p7smhnx5ua2dr23o4uxyi.cpp:4473] Warning: "Input 0 was compiled as 16-bytes aligned, but it is not aligned at run time. Copying to an aligned tensor to guarantee correctness, but expect a performance hit." (function run_impl)
+[W1218 20:27:49.708372136 co6zzpgebfaztbdh6pt7cko6cjwmho5p7smhnx5ua2dr23o4uxyi.cpp:4480] Warning: "Input 1 was compiled as 16-bytes aligned, but it is not aligned at run time. Copying to an aligned tensor to guarantee correctness, but expect a performance hit." (function run_impl)
+[W1218 20:27:49.719316352 co6zzpgebfaztbdh6pt7cko6cjwmho5p7smhnx5ua2dr23o4uxyi.cpp:4473] Warning: "Input 0 was compiled as 16-bytes aligned, but it is not aligned at run time. Copying to an aligned tensor to guarantee correctness, but expect a performance hit." (function run_impl)
+[W1218 20:27:49.719465824 co6zzpgebfaztbdh6pt7cko6cjwmho5p7smhnx5ua2dr23o4uxyi.cpp:4480] Warning: "Input 1 was compiled as 16-bytes aligned, but it is not aligned at run time. Copying to an aligned tensor to guarantee correctness, but expect a performance hit." (function run_impl)
+[W1218 20:27:49.741387533 co6zzpgebfaztbdh6pt7cko6cjwmho5p7smhnx5ua2dr23o4uxyi.cpp:4473] Warning: "Input 0 was compiled as 16-bytes aligned, but it is not aligned at run time. Copying to an aligned tensor to guarantee correctness, but expect a performance hit." (function run_impl)
+[W1218 20:27:49.741522441 co6zzpgebfaztbdh6pt7cko6cjwmho5p7smhnx5ua2dr23o4uxyi.cpp:4480] Warning: "Input 1 was compiled as 16-bytes aligned, but it is not aligned at run time. Copying to an aligned tensor to guarantee correctness, but expect a performance hit." (function run_impl)
+
+========================================
+
+Once upon a time, in the midst of a bustling metropolis, there was a small, quirky bookstore called "The Book Nook." The store was run by a kind-hearted owner named Emily, who was passionate about books and people. She spent her days surrounded by shelves upon shelves of novels, poetry collections, and children's books, each one lovingly placed to spark the imagination of her customers. The Book Nook was a cozy haven, where people could escape the hustle and bustle of city life and get lost in the world of stories.
+One rainy afternoon, a young girl named Lily wandered into The Book Nook, shaking the rain from her umbrella and seeking refuge from the downpour. Her eyes widened as she stepped inside, taking in the warm glow of the store and the enticing aroma of old books. Emily, noticing Lily's curiosity, greeted her with a warm smile and invited her to explore the shelves.
+As Lily browsed, she stumbled upon a peculiar book with a strange cover and a title
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~                
+Generated 199 tokens                 
+Time for inference 3: 2.3978 sec total                 
+Time to first token: 0.0585 sec with sequential prefill.                
+
+      Total throughput: 83.4093 tokens/sec, 0.0120 s/token                 
+First token throughput: 17.0928 tokens/sec, 0.0585 s/token                 
+ Next token throughput: 85.0679 tokens/sec, 0.0118 s/token                     
+
+Bandwidth achieved: 0.00 GB/s
+
+========================================
+
+
+Warning: Excluding compile in calculations                 
+      Average tokens/sec (total): 81.86                 
+Average tokens/sec (first token): 13.04                 
+Average tokens/sec (next tokens): 85.12 
+                
+Memory used: 0.05 GB
diff --git a/llama31-1218/cuda_aoti_pt2_4.txt b/llama31-1218/cuda_aoti_pt2_4.txt
new file mode 100644
index 000000000..00155765f
--- /dev/null
+++ b/llama31-1218/cuda_aoti_pt2_4.txt
@@ -0,0 +1,338 @@
+python3 torchchat.py export llama3.1 --quantize '{"linear:int4": {"groupsize": 256}, "precision": {"dtype":"bfloat16"}, "executor":{"accelerator":"cuda"}}' --output-aoti-package-path /tmp/model34.pt2
+OMP_NUM_THREADS=16 numactl --cpunodebind=0 --membind=0 python3 torchchat.py generate llama3.1 --aoti-package-path /tmp/model34.pt2 --prompt "Once upon a time," --max-new-tokens 200 --device cuda --num-samples 3
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+python3 torchchat.py export llama3.1 --quantize '{"linear:int4": {"groupsize": 256}, "precision": {"dtype":"bfloat16"}, "executor":{"accelerator":"cuda"}}' --output-aoti-package-path /tmp/model34.pt2
+Note: NumExpr detected 22 cores but "NUMEXPR_MAX_THREADS" not set, so enforcing safe limit of 16.
+NumExpr defaulting to 16 threads.
+PyTorch version 2.6.0.dev20241218+cu124 available.
+W1218 20:45:21.120409 1729349 site-packages/torch/_export/__init__.py:276] +============================+
+W1218 20:45:21.120840 1729349 site-packages/torch/_export/__init__.py:277] |     !!!   WARNING   !!!    |
+W1218 20:45:21.121033 1729349 site-packages/torch/_export/__init__.py:278] +============================+
+W1218 20:45:21.121192 1729349 site-packages/torch/_export/__init__.py:279] torch._export.aot_compile()/torch._export.aot_load() is being deprecated, please switch to directly calling torch._inductor.aoti_compile_and_package(torch.export.export())/torch._inductor.aoti_load_package() instead.
+W1218 20:46:16.465686 1729349 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 20:46:16.467544 1729349 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 20:46:16.788190 1729349 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 20:46:16.869422 1729349 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 20:46:16.905183 1729349 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 20:46:16.906870 1729349 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 20:46:16.919432 1729349 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 20:46:16.957210 1729349 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 20:46:16.958714 1729349 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 20:46:17.132222 1729349 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 20:46:17.174525 1729349 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 20:46:17.210503 1729349 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 20:46:17.211761 1729349 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 20:46:17.220802 1729349 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 20:46:17.258308 1729349 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 20:46:17.259680 1729349 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 20:46:17.427675 1729349 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 20:46:17.464625 1729349 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 20:46:17.488438 1729349 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 20:46:17.489862 1729349 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 20:46:17.499448 1729349 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 20:46:17.529007 1729349 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 20:46:17.530410 1729349 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 20:46:17.695004 1729349 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 20:46:17.733350 1729349 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 20:46:17.765392 1729349 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 20:46:17.766595 1729349 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 20:46:17.775712 1729349 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 20:46:17.811386 1729349 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 20:46:17.812732 1729349 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 20:46:17.979806 1729349 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 20:46:18.017590 1729349 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 20:46:18.039381 1729349 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 20:46:18.040876 1729349 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 20:46:18.049909 1729349 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 20:46:18.074753 1729349 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 20:46:18.076165 1729349 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 20:46:18.244242 1729349 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 20:46:18.283118 1729349 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 20:46:18.312961 1729349 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 20:46:18.314171 1729349 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 20:46:18.323915 1729349 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 20:46:18.359438 1729349 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 20:46:18.360628 1729349 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 20:46:18.530304 1729349 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 20:46:18.569142 1729349 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 20:46:18.591658 1729349 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 20:46:18.593145 1729349 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 20:46:18.602609 1729349 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 20:46:18.628443 1729349 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 20:46:18.629992 1729349 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 20:46:18.804650 1729349 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 20:46:18.843868 1729349 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 20:46:18.874268 1729349 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 20:46:18.875512 1729349 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 20:46:18.884989 1729349 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 20:46:18.920465 1729349 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 20:46:18.921745 1729349 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 20:46:19.092431 1729349 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 20:46:19.130003 1729349 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 20:46:19.152019 1729349 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 20:46:19.153467 1729349 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 20:46:19.163017 1729349 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 20:46:19.187678 1729349 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 20:46:19.189476 1729349 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 20:46:19.357551 1729349 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 20:46:19.394279 1729349 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 20:46:19.423387 1729349 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 20:46:19.424555 1729349 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 20:46:19.433384 1729349 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 20:46:19.470442 1729349 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 20:46:19.471665 1729349 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 20:46:19.634660 1729349 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 20:46:19.671523 1729349 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 20:46:19.694029 1729349 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 20:46:19.695464 1729349 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 20:46:19.704744 1729349 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 20:46:19.731007 1729349 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 20:46:19.732405 1729349 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 20:46:19.898856 1729349 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 20:46:19.937233 1729349 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 20:46:19.966637 1729349 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 20:46:19.967890 1729349 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 20:46:19.977036 1729349 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 20:46:20.013014 1729349 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 20:46:20.014342 1729349 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 20:46:20.190135 1729349 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 20:46:20.233421 1729349 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 20:46:20.258804 1729349 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 20:46:20.263476 1729349 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 20:46:20.274563 1729349 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 20:46:20.301933 1729349 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 20:46:20.303448 1729349 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 20:46:20.488687 1729349 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 20:46:20.530357 1729349 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 20:46:20.562682 1729349 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 20:46:20.564024 1729349 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 20:46:20.574282 1729349 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 20:46:20.614861 1729349 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 20:46:20.616139 1729349 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 20:46:20.790657 1729349 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 20:46:20.829637 1729349 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 20:46:20.855765 1729349 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 20:46:20.857404 1729349 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 20:46:20.868592 1729349 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 20:46:20.902563 1729349 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 20:46:20.904732 1729349 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 20:46:21.106086 1729349 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 20:46:21.148476 1729349 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 20:46:21.181625 1729349 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 20:46:21.182984 1729349 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 20:46:21.193009 1729349 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 20:46:21.231682 1729349 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 20:46:21.232983 1729349 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 20:46:21.408428 1729349 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 20:46:21.448967 1729349 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 20:46:21.471734 1729349 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 20:46:21.473220 1729349 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 20:46:21.483063 1729349 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 20:46:21.509639 1729349 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 20:46:21.511165 1729349 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 20:46:21.693006 1729349 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 20:46:21.746879 1729349 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 20:46:21.783743 1729349 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 20:46:21.785223 1729349 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 20:46:21.797593 1729349 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 20:46:21.835792 1729349 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 20:46:21.837201 1729349 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 20:46:22.005191 1729349 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 20:46:22.046932 1729349 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 20:46:22.069038 1729349 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 20:46:22.070684 1729349 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 20:46:22.079836 1729349 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 20:46:22.105181 1729349 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 20:46:22.106583 1729349 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 20:46:22.283936 1729349 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 20:46:22.324374 1729349 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 20:46:22.356308 1729349 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 20:46:22.357868 1729349 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 20:46:22.367755 1729349 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 20:46:22.407168 1729349 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 20:46:22.408499 1729349 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 20:46:22.585803 1729349 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 20:46:22.636426 1729349 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 20:46:22.662883 1729349 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 20:46:22.664396 1729349 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 20:46:22.678917 1729349 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 20:46:22.709474 1729349 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 20:46:22.711118 1729349 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 20:46:22.886478 1729349 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 20:46:22.926027 1729349 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 20:46:22.956582 1729349 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 20:46:22.957885 1729349 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 20:46:22.967808 1729349 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 20:46:23.006169 1729349 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 20:46:23.007444 1729349 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 20:46:23.211189 1729349 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 20:46:23.266924 1729349 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 20:46:23.292163 1729349 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 20:46:23.293925 1729349 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 20:46:23.305099 1729349 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 20:46:23.338405 1729349 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 20:46:23.339977 1729349 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 20:46:23.563599 1729349 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 20:46:23.614309 1729349 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 20:46:23.647895 1729349 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 20:46:23.649181 1729349 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 20:46:23.658919 1729349 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 20:46:23.700968 1729349 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 20:46:23.702237 1729349 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 20:46:23.886346 1729349 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 20:46:23.928838 1729349 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 20:46:23.952301 1729349 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 20:46:23.953792 1729349 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 20:46:23.964244 1729349 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 20:46:23.992369 1729349 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 20:46:23.994292 1729349 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 20:46:24.178018 1729349 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 20:46:24.218735 1729349 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 20:46:24.251069 1729349 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 20:46:24.252264 1729349 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 20:46:24.261217 1729349 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 20:46:24.298934 1729349 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 20:46:24.300217 1729349 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 20:46:24.466638 1729349 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 20:46:24.507352 1729349 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 20:46:24.529404 1729349 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 20:46:24.530803 1729349 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 20:46:24.540035 1729349 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 20:46:24.565938 1729349 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 20:46:24.567897 1729349 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 20:46:24.740230 1729349 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 20:46:24.781826 1729349 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 20:46:24.813966 1729349 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 20:46:24.815215 1729349 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 20:46:24.824795 1729349 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 20:46:24.860429 1729349 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 20:46:24.861671 1729349 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 20:46:25.034183 1729349 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 20:46:25.074481 1729349 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 20:46:25.096683 1729349 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 20:46:25.098379 1729349 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 20:46:25.107500 1729349 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 20:46:25.131445 1729349 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 20:46:25.132549 1729349 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 20:46:25.299923 1729349 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 20:46:25.338064 1729349 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 20:46:25.369035 1729349 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 20:46:25.370257 1729349 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 20:46:25.379743 1729349 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 20:46:25.415671 1729349 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 20:46:25.416880 1729349 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 20:46:25.586826 1729349 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 20:46:25.625188 1729349 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 20:46:25.647928 1729349 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 20:46:25.649308 1729349 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 20:46:25.658444 1729349 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 20:46:25.682103 1729349 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 20:46:25.683469 1729349 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 20:46:25.844792 1729349 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 20:46:25.885546 1729349 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 20:46:25.914999 1729349 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 20:46:25.916060 1729349 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 20:46:25.924618 1729349 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm.default is missing a c-shim implementation, using proxy executor as fallback
+W1218 20:46:25.944749 1729349 site-packages/torch/_inductor/ir.py:6603] [0/0] aten._weight_int4pack_mm.default is missing a c-shim implementation, using proxy executor as fallback
+Unabled to import torchao experimental quant_api with error:  [Errno 2] No such file or directory: '/home/jackkhuu/oss/torchchat/torchao-build/src/ao/torchao/experimental/quant_api.py'
+Using device=cuda
+Setting max_seq_length to 300 for DSO export.
+Loading model...
+Time to load model: 6.71 seconds
+Quantizing the model with: {'linear:int4': {'groupsize': 256}, 'precision': {'dtype': 'bfloat16'}, 'executor': {'accelerator': 'cuda'}}
+Time to quantize model: 0.50 seconds
+-----------------------------------------------------------
+Exporting model using AOT Inductor to /tmp/model34.pt2
+The generated packaged model can be found at: /tmp/model34.pt2
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+OMP_NUM_THREADS=16 numactl --cpunodebind=0 --membind=0 python3 torchchat.py generate llama3.1 --aoti-package-path /tmp/model34.pt2 --prompt "Once upon a time," --max-new-tokens 200 --device cuda --num-samples 3
+PyTorch version 2.6.0.dev20241218+cu124 available.
+[W1218 20:48:29.512912813 cf365z5wc4kdvgayh6jymwqzclrdao6z5qmdwzzsqqsxrjowv5sn.cpp:6940] Warning: "Input 0 was compiled as 16-bytes aligned, but it is not aligned at run time. Copying to an aligned tensor to guarantee correctness, but expect a performance hit." (function run_impl)
+[W1218 20:48:29.513094136 cf365z5wc4kdvgayh6jymwqzclrdao6z5qmdwzzsqqsxrjowv5sn.cpp:6947] Warning: "Input 1 was compiled as 16-bytes aligned, but it is not aligned at run time. Copying to an aligned tensor to guarantee correctness, but expect a performance hit." (function run_impl)
+[W1218 20:48:29.519145128 cf365z5wc4kdvgayh6jymwqzclrdao6z5qmdwzzsqqsxrjowv5sn.cpp:6940] Warning: "Input 0 was compiled as 16-bytes aligned, but it is not aligned at run time. Copying to an aligned tensor to guarantee correctness, but expect a performance hit." (function run_impl)
+[W1218 20:48:29.519231125 cf365z5wc4kdvgayh6jymwqzclrdao6z5qmdwzzsqqsxrjowv5sn.cpp:6947] Warning: "Input 1 was compiled as 16-bytes aligned, but it is not aligned at run time. Copying to an aligned tensor to guarantee correctness, but expect a performance hit." (function run_impl)
+[W1218 20:48:29.525247092 cf365z5wc4kdvgayh6jymwqzclrdao6z5qmdwzzsqqsxrjowv5sn.cpp:6940] Warning: "Input 0 was compiled as 16-bytes aligned, but it is not aligned at run time. Copying to an aligned tensor to guarantee correctness, but expect a performance hit." (function run_impl)
+[W1218 20:48:29.525334080 cf365z5wc4kdvgayh6jymwqzclrdao6z5qmdwzzsqqsxrjowv5sn.cpp:6947] Warning: "Input 1 was compiled as 16-bytes aligned, but it is not aligned at run time. Copying to an aligned tensor to guarantee correctness, but expect a performance hit." (function run_impl)
+[W1218 20:48:29.537342602 cf365z5wc4kdvgayh6jymwqzclrdao6z5qmdwzzsqqsxrjowv5sn.cpp:6940] Warning: "Input 0 was compiled as 16-bytes aligned, but it is not aligned at run time. Copying to an aligned tensor to guarantee correctness, but expect a performance hit." (function run_impl)
+[W1218 20:48:29.537427316 cf365z5wc4kdvgayh6jymwqzclrdao6z5qmdwzzsqqsxrjowv5sn.cpp:6947] Warning: "Input 1 was compiled as 16-bytes aligned, but it is not aligned at run time. Copying to an aligned tensor to guarantee correctness, but expect a performance hit." (function run_impl)
+Unabled to import torchao experimental quant_api with error:  [Errno 2] No such file or directory: '/home/jackkhuu/oss/torchchat/torchao-build/src/ao/torchao/experimental/quant_api.py'
+Warning: checkpoint path ignored because an exported model was specified using a DSO, AOTI PACKAGE or PTE path argument
+Warning: checkpoint path ignored because an exported model was specified using a DSO, AOTI PACKAGE or PTE path argument
+Using device=cuda NVIDIA PG509-210
+Loading model...
+Time to load model: 7.75 seconds
+-----------------------------------------------------------
+Once upon a time, there was a clever and quick-witted young lad named Jack. Jack lived in a small village surrounded by vast fields and dense forests. He was known for his cunning and wit, and people often sought his advice on matters that required cleverness and strategy.
+One sunny afternoon, Jack was sitting outside his village, playing a game of chess with a group of friends. They were all discussing a grand plan to build a magnificent new bridge that would connect their village to a nearby town. The villagers had been discussing this plan for months, but they couldn’t agree on who should fund the project.
+As Jack was focused on the game, a clever idea suddenly struck him. He suddenly stopped playing chess and exclaimed, “Friends! I have a brilliant idea that will solve our dilemma.” His friends turned to him with curiosity and asked, “What is it, Jack?” Jack thought for a moment and then said, “We will build the bridge and then auction it off to the highest bidder. Whoever wins
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~                
+Generated 199 tokens                 
+Time for inference 1: 1.5175 sec total                 
+Time to first token: 0.1717 sec with sequential prefill.                
+
+      Total throughput: 131.7973 tokens/sec, 0.0076 s/token                 
+First token throughput: 5.8242 tokens/sec, 0.1717 s/token                 
+ Next token throughput: 147.8691 tokens/sec, 0.0068 s/token                     
+
+Bandwidth achieved: 0.00 GB/s
+*** This first iteration will include cold start effects for dynamic import, hardware caches. ***
+[W1218 20:48:31.016062049 cf365z5wc4kdvgayh6jymwqzclrdao6z5qmdwzzsqqsxrjowv5sn.cpp:6940] Warning: "Input 0 was compiled as 16-bytes aligned, but it is not aligned at run time. Copying to an aligned tensor to guarantee correctness, but expect a performance hit." (function run_impl)
+[W1218 20:48:31.016155569 cf365z5wc4kdvgayh6jymwqzclrdao6z5qmdwzzsqqsxrjowv5sn.cpp:6947] Warning: "Input 1 was compiled as 16-bytes aligned, but it is not aligned at run time. Copying to an aligned tensor to guarantee correctness, but expect a performance hit." (function run_impl)
+[W1218 20:48:31.022185362 cf365z5wc4kdvgayh6jymwqzclrdao6z5qmdwzzsqqsxrjowv5sn.cpp:6940] Warning: "Input 0 was compiled as 16-bytes aligned, but it is not aligned at run time. Copying to an aligned tensor to guarantee correctness, but expect a performance hit." (function run_impl)
+[W1218 20:48:31.022273344 cf365z5wc4kdvgayh6jymwqzclrdao6z5qmdwzzsqqsxrjowv5sn.cpp:6947] Warning: "Input 1 was compiled as 16-bytes aligned, but it is not aligned at run time. Copying to an aligned tensor to guarantee correctness, but expect a performance hit." (function run_impl)
+[W1218 20:48:31.028300182 cf365z5wc4kdvgayh6jymwqzclrdao6z5qmdwzzsqqsxrjowv5sn.cpp:6940] Warning: "Input 0 was compiled as 16-bytes aligned, but it is not aligned at run time. Copying to an aligned tensor to guarantee correctness, but expect a performance hit." (function run_impl)
+[W1218 20:48:31.028384715 cf365z5wc4kdvgayh6jymwqzclrdao6z5qmdwzzsqqsxrjowv5sn.cpp:6947] Warning: "Input 1 was compiled as 16-bytes aligned, but it is not aligned at run time. Copying to an aligned tensor to guarantee correctness, but expect a performance hit." (function run_impl)
+[W1218 20:48:31.040433087 cf365z5wc4kdvgayh6jymwqzclrdao6z5qmdwzzsqqsxrjowv5sn.cpp:6940] Warning: "Input 0 was compiled as 16-bytes aligned, but it is not aligned at run time. Copying to an aligned tensor to guarantee correctness, but expect a performance hit." (function run_impl)
+[W1218 20:48:31.040538332 cf365z5wc4kdvgayh6jymwqzclrdao6z5qmdwzzsqqsxrjowv5sn.cpp:6947] Warning: "Input 1 was compiled as 16-bytes aligned, but it is not aligned at run time. Copying to an aligned tensor to guarantee correctness, but expect a performance hit." (function run_impl)
+
+========================================
+
+Once upon a time, in a small village nestled in the rolling hills of the countryside, there lived a young girl named Emily. Emily was a curious and adventurous child, always eager to explore the world around her. She loved nothing more than to spend her days wandering through the fields and forests, discovering hidden streams and secret glades.
+
+One day, as Emily was wandering through the woods, she stumbled upon a small clearing. In the center of the clearing stood an enormous tree, its trunk gnarled and twisted with age. The tree seemed to loom over the clearing, casting a shadow that stretched out in all directions. Emily felt a shiver run down her spine as she approached the tree, feeling as though she was being watched.
+
+As she drew closer, she noticed that the tree was covered in strange symbols. They seemed to be some sort of language, but Emily had never seen anything like them before. She felt a sudden urge to touch one of the symbols, and as she reached out, a loud
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~                
+Generated 199 tokens                 
+Time for inference 2: 1.3659 sec total                 
+Time to first token: 0.0353 sec with sequential prefill.                
+
+      Total throughput: 146.4218 tokens/sec, 0.0068 s/token                 
+First token throughput: 28.3165 tokens/sec, 0.0353 s/token                 
+ Next token throughput: 149.5564 tokens/sec, 0.0067 s/token                     
+
+Bandwidth achieved: 0.00 GB/s
+[W1218 20:48:32.382356630 cf365z5wc4kdvgayh6jymwqzclrdao6z5qmdwzzsqqsxrjowv5sn.cpp:6940] Warning: "Input 0 was compiled as 16-bytes aligned, but it is not aligned at run time. Copying to an aligned tensor to guarantee correctness, but expect a performance hit." (function run_impl)
+[W1218 20:48:32.382449619 cf365z5wc4kdvgayh6jymwqzclrdao6z5qmdwzzsqqsxrjowv5sn.cpp:6947] Warning: "Input 1 was compiled as 16-bytes aligned, but it is not aligned at run time. Copying to an aligned tensor to guarantee correctness, but expect a performance hit." (function run_impl)
+[W1218 20:48:32.388466095 cf365z5wc4kdvgayh6jymwqzclrdao6z5qmdwzzsqqsxrjowv5sn.cpp:6940] Warning: "Input 0 was compiled as 16-bytes aligned, but it is not aligned at run time. Copying to an aligned tensor to guarantee correctness, but expect a performance hit." (function run_impl)
+[W1218 20:48:32.388539531 cf365z5wc4kdvgayh6jymwqzclrdao6z5qmdwzzsqqsxrjowv5sn.cpp:6947] Warning: "Input 1 was compiled as 16-bytes aligned, but it is not aligned at run time. Copying to an aligned tensor to guarantee correctness, but expect a performance hit." (function run_impl)
+[W1218 20:48:32.394558918 cf365z5wc4kdvgayh6jymwqzclrdao6z5qmdwzzsqqsxrjowv5sn.cpp:6940] Warning: "Input 0 was compiled as 16-bytes aligned, but it is not aligned at run time. Copying to an aligned tensor to guarantee correctness, but expect a performance hit." (function run_impl)
+[W1218 20:48:32.394629422 cf365z5wc4kdvgayh6jymwqzclrdao6z5qmdwzzsqqsxrjowv5sn.cpp:6947] Warning: "Input 1 was compiled as 16-bytes aligned, but it is not aligned at run time. Copying to an aligned tensor to guarantee correctness, but expect a performance hit." (function run_impl)
+[W1218 20:48:32.406656855 cf365z5wc4kdvgayh6jymwqzclrdao6z5qmdwzzsqqsxrjowv5sn.cpp:6940] Warning: "Input 0 was compiled as 16-bytes aligned, but it is not aligned at run time. Copying to an aligned tensor to guarantee correctness, but expect a performance hit." (function run_impl)
+[W1218 20:48:32.406736062 cf365z5wc4kdvgayh6jymwqzclrdao6z5qmdwzzsqqsxrjowv5sn.cpp:6947] Warning: "Input 1 was compiled as 16-bytes aligned, but it is not aligned at run time. Copying to an aligned tensor to guarantee correctness, but expect a performance hit." (function run_impl)
+
+========================================
+
+Once upon a time, I had a pair of pliers that were very special. They were a gift from my father, who was a skilled craftsman and a master of his trade. The pliers were engraved with intricate designs and symbols, each one representing a different tool or technique in his trade.
+My father was a master carpenter, and his workshop was filled with all sorts of interesting tools and gadgets. He had a collection of hand tools, including chisels, hammers, and saws, as well as power tools like drills and sanders. He spent his days building and repairing all sorts of things, from furniture to homes to bridges.
+The pliers were one of his favorite tools, and he used them for all sorts of tasks. He could grip anything from a small nail to a large screw nut, and he could bend the metal to get a good grip. He was especially proud of the way he could make the pliers look like a combination of a pair of scissors and a wrench
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~                
+Generated 199 tokens                 
+Time for inference 3: 1.3910 sec total                 
+Time to first token: 0.0354 sec with sequential prefill.                
+
+      Total throughput: 143.7809 tokens/sec, 0.0070 s/token                 
+First token throughput: 28.2736 tokens/sec, 0.0354 s/token                 
+ Next token throughput: 146.7945 tokens/sec, 0.0068 s/token                     
+
+Bandwidth achieved: 0.00 GB/s
+
+========================================
+
+
+Warning: Excluding compile in calculations                 
+      Average tokens/sec (total): 140.67                 
+Average tokens/sec (first token): 20.80                 
+Average tokens/sec (next tokens): 148.07 
+                
+Memory used: 0.05 GB
diff --git a/llama31-1218/cuda_aoti_pt2_8.txt b/llama31-1218/cuda_aoti_pt2_8.txt
new file mode 100644
index 000000000..69636bdf5
--- /dev/null
+++ b/llama31-1218/cuda_aoti_pt2_8.txt
@@ -0,0 +1,114 @@
+python3 torchchat.py export llama3.1 --quantize '{"linear:int8": {"groupsize": 0}, "precision": {"dtype":"bfloat16"}, "executor":{"accelerator":"cuda"}}' --output-aoti-package-path /tmp/model8.pt2
+OMP_NUM_THREADS=16 numactl --cpunodebind=0 --membind=0 python3 torchchat.py generate llama3.1 --aoti-package-path /tmp/model8.pt2 --prompt "Once upon a time," --max-new-tokens 200 --device cuda --num-samples 3
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+python3 torchchat.py export llama3.1 --quantize '{"linear:int8": {"groupsize": 0}, "precision": {"dtype":"bfloat16"}, "executor":{"accelerator":"cuda"}}' --output-aoti-package-path /tmp/model8.pt2
+Note: NumExpr detected 22 cores but "NUMEXPR_MAX_THREADS" not set, so enforcing safe limit of 16.
+NumExpr defaulting to 16 threads.
+PyTorch version 2.6.0.dev20241218+cu124 available.
+W1218 20:41:19.819849 1706108 site-packages/torch/_export/__init__.py:276] +============================+
+W1218 20:41:19.820317 1706108 site-packages/torch/_export/__init__.py:277] |     !!!   WARNING   !!!    |
+W1218 20:41:19.820508 1706108 site-packages/torch/_export/__init__.py:278] +============================+
+W1218 20:41:19.820676 1706108 site-packages/torch/_export/__init__.py:279] torch._export.aot_compile()/torch._export.aot_load() is being deprecated, please switch to directly calling torch._inductor.aoti_compile_and_package(torch.export.export())/torch._inductor.aoti_load_package() instead.
+Unabled to import torchao experimental quant_api with error:  [Errno 2] No such file or directory: '/home/jackkhuu/oss/torchchat/torchao-build/src/ao/torchao/experimental/quant_api.py'
+Using device=cuda
+Setting max_seq_length to 300 for DSO export.
+Loading model...
+Time to load model: 6.60 seconds
+Quantizing the model with: {'linear:int8': {'groupsize': 0}, 'precision': {'dtype': 'bfloat16'}, 'executor': {'accelerator': 'cuda'}}
+Time to quantize model: 0.40 seconds
+-----------------------------------------------------------
+Exporting model using AOT Inductor to /tmp/model8.pt2
+The generated packaged model can be found at: /tmp/model8.pt2
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+OMP_NUM_THREADS=16 numactl --cpunodebind=0 --membind=0 python3 torchchat.py generate llama3.1 --aoti-package-path /tmp/model8.pt2 --prompt "Once upon a time," --max-new-tokens 200 --device cuda --num-samples 3
+PyTorch version 2.6.0.dev20241218+cu124 available.
+[W1218 20:44:54.268953437 ci27ajwi2j7wl5u47bddskaatqiq737hmrkjc5xkuvhkkfna5lmk.cpp:6944] Warning: "Input 0 was compiled as 16-bytes aligned, but it is not aligned at run time. Copying to an aligned tensor to guarantee correctness, but expect a performance hit." (function run_impl)
+[W1218 20:44:54.269164436 ci27ajwi2j7wl5u47bddskaatqiq737hmrkjc5xkuvhkkfna5lmk.cpp:6951] Warning: "Input 1 was compiled as 16-bytes aligned, but it is not aligned at run time. Copying to an aligned tensor to guarantee correctness, but expect a performance hit." (function run_impl)
+[W1218 20:44:54.276771686 ci27ajwi2j7wl5u47bddskaatqiq737hmrkjc5xkuvhkkfna5lmk.cpp:6944] Warning: "Input 0 was compiled as 16-bytes aligned, but it is not aligned at run time. Copying to an aligned tensor to guarantee correctness, but expect a performance hit." (function run_impl)
+[W1218 20:44:54.276883082 ci27ajwi2j7wl5u47bddskaatqiq737hmrkjc5xkuvhkkfna5lmk.cpp:6951] Warning: "Input 1 was compiled as 16-bytes aligned, but it is not aligned at run time. Copying to an aligned tensor to guarantee correctness, but expect a performance hit." (function run_impl)
+[W1218 20:44:54.284506385 ci27ajwi2j7wl5u47bddskaatqiq737hmrkjc5xkuvhkkfna5lmk.cpp:6944] Warning: "Input 0 was compiled as 16-bytes aligned, but it is not aligned at run time. Copying to an aligned tensor to guarantee correctness, but expect a performance hit." (function run_impl)
+[W1218 20:44:54.284609253 ci27ajwi2j7wl5u47bddskaatqiq737hmrkjc5xkuvhkkfna5lmk.cpp:6951] Warning: "Input 1 was compiled as 16-bytes aligned, but it is not aligned at run time. Copying to an aligned tensor to guarantee correctness, but expect a performance hit." (function run_impl)
+[W1218 20:44:54.299880626 ci27ajwi2j7wl5u47bddskaatqiq737hmrkjc5xkuvhkkfna5lmk.cpp:6944] Warning: "Input 0 was compiled as 16-bytes aligned, but it is not aligned at run time. Copying to an aligned tensor to guarantee correctness, but expect a performance hit." (function run_impl)
+[W1218 20:44:54.299997726 ci27ajwi2j7wl5u47bddskaatqiq737hmrkjc5xkuvhkkfna5lmk.cpp:6951] Warning: "Input 1 was compiled as 16-bytes aligned, but it is not aligned at run time. Copying to an aligned tensor to guarantee correctness, but expect a performance hit." (function run_impl)
+Unabled to import torchao experimental quant_api with error:  [Errno 2] No such file or directory: '/home/jackkhuu/oss/torchchat/torchao-build/src/ao/torchao/experimental/quant_api.py'
+Warning: checkpoint path ignored because an exported model was specified using a DSO, AOTI PACKAGE or PTE path argument
+Warning: checkpoint path ignored because an exported model was specified using a DSO, AOTI PACKAGE or PTE path argument
+Using device=cuda NVIDIA PG509-210
+Loading model...
+Time to load model: 7.05 seconds
+-----------------------------------------------------------
+Once upon a time, in a far-off kingdom, there lived a kind and gentle princess named Sophia. She was loved by all who knew her, and she loved nothing more than helping those in need.
+One day, a terrible drought struck the kingdom, and the land became parched and barren. The people were suffering, and the crops were withering away. Sophia knew that she had to do something to help her people, so she set out on a journey to find a way to bring water back to the kingdom.
+She traveled far and wide, meeting all sorts of creatures along the way. She met a wise old owl who told her of a magical spring hidden deep within the forest. She met a mischievous band of fairies who offered to guide her through the treacherous paths. And she even met a brave knight who joined her on her quest.
+Together, they journeyed deeper into the forest, facing many challenges and obstacles along the way. They crossed rushing rivers, climbed steep mountains, and
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~                
+Generated 199 tokens                 
+Time for inference 1: 1.8699 sec total                 
+Time to first token: 0.1933 sec with sequential prefill.                
+
+      Total throughput: 106.9556 tokens/sec, 0.0093 s/token                 
+First token throughput: 5.1722 tokens/sec, 0.1933 s/token                 
+ Next token throughput: 118.6932 tokens/sec, 0.0084 s/token                     
+
+Bandwidth achieved: 0.00 GB/s
+*** This first iteration will include cold start effects for dynamic import, hardware caches. ***
+[W1218 20:44:56.124028221 ci27ajwi2j7wl5u47bddskaatqiq737hmrkjc5xkuvhkkfna5lmk.cpp:6944] Warning: "Input 0 was compiled as 16-bytes aligned, but it is not aligned at run time. Copying to an aligned tensor to guarantee correctness, but expect a performance hit." (function run_impl)
+[W1218 20:44:56.124154069 ci27ajwi2j7wl5u47bddskaatqiq737hmrkjc5xkuvhkkfna5lmk.cpp:6951] Warning: "Input 1 was compiled as 16-bytes aligned, but it is not aligned at run time. Copying to an aligned tensor to guarantee correctness, but expect a performance hit." (function run_impl)
+[W1218 20:44:56.131776376 ci27ajwi2j7wl5u47bddskaatqiq737hmrkjc5xkuvhkkfna5lmk.cpp:6944] Warning: "Input 0 was compiled as 16-bytes aligned, but it is not aligned at run time. Copying to an aligned tensor to guarantee correctness, but expect a performance hit." (function run_impl)
+[W1218 20:44:56.131867475 ci27ajwi2j7wl5u47bddskaatqiq737hmrkjc5xkuvhkkfna5lmk.cpp:6951] Warning: "Input 1 was compiled as 16-bytes aligned, but it is not aligned at run time. Copying to an aligned tensor to guarantee correctness, but expect a performance hit." (function run_impl)
+[W1218 20:44:56.139422326 ci27ajwi2j7wl5u47bddskaatqiq737hmrkjc5xkuvhkkfna5lmk.cpp:6944] Warning: "Input 0 was compiled as 16-bytes aligned, but it is not aligned at run time. Copying to an aligned tensor to guarantee correctness, but expect a performance hit." (function run_impl)
+[W1218 20:44:56.139504760 ci27ajwi2j7wl5u47bddskaatqiq737hmrkjc5xkuvhkkfna5lmk.cpp:6951] Warning: "Input 1 was compiled as 16-bytes aligned, but it is not aligned at run time. Copying to an aligned tensor to guarantee correctness, but expect a performance hit." (function run_impl)
+[W1218 20:44:56.154776906 ci27ajwi2j7wl5u47bddskaatqiq737hmrkjc5xkuvhkkfna5lmk.cpp:6944] Warning: "Input 0 was compiled as 16-bytes aligned, but it is not aligned at run time. Copying to an aligned tensor to guarantee correctness, but expect a performance hit." (function run_impl)
+[W1218 20:44:56.154920098 ci27ajwi2j7wl5u47bddskaatqiq737hmrkjc5xkuvhkkfna5lmk.cpp:6951] Warning: "Input 1 was compiled as 16-bytes aligned, but it is not aligned at run time. Copying to an aligned tensor to guarantee correctness, but expect a performance hit." (function run_impl)
+
+========================================
+
+Once upon a time, in a small village nestled in the rolling hills of the countryside, there lived a young girl named Sophia. Sophia was a kind and gentle soul, with a heart full of love for all living things. She spent her days helping her mother with the family farm, tending to the animals, and learning the secrets of the land from her wise and aged grandmother, Olga.
+One day, while out collecting wild herbs in the nearby forest, Sophia stumbled upon a hidden glade. In the center of the clearing stood an enormous tree, its branches twisted and gnarled with age. The tree seemed to be watching her, and Sophia felt an inexplicable sense of wonder and awe wash over her.
+As she approached the tree, a soft, golden light began to emanate from its trunk. The light grew brighter and brighter, until Sophia felt herself being drawn into its warmth. She closed her eyes, and when she opened them again, she found herself sitting at the base of the tree,
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~                
+Generated 199 tokens                 
+Time for inference 2: 1.7020 sec total                 
+Time to first token: 0.0414 sec with sequential prefill.                
+
+      Total throughput: 117.5111 tokens/sec, 0.0085 s/token                 
+First token throughput: 24.1696 tokens/sec, 0.0414 s/token                 
+ Next token throughput: 119.8367 tokens/sec, 0.0083 s/token                     
+
+Bandwidth achieved: 0.00 GB/s
+[W1218 20:44:58.826271512 ci27ajwi2j7wl5u47bddskaatqiq737hmrkjc5xkuvhkkfna5lmk.cpp:6944] Warning: "Input 0 was compiled as 16-bytes aligned, but it is not aligned at run time. Copying to an aligned tensor to guarantee correctness, but expect a performance hit." (function run_impl)
+[W1218 20:44:58.826369055 ci27ajwi2j7wl5u47bddskaatqiq737hmrkjc5xkuvhkkfna5lmk.cpp:6951] Warning: "Input 1 was compiled as 16-bytes aligned, but it is not aligned at run time. Copying to an aligned tensor to guarantee correctness, but expect a performance hit." (function run_impl)
+[W1218 20:44:58.833979281 ci27ajwi2j7wl5u47bddskaatqiq737hmrkjc5xkuvhkkfna5lmk.cpp:6944] Warning: "Input 0 was compiled as 16-bytes aligned, but it is not aligned at run time. Copying to an aligned tensor to guarantee correctness, but expect a performance hit." (function run_impl)
+[W1218 20:44:58.834067922 ci27ajwi2j7wl5u47bddskaatqiq737hmrkjc5xkuvhkkfna5lmk.cpp:6951] Warning: "Input 1 was compiled as 16-bytes aligned, but it is not aligned at run time. Copying to an aligned tensor to guarantee correctness, but expect a performance hit." (function run_impl)
+[W1218 20:44:58.841663859 ci27ajwi2j7wl5u47bddskaatqiq737hmrkjc5xkuvhkkfna5lmk.cpp:6944] Warning: "Input 0 was compiled as 16-bytes aligned, but it is not aligned at run time. Copying to an aligned tensor to guarantee correctness, but expect a performance hit." (function run_impl)
+[W1218 20:44:58.841746947 ci27ajwi2j7wl5u47bddskaatqiq737hmrkjc5xkuvhkkfna5lmk.cpp:6951] Warning: "Input 1 was compiled as 16-bytes aligned, but it is not aligned at run time. Copying to an aligned tensor to guarantee correctness, but expect a performance hit." (function run_impl)
+[W1218 20:44:58.856941467 ci27ajwi2j7wl5u47bddskaatqiq737hmrkjc5xkuvhkkfna5lmk.cpp:6944] Warning: "Input 0 was compiled as 16-bytes aligned, but it is not aligned at run time. Copying to an aligned tensor to guarantee correctness, but expect a performance hit." (function run_impl)
+[W1218 20:44:58.857075845 ci27ajwi2j7wl5u47bddskaatqiq737hmrkjc5xkuvhkkfna5lmk.cpp:6951] Warning: "Input 1 was compiled as 16-bytes aligned, but it is not aligned at run time. Copying to an aligned tensor to guarantee correctness, but expect a performance hit." (function run_impl)
+
+========================================
+
+Once upon a time, in a far-off land, there was a young girl named Maria. Maria was a curious and adventurous child, always eager to explore the world around her. She loved to run through the woods, play in the streams, and climb trees. Maria was especially fascinated by the creatures of the forest, and spent hours watching the birds, squirrels, and rabbits that lived there.
+One day, while wandering through the woods, Maria stumbled upon a hidden clearing. In the center of the clearing stood an enormous tree, its trunk as wide as a house and its branches reaching up to the sky. Maria approached the tree, feeling a sense of wonder and awe wash over her. As she reached out to touch the trunk, a soft, melodious voice spoke to her.
+"Welcome, Maria," said the tree. "I have been waiting for you."
+Maria was startled, but she felt no fear. Instead, she felt a deep sense of connection to the tree.
+"How did you know my name
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~                
+Generated 199 tokens                 
+Time for inference 3: 1.7068 sec total                 
+Time to first token: 0.0413 sec with sequential prefill.                
+
+      Total throughput: 117.1785 tokens/sec, 0.0085 s/token                 
+First token throughput: 24.2319 tokens/sec, 0.0413 s/token                 
+ Next token throughput: 119.4815 tokens/sec, 0.0084 s/token                     
+
+Bandwidth achieved: 0.00 GB/s
+
+========================================
+
+
+Warning: Excluding compile in calculations                 
+      Average tokens/sec (total): 113.88                 
+Average tokens/sec (first token): 17.86                 
+Average tokens/sec (next tokens): 119.34 
+                
+Memory used: 0.05 GB
diff --git a/llama31-1218/cuda_aoti_pt2_b16.txt b/llama31-1218/cuda_aoti_pt2_b16.txt
new file mode 100644
index 000000000..3d097b131
--- /dev/null
+++ b/llama31-1218/cuda_aoti_pt2_b16.txt
@@ -0,0 +1,117 @@
+python3 torchchat.py export llama3.1 --quantize '{"precision": {"dtype":"bfloat16"}, "executor":{"accelerator":"cuda"}}' --output-aoti-package-path /tmp/model16.pt2
+OMP_NUM_THREADS=16 numactl --cpunodebind=0 --membind=0 python3 torchchat.py generate llama3.1 --aoti-package-path /tmp/model16.pt2 --prompt "Once upon a time," --max-new-tokens 200 --device cuda --num-samples 3
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+python3 torchchat.py export llama3.1 --quantize '{"precision": {"dtype":"bfloat16"}, "executor":{"accelerator":"cuda"}}' --output-aoti-package-path /tmp/model16.pt2
+Note: NumExpr detected 22 cores but "NUMEXPR_MAX_THREADS" not set, so enforcing safe limit of 16.
+NumExpr defaulting to 16 threads.
+PyTorch version 2.6.0.dev20241218+cu124 available.
+W1218 20:36:00.442827 1674292 site-packages/torch/_export/__init__.py:276] +============================+
+W1218 20:36:00.443327 1674292 site-packages/torch/_export/__init__.py:277] |     !!!   WARNING   !!!    |
+W1218 20:36:00.443539 1674292 site-packages/torch/_export/__init__.py:278] +============================+
+W1218 20:36:00.443731 1674292 site-packages/torch/_export/__init__.py:279] torch._export.aot_compile()/torch._export.aot_load() is being deprecated, please switch to directly calling torch._inductor.aoti_compile_and_package(torch.export.export())/torch._inductor.aoti_load_package() instead.
+Unabled to import torchao experimental quant_api with error:  [Errno 2] No such file or directory: '/home/jackkhuu/oss/torchchat/torchao-build/src/ao/torchao/experimental/quant_api.py'
+Using device=cuda
+Setting max_seq_length to 300 for DSO export.
+Loading model...
+Time to load model: 5.87 seconds
+Quantizing the model with: {'precision': {'dtype': 'bfloat16'}, 'executor': {'accelerator': 'cuda'}}
+Time to quantize model: 0.01 seconds
+-----------------------------------------------------------
+Exporting model using AOT Inductor to /tmp/model16.pt2
+The generated packaged model can be found at: /tmp/model16.pt2
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+OMP_NUM_THREADS=16 numactl --cpunodebind=0 --membind=0 python3 torchchat.py generate llama3.1 --aoti-package-path /tmp/model16.pt2 --prompt "Once upon a time," --max-new-tokens 200 --device cuda --num-samples 3
+PyTorch version 2.6.0.dev20241218+cu124 available.
+[W1218 20:40:46.596597380 cdxn63ic7ggyjxgsm7ylanxntvodcwwexnbz4vnsl7tytks2biuh.cpp:4473] Warning: "Input 0 was compiled as 16-bytes aligned, but it is not aligned at run time. Copying to an aligned tensor to guarantee correctness, but expect a performance hit." (function run_impl)
+[W1218 20:40:46.596741672 cdxn63ic7ggyjxgsm7ylanxntvodcwwexnbz4vnsl7tytks2biuh.cpp:4480] Warning: "Input 1 was compiled as 16-bytes aligned, but it is not aligned at run time. Copying to an aligned tensor to guarantee correctness, but expect a performance hit." (function run_impl)
+[W1218 20:40:46.607672802 cdxn63ic7ggyjxgsm7ylanxntvodcwwexnbz4vnsl7tytks2biuh.cpp:4473] Warning: "Input 0 was compiled as 16-bytes aligned, but it is not aligned at run time. Copying to an aligned tensor to guarantee correctness, but expect a performance hit." (function run_impl)
+[W1218 20:40:46.607811251 cdxn63ic7ggyjxgsm7ylanxntvodcwwexnbz4vnsl7tytks2biuh.cpp:4480] Warning: "Input 1 was compiled as 16-bytes aligned, but it is not aligned at run time. Copying to an aligned tensor to guarantee correctness, but expect a performance hit." (function run_impl)
+[W1218 20:40:46.618749383 cdxn63ic7ggyjxgsm7ylanxntvodcwwexnbz4vnsl7tytks2biuh.cpp:4473] Warning: "Input 0 was compiled as 16-bytes aligned, but it is not aligned at run time. Copying to an aligned tensor to guarantee correctness, but expect a performance hit." (function run_impl)
+[W1218 20:40:46.618883811 cdxn63ic7ggyjxgsm7ylanxntvodcwwexnbz4vnsl7tytks2biuh.cpp:4480] Warning: "Input 1 was compiled as 16-bytes aligned, but it is not aligned at run time. Copying to an aligned tensor to guarantee correctness, but expect a performance hit." (function run_impl)
+[W1218 20:40:46.640736941 cdxn63ic7ggyjxgsm7ylanxntvodcwwexnbz4vnsl7tytks2biuh.cpp:4473] Warning: "Input 0 was compiled as 16-bytes aligned, but it is not aligned at run time. Copying to an aligned tensor to guarantee correctness, but expect a performance hit." (function run_impl)
+[W1218 20:40:46.640903632 cdxn63ic7ggyjxgsm7ylanxntvodcwwexnbz4vnsl7tytks2biuh.cpp:4480] Warning: "Input 1 was compiled as 16-bytes aligned, but it is not aligned at run time. Copying to an aligned tensor to guarantee correctness, but expect a performance hit." (function run_impl)
+Unabled to import torchao experimental quant_api with error:  [Errno 2] No such file or directory: '/home/jackkhuu/oss/torchchat/torchao-build/src/ao/torchao/experimental/quant_api.py'
+Warning: checkpoint path ignored because an exported model was specified using a DSO, AOTI PACKAGE or PTE path argument
+Warning: checkpoint path ignored because an exported model was specified using a DSO, AOTI PACKAGE or PTE path argument
+Using device=cuda NVIDIA PG509-210
+Loading model...
+Time to load model: 7.99 seconds
+-----------------------------------------------------------
+Once upon a time, there was a little girl named Sophia who lived in a small village surrounded by a beautiful forest. Sophia loved to explore the forest and all its secrets. She would spend hours wandering through the trees, discovering hidden streams and secret meadows.
+
+One day, Sophia stumbled upon a hidden path she had never seen before. The path was narrow and winding, and it seemed to lead deeper into the forest than Sophia had ever gone before. Her curiosity was piqued, and she decided to follow the path to see where it would take her.
+
+As she walked, the trees grew taller and the air grew thick with the scent of wildflowers. Sophia felt as though she had entered a magical world, one that was full of wonder and enchantment. She walked for what felt like hours, the path twisting and turning through the forest, until she came to a clearing.
+
+In the center of the clearing stood an enormous tree, its trunk as wide as a house and its branches reaching up to the sky like
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~                
+Generated 199 tokens                 
+Time for inference 1: 2.5111 sec total                 
+Time to first token: 0.1777 sec with sequential prefill.                
+
+      Total throughput: 79.6449 tokens/sec, 0.0126 s/token                 
+First token throughput: 5.6290 tokens/sec, 0.1777 s/token                 
+ Next token throughput: 85.2798 tokens/sec, 0.0117 s/token                     
+
+Bandwidth achieved: 0.00 GB/s
+*** This first iteration will include cold start effects for dynamic import, hardware caches. ***
+[W1218 20:40:49.097018055 cdxn63ic7ggyjxgsm7ylanxntvodcwwexnbz4vnsl7tytks2biuh.cpp:4473] Warning: "Input 0 was compiled as 16-bytes aligned, but it is not aligned at run time. Copying to an aligned tensor to guarantee correctness, but expect a performance hit." (function run_impl)
+[W1218 20:40:49.097118005 cdxn63ic7ggyjxgsm7ylanxntvodcwwexnbz4vnsl7tytks2biuh.cpp:4480] Warning: "Input 1 was compiled as 16-bytes aligned, but it is not aligned at run time. Copying to an aligned tensor to guarantee correctness, but expect a performance hit." (function run_impl)
+[W1218 20:40:49.108051462 cdxn63ic7ggyjxgsm7ylanxntvodcwwexnbz4vnsl7tytks2biuh.cpp:4473] Warning: "Input 0 was compiled as 16-bytes aligned, but it is not aligned at run time. Copying to an aligned tensor to guarantee correctness, but expect a performance hit." (function run_impl)
+[W1218 20:40:49.108127290 cdxn63ic7ggyjxgsm7ylanxntvodcwwexnbz4vnsl7tytks2biuh.cpp:4480] Warning: "Input 1 was compiled as 16-bytes aligned, but it is not aligned at run time. Copying to an aligned tensor to guarantee correctness, but expect a performance hit." (function run_impl)
+[W1218 20:40:49.119061657 cdxn63ic7ggyjxgsm7ylanxntvodcwwexnbz4vnsl7tytks2biuh.cpp:4473] Warning: "Input 0 was compiled as 16-bytes aligned, but it is not aligned at run time. Copying to an aligned tensor to guarantee correctness, but expect a performance hit." (function run_impl)
+[W1218 20:40:49.119144540 cdxn63ic7ggyjxgsm7ylanxntvodcwwexnbz4vnsl7tytks2biuh.cpp:4480] Warning: "Input 1 was compiled as 16-bytes aligned, but it is not aligned at run time. Copying to an aligned tensor to guarantee correctness, but expect a performance hit." (function run_impl)
+[W1218 20:40:49.141118668 cdxn63ic7ggyjxgsm7ylanxntvodcwwexnbz4vnsl7tytks2biuh.cpp:4473] Warning: "Input 0 was compiled as 16-bytes aligned, but it is not aligned at run time. Copying to an aligned tensor to guarantee correctness, but expect a performance hit." (function run_impl)
+[W1218 20:40:49.141200457 cdxn63ic7ggyjxgsm7ylanxntvodcwwexnbz4vnsl7tytks2biuh.cpp:4480] Warning: "Input 1 was compiled as 16-bytes aligned, but it is not aligned at run time. Copying to an aligned tensor to guarantee correctness, but expect a performance hit." (function run_impl)
+
+========================================
+
+Once upon a time, in a land far, far away, there was a small village nestled in the rolling hills of the countryside. The villagers lived simple lives, working the land and tending to their families. But amidst the tranquility, a sense of unease settled over the village. People began to disappear, one by one, without explanation or warning.
+At first, the villagers thought it was just the usual wandering off or moving to a nearby town, but as the days passed and more people vanished, they realized something was terribly wrong. No one knew where they had gone or who might be responsible.
+A young girl named Lily, with a heart full of courage and a spirit that refused to be tamed, decided to take matters into her own hands. She had lost her mother in one of the disappearances and was determined to find out what had happened to her and the others.
+Lily set out early one morning, armed with a small satchel of provisions and a fierce determination. She walked
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~                
+Generated 199 tokens                 
+Time for inference 2: 2.3838 sec total                 
+Time to first token: 0.0575 sec with sequential prefill.                
+
+      Total throughput: 83.8986 tokens/sec, 0.0119 s/token                 
+First token throughput: 17.3975 tokens/sec, 0.0575 s/token                 
+ Next token throughput: 85.5417 tokens/sec, 0.0117 s/token                     
+
+Bandwidth achieved: 0.00 GB/s
+[W1218 20:40:51.481301973 cdxn63ic7ggyjxgsm7ylanxntvodcwwexnbz4vnsl7tytks2biuh.cpp:4473] Warning: "Input 0 was compiled as 16-bytes aligned, but it is not aligned at run time. Copying to an aligned tensor to guarantee correctness, but expect a performance hit." (function run_impl)
+[W1218 20:40:51.481444721 cdxn63ic7ggyjxgsm7ylanxntvodcwwexnbz4vnsl7tytks2biuh.cpp:4480] Warning: "Input 1 was compiled as 16-bytes aligned, but it is not aligned at run time. Copying to an aligned tensor to guarantee correctness, but expect a performance hit." (function run_impl)
+[W1218 20:40:51.492388335 cdxn63ic7ggyjxgsm7ylanxntvodcwwexnbz4vnsl7tytks2biuh.cpp:4473] Warning: "Input 0 was compiled as 16-bytes aligned, but it is not aligned at run time. Copying to an aligned tensor to guarantee correctness, but expect a performance hit." (function run_impl)
+[W1218 20:40:51.492505894 cdxn63ic7ggyjxgsm7ylanxntvodcwwexnbz4vnsl7tytks2biuh.cpp:4480] Warning: "Input 1 was compiled as 16-bytes aligned, but it is not aligned at run time. Copying to an aligned tensor to guarantee correctness, but expect a performance hit." (function run_impl)
+[W1218 20:40:51.503454922 cdxn63ic7ggyjxgsm7ylanxntvodcwwexnbz4vnsl7tytks2biuh.cpp:4473] Warning: "Input 0 was compiled as 16-bytes aligned, but it is not aligned at run time. Copying to an aligned tensor to guarantee correctness, but expect a performance hit." (function run_impl)
+[W1218 20:40:51.503565765 cdxn63ic7ggyjxgsm7ylanxntvodcwwexnbz4vnsl7tytks2biuh.cpp:4480] Warning: "Input 1 was compiled as 16-bytes aligned, but it is not aligned at run time. Copying to an aligned tensor to guarantee correctness, but expect a performance hit." (function run_impl)
+[W1218 20:40:51.525511259 cdxn63ic7ggyjxgsm7ylanxntvodcwwexnbz4vnsl7tytks2biuh.cpp:4473] Warning: "Input 0 was compiled as 16-bytes aligned, but it is not aligned at run time. Copying to an aligned tensor to guarantee correctness, but expect a performance hit." (function run_impl)
+[W1218 20:40:51.525665615 cdxn63ic7ggyjxgsm7ylanxntvodcwwexnbz4vnsl7tytks2biuh.cpp:4480] Warning: "Input 1 was compiled as 16-bytes aligned, but it is not aligned at run time. Copying to an aligned tensor to guarantee correctness, but expect a performance hit." (function run_impl)
+
+========================================
+
+Once upon a time, there was a very old man who lived in a small village nestled in the mountains. This old man was known throughout the village as the wisest and most knowledgeable person in the community. Everyone would go to him for advice, guidance, and to learn from his vast knowledge. But what made this old man truly unique was that he was also a master storyteller.
+The old man's stories were not just simple tales to entertain the villagers; they were full of wisdom, love, and deep understanding of the world and the human condition. His stories were a way of sharing his life experiences, guiding the villagers, and helping them to understand themselves and the world around them better.
+As the villagers would sit around the fire in the evening, listening to the old man's stories, they would feel a sense of peace, calmness, and connection with each other and with the world. The old man's stories were not only entertaining but also transformative.
+One evening, a young boy named Kaito
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~                
+Generated 199 tokens                 
+Time for inference 3: 2.3953 sec total                 
+Time to first token: 0.0582 sec with sequential prefill.                
+
+      Total throughput: 83.4967 tokens/sec, 0.0120 s/token                 
+First token throughput: 17.1732 tokens/sec, 0.0582 s/token                 
+ Next token throughput: 85.1492 tokens/sec, 0.0117 s/token                     
+
+Bandwidth achieved: 0.00 GB/s
+
+========================================
+
+
+Warning: Excluding compile in calculations                 
+      Average tokens/sec (total): 82.35                 
+Average tokens/sec (first token): 13.40                 
+Average tokens/sec (next tokens): 85.32 
+                
+Memory used: 0.05 GB
diff --git a/llama31-1218/cuda_compile_4.txt b/llama31-1218/cuda_compile_4.txt
new file mode 100644
index 000000000..18abc4559
--- /dev/null
+++ b/llama31-1218/cuda_compile_4.txt
@@ -0,0 +1,72 @@
+
+OMP_NUM_THREADS=16 numactl --cpunodebind=0 --membind=0 python3 torchchat.py generate llama3.1 --quantize '{"linear:int4": {"groupsize": 256}, "precision": {"dtype":"bfloat16"}, "executor":{"accelerator":"cuda"}}' --prompt "Once upon a time," --max-new-tokens 200 --compile --num-samples 3
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+OMP_NUM_THREADS=16 numactl --cpunodebind=0 --membind=0 python3 torchchat.py generate llama3.1 --quantize '{"linear:int4": {"groupsize": 256}, "precision": {"dtype":"bfloat16"}, "executor":{"accelerator":"cuda"}}' --prompt "Once upon a time," --max-new-tokens 200 --compile --num-samples 3
+PyTorch version 2.6.0.dev20241218+cu124 available.
+Unabled to import torchao experimental quant_api with error:  [Errno 2] No such file or directory: '/home/jackkhuu/oss/torchchat/torchao-build/src/ao/torchao/experimental/quant_api.py'
+Using device=cuda NVIDIA PG509-210
+Loading model...
+Time to load model: 6.13 seconds
+Quantizing the model with: {'linear:int4': {'groupsize': 256}, 'precision': {'dtype': 'bfloat16'}, 'executor': {'accelerator': 'cuda'}}
+Time to quantize model: 0.53 seconds
+-----------------------------------------------------------
+Once upon a time, there was a beautiful village surrounded by dense forests and rolling hills. The villagers lived in harmony with nature, respecting the land and its resources. They built their homes with natural materials, planted trees and flowers, and worked in the fields from dawn till dusk.
+One day, a young boy named Kaito wandered into the forest, curious about the strange sounds and sights. He had heard stories of a magical spring that bubbled out of the earth, but couldn't find it anywhere. His friends teased him for being too lazy, too slow, or too weak to find it.
+Kaito wouldn't give up. He kept exploring until the sun began to set, and even as the stars started to twinkle in the sky. He followed his instincts and sense of smell, through the forest, until he stumbled upon a small clearing. At its center stood an enormous tree, its branches like giant arms embracing the sky.
+In the tree's trunk was a small entrance, guarded by a wise
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~                
+Generated 199 tokens                 
+Time for inference 1: 257.7957 sec total                 
+Time to first token: 0.3057 sec with parallel prefill.                
+
+      Total throughput: 0.7758 tokens/sec, 1.2890 s/token                 
+First token throughput: 3.2709 tokens/sec, 0.3057 s/token                 
+ Next token throughput: 0.7728 tokens/sec, 1.2939 s/token                     
+
+Bandwidth achieved: 12.46 GB/s
+*** This first iteration will include cold start effects for dynamic import, hardware caches, JIT compilation. ***
+just-in-time compilation time (incl run time): 2.6e+02 seconds
+
+========================================
+
+Once upon a time, there was a young girl named Lily. She was a shy and quiet child who rarely spoke up in class or in front of her family. Her mother, a kind and patient woman, tried to encourage Lily to be more outgoing and talkative, but Lily just couldn’t seem to muster up the courage.
+One day, Lily’s teacher, Mrs. Johnson, announced that the class would be putting on a school play. Lily was intrigued by the idea and decided to sign up. Mrs. Johnson was thrilled and began to assign roles to the students.
+Lily was surprised to find out that she was chosen to play the lead role in the play. She was hesitant at first, but with Mrs. Johnson’s encouragement, she began to practice her lines and rehearse her performance.
+As the big night approached, Lily became more and more anxious. She was afraid of making mistakes and feared that everyone would laugh at her. Mrs. Johnson saw Lily’s distress and took her aside for a talk.
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~                
+Generated 199 tokens                 
+Time for inference 2: 5.3577 sec total                 
+Time to first token: 0.0916 sec with parallel prefill.                
+
+      Total throughput: 37.3294 tokens/sec, 0.0268 s/token                 
+First token throughput: 10.9147 tokens/sec, 0.0916 s/token                 
+ Next token throughput: 37.7889 tokens/sec, 0.0265 s/token                     
+
+Bandwidth achieved: 599.53 GB/s
+
+========================================
+
+Once upon a time, there were three friends, Alex, Ben and Charlie. They were all high school students, and they decided to have a road trip to Las Vegas one summer. The trip was planned to be a 5-day long journey of fun and excitement.
+Day 1: The group set off in the morning from their hometown in a car, with snacks, music, and a map to guide them. The temperature was hot, but they were excited to start their adventure. They drove for 6 hours before reaching their first stop, the Grand Canyon. They spent the day exploring the Canyon, taking pictures and gazing at the majestic beauty. In the evening, they headed to their hotel, and had a dinner of burgers and french fries.
+Day 2: The next morning, they started driving to Las Vegas, a 4-hour drive from their previous night’s hotel. On their way, they stopped at a rest stop where a group of musicians were performing. The trio enjoyed the music and even
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~                
+Generated 199 tokens                 
+Time for inference 3: 3.4664 sec total                 
+Time to first token: 0.0975 sec with parallel prefill.                
+
+      Total throughput: 57.6975 tokens/sec, 0.0173 s/token                 
+First token throughput: 10.2556 tokens/sec, 0.0975 s/token                 
+ Next token throughput: 59.0707 tokens/sec, 0.0169 s/token                     
+
+Bandwidth achieved: 926.65 GB/s
+
+========================================
+
+
+Warning: Excluding compile in calculations                 
+      Average tokens/sec (total): 47.51                 
+Average tokens/sec (first token): 10.59                 
+Average tokens/sec (next tokens): 48.43 
+                
+Memory used: 24.75 GB
diff --git a/llama31-1218/cuda_compile_8.txt b/llama31-1218/cuda_compile_8.txt
new file mode 100644
index 000000000..7a55adde2
--- /dev/null
+++ b/llama31-1218/cuda_compile_8.txt
@@ -0,0 +1,72 @@
+
+OMP_NUM_THREADS=16 numactl --cpunodebind=0 --membind=0 python3 torchchat.py generate llama3.1 --quantize '{"linear:int8": {"groupsize": 0}, "precision": {"dtype":"bfloat16"}, "executor":{"accelerator":"cuda"}}' --prompt "Once upon a time," --max-new-tokens 200 --compile --num-samples 3
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+OMP_NUM_THREADS=16 numactl --cpunodebind=0 --membind=0 python3 torchchat.py generate llama3.1 --quantize '{"linear:int8": {"groupsize": 0}, "precision": {"dtype":"bfloat16"}, "executor":{"accelerator":"cuda"}}' --prompt "Once upon a time," --max-new-tokens 200 --compile --num-samples 3
+PyTorch version 2.6.0.dev20241218+cu124 available.
+Unabled to import torchao experimental quant_api with error:  [Errno 2] No such file or directory: '/home/jackkhuu/oss/torchchat/torchao-build/src/ao/torchao/experimental/quant_api.py'
+Using device=cuda NVIDIA PG509-210
+Loading model...
+Time to load model: 5.76 seconds
+Quantizing the model with: {'linear:int8': {'groupsize': 0}, 'precision': {'dtype': 'bfloat16'}, 'executor': {'accelerator': 'cuda'}}
+Time to quantize model: 0.41 seconds
+-----------------------------------------------------------
+Once upon a time, there was a young professional woman named Sarah who was struggling to start her career. She had recently graduated with a degree in business and was eager to get a feel for the corporate world. After several weeks of applying to jobs and attending networking events, Sarah found herself feeling discouraged and unsure if she was cut out for the fast-paced business world.
+One day, while browsing through a local bookstore, Sarah stumbled upon a self-help book titled, “The Power of Vulnerability: A Guide to Achieving Success by Embracing Your True Self.” On a whim, she purchased the book and began reading it on her commute to work.
+As she turned the pages, Sarah discovered a wealth of advice and inspiration that resonated deeply with her. The author, a successful businesswoman who had faced many challenges throughout her career, wrote about the importance of embracing vulnerability, taking risks, and being true to oneself.
+Sarah was particularly drawn to the author’s insights on how to build meaningful relationships with colleagues and mentors
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~                
+Generated 199 tokens                 
+Time for inference 1: 248.5232 sec total                 
+Time to first token: 0.5963 sec with parallel prefill.                
+
+      Total throughput: 0.8048 tokens/sec, 1.2426 s/token                 
+First token throughput: 1.6771 tokens/sec, 0.5963 s/token                 
+ Next token throughput: 0.8027 tokens/sec, 1.2459 s/token                     
+
+Bandwidth achieved: 6.89 GB/s
+*** This first iteration will include cold start effects for dynamic import, hardware caches, JIT compilation. ***
+just-in-time compilation time (incl run time): 2.5e+02 seconds
+
+========================================
+
+Once upon a time, in a land far, far away, there lived a humble baker named Max. Max had a small bakery on the corner of a bustling street, where he spent his days kneading dough, baking bread, and making pastries that made everyone’s mouth water.
+Max loved his job, but he had a secret: he was tired of making the same old recipes over and over again. He longed to create something new, something exciting, something that would make people talk. So, one day, he decided to take a risk and try something entirely different.
+Max had always been fascinated by the world of science and chemistry, and he had a curiosity about how different ingredients reacted with each other. He spent hours in his bakery, experimenting with different combinations of ingredients, testing his theories, and refining his recipes.
+As he mixed and matched, Max stumbled upon an innovative creation: a croissant-doughnut hybrid that he called the “Croissant-nut.” The Croissant-nut was a
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~                
+Generated 199 tokens                 
+Time for inference 2: 4.3505 sec total                 
+Time to first token: 0.0815 sec with parallel prefill.                
+
+      Total throughput: 45.9718 tokens/sec, 0.0218 s/token                 
+First token throughput: 12.2774 tokens/sec, 0.0815 s/token                 
+ Next token throughput: 46.6146 tokens/sec, 0.0215 s/token                     
+
+Bandwidth achieved: 393.47 GB/s
+
+========================================
+
+Once upon a time, I was a young and ambitious journalist, eager to make a name for myself in the world of investigative reporting. I had just landed a job at a small, independent newspaper in a city known for its rich history and complex politics.
+My editor, a seasoned journalist with a keen eye for detail, assigned me to cover a story about a local businessman who was embroiled in a scandal. The businessman, a wealthy and influential man, had been accused of embezzling funds from his company and using them to finance his lavish lifestyle.
+I threw myself into the investigation, pouring over financial records and conducting interviews with sources who had come forward to speak out against the businessman. As I dug deeper, I began to uncover a web of deceit and corruption that went all the way to the top of the city's power structure.
+But as I continued to investigate, I started to receive strange phone calls and messages. They were from an unknown number, and the messages were always brief and cryptic.
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~                
+Generated 199 tokens                 
+Time for inference 3: 2.1550 sec total                 
+Time to first token: 0.0772 sec with parallel prefill.                
+
+      Total throughput: 92.8085 tokens/sec, 0.0108 s/token                 
+First token throughput: 12.9556 tokens/sec, 0.0772 s/token                 
+ Next token throughput: 95.7749 tokens/sec, 0.0104 s/token                     
+
+Bandwidth achieved: 794.34 GB/s
+
+========================================
+
+
+Warning: Excluding compile in calculations                 
+      Average tokens/sec (total): 69.39                 
+Average tokens/sec (first token): 12.62                 
+Average tokens/sec (next tokens): 71.19 
+                
+Memory used: 28.81 GB
diff --git a/llama31-1218/cuda_compile_b16.txt b/llama31-1218/cuda_compile_b16.txt
new file mode 100644
index 000000000..2d06f469a
--- /dev/null
+++ b/llama31-1218/cuda_compile_b16.txt
@@ -0,0 +1,74 @@
+
+OMP_NUM_THREADS=16 numactl --cpunodebind=0 --membind=0 python3 torchchat.py generate llama3.1 --quantize '{"precision": {"dtype":"bfloat16"}, "executor":{"accelerator":"cuda"}}' --prompt "Once upon a time," --max-new-tokens 200 --compile --num-samples 3
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+OMP_NUM_THREADS=16 numactl --cpunodebind=0 --membind=0 python3 torchchat.py generate llama3.1 --quantize '{"precision": {"dtype":"bfloat16"}, "executor":{"accelerator":"cuda"}}' --prompt "Once upon a time," --max-new-tokens 200 --compile --num-samples 3
+PyTorch version 2.6.0.dev20241218+cu124 available.
+Unabled to import torchao experimental quant_api with error:  [Errno 2] No such file or directory: '/home/jackkhuu/oss/torchchat/torchao-build/src/ao/torchao/experimental/quant_api.py'
+Using device=cuda NVIDIA PG509-210
+Loading model...
+Time to load model: 5.79 seconds
+Quantizing the model with: {'precision': {'dtype': 'bfloat16'}, 'executor': {'accelerator': 'cuda'}}
+Time to quantize model: 0.01 seconds
+-----------------------------------------------------------
+Once upon a time, in a small village nestled in the rolling hills of the countryside, there lived a young girl named Sophie. Sophie was a curious and adventurous child, always eager to explore the world around her. One day, while wandering through the village, Sophie stumbled upon a small, mysterious shop tucked away on a quiet street. The sign above the door read "Curios and Wonders," and the windows were filled with an assortment of odd and fascinating items.
+
+Sophie's curiosity was piqued, and she pushed open the door to venture inside. The shop was dimly lit, with shelves upon shelves of peculiar objects that seemed to stretch up to the ceiling. There were vintage dolls with glassy eyes, antique clockwork mechanisms, and strange, glowing orbs that seemed to pulse with an otherworldly energy.
+
+At the back of the shop, behind a counter made of polished wood, stood an old man with a kind face and a twinkle in his eye. He introduced himself as Mr. Jenkins
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~                
+Generated 199 tokens                 
+Time for inference 1: 223.4696 sec total                 
+Time to first token: 0.6419 sec with parallel prefill.                
+
+      Total throughput: 0.8950 tokens/sec, 1.1173 s/token                 
+First token throughput: 1.5580 tokens/sec, 0.6419 s/token                 
+ Next token throughput: 0.8931 tokens/sec, 1.1197 s/token                     
+
+Bandwidth achieved: 14.37 GB/s
+*** This first iteration will include cold start effects for dynamic import, hardware caches, JIT compilation. ***
+just-in-time compilation time (incl run time): 2.2e+02 seconds
+
+========================================
+
+Once upon a time, in a far-off land, there was a magnificent bird named Philemon. Philemon was a majestic creature with shimmering feathers as blue as the sky and a song as sweet as the morning dew. He lived in a lush forest where he would soar through the skies, singing his heart out, and spreading joy to all who heard him.
+
+Philemon was known throughout the land for his enchanting voice. People would travel from far and wide to hear him sing. His melodies were so captivating that they could lift the spirits of even the saddest of hearts. One day, a fierce storm rolled in, bringing with it strong winds and heavy rains. The forest was battered, and many of the birds took refuge in their nests, frightened by the turbulence.
+
+Philemon, however, did not flee. Instead, he flew out into the storm, his wings beating fiercely against the wind. With his powerful voice, he sang of courage and perseverance. He sang of hope and resilience
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~                
+Generated 199 tokens                 
+Time for inference 2: 4.3986 sec total                 
+Time to first token: 0.0631 sec with parallel prefill.                
+
+      Total throughput: 45.4693 tokens/sec, 0.0220 s/token                 
+First token throughput: 15.8387 tokens/sec, 0.0631 s/token                 
+ Next token throughput: 45.9008 tokens/sec, 0.0218 s/token                     
+
+Bandwidth achieved: 730.26 GB/s
+
+========================================
+
+Once upon a time, there was a couple, Manfred and Kristin, who thought they had finally found the perfect home. They had been house hunting for months, and their new home was in a beautiful neighborhood, had a spacious backyard, and more than enough space for their growing family.
+However, as they began to settle in, strange things started to happen. Doors would open by themselves, and Kristin would find things moved from their original place. Manfred would often wake up in the middle of the night to find himself alone, even though Kristin was right next to him. They would hear footsteps and whispers in the night.
+At first, they thought it was just the usual settling of an old house, but as the occurrences became more frequent and intense, they began to suspect that their new home was haunted.
+Manfred, being a skeptic, tried to find rational explanations for the strange events. He checked the house for any hidden rooms, secret passages, or other structural issues that could be causing the
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~                
+Generated 199 tokens                 
+Time for inference 3: 2.5217 sec total                 
+Time to first token: 0.0602 sec with parallel prefill.                
+
+      Total throughput: 79.3121 tokens/sec, 0.0126 s/token                 
+First token throughput: 16.6227 tokens/sec, 0.0602 s/token                 
+ Next token throughput: 80.8442 tokens/sec, 0.0124 s/token                     
+
+Bandwidth achieved: 1273.79 GB/s
+
+========================================
+
+
+Warning: Excluding compile in calculations                 
+      Average tokens/sec (total): 62.39                 
+Average tokens/sec (first token): 16.23                 
+Average tokens/sec (next tokens): 63.37 
+                
+Memory used: 16.91 GB
diff --git a/llama31-1218/cuda_eager_4.txt b/llama31-1218/cuda_eager_4.txt
new file mode 100644
index 000000000..44f2b5eac
--- /dev/null
+++ b/llama31-1218/cuda_eager_4.txt
@@ -0,0 +1,69 @@
+
+OMP_NUM_THREADS=16 numactl --cpunodebind=0 --membind=0 python3 torchchat.py generate llama3.1 --quantize '{"linear:int4": {"groupsize": 256}, "precision": {"dtype":"bfloat16"}, "executor":{"accelerator":"cuda"}}' --prompt "Once upon a time," --max-new-tokens 200 --num-samples 3
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+OMP_NUM_THREADS=16 numactl --cpunodebind=0 --membind=0 python3 torchchat.py generate llama3.1 --quantize '{"linear:int4": {"groupsize": 256}, "precision": {"dtype":"bfloat16"}, "executor":{"accelerator":"cuda"}}' --prompt "Once upon a time," --max-new-tokens 200 --num-samples 3
+PyTorch version 2.6.0.dev20241218+cu124 available.
+Unabled to import torchao experimental quant_api with error:  [Errno 2] No such file or directory: '/home/jackkhuu/oss/torchchat/torchao-build/src/ao/torchao/experimental/quant_api.py'
+Using device=cuda NVIDIA PG509-210
+Loading model...
+Time to load model: 6.25 seconds
+Quantizing the model with: {'linear:int4': {'groupsize': 256}, 'precision': {'dtype': 'bfloat16'}, 'executor': {'accelerator': 'cuda'}}
+Time to quantize model: 0.59 seconds
+-----------------------------------------------------------
+Once upon a time, in a small village nestled in the rolling hills of rural Ireland, there lived a young girl named Saoirse. She was a bright and curious child, with a mop of curly brown hair and a smile that could light up the darkest of rooms. Saoirse loved nothing more than to spend her days exploring the countryside, chasing after butterflies and watching the clouds drift lazily across the sky.
+One day, while wandering through the village, Saoirse stumbled upon a small, mysterious shop. The sign above the door read "Curios and Wonders," and the windows were filled with a dazzling array of strange and exotic treasures. There were glittering gemstones, ancient artifacts, and all manner of curious knick-knacks that Saoirse had never seen before.
+The shop was owned by a kind and enigmatic old man named Seamus, who welcomed Saoirse with a warm smile and invited her to come inside. As she browsed the shelves and displays, Saoirse found herself captivated by
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~                
+Generated 199 tokens                 
+Time for inference 1: 19.6475 sec total                 
+Time to first token: 0.3485 sec with parallel prefill.                
+
+      Total throughput: 10.1794 tokens/sec, 0.0982 s/token                 
+First token throughput: 2.8698 tokens/sec, 0.3485 s/token                 
+ Next token throughput: 10.3114 tokens/sec, 0.0970 s/token                     
+
+Bandwidth achieved: 163.49 GB/s
+*** This first iteration will include cold start effects for dynamic import, hardware caches. ***
+
+========================================
+
+Once upon a time, in the beautiful country of Nepal, there lived a young girl named Pooja. Pooja was a bright and curious child who loved nothing more than playing outside in the fresh air. She spent most of her days exploring the lush green forests and rolling hills of her village, collecting flowers and berries, and watching the birds and animals that lived there.
+One day, while wandering through the forest, Pooja stumbled upon a beautiful waterfall. The water cascaded down in a crystal clear stream, creating a misty veil that rose high into the air. Pooja was mesmerized by the sight and decided to follow the stream to its source. As she walked, the stream narrowed into a tiny brook that flowed gently through the rocks and moss. Pooja followed the brook until she came to a clearing, where she found a small cave.
+The cave was hidden behind the waterfall and could only be seen from a specific point. Inside, Pooja discovered a chest filled
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~                
+Generated 199 tokens                 
+Time for inference 2: 19.1163 sec total                 
+Time to first token: 0.0868 sec with parallel prefill.                
+
+      Total throughput: 10.4623 tokens/sec, 0.0956 s/token                 
+First token throughput: 11.5195 tokens/sec, 0.0868 s/token                 
+ Next token throughput: 10.4574 tokens/sec, 0.0956 s/token                     
+
+Bandwidth achieved: 168.03 GB/s
+
+========================================
+
+Once upon a time, there was a great king named Shishupal who ruled a beautiful kingdom surrounded by high mountains. He had a beautiful palace with a grandeur that would make anyone feel awestruck. His palace was filled with precious stones, golden ornaments, and an endless variety of delicious dishes that would make your mouth water.
+One sunny day, a wise old man came to King Shishupal’s palace. The king welcomed him with great warmth and asked him what he wanted. The wise old man replied, “King, I’ve heard that your kingdom is filled with wealth and precious stones. Is there any truth in what I’ve heard?”
+The king replied, “Of course there is. My kingdom has an abundance of precious stones. In fact, there’s a cave in the heart of these mountain ranges that’s filled with diamonds, rubies, and emeralds. The cave is said to be owned by the great sage, Bhishmapithamah.”
+The old wise man asked for
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~                
+Generated 199 tokens                 
+Time for inference 3: 18.9365 sec total                 
+Time to first token: 0.0846 sec with parallel prefill.                
+
+      Total throughput: 10.5616 tokens/sec, 0.0947 s/token                 
+First token throughput: 11.8202 tokens/sec, 0.0846 s/token                 
+ Next token throughput: 10.5559 tokens/sec, 0.0947 s/token                     
+
+Bandwidth achieved: 169.62 GB/s
+
+========================================
+
+
+Warning: Excluding compile in calculations                 
+      Average tokens/sec (total): 10.40                 
+Average tokens/sec (first token): 8.74                 
+Average tokens/sec (next tokens): 10.44 
+                
+Memory used: 24.80 GB
diff --git a/llama31-1218/cuda_eager_8.txt b/llama31-1218/cuda_eager_8.txt
new file mode 100644
index 000000000..9ae42a718
--- /dev/null
+++ b/llama31-1218/cuda_eager_8.txt
@@ -0,0 +1,70 @@
+
+OMP_NUM_THREADS=16 numactl --cpunodebind=0 --membind=0 python3 torchchat.py generate llama3.1 --quantize '{"linear:int8": {"groupsize": 0}, "precision": {"dtype":"bfloat16"}, "executor":{"accelerator":"cuda"}}' --prompt "Once upon a time," --max-new-tokens 200 --num-samples 3
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+OMP_NUM_THREADS=16 numactl --cpunodebind=0 --membind=0 python3 torchchat.py generate llama3.1 --quantize '{"linear:int8": {"groupsize": 0}, "precision": {"dtype":"bfloat16"}, "executor":{"accelerator":"cuda"}}' --prompt "Once upon a time," --max-new-tokens 200 --num-samples 3
+PyTorch version 2.6.0.dev20241218+cu124 available.
+Unabled to import torchao experimental quant_api with error:  [Errno 2] No such file or directory: '/home/jackkhuu/oss/torchchat/torchao-build/src/ao/torchao/experimental/quant_api.py'
+Using device=cuda NVIDIA PG509-210
+Loading model...
+Time to load model: 6.10 seconds
+Quantizing the model with: {'linear:int8': {'groupsize': 0}, 'precision': {'dtype': 'bfloat16'}, 'executor': {'accelerator': 'cuda'}}
+Time to quantize model: 0.42 seconds
+-----------------------------------------------------------
+Once upon a time, in a small village nestled in the rolling hills of the countryside, there lived a young girl named Lily. Lily was a curious and adventurous child, always eager to explore the world around her. She loved nothing more than to wander through the village, discovering new sights, sounds, and smells.
+One day, while out exploring, Lily stumbled upon a small, mysterious shop tucked away on a quiet street. The sign above the door read "Curios and Wonders," and the windows were filled with all manner of strange and fascinating objects.
+Lily's curiosity was piqued, and she pushed open the door to reveal a dimly lit interior filled with rows upon rows of shelves, each one packed with an assortment of peculiar items. There were vintage toys, antique furniture, and even what appeared to be a taxidermied owl perched on a shelf, watching her every move.
+As she wandered deeper into the shop, Lily came across a small, glass-fronted cabinet filled with an
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~                
+Generated 199 tokens                 
+Time for inference 1: 16.5574 sec total                 
+Time to first token: 0.5756 sec with parallel prefill.                
+
+      Total throughput: 12.0792 tokens/sec, 0.0828 s/token                 
+First token throughput: 1.7373 tokens/sec, 0.5756 s/token                 
+ Next token throughput: 12.4517 tokens/sec, 0.0803 s/token                     
+
+Bandwidth achieved: 103.38 GB/s
+*** This first iteration will include cold start effects for dynamic import, hardware caches. ***
+
+========================================
+
+Once upon a time, I had the pleasure of photographing the newlywed couple, Ashley and Drew for their engagement session in downtown Boise. It was a beautiful day filled with laughter, exploring, and capturing some amazing moments between these two. Their love for each other shines through in every photo we took that day. I love how their personalities and quirkiness came through, and I was so happy to be a part of their special moments.
+Ashley and Drew were a blast to work with. They had such a great dynamic and natural chemistry. From the moment we started shooting, they both had a infectious energy that made the whole experience so enjoyable. They laughed, joked, and even did a little jig or two, which made my job so much easier. It was clear that they have found their partner in crime and that their love for each other is the real deal.
+One of my favorite parts of the session was when we were walking through the city streets and they stumbled upon a quirky little shop with a vintage
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~                
+Generated 199 tokens                 
+Time for inference 2: 16.1635 sec total                 
+Time to first token: 0.0700 sec with parallel prefill.                
+
+      Total throughput: 12.3736 tokens/sec, 0.0808 s/token                 
+First token throughput: 14.2776 tokens/sec, 0.0700 s/token                 
+ Next token throughput: 12.3653 tokens/sec, 0.0809 s/token                     
+
+Bandwidth achieved: 105.90 GB/s
+
+========================================
+
+Once upon a time, in a world not so different from our own, there was a small village nestled in the mountains. The villagers lived simple lives, farming, hunting, and trading among themselves. They were a tight-knit community, looking out for one another and relying on each other's skills to survive.
+One day, a stranger arrived in the village. He was tall, dark-haired, and had piercing green eyes that seemed to see right through you. He wore a long, black coat with a hood that cast a shadow over his face, making him look mysterious and otherworldly.
+At first, the villagers were wary of the stranger. They didn't know where he came from or what his intentions were. But as he began to help them with their daily tasks, they started to feel more at ease around him.
+The stranger, who introduced himself as Eryndor Thorne, was an expert hunter and tracker. He helped the villagers catch their prey, fix their traps, and even taught them
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~                
+Generated 199 tokens                 
+Time for inference 3: 16.0147 sec total                 
+Time to first token: 0.0755 sec with parallel prefill.                
+
+      Total throughput: 12.4886 tokens/sec, 0.0801 s/token                 
+First token throughput: 13.2400 tokens/sec, 0.0755 s/token                 
+ Next token throughput: 12.4850 tokens/sec, 0.0801 s/token                     
+
+Bandwidth achieved: 106.89 GB/s
+
+========================================
+
+
+Warning: Excluding compile in calculations                 
+      Average tokens/sec (total): 12.31                 
+Average tokens/sec (first token): 9.75                 
+Average tokens/sec (next tokens): 12.43 
+                
+Memory used: 28.86 GB
diff --git a/llama31-1218/cuda_eager_b16.txt b/llama31-1218/cuda_eager_b16.txt
new file mode 100644
index 000000000..b72911055
--- /dev/null
+++ b/llama31-1218/cuda_eager_b16.txt
@@ -0,0 +1,73 @@
+
+OMP_NUM_THREADS=16 numactl --cpunodebind=0 --membind=0 python3 torchchat.py generate llama3.1 --quantize '{"precision": {"dtype":"bfloat16"}, "executor":{"accelerator":"cuda"}}' --prompt "Once upon a time," --max-new-tokens 200 --num-samples 3
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+OMP_NUM_THREADS=16 numactl --cpunodebind=0 --membind=0 python3 torchchat.py generate llama3.1 --quantize '{"precision": {"dtype":"bfloat16"}, "executor":{"accelerator":"cuda"}}' --prompt "Once upon a time," --max-new-tokens 200 --num-samples 3
+PyTorch version 2.6.0.dev20241218+cu124 available.
+Unabled to import torchao experimental quant_api with error:  [Errno 2] No such file or directory: '/home/jackkhuu/oss/torchchat/torchao-build/src/ao/torchao/experimental/quant_api.py'
+Using device=cuda NVIDIA PG509-210
+Loading model...
+Time to load model: 8.50 seconds
+Quantizing the model with: {'precision': {'dtype': 'bfloat16'}, 'executor': {'accelerator': 'cuda'}}
+Time to quantize model: 0.01 seconds
+-----------------------------------------------------------
+Once upon a time, in a small village nestled in the rolling hills of rural France, there lived a young girl named Sophie. Sophie was a curious and adventurous child, with a passion for exploring the world around her.
+One day, while wandering through the village, Sophie stumbled upon a beautiful, old-fashioned clock shop. The sign above the door read "Monsieur LeFleur's Clocks," and the windows were filled with all sorts of fascinating timekeeping devices.
+Sophie's eyes widened as she pushed open the door and stepped inside. The shop was dimly lit, with soft candles flickering on the shelves, casting a warm glow over the intricate clockwork mechanisms on display.
+Monsieur LeFleur, the shop's owner, looked up from his workbench and smiled at Sophie. "Bonjour, mademoiselle," he said, his voice warm and welcoming. "Welcome to my humble shop. How may I assist you today?"
+Sophie's eyes scanned the shelves, taking in the
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~                
+Generated 199 tokens                 
+Time for inference 1: 12.8346 sec total                 
+Time to first token: 0.4995 sec with parallel prefill.                
+
+      Total throughput: 15.5829 tokens/sec, 0.0642 s/token                 
+First token throughput: 2.0019 tokens/sec, 0.4995 s/token                 
+ Next token throughput: 16.1329 tokens/sec, 0.0620 s/token                     
+
+Bandwidth achieved: 250.27 GB/s
+*** This first iteration will include cold start effects for dynamic import, hardware caches. ***
+
+========================================
+
+Once upon a time, there was a magical kingdom hidden deep within a dense forest. This kingdom was ruled by a wise and just king who loved his people deeply. However, a dark force had begun to spread across the land, threatening to destroy everything the king had worked to build.
+The king knew that he needed to find a way to defeat this dark force, but he was faced with a problem. He had a daughter, a beautiful and brave princess, who had grown up with the knowledge of how to harness her magical powers. However, the king was afraid to let his daughter go on a quest to defeat the dark force, as he did not want to put her in harm's way.
+One day, a wise old wizard came to the kingdom and offered to help the king defeat the dark force. The wizard told the king that he had a plan, but it would require the king to let his daughter go on a quest. The king was hesitant, but the wizard assured him that his daughter was the only
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~                
+Generated 199 tokens                 
+Time for inference 2: 12.3767 sec total                 
+Time to first token: 0.0551 sec with parallel prefill.                
+
+      Total throughput: 16.1593 tokens/sec, 0.0619 s/token                 
+First token throughput: 18.1489 tokens/sec, 0.0551 s/token                 
+ Next token throughput: 16.1504 tokens/sec, 0.0619 s/token                     
+
+Bandwidth achieved: 259.53 GB/s
+
+========================================
+
+Once upon a time, in the quaint town of Willow Creek, there lived a young girl named Emily. Emily was a curious and adventurous child, with a heart full of wonder and a mind full of questions. She loved to explore the world around her, and her favorite thing to do was to wander through the woods that surrounded her town.
+One day, as Emily was walking through the woods, she stumbled upon a hidden path she had never seen before. The path was overgrown with weeds and vines, and it looked like it hadn’t been traveled in a long time. But something about it drew Emily in, and she felt compelled to follow it.
+
+As she made her way down the path, the trees grew closer together, and the air grew thick with the scent of damp earth and decaying leaves. Emily felt a shiver run down her spine, but she pressed on, her curiosity getting the better of her.
+
+The path twisted and turned, leading Emily deeper and deeper into the woods. She began to feel
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~                
+Generated 199 tokens                 
+Time for inference 3: 13.2078 sec total                 
+Time to first token: 0.0775 sec with parallel prefill.                
+
+      Total throughput: 15.1426 tokens/sec, 0.0660 s/token                 
+First token throughput: 12.8965 tokens/sec, 0.0775 s/token                 
+ Next token throughput: 15.1559 tokens/sec, 0.0660 s/token                     
+
+Bandwidth achieved: 243.20 GB/s
+
+========================================
+
+
+Warning: Excluding compile in calculations                 
+      Average tokens/sec (total): 15.63                 
+Average tokens/sec (first token): 11.02                 
+Average tokens/sec (next tokens): 15.81 
+                
+Memory used: 16.68 GB
diff --git a/scripts/benchmarking/benchmarking_linux.sh b/scripts/benchmarking/benchmarking_linux.sh
new file mode 100755
index 000000000..99e47751e
--- /dev/null
+++ b/scripts/benchmarking/benchmarking_linux.sh
@@ -0,0 +1,254 @@
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+# Customize what is being run
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+DRY_RUN=0
+
+RUN_CUDA_EAGER=1
+RUN_CUDA_COMPILE=1
+RUN_CUDA_AOTI=1
+RUN_CUDA_AOTI_PT2=1
+
+RUN_CPU_EAGER=1
+RUN_CPU_COMPILE=1
+RUN_CPU_AOTI=1
+RUN_CPU_AOTI_PT2=1
+
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+# Check and Set Up Args (model, out_directory)
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+if [ $# -ne 2 ]; then
+  echo "Please provide (1) model and (2) directory as positional arguments"
+  exit 1
+fi
+
+model=$1
+dir=$2
+
+mkdir -p $dir
+
+
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+# Helpers
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+# Env Variables for Running Commands
+ENV_VARIABLE="OMP_NUM_THREADS=16 numactl --cpunodebind=0 --membind=0"
+
+# Function for printing and writing to files
+function formatted_export_and_generate {
+  local file="$dir/$1"
+  local generate_cmd="${ENV_VARIABLE} $2"
+  local compile_cmd="$3"
+
+  # Write Commands to the top of the output file
+  echo $compile_cmd > $file
+  echo $generate_cmd >> $file
+
+  echo "Writing to: ${file}"
+
+  # Export the Model
+  if [ ! -z "$compile_cmd" ]; then
+    echo "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" >> $file
+    echo "$compile_cmd" | tee -a $file
+    if [ $DRY_RUN -eq 0 ]; then
+      eval $compile_cmd &>> $file
+    fi
+  fi
+
+  # Generate using the Model
+  echo "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" >> $file
+  echo $generate_cmd | tee -a $file
+  if [ $DRY_RUN -eq 0 ]; then
+    eval $generate_cmd &>> $file
+  fi
+  echo
+}
+
+
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+# Cuda eager
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+if [ $RUN_CUDA_EAGER -eq 1 ]; then
+  echo "Cuda eager b16"
+  generate_cmd="python3 torchchat.py generate $model --quantize '{\"precision\": {\"dtype\":\"bfloat16\"}, \"executor\":{\"accelerator\":\"cuda\"}}' --prompt \"Once upon a time,\" --max-new-tokens 200 --num-samples 3"
+  file="cuda_eager_b16.txt"
+  formatted_export_and_generate "$file" "$generate_cmd"
+
+  echo "Cuda eager int8"
+  generate_cmd="python3 torchchat.py generate $model --quantize '{\"linear:int8\": {\"groupsize\": 0}, \"precision\": {\"dtype\":\"bfloat16\"}, \"executor\":{\"accelerator\":\"cuda\"}}' --prompt \"Once upon a time,\" --max-new-tokens 200 --num-samples 3"
+  file="cuda_eager_8.txt"
+  formatted_export_and_generate "$file" "$generate_cmd"
+
+  echo "Cuda eager int4"
+  generate_cmd="python3 torchchat.py generate $model --quantize '{\"linear:int4\": {\"groupsize\": 256}, \"precision\": {\"dtype\":\"bfloat16\"}, \"executor\":{\"accelerator\":\"cuda\"}}' --prompt \"Once upon a time,\" --max-new-tokens 200 --num-samples 3"
+  file="cuda_eager_4.txt"
+  formatted_export_and_generate "$file" "$generate_cmd"
+fi
+
+
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+# Cuda compile
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+if [ $RUN_CUDA_COMPILE -eq 1 ]; then
+  echo "Cuda compile b16"
+  generate_cmd="python3 torchchat.py generate $model --quantize '{\"precision\": {\"dtype\":\"bfloat16\"}, \"executor\":{\"accelerator\":\"cuda\"}}' --prompt \"Once upon a time,\" --max-new-tokens 200 --compile --num-samples 3"
+  file="cuda_compile_b16.txt"
+  formatted_export_and_generate "$file" "$generate_cmd"
+
+  echo "Cuda compile int8"
+  generate_cmd="python3 torchchat.py generate $model --quantize '{\"linear:int8\": {\"groupsize\": 0}, \"precision\": {\"dtype\":\"bfloat16\"}, \"executor\":{\"accelerator\":\"cuda\"}}' --prompt \"Once upon a time,\" --max-new-tokens 200 --compile --num-samples 3"
+  file="cuda_compile_8.txt"
+  formatted_export_and_generate "$file" "$generate_cmd"
+
+  echo "Cuda compile int4"
+  generate_cmd="python3 torchchat.py generate $model --quantize '{\"linear:int4\": {\"groupsize\": 256}, \"precision\": {\"dtype\":\"bfloat16\"}, \"executor\":{\"accelerator\":\"cuda\"}}' --prompt \"Once upon a time,\" --max-new-tokens 200 --compile --num-samples 3"
+  file="cuda_compile_4.txt"
+  formatted_export_and_generate "$file" "$generate_cmd"
+fi
+
+
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+# CPU eager
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+if [ $RUN_CPU_EAGER -eq 1 ]; then
+  echo "CPU eager b16"
+  generate_cmd="python3 torchchat.py generate $model --quantize '{\"precision\": {\"dtype\":\"bfloat16\"}, \"executor\":{\"accelerator\":\"cpu\"}}' --prompt \"Once upon a time,\" --max-new-tokens 256 --num-samples 3"
+  file="cpu_eager_b16.txt"
+  formatted_export_and_generate "$file" "$generate_cmd"
+
+  echo "CPU eager int8"
+  generate_cmd="python3 torchchat.py generate $model --quantize '{\"linear:int8\": {\"groupsize\": 0}, \"precision\": {\"dtype\":\"bfloat16\"}, \"executor\":{\"accelerator\":\"cpu\"}}' --prompt \"Once upon a time,\" --max-new-tokens 256 --num-samples 3"
+  file="cpu_eager_8.txt"
+  formatted_export_and_generate "$file" "$generate_cmd"
+
+  echo "CPU eager int4"
+  generate_cmd="python3 torchchat.py generate $model --quantize '{\"linear:int4\": {\"groupsize\": 256}, \"precision\": {\"dtype\":\"bfloat16\"}, \"executor\":{\"accelerator\":\"cpu\"}}' --prompt \"Once upon a time,\" --max-new-tokens 256 --num-samples 3"
+  file="cpu_eager_4.txt"
+  formatted_export_and_generate "$file" "$generate_cmd"
+fi
+
+
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+# CPU compile
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+if [ $RUN_CPU_COMPILE -eq 1 ]; then
+  echo "CPU compile b16"
+  generate_cmd="python3 torchchat.py generate $model --quantize '{\"precision\": {\"dtype\":\"bfloat16\"}, \"executor\":{\"accelerator\":\"cpu\"}}' --prompt \"Once upon a time,\" --max-new-tokens 256 --compile --num-samples 3"
+  file="cpu_compile_b16.txt"
+  formatted_export_and_generate "$file" "$generate_cmd"
+
+  echo "CPU compile int8"
+  generate_cmd="python3 torchchat.py generate $model --quantize '{\"linear:int8\": {\"groupsize\": 0}, \"precision\": {\"dtype\":\"bfloat16\"}, \"executor\":{\"accelerator\":\"cpu\"}}' --prompt \"Once upon a time,\" --max-new-tokens 256 --compile --num-samples 3"
+  file="cpu_compile_8.txt"
+  formatted_export_and_generate "$file" "$generate_cmd"
+
+  echo "CPU compile int4"
+  generate_cmd="python3 torchchat.py generate $model --quantize '{\"linear:int4\": {\"groupsize\": 256}, \"precision\": {\"dtype\":\"bfloat16\"}, \"executor\":{\"accelerator\":\"cpu\"}}' --prompt \"Once upon a time,\" --max-new-tokens 256 --compile --num-samples 3"
+  file="cpu_compile_4.txt"
+  formatted_export_and_generate "$file" "$generate_cmd"
+fi
+
+
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+# Cuda AOTI
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+if [ $RUN_CUDA_AOTI -eq 1 ]; then
+  echo "Cuda aoti b16"
+  compile_cmd="python3 torchchat.py export $model --quantize '{\"precision\": {\"dtype\":\"bfloat16\"}, \"executor\":{\"accelerator\":\"cuda\"}}' --output-dso-path /tmp/model16.so"
+  generate_cmd="python3 torchchat.py generate $model --dso-path /tmp/model16.so --prompt \"Once upon a time,\" --max-new-tokens 200 --device cuda --num-samples 3"
+  file="cuda_aoti_b16.txt"
+  formatted_export_and_generate "$file" "$generate_cmd" "$compile_cmd"
+
+  echo "Cuda aoti int8"
+  compile_cmd="python3 torchchat.py export $model --quantize '{\"linear:int8\": {\"groupsize\": 0}, \"precision\": {\"dtype\":\"bfloat16\"}, \"executor\":{\"accelerator\":\"cuda\"}}' --output-dso-path /tmp/model8.so"
+  generate_cmd="python3 torchchat.py generate $model --dso-path /tmp/model8.so --prompt \"Once upon a time,\" --max-new-tokens 200 --device cuda --num-samples 3"
+  file="cuda_aoti_8.txt"
+  formatted_export_and_generate "$file" "$generate_cmd" "$compile_cmd"
+
+  echo "Cuda aoti int4"
+  compile_cmd="python3 torchchat.py export $model --quantize '{\"linear:int4\": {\"groupsize\": 256}, \"precision\": {\"dtype\":\"bfloat16\"}, \"executor\":{\"accelerator\":\"cuda\"}}' --output-dso-path /tmp/model34.so"
+  generate_cmd="python3 torchchat.py generate $model --dso-path /tmp/model34.so --prompt \"Once upon a time,\" --max-new-tokens 200 --device cuda --num-samples 3"
+  file="cuda_aoti_4.txt"
+  formatted_export_and_generate "$file" "$generate_cmd" "$compile_cmd"
+fi
+
+
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+# Cuda AOTI PT2
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+if [ $RUN_CUDA_AOTI_PT2 -eq 1 ]; then
+  echo "Cuda aoti PT2 b16"
+  compile_cmd="python3 torchchat.py export $model --quantize '{\"precision\": {\"dtype\":\"bfloat16\"}, \"executor\":{\"accelerator\":\"cuda\"}}' --output-aoti-package-path /tmp/model16.pt2"
+  generate_cmd="python3 torchchat.py generate $model --aoti-package-path /tmp/model16.pt2 --prompt \"Once upon a time,\" --max-new-tokens 200 --device cuda --num-samples 3"
+  file="cuda_aoti_pt2_b16.txt"
+  formatted_export_and_generate "$file" "$generate_cmd" "$compile_cmd"
+
+  echo "Cuda aoti PT2 int8"
+  compile_cmd="python3 torchchat.py export $model --quantize '{\"linear:int8\": {\"groupsize\": 0}, \"precision\": {\"dtype\":\"bfloat16\"}, \"executor\":{\"accelerator\":\"cuda\"}}' --output-aoti-package-path /tmp/model8.pt2"
+  generate_cmd="python3 torchchat.py generate $model --aoti-package-path /tmp/model8.pt2 --prompt \"Once upon a time,\" --max-new-tokens 200 --device cuda --num-samples 3"
+  file="cuda_aoti_pt2_8.txt"
+  formatted_export_and_generate "$file" "$generate_cmd" "$compile_cmd"
+
+  echo "Cuda aoti PT2 int4"
+  compile_cmd="python3 torchchat.py export $model --quantize '{\"linear:int4\": {\"groupsize\": 256}, \"precision\": {\"dtype\":\"bfloat16\"}, \"executor\":{\"accelerator\":\"cuda\"}}' --output-aoti-package-path /tmp/model34.pt2"
+  generate_cmd="python3 torchchat.py generate $model --aoti-package-path /tmp/model34.pt2 --prompt \"Once upon a time,\" --max-new-tokens 200 --device cuda --num-samples 3"
+  file="cuda_aoti_pt2_4.txt"
+  formatted_export_and_generate "$file" "$generate_cmd" "$compile_cmd"
+fi
+
+
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+# CPU AOTI
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+if [ $RUN_CPU_AOTI -eq 1 ]; then
+  echo "CPU aoti b16"
+  compile_cmd="python3 torchchat.py export $model --quantize '{\"precision\": {\"dtype\":\"bfloat16\"}, \"executor\":{\"accelerator\":\"cpu\"}}' --output-dso-path /tmp/model16.so"
+  generate_cmd="python3 torchchat.py generate $model --dso-path /tmp/model16.so --prompt \"Once upon a time,\" --max-new-tokens 256 --device cpu --num-samples 3"
+  file="cpu_aoti_b16.txt"
+  formatted_export_and_generate "$file" "$generate_cmd" "$compile_cmd"
+
+  echo "CPU aoti int8"
+  compile_cmd="python3 torchchat.py export $model --quantize '{\"linear:int8\": {\"groupsize\": 0}, \"precision\": {\"dtype\":\"bfloat16\"}, \"executor\":{\"accelerator\":\"cpu\"}}' --output-dso-path /tmp/model8.so"
+  generate_cmd="python3 torchchat.py generate $model --dso-path /tmp/model8.so --prompt \"Once upon a time,\" --max-new-tokens 256 --device cpu --num-samples 3"
+  file="cpu_aoti_8.txt"
+  formatted_export_and_generate "$file" "$generate_cmd" "$compile_cmd"
+
+  echo "CPU aoti int4"
+  compile_cmd="python3 torchchat.py export $model --quantize '{\"linear:int4\": {\"groupsize\": 256}, \"precision\": {\"dtype\":\"bfloat16\"}, \"executor\":{\"accelerator\":\"cpu\"}}' --output-dso-path /tmp/model34.so"
+  generate_cmd="python3 torchchat.py generate $model --dso-path /tmp/model34.so --prompt \"Once upon a time,\" --max-new-tokens 256 --device cpu --num-samples 3"
+  file="cpu_aoti_4.txt"
+  formatted_export_and_generate "$file" "$generate_cmd" "$compile_cmd"
+fi
+
+
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+# CPU AOTI PT2
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+if [ $RUN_CPU_AOTI_PT2 -eq 1 ]; then
+  echo "CPU aoti PT2 b16"
+  compile_cmd="python3 torchchat.py export $model --quantize '{\"precision\": {\"dtype\":\"bfloat16\"}, \"executor\":{\"accelerator\":\"cpu\"}}' --output-aoti-package-path /tmp/model16.pt2"
+  generate_cmd="python3 torchchat.py generate $model --aoti-package-path /tmp/model16.pt2 --prompt \"Once upon a time,\" --max-new-tokens 256 --device cpu --num-samples 3"
+  file="cpu_aoti_pt2_b16.txt"
+  formatted_export_and_generate "$file" "$generate_cmd" "$compile_cmd"
+
+  echo "CPU aoti PT2 int8"
+  compile_cmd="python3 torchchat.py export $model --quantize '{\"linear:int8\": {\"groupsize\": 0}, \"precision\": {\"dtype\":\"bfloat16\"}, \"executor\":{\"accelerator\":\"cpu\"}}' --output-aoti-package-path /tmp/model8.pt2"
+  generate_cmd="python3 torchchat.py generate $model --aoti-package-path /tmp/model8.pt2 --prompt \"Once upon a time,\" --max-new-tokens 256 --device cpu --num-samples 3"
+  file="cpu_aoti_pt2_8.txt"
+  formatted_export_and_generate "$file" "$generate_cmd" "$compile_cmd"
+
+  echo "CPU aoti PT2 int4"
+  compile_cmd="python3 torchchat.py export $model --quantize '{\"linear:int4\": {\"groupsize\": 256}, \"precision\": {\"dtype\":\"bfloat16\"}, \"executor\":{\"accelerator\":\"cpu\"}}' --output-aoti-package-path /tmp/model34.pt2"
+  generate_cmd="python3 torchchat.py generate $model --aoti-package-path /tmp/model34.pt2 --prompt \"Once upon a time,\" --max-new-tokens 256 --device cpu --num-samples 3"
+  file="cpu_aoti_pt2_4.txt"
+  formatted_export_and_generate "$file" "$generate_cmd" "$compile_cmd"
+fi
diff --git a/scripts/benchmarking/benchmarking_mac.sh b/scripts/benchmarking/benchmarking_mac.sh
new file mode 100755
index 000000000..2b8fad5c1
--- /dev/null
+++ b/scripts/benchmarking/benchmarking_mac.sh
@@ -0,0 +1,173 @@
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+# Customize what is being run
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+DRY_RUN=0
+
+RUN_MPS_EAGER=0
+
+RUN_CPU_EAGER=0
+RUN_CPU_COMPILE=0
+RUN_CPU_AOTI=0
+RUN_CPU_AOTI_PT2=0
+
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+# Check and Set Up Args (model, out_directory)
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+if [ $# -ne 2 ]; then
+  echo "Please provide (1) model and (2) directory as positional arguments"
+  exit 1
+fi
+
+model=$1
+dir=$2
+
+mkdir -p $dir
+
+
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+# Helpers
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+# Function for printing and writing to files
+function formatted_export_and_generate {
+  local file="$dir/$1"
+  local generate_cmd="$2"
+  local compile_cmd="$3"
+
+  # Write Commands to the top of the output file
+  echo $compile_cmd > $file
+  echo $generate_cmd >> $file
+
+  echo "Writing to: ${file}"
+
+  # Export the Model
+  if [ ! -z "$compile_cmd" ]; then
+    echo "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" >> $file
+    echo "$compile_cmd" | tee -a $file
+    if [ $DRY_RUN -eq 0 ]; then
+      eval $compile_cmd >> $file 2>&1
+    fi
+  fi
+
+  # Generate using the Model
+  echo "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" >> $file
+  echo $generate_cmd | tee -a $file
+  if [ $DRY_RUN -eq 0 ]; then
+    eval $generate_cmd >> $file 2>&1
+  fi
+  echo
+}
+
+
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+# MPS Eager
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+if [ $RUN_MPS_EAGER -eq 1 ]; then
+  echo "MPS Eager 16"
+  generate_cmd="python3 torchchat.py generate $model --quantize '{\"precision\": {\"dtype\":\"float16\"}, \"executor\":{\"accelerator\":\"mps\"}}' --prompt \"Once upon a time,\" --max-new-tokens 256 --num-samples 3"
+  file="mps_eager_16.txt"
+  formatted_export_and_generate "$file" "$generate_cmd"
+
+  echo "MPS Eager int8"
+  generate_cmd="python3 torchchat.py generate $model --quantize '{\"linear:int8\": {\"groupsize\": 0}, \"precision\": {\"dtype\":\"float16\"}, \"executor\":{\"accelerator\":\"mps\"}}' --prompt \"Once upon a time,\" --max-new-tokens 256 --num-samples 3"
+  file="mps_eager_8.txt"
+  formatted_export_and_generate "$file" "$generate_cmd"
+
+  echo "MPS Eager int4"
+  generate_cmd="python3 torchchat.py generate $model --quantize '{\"linear:int4\": {\"groupsize\": 256}, \"precision\": {\"dtype\":\"float16\"}, \"executor\":{\"accelerator\":\"mps\"}}' --prompt \"Once upon a time,\" --max-new-tokens 256 --num-samples 3"
+  file="mps_eager_4.txt"
+  formatted_export_and_generate "$file" "$generate_cmd"
+fi
+
+
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+# CPU Eager
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+if [ $RUN_CPU_EAGER -eq 1 ]; then
+  echo "CPU Eager 16"
+  generate_cmd="python3 torchchat.py generate $model --quantize '{\"precision\": {\"dtype\":\"float16\"}, \"executor\":{\"accelerator\":\"cpu\"}}' --prompt \"Once upon a time,\" --max-new-tokens 256 --num-samples 3"
+  file="cpu_eager_16.txt"
+  formatted_export_and_generate "$file" "$generate_cmd"
+
+  echo "CPU Eager int8"
+  generate_cmd="python3 torchchat.py generate $model --quantize '{\"linear:int8\": {\"groupsize\": 0}, \"precision\": {\"dtype\":\"float16\"}, \"executor\":{\"accelerator\":\"cpu\"}}' --prompt \"Once upon a time,\" --max-new-tokens 256 --num-samples 3"
+  file="cpu_eager_8.txt"
+  formatted_export_and_generate "$file" "$generate_cmd"
+
+  echo "CPU Eager int4"
+  generate_cmd="python3 torchchat.py generate $model --quantize '{\"linear:int4\": {\"groupsize\": 256}, \"precision\": {\"dtype\":\"float16\"}, \"executor\":{\"accelerator\":\"cpu\"}}' --prompt \"Once upon a time,\" --max-new-tokens 256 --num-samples 3"
+  file="cpu_eager_4.txt"
+  formatted_export_and_generate "$file" "$generate_cmd"
+fi
+
+
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+# CPU compile
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+if [ $RUN_CPU_COMPILE -eq 1 ]; then
+  echo "CPU compile b16"
+  generate_cmd="python3 torchchat.py generate $model --quantize '{\"precision\": {\"dtype\":\"float16\"}, \"executor\":{\"accelerator\":\"cpu\"}}' --prompt \"Once upon a time,\" --max-new-tokens 256 --compile --num-samples 3"
+  file="cpu_compile_b16.txt"
+  formatted_export_and_generate "$file" "$generate_cmd"
+
+  echo "CPU compile int8"
+  generate_cmd="python3 torchchat.py generate $model --quantize '{\"linear:int8\": {\"groupsize\": 0}, \"precision\": {\"dtype\":\"float16\"}, \"executor\":{\"accelerator\":\"cpu\"}}' --prompt \"Once upon a time,\" --max-new-tokens 256 --compile --num-samples 3"
+  file="cpu_compile_8.txt"
+  formatted_export_and_generate "$file" "$generate_cmd"
+
+  echo "CPU compile int4"
+  generate_cmd="python3 torchchat.py generate $model --quantize '{\"linear:int4\": {\"groupsize\": 256}, \"precision\": {\"dtype\":\"float16\"}, \"executor\":{\"accelerator\":\"cpu\"}}' --prompt \"Once upon a time,\" --max-new-tokens 256 --compile --num-samples 3"
+  file="cpu_compile_4.txt"
+  formatted_export_and_generate "$file" "$generate_cmd"
+fi
+
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+# CPU AOTI
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+if [ $RUN_CPU_AOTI -eq 1 ]; then
+  echo "CPU aoti b16"
+  compile_cmd="python3 torchchat.py export $model --quantize '{\"precision\": {\"dtype\":\"float16\"}, \"executor\":{\"accelerator\":\"cpu\"}}' --output-dso-path /tmp/model16.so"
+  generate_cmd="python3 torchchat.py generate $model --dso-path /tmp/model16.so --prompt \"Once upon a time,\" --max-new-tokens 256 --device cpu --num-samples 3"
+  file="cpu_aoti_16.txt"
+  formatted_export_and_generate "$file" "$generate_cmd" "$compile_cmd"
+
+  echo "CPU aoti int8"
+  compile_cmd="python3 torchchat.py export $model --quantize '{\"linear:int8\": {\"groupsize\": 0}, \"precision\": {\"dtype\":\"float16\"}, \"executor\":{\"accelerator\":\"cpu\"}}' --output-dso-path /tmp/model8.so"
+  generate_cmd="python3 torchchat.py generate $model --dso-path /tmp/model8.so --prompt \"Once upon a time,\" --max-new-tokens 256 --device cpu --num-samples 3"
+  file="cpu_aoti_8.txt"
+  formatted_export_and_generate "$file" "$generate_cmd" "$compile_cmd"
+
+  echo "CPU aoti int4"
+  compile_cmd="python3 torchchat.py export $model --quantize '{\"linear:int4\": {\"groupsize\": 256}, \"precision\": {\"dtype\":\"float16\"}, \"executor\":{\"accelerator\":\"cpu\"}}' --output-dso-path /tmp/model34.so"
+  generate_cmd="python3 torchchat.py generate $model --dso-path /tmp/model34.so --prompt \"Once upon a time,\" --max-new-tokens 256 --device cpu --num-samples 3"
+  file="cpu_aoti_4.txt"
+  formatted_export_and_generate "$file" "$generate_cmd" "$compile_cmd"
+fi
+
+
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+# CPU AOTI PT2
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+if [ $RUN_CPU_AOTI_PT2 -eq 1 ]; then
+  echo "CPU aoti PT2 b16"
+  compile_cmd="python3 torchchat.py export $model --quantize '{\"precision\": {\"dtype\":\"float16\"}, \"executor\":{\"accelerator\":\"cpu\"}}' --output-aoti-package-path /tmp/model16.pt2"
+  generate_cmd="python3 torchchat.py generate $model --aoti-package-path /tmp/model16.pt2 --prompt \"Once upon a time,\" --max-new-tokens 256 --device cpu --num-samples 3"
+  file="cpu_aoti_pt2_16.txt"
+  formatted_export_and_generate "$file" "$generate_cmd" "$compile_cmd"
+
+  echo "CPU aoti PT2 int8"
+  compile_cmd="python3 torchchat.py export $model --quantize '{\"linear:int8\": {\"groupsize\": 0}, \"precision\": {\"dtype\":\"float16\"}, \"executor\":{\"accelerator\":\"cpu\"}}' --output-aoti-package-path /tmp/model8.pt2"
+  generate_cmd="python3 torchchat.py generate $model --aoti-package-path /tmp/model8.pt2 --prompt \"Once upon a time,\" --max-new-tokens 256 --device cpu --num-samples 3"
+  file="cpu_aoti_pt2_8.txt"
+  formatted_export_and_generate "$file" "$generate_cmd" "$compile_cmd"
+
+  echo "CPU aoti PT2 int4"
+  compile_cmd="python3 torchchat.py export $model --quantize '{\"linear:int4\": {\"groupsize\": 256}, \"precision\": {\"dtype\":\"float16\"}, \"executor\":{\"accelerator\":\"cpu\"}}' --output-aoti-package-path /tmp/model34.pt2"
+  generate_cmd="python3 torchchat.py generate $model --aoti-package-path /tmp/model34.pt2 --prompt \"Once upon a time,\" --max-new-tokens 256 --device cpu --num-samples 3"
+  file="cpu_aoti_pt2_4.txt"
+  formatted_export_and_generate "$file" "$generate_cmd" "$compile_cmd"
+fi