nod-ai · monorimet · Jul 12, 2024 · Jul 12, 2024 · Jul 12, 2024 · Jul 12, 2024
diff --git a/.github/workflows/test_models.yml b/.github/workflows/test_models.yml
@@ -69,8 +69,7 @@ jobs:
           source turbine_venv/bin/activate
 
           pytest -v models/turbine_models/tests/sd_test.py
-          pytest -v models/turbine_models/tests/sdxl_test.py --device cpu --rt_device local-task --iree_target_triple x86_64-linux-gnu --num_inference_steps 5
-          pytest -v models/turbine_models/tests/sdxl_test.py --device vulkan --rt_device vulkan --iree_target_triple rdna3-unknown-linux
+          pytest -v models/turbine_models/tests/sdxl_test.py --device cpu --rt_device local-task --iree_target_triple x86_64-linux-gnu --num_inference_steps 2
           pytest -v models/turbine_models/tests/sdxl_test.py --device rocm --rt_device hip --iree_target_triple gfx90a --precision fp16 --attn_spec default
           pytest -v models/turbine_models/tests/sdxl_test.py --device rocm --rt_device hip --iree_target_triple gfx90a --precision fp16 --attn_spec default --batch_size 2
-          pytest -v models/turbine_models/tests/sd3_test.py --device cpu --rt_device local-task --iree_target_triple x86_64-linux-gnu --num_inference_steps 5
+          pytest -v models/turbine_models/tests/sd3_test.py --device cpu --rt_device local-task --iree_target_triple x86_64-linux-gnu --num_inference_steps 2
diff --git a/models/README.md b/models/README.md
@@ -1,26 +1,19 @@
-# LLAMA 2 Inference
+# Turbine-Models setup (source)
 
-This example require some extra dependencies. Here's an easy way to get it running on a fresh server.
-
-Don't forget to put in your huggingface token from https://huggingface.co/settings/tokens
+For private/gated models, make sure you have run `huggingface-cli login`.
 
 ```bash
 #!/bin/bash
 
-
-# if you don't insert it, you will be prompted to log in later;
-# you may need to rerun this script after logging in
-YOUR_HF_TOKEN="insert token for headless" 
-
 # clone and install dependencies
 sudo apt install -y git
 git clone https://github.com/nod-ai/SHARK-Turbine.git
 cd SHARK-Turbine
-pip install -r core/requirements.txt
+pip install torch==2.5.0.dev20240801 torchvision --index-url https://download.pytorch.org/whl/nightly/cpu
 pip install -r models/requirements.txt
 
 # do an editable install from the cloned SHARK-Turbine
-pip install --editable core models
+pip install --editable models
 
 # Log in with Hugging Face CLI if token setup is required
 if [[ $YOUR_HF_TOKEN == hf_* ]]; then
@@ -42,6 +35,3 @@ else
     huggingface-cli login
 fi
 
-# Step 7: Run the Python script
-python .\python\turbine_models\custom_models\stateless_llama.py --compile_to=torch --external_weights=safetensors --external_weight_file=llama_f32.safetensors
-```
diff --git a/models/requirements.txt b/models/requirements.txt
@@ -1,6 +1,6 @@
 protobuf
 gguf
-transformers==4.37.1
+transformers==4.43.3
 torchsde
 accelerate
 peft
@@ -13,4 +13,4 @@ einops
 pytest
 scipy
 shark-turbine @ git+https://github.com/iree-org/iree-turbine.git@main
--e git+https://github.com/nod-ai/sharktank.git@main#egg=sharktank&subdirectory=sharktank
+-e git+https://github.com/nod-ai/sharktank.git@main#egg=sharktank&subdirectory=sharktank
diff --git a/models/setup.py b/models/setup.py
@@ -57,7 +57,7 @@ def load_version_info():
         "Shark-Turbine",
         "protobuf",
         "sentencepiece",
-        "transformers>=4.37.1",
+        "transformers>=4.43.3",
         "accelerate",
         "diffusers==0.29.0.dev0",
         "azure-storage-blob",

diff --git a/models/turbine_models/custom_models/pipeline_base.py b/models/turbine_models/custom_models/pipeline_base.py
@@ -84,16 +84,24 @@ class PipelineComponent:
     """
 
     def __init__(
-        self, printer, dest_type="devicearray", dest_dtype="float16", benchmark=False
+        self,
+        printer,
+        dest_type="devicearray",
+        dest_dtype="float16",
+        benchmark=False,
+        save_outputs=False,
     ):
         self.runner = None
         self.module_name = None
         self.device = None
         self.metadata = None
         self.printer = printer
         self.benchmark = benchmark
+        self.save_outputs = save_outputs
+        self.output_counter = 0
         self.dest_type = dest_type
         self.dest_dtype = dest_dtype
+        self.validate = False
 
     def load(
         self,
@@ -218,6 +226,16 @@ def _output_cast(self, output):
             case _:
                 return output
 
+    def save_output(self, function_name, output):
+        if isinstance(output, tuple) or isinstance(output, list):
+            for i in output:
+                self.save_output(function_name, i)
+        else:
+            np.save(
+                f"{function_name}_output_{self.output_counter}.npy", output.to_host()
+            )
+            self.output_counter += 1
+
     def _run(self, function_name, inputs: list):
         return self.module[function_name](*inputs)
 
@@ -235,13 +253,21 @@ def __call__(self, function_name, inputs: list):
         if not isinstance(inputs, list):
             inputs = [inputs]
         inputs = self._validate_or_convert_inputs(function_name, inputs)
+
+        if self.validate:
+            self.save_torch_inputs(inputs)
+
         if self.benchmark:
             output = self._run_and_benchmark(function_name, inputs)
         else:
             output = self._run(function_name, inputs)
+        if self.save_outputs:
+            self.save_output(function_name, output)
         output = self._output_cast(output)
         return output
 
+    # def _run_and_validate(self, iree_fn, torch_fn, inputs: list)
+
 
 class Printer:
     def __init__(self, verbose, start_time, print_time):
@@ -340,6 +366,7 @@ def __init__(
         hf_model_name: str | dict[str] = None,
         benchmark: bool | dict[bool] = False,
         verbose: bool = False,
+        save_outputs: bool | dict[bool] = False,
         common_export_args: dict = {},
     ):
         self.map = model_map
@@ -374,6 +401,7 @@ def __init__(
             "external_weights": external_weights,
             "hf_model_name": hf_model_name,
             "benchmark": benchmark,
+            "save_outputs": save_outputs,
         }
         for arg in map_arguments.keys():
             self.map = merge_arg_into_map(self.map, map_arguments[arg], arg)
@@ -391,7 +419,8 @@ def __init__(
                 )
         for submodel in self.map.keys():
             for key, value in map_arguments.items():
-                self.map = merge_export_arg(self.map, value, key)
+                if key not in ["benchmark", "save_outputs"]:
+                    self.map = merge_export_arg(self.map, value, key)
             for key, value in self.map[submodel].get("export_args", {}).items():
                 if key == "hf_model_name":
                     self.map[submodel]["keywords"].append(
@@ -539,7 +568,11 @@ def is_prepared(self, vmfbs, weights):
                 avail_files = os.listdir(self.external_weights_dir)
                 candidates = []
                 for filename in avail_files:
-                    if all(str(x) in filename for x in w_keywords):
+                    if all(
+                        str(x) in filename
+                        or str(x) == os.path.join(self.external_weights_dir, filename)
+                        for x in w_keywords
+                    ):
                         candidates.append(
                             os.path.join(self.external_weights_dir, filename)
                         )
@@ -723,7 +756,7 @@ def export_submodel(
     def load_map(self):
         for submodel in self.map.keys():
             if not self.map[submodel]["load"]:
-                self.printer.print("Skipping load for ", submodel)
+                self.printer.print(f"Skipping load for {submodel}")
                 continue
             self.load_submodel(submodel)
 
@@ -739,6 +772,7 @@ def load_submodel(self, submodel):
             printer=self.printer,
             dest_type=dest_type,
             benchmark=self.map[submodel].get("benchmark", False),
+            save_outputs=self.map[submodel].get("save_outputs", False),
         )
         self.map[submodel]["runner"].load(
             self.map[submodel]["driver"],
@@ -751,6 +785,10 @@ def load_submodel(self, submodel):
 
     def unload_submodel(self, submodel):
         self.map[submodel]["runner"].unload()
+        self.map[submodel]["vmfb"] = None
+        self.map[submodel]["mlir"] = None
+        self.map[submodel]["weights"] = None
+        self.map[submodel]["export_args"]["input_mlir"] = None
         setattr(self, submodel, None)
 
 

diff --git a/models/turbine_models/custom_models/sd3_inference/diffusers_ref.py b/models/turbine_models/custom_models/sd3_inference/diffusers_ref.py
@@ -0,0 +1,49 @@
+from diffusers import StableDiffusion3Pipeline
+import torch
+from datetime import datetime as dt
+
+
+def run_diffusers_cpu(
+    hf_model_name,
+    prompt,
+    negative_prompt,
+    guidance_scale,
+    seed,
+    height,
+    width,
+    num_inference_steps,
+):
+    from diffusers import StableDiffusion3Pipeline
+
+    pipe = StableDiffusion3Pipeline.from_pretrained(
+        hf_model_name, torch_dtype=torch.float32
+    )
+    pipe = pipe.to("cpu")
+    generator = torch.Generator().manual_seed(int(seed))
+
+    image = pipe(
+        prompt=prompt,
+        negative_prompt=negative_prompt,
+        num_inference_steps=num_inference_steps,
+        guidance_scale=guidance_scale,
+        height=height,
+        width=width,
+        generator=generator,
+    ).images[0]
+    timestamp = dt.now().strftime("%Y-%m-%d_%H-%M-%S")
+    image.save(f"diffusers_reference_output_{timestamp}.png")
+
+
+if __name__ == "__main__":
+    from turbine_models.custom_models.sd_inference.sd_cmd_opts import args
+
+    run_diffusers_cpu(
+        args.hf_model_name,
+        args.prompt,
+        args.negative_prompt,
+        args.guidance_scale,
+        args.seed,
+        args.height,
+        args.width,
+        args.num_inference_steps,
+    )
diff --git a/models/turbine_models/custom_models/sd3_inference/sd3_mmdit.py b/models/turbine_models/custom_models/sd3_inference/sd3_mmdit.py
@@ -45,6 +45,7 @@ def forward(
         pooled_projections,
         timestep,
     ):
+        timestep.expand(hidden_states.shape[0])
         noise_pred = self.mmdit(
             hidden_states,
             encoder_hidden_states,
@@ -71,7 +72,7 @@ def forward(self, q, k, v):
 def export_attn(
     precision="fp16",
     device="cpu",
-    target_triple="x86_64-unknown-linux-gnu",
+    target="x86_64-unknown-linux-gnu",
     ireec_flags="",
     compile_to="torch",
     decomp_attn=False,
@@ -128,7 +129,7 @@ class CompiledAttn(CompiledModule):
         vmfb_path = utils.compile_to_vmfb(
             module_str,
             device,
-            target_triple,
+            target,
             ireec_flags,
             safe_name,
             return_path=True,
@@ -139,7 +140,6 @@ class CompiledAttn(CompiledModule):
 
 @torch.no_grad()
 def export_mmdit_model(
-    mmdit_model,
     hf_model_name,
     batch_size,
     height,
@@ -151,8 +151,8 @@ def export_mmdit_model(
     external_weights=None,
     external_weight_path=None,
     device=None,
-    target_triple=None,
-    ireec_flags=None,
+    target=None,
+    ireec_flags="",
     decomp_attn=False,
     exit_on_vmfb=False,
     pipeline_dir=None,
@@ -161,6 +161,9 @@ def export_mmdit_model(
     weights_only=False,
 ):
     dtype = torch.float16 if precision == "fp16" else torch.float32
+    mmdit_model = MMDiTModel(
+        dtype=dtype,
+    )
     np_dtype = "float16" if precision == "fp16" else "float32"
     safe_name = utils.create_safe_name(
         hf_model_name,
@@ -169,13 +172,14 @@ def export_mmdit_model(
     if pipeline_dir:
         safe_name = os.path.join(pipeline_dir, safe_name)
     if decomp_attn == True:
+        safe_name += "_decomp_attn"
         ireec_flags += ",--iree-opt-aggressively-propagate-transposes=False"
 
     if input_mlir:
         vmfb_path = utils.compile_to_vmfb(
             input_mlir,
             device,
-            target_triple,
+            target,
             ireec_flags,
             safe_name,
             mlir_source="file",
@@ -208,7 +212,7 @@ def export_mmdit_model(
         torch.empty(hidden_states_shape, dtype=dtype),
         torch.empty(encoder_hidden_states_shape, dtype=dtype),
         torch.empty(pooled_projections_shape, dtype=dtype),
-        torch.empty(init_batch_dim, dtype=dtype),
+        torch.empty(1, dtype=dtype),
     ]
 
     decomp_list = []
@@ -249,7 +253,7 @@ class CompiledMmdit(CompiledModule):
             hidden_states_shape,
             encoder_hidden_states_shape,
             pooled_projections_shape,
-            init_batch_dim,
+            (1,),
         ],
         "input_dtypes": [np_dtype for x in range(4)],
         "output_shapes": [hidden_states_shape],
@@ -263,7 +267,7 @@ class CompiledMmdit(CompiledModule):
         vmfb_path = utils.compile_to_vmfb(
             module_str,
             device,
-            target_triple,
+            target,
             ireec_flags,
             safe_name,
             return_path=True,