Project-MONAI · borisfom · Aug 5, 2024 · Aug 5, 2024 · Aug 5, 2024 · Aug 5, 2024
diff --git a/Dockerfile b/Dockerfile
@@ -11,7 +11,7 @@
 
 # To build with a different base image
 # please run `docker build` using the `--build-arg PYTORCH_IMAGE=...` flag.
-ARG PYTORCH_IMAGE=nvcr.io/nvidia/pytorch:24.08-py3
+ARG PYTORCH_IMAGE=nvcr.io/nvidia/pytorch:24.09-py3
 FROM ${PYTORCH_IMAGE}
 
 LABEL maintainer="[email protected]"

diff --git a/monai/apps/generation/maisi/networks/controlnet_maisi.py b/monai/apps/generation/maisi/networks/controlnet_maisi.py
@@ -119,7 +119,7 @@ def forward(
         down_block_res_samples = [h * conditioning_scale for h in down_block_res_samples]
         mid_block_res_sample *= conditioning_scale
 
-        return down_block_res_samples, mid_block_res_sample
+        return [*down_block_res_samples, mid_block_res_sample]
 
     def _prepare_time_and_class_embedding(self, x, timesteps, class_labels):
         # 1. time

diff --git a/monai/networks/blocks/spatialattention.py b/monai/networks/blocks/spatialattention.py
@@ -17,9 +17,6 @@
 import torch.nn as nn
 
 from monai.networks.blocks import SABlock
-from monai.utils import optional_import
-
-Rearrange, _ = optional_import("einops.layers.torch", name="Rearrange")
 
 
 class SpatialAttentionBlock(nn.Module):
@@ -74,24 +71,10 @@ def __init__(
 
     def forward(self, x: torch.Tensor):
         residual = x
-
-        if self.spatial_dims == 1:
-            h = x.shape[2]
-            rearrange_input = Rearrange("b c h -> b h c")
-            rearrange_output = Rearrange("b h c -> b c h", h=h)
-        if self.spatial_dims == 2:
-            h, w = x.shape[2], x.shape[3]
-            rearrange_input = Rearrange("b c h w -> b (h w) c")
-            rearrange_output = Rearrange("b (h w) c -> b c h w", h=h, w=w)
-        else:
-            h, w, d = x.shape[2], x.shape[3], x.shape[4]
-            rearrange_input = Rearrange("b c h w d -> b (h w d) c")
-            rearrange_output = Rearrange("b (h w d) c -> b c h w d", h=h, w=w, d=d)
-
+        shape = x.shape
         x = self.norm(x)
-        x = rearrange_input(x)  # B x (x_dim * y_dim [ * z_dim]) x C
-
+        x = x.reshape(*shape[:2], -1).transpose(1, 2)  # "b c h w d -> b (h w d) c"
         x = self.attn(x)
-        x = rearrange_output(x)  # B x  x C x x_dim * y_dim * [z_dim]
+        x = x.transpose(1, 2).reshape(shape)  # "b (h w d) c -> b c h w d"
         x = x + residual
         return x
diff --git a/monai/networks/nets/vista3d.py b/monai/networks/nets/vista3d.py
@@ -639,12 +639,9 @@ def forward(self, src: torch.Tensor, class_vector: torch.Tensor):
         if self.use_mlp:
             class_embedding = self.mlp(class_embedding)
         # [b,1,feat] @ [1,feat,dim], batch dimension become class_embedding batch dimension.
-        masks = []
-        for i in range(b):
-            mask = class_embedding @ src[[i]].view(1, c, h * w * d)
-            masks.append(mask.view(-1, 1, h, w, d))
-
-        return torch.cat(masks, 1), class_embedding
+        masks_embedding = class_embedding.squeeze() @ src.view(b, c, h * w * d)
+        masks_embedding = masks_embedding.view(b, -1, h, w, d).transpose(0, 1)
+        return masks_embedding, class_embedding
 
 
 class TwoWayTransformer(nn.Module):

diff --git a/monai/networks/trt_compiler.py b/monai/networks/trt_compiler.py
@@ -134,6 +134,9 @@ def __init__(self, plan_path, logger=None):
                 self.output_names.append(binding)
                 dtype = dtype_dict[self.engine.get_tensor_dtype(binding)]
                 self.dtypes.append(dtype)
+        self.logger.info(
+            f"Loaded TensorRT engine: {self.plan_path}.\nInputs: {self.input_names}\nOutputs: {self.output_names}"
+        )
 
     def allocate_buffers(self, device):
         """
@@ -163,7 +166,8 @@ def set_inputs(self, feed_dict, stream):
         last_profile = self.cur_profile
 
         def try_set_inputs():
-            for binding, t in feed_dict.items():
+            for binding in self.input_names:
+                t = feed_dict[binding]
                 if t is not None:
                     t = t.contiguous()
                     shape = t.shape
@@ -180,7 +184,8 @@ def try_set_inputs():
                     raise
                 self.cur_profile = next_profile
                 ctx.set_optimization_profile_async(self.cur_profile, stream)
-
+            except Exception:
+                raise
         left = ctx.infer_shapes()
         assert len(left) == 0
 
@@ -217,6 +222,40 @@ def infer(self, stream, use_cuda_graph=False):
         return self.tensors
 
 
+def remove_non_tensors(input_example, remove_constants=True):
+    #
+    # TODO : see if we can instantiate wrappers to handle non-default non-tensors
+    #
+    non_tensors = {}
+    for k, v in input_example.items():
+        if v is None:
+            non_tensors[k] = v
+        elif not torch.is_tensor(v):
+            if remove_constants:
+                non_tensors[k] = v
+            else:
+                input_example[k] = torch.tensor(v)
+
+    for key in non_tensors.keys():
+        # print(f"Removing non-tensor input: {key})")
+        input_example.pop(key)
+    return non_tensors
+
+
+def unroll_input(input_names, input_example):
+    # Simulate list/tuple unrolling during ONNX export
+    unrolled_input = {}
+    for name in input_names:
+        val = input_example[name]
+        if val is not None:
+            if isinstance(val, list | tuple):
+                for i in range(len(val)):
+                    unrolled_input[f"{name}_{i}"] = val[i]
+            else:
+                unrolled_input[name] = val
+    return unrolled_input
+
+
 class TrtCompiler:
     """
     This class implements:
@@ -240,6 +279,7 @@ def __init__(
         use_cuda_graph=False,
         timestamp=None,
         fallback=False,
+        forward_override=None,
         logger=None,
     ):
         """
@@ -289,11 +329,18 @@ def __init__(
         self.disabled = False
 
         self.logger = logger or get_logger("trt_compile")
-
+        self.argspec = inspect.getfullargspec(model.forward)
         # Normally we read input_names from forward() but can be overridden
         if input_names is None:
-            argspec = inspect.getfullargspec(model.forward)
-            input_names = argspec.args[1:]
+            input_names = self.argspec.args[1:]
+        self.defaults = {}
+        if self.argspec.defaults is not None:
+            for i in range(len(self.argspec.defaults)):
+                d = self.argspec.defaults[-i - 1]
+                if d is not None:
+                    d = torch.tensor(d).cuda()
+                self.defaults[self.argspec.args[-i - 1]] = d
+
         self.input_names = input_names
         self.old_forward = model.forward
 
@@ -314,9 +361,9 @@ def _load_engine(self):
         """
         try:
             self.engine = TRTEngine(self.plan_path, self.logger)
-            self.input_names = self.engine.input_names
+            self.logger.info(f"Engine loaded, inputs:{self.engine.input_names}")
         except Exception as e:
-            self.logger.debug(f"Exception while loading the engine:\n{e}")
+            self.logger.info(f"Exception while loading the engine:\n{e}")
 
     def forward(self, model, argv, kwargs):
         """
@@ -329,18 +376,22 @@ def forward(self, model, argv, kwargs):
         Returns: Passing through wrapped module's forward() return value(s)
 
         """
+        args = self.defaults
+        args.update(kwargs)
+        if len(argv) > 0:
+            args.update(self._inputs_to_dict(argv))
+
         if self.engine is None and not self.disabled:
             # Restore original forward for export
             new_forward = model.forward
             model.forward = self.old_forward
             try:
                 self._load_engine()
                 if self.engine is None:
-                    build_args = kwargs.copy()
-                    if len(argv) > 0:
-                        build_args.update(self._inputs_to_dict(argv))
-                    self._build_and_save(model, build_args)
-                    # This will reassign input_names from the engine
+                    build_args = args.copy()
+                    with torch.no_grad():
+                        self._build_and_save(model, build_args)
+                        # This will reassign input_names from the engine
                     self._load_engine()
                     assert self.engine is not None
             except Exception as e:
@@ -355,19 +406,16 @@ def forward(self, model, argv, kwargs):
                     del param
                 # Call empty_cache to release GPU memory
                 torch.cuda.empty_cache()
+            # restore TRT hook
             model.forward = new_forward
         # Run the engine
         try:
-            if len(argv) > 0:
-                kwargs.update(self._inputs_to_dict(argv))
-                argv = ()
-
             if self.engine is not None:
                 # forward_trt is not thread safe as we do not use per-thread execution contexts
                 with lock_sm:
                     device = torch.cuda.current_device()
                     stream = torch.cuda.Stream(device=device)
-                    self.engine.set_inputs(kwargs, stream.cuda_stream)
+                    self.engine.set_inputs(unroll_input(self.input_names, args), stream.cuda_stream)
                     self.engine.allocate_buffers(device=device)
                     # Need this to synchronize with Torch stream
                     stream.wait_stream(torch.cuda.current_stream())
@@ -379,7 +427,7 @@ def forward(self, model, argv, kwargs):
                             ret = ret[0]
                     return ret
         except Exception as e:
-            if model is not None:
+            if self.fallback:
                 self.logger.info(f"Exception: {e}\nFalling back to Pytorch ...")
             else:
                 raise e
@@ -391,16 +439,11 @@ def _onnx_to_trt(self, onnx_path):
         """
 
         profiles = []
-        if self.profiles:
-            for input_profile in self.profiles:
-                if isinstance(input_profile, Profile):
-                    profiles.append(input_profile)
-                else:
-                    p = Profile()
-                    for name, dims in input_profile.items():
-                        assert len(dims) == 3
-                        p.add(name, min=dims[0], opt=dims[1], max=dims[2])
-                    profiles.append(p)
+        for profile in self.profiles:
+            p = Profile()
+            for id, val in profile.items():
+                p.add(id, min=val[0], opt=val[1], max=val[2])
+            profiles.append(p)
 
         build_args = self.build_args.copy()
         build_args["tf32"] = self.precision != "fp32"
@@ -426,6 +469,9 @@ def _build_and_save(self, model, input_example):
 
         export_args = self.export_args
 
+        # remove_non_tensors(input_example)
+
+        engine_bytes = None
         add_casts_around_norms(model)
 
         if self.method == "torch_trt":
@@ -459,33 +505,46 @@ def get_torch_trt_input(input_shape, dynamic_batchsize):
                     raise ValueError("ERROR: Both dynamic_batchsize and input_profiles set for TrtCompiler!")
                 if len(dbs) != 3:
                     raise ValueError("dynamic_batchsize has to have len ==3 ")
-                profiles = {}
+                profile = {}
                 for id, val in input_example.items():
-                    sh = val.shape[1:]
-                    profiles[id] = [[dbs[0], *sh], [dbs[1], *sh], [dbs[2], *sh]]
-                self.profiles = [profiles]
 
-            if len(self.profiles) > 0:
-                export_args.update({"dynamic_axes": get_dynamic_axes(self.profiles)})
+                    def add_profile(id, val):
+                        sh = val.shape
+                        if len(sh) > 0:
+                            sh = sh[1:]
+                            profile[id] = [[dbs[0], *sh], [dbs[1], *sh], [dbs[2], *sh]]
+
+                    if isinstance(val, list | tuple):
+                        for i in range(len(val)):
+                            add_profile(f"{id}_{i}", val[i])
+                    elif isinstance(val, torch.Tensor):
+                        add_profile(id, val)
+                self.profiles = [profile]
+
+            self.dynamic_axes = get_dynamic_axes(self.profiles)
+
+            if len(self.dynamic_axes) > 0:
+                export_args.update({"dynamic_axes": self.dynamic_axes})
 
             # Use temporary directory for easy cleanup in case of external weights
             with tempfile.TemporaryDirectory() as tmpdir:
-                onnx_path = Path(tmpdir) / "model.onnx"
+                unrolled_input = unroll_input(self.input_names, input_example)
+                onnx_path = str(Path(tmpdir) / "model.onnx")
                 self.logger.info(
-                    f"Exporting to {onnx_path}:\n\toutput_names={self.output_names}\n\texport args: {export_args}"
+                    f"Exporting to {onnx_path}:\nunrolled_inputs={list(unrolled_input.keys())}\noutput_names={self.output_names}\ninput_names={self.input_names}\nexport args: {export_args}"
                 )
                 convert_to_onnx(
                     model,
                     input_example,
-                    filename=str(onnx_path),
-                    input_names=self.input_names,
+                    filename=onnx_path,
+                    input_names=list(unrolled_input.keys()),
                     output_names=self.output_names,
                     **export_args,
                 )
                 self.logger.info("Export to ONNX successful.")
-                engine_bytes = self._onnx_to_trt(str(onnx_path))
-
-        open(self.plan_path, "wb").write(engine_bytes)
+                engine_bytes = self._onnx_to_trt(onnx_path)
+        if engine_bytes:
+            open(self.plan_path, "wb").write(engine_bytes)
 
 
 def trt_forward(self, *argv, **kwargs):
@@ -540,6 +599,8 @@ def trt_compile(
             args["timestamp"] = timestamp
 
         def wrap(model, path):
+            if not hasattr(model, "_trt_compiler"):
+                model.orig_forward = model.forward
             wrapper = TrtCompiler(model, path + ".plan", logger=logger, **args)
             model._trt_compiler = wrapper
             model.forward = MethodType(trt_forward, model)

diff --git a/monai/networks/utils.py b/monai/networks/utils.py
@@ -631,7 +631,6 @@ def convert_to_onnx(
     use_trace: bool = True,
     do_constant_folding: bool = True,
     constant_size_threshold: int = 16 * 1024 * 1024 * 1024,
-    dynamo=False,
     **kwargs,
 ):
     """
@@ -672,6 +671,9 @@ def convert_to_onnx(
             # let torch.onnx.export to trace the model.
             mode_to_export = model
             torch_versioned_kwargs = kwargs
+            if "dynamo" in kwargs and kwargs["dynamo"] and verify:
+                torch_versioned_kwargs["verify"] = verify
+                verify = False
         else:
             if not pytorch_after(1, 10):
                 if "example_outputs" not in kwargs:
@@ -693,7 +695,7 @@ def convert_to_onnx(
             f = io.BytesIO()
         else:
             f = filename
-
+        print(f"torch_versioned_kwargs={torch_versioned_kwargs}")
         torch.onnx.export(
             mode_to_export,
             onnx_inputs,
@@ -716,6 +718,9 @@ def convert_to_onnx(
         fold_constants(onnx_model, size_threshold=constant_size_threshold)
 
     if verify:
+        if isinstance(inputs, dict):
+            inputs = list(inputs.values())
+
         if device is None:
             device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 

diff --git a/tests/test_trt_compile.py b/tests/test_trt_compile.py
@@ -24,6 +24,7 @@
 from tests.utils import skip_if_no_cuda, skip_if_quick, skip_if_windows
 
 trt, trt_imported = optional_import("tensorrt", "10.1.0", min_version)
+torch_tensorrt, torch_trt_imported = optional_import("torch_tensorrt")
 polygraphy, polygraphy_imported = optional_import("polygraphy")
 build_sam_vit_b, has_sam = optional_import("segment_anything.build_sam", name="build_sam_vit_b")
 
@@ -46,6 +47,7 @@ def tearDown(self):
         if current_device != self.gpu_device:
             torch.cuda.set_device(self.gpu_device)
 
+    @unittest.skipUnless(torch_trt_imported, "torch_tensorrt is required")
     def test_handler(self):
         from ignite.engine import Engine