quant: update tpu_int8 to use AphroditeParameters (#959)

PygmalionAI · Dec 22, 2024 · f4b62bf · f4b62bf
1 parent 9ff3239
commit f4b62bf
Show file tree

Hide file tree

Showing 2 changed files with 12 additions and 11 deletions.
diff --git a/aphrodite/modeling/layers/linear.py b/aphrodite/modeling/layers/linear.py
@@ -27,7 +27,7 @@
     "CompressedTensorsLinearMethod", "GPTQMarlinLinearMethod",
     "AWQMarlinLinearMethod", "AWQLinearMethod", "HQQMarlinMethod",
     "Fp8LinearMethod", "MarlinLinearMethod", "QQQLinearMethod",
-    "GPTQMarlin24LinearMethod",
+    "GPTQMarlin24LinearMethod", "TPUInt8LinearMethod"
 ]
 
 

diff --git a/aphrodite/quantization/tpu_int8.py b/aphrodite/quantization/tpu_int8.py
@@ -5,7 +5,7 @@
 from torch.nn.parameter import Parameter
 
 from aphrodite.modeling.layers.linear import LinearBase, LinearMethodBase
-from aphrodite.modeling.utils import set_weight_attrs
+from aphrodite.modeling.parameter import ModelWeightParameter
 from aphrodite.quantization.base_config import QuantizationConfig
 
 ACTIVATION_SCHEMES = ["none"]
@@ -63,16 +63,16 @@ def create_weights(self, layer: Module, input_size_per_partition: int,
                        output_partition_sizes: List[int], input_size: int,
                        output_size: int, params_dtype: torch.dtype,
                        **extra_weight_attrs):
-        weight = Parameter(torch.empty(sum(output_partition_sizes),
-                                       input_size_per_partition,
-                                       dtype=params_dtype),
-                           requires_grad=False)
+
+        weight_loader = extra_weight_attrs.get("weight_loader")
+        weight = ModelWeightParameter(data=torch.empty(
+            sum(output_partition_sizes),
+            input_size_per_partition,
+            dtype=params_dtype),
+                                      input_dim=1,
+                                      output_dim=0,
+                                      weight_loader=weight_loader)
         layer.register_parameter("weight", weight)
-        set_weight_attrs(weight, {
-            **extra_weight_attrs,
-            "input_dim": 1,
-            "output_dim": 0,
-        })
 
     def _quantize_weight(
             self, weight: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
@@ -91,6 +91,7 @@ def _quantize_weight(
         return qweight, qscale
 
     def process_weights_after_loading(self, layer: Module) -> None:
+        layer.weight = Parameter(layer.weight.data, requires_grad=False)
         device = layer.weight.device
         qweight, qscale = self._quantize_weight(layer.weight)
         qweight = qweight.to(device)