diff --git a/auto_round/auto_quantizer.py b/auto_round/auto_quantizer.py index 84aaa5e7..cb7ebee0 100644 --- a/auto_round/auto_quantizer.py +++ b/auto_round/auto_quantizer.py @@ -255,7 +255,7 @@ class AutoRoundQuantizer(HfQuantizer): def __init__(self, quantization_config: QuantizationConfigMixin, **kwargs): super().__init__(quantization_config, **kwargs) - self.exllama2_available = is_autoround_exllamav2_available + self.exllama2_available = is_autoround_exllamav2_available() def validate_environment(self, *args, **kwargs): if not is_auto_round_available(): @@ -311,7 +311,7 @@ def convert_model(self, model: nn.Module): return model def _dynamic_import_inference_linear(self, bits, backend): - if bits == 4 and self.exllama2_available and "exllama2" in backend: + if bits == 4 and self.exllama2_available and "exllamav2" in backend: from auto_round_extension.cuda.qliner_exllamav2 import QuantLinear else: from auto_round_extension.cuda.qliner_triton import QuantLinear diff --git a/auto_round/autoround.py b/auto_round/autoround.py index 375da000..127ca52c 100644 --- a/auto_round/autoround.py +++ b/auto_round/autoround.py @@ -70,14 +70,13 @@ def __init__(self, orig_layer, enable_minmax_tuning=True): self.scale_dtype = self.orig_layer.scale_dtype self.sym = self.orig_layer.sym weight_dtype = self.orig_layer.weight.dtype - weight_dtype = torch.float32 ##TODO revert the change to check the accuracy + weight_dtype = torch.float32 self.value = torch.nn.Parameter( torch.zeros(self.orig_layer.weight.shape, device=self.orig_layer.weight.device, dtype=weight_dtype), requires_grad=True, ) self.enable_minmax_tuning = enable_minmax_tuning shape = get_scale_shape(self.orig_layer.weight, self.group_size) - weight_dtype = self.orig_layer.weight.dtype if self.enable_minmax_tuning: self.min_scale = torch.nn.Parameter( torch.zeros(shape, device=self.orig_layer.weight.device, dtype=weight_dtype), requires_grad=True @@ -178,8 +177,7 @@ def __init__(self, orig_layer, enable_minmax_tuning=True): self.sym = self.orig_layer.sym self.scale_dtype = self.orig_layer.scale_dtype weight_dtype = self.orig_layer.weight.dtype - weight_dtype = torch.float32 ##TODO revert the change to check the accuracy - + weight_dtype = torch.float32 device = self.orig_layer.weight.device self.weight_t = self.orig_layer.weight.t() self.value = torch.nn.Parameter(