Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix exlllamav2 backend issue #144

Merged
merged 4 commits into from
Jun 3, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions auto_round/auto_quantizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -255,7 +255,7 @@ class AutoRoundQuantizer(HfQuantizer):

def __init__(self, quantization_config: QuantizationConfigMixin, **kwargs):
super().__init__(quantization_config, **kwargs)
self.exllama2_available = is_autoround_exllamav2_available
self.exllama2_available = is_autoround_exllamav2_available()

def validate_environment(self, *args, **kwargs):
if not is_auto_round_available():
Expand Down Expand Up @@ -311,7 +311,7 @@ def convert_model(self, model: nn.Module):
return model

def _dynamic_import_inference_linear(self, bits, backend):
if bits == 4 and self.exllama2_available and "exllama2" in backend:
if bits == 4 and self.exllama2_available and "exllamav2" in backend:
from auto_round_extension.cuda.qliner_exllamav2 import QuantLinear
else:
from auto_round_extension.cuda.qliner_triton import QuantLinear
Expand Down
6 changes: 2 additions & 4 deletions auto_round/autoround.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,14 +70,13 @@ def __init__(self, orig_layer, enable_minmax_tuning=True):
self.scale_dtype = self.orig_layer.scale_dtype
self.sym = self.orig_layer.sym
weight_dtype = self.orig_layer.weight.dtype
weight_dtype = torch.float32 ##TODO revert the change to check the accuracy
weight_dtype = torch.float32
self.value = torch.nn.Parameter(
torch.zeros(self.orig_layer.weight.shape, device=self.orig_layer.weight.device, dtype=weight_dtype),
requires_grad=True,
)
self.enable_minmax_tuning = enable_minmax_tuning
shape = get_scale_shape(self.orig_layer.weight, self.group_size)
weight_dtype = self.orig_layer.weight.dtype
if self.enable_minmax_tuning:
self.min_scale = torch.nn.Parameter(
torch.zeros(shape, device=self.orig_layer.weight.device, dtype=weight_dtype), requires_grad=True
Expand Down Expand Up @@ -178,8 +177,7 @@ def __init__(self, orig_layer, enable_minmax_tuning=True):
self.sym = self.orig_layer.sym
self.scale_dtype = self.orig_layer.scale_dtype
weight_dtype = self.orig_layer.weight.dtype
weight_dtype = torch.float32 ##TODO revert the change to check the accuracy

weight_dtype = torch.float32
device = self.orig_layer.weight.device
self.weight_t = self.orig_layer.weight.t()
self.value = torch.nn.Parameter(
Expand Down