diff --git a/aphrodite/common/utils.py b/aphrodite/common/utils.py index 285690073..9b2af44ca 100644 --- a/aphrodite/common/utils.py +++ b/aphrodite/common/utils.py @@ -1138,8 +1138,9 @@ def tensor_progress_bar(iterable:Iterable[Tuple[str, torch.Tensor]], SpinnerColumn(), TextColumn("[progress.description]{task.description}"), BarColumn(), - MofNCompleteColumn(), + # MofNCompleteColumn(), TextColumn("[progress.percentage]{task.percentage:>3.0f}%"), + TextColumn("{task.completed:.2f}/{task.total:.2f} GiB"), TimeElapsedColumn(), ) as progress: task = progress.add_task(f"[cyan]{desc}", total=final_bytes/units) diff --git a/aphrodite/distributed/device_communicators/custom_all_reduce.py b/aphrodite/distributed/device_communicators/custom_all_reduce.py index f037f8d71..a8f8749bf 100644 --- a/aphrodite/distributed/device_communicators/custom_all_reduce.py +++ b/aphrodite/distributed/device_communicators/custom_all_reduce.py @@ -78,12 +78,13 @@ def __init__(self, return if world_size not in CustomAllreduce._SUPPORTED_WORLD_SIZES: - logger.warning( - "Custom allreduce is disabled due to an unsupported world" - f" size: {world_size}. Supported world sizes:" - f"{str(CustomAllreduce._SUPPORTED_WORLD_SIZES)}. To silence " - "this warning, specify disable_custom_all_reduce=True " - "explicitly.") + if rank == 0: + logger.warning( + "Custom allreduce is disabled due to an unsupported world" + f" size: {world_size}. Supported world sizes:" + f"{str(CustomAllreduce._SUPPORTED_WORLD_SIZES)}. To " + "silence this warning, specify disable_custom_all_reduce=" + "True explicitly.") return if isinstance(device, int): @@ -119,19 +120,23 @@ def __init__(self, cuda_platform: CudaPlatform = current_platform full_nvlink = cuda_platform.is_full_nvlink(physical_device_ids) if world_size > 2 and not full_nvlink: - logger.warning( - "Custom allreduce is disabled because it's not supported on" - " more than two PCIe-only GPUs. To silence this warning, " - "specify disable_custom_all_reduce=True explicitly.") + if rank == 0: + logger.warning( + "Custom allreduce is disabled because it's not supported " + "on more than two PCIe-only GPUs. To silence this " + "warning, specify disable_custom_all_reduce=True " + "explicitly.") return # test P2P capability, this checks software/cudaruntime support # this is expensive to compute at the first time # then we cache the result if not _can_p2p(rank, world_size): - logger.warning( - "Custom allreduce is disabled because your platform lacks " - "GPU P2P capability or P2P test failed. To silence this " - "warning, specify disable_custom_all_reduce=True explicitly.") + if rank == 0: + logger.warning( + "Custom allreduce is disabled because your platform lacks " + "GPU P2P capability or P2P test failed. To silence this " + "warning, specify disable_custom_all_reduce=True " + "explicitly.") return self.disabled = False diff --git a/aphrodite/modeling/model_loader/loader.py b/aphrodite/modeling/model_loader/loader.py index abb45ee11..f63279610 100644 --- a/aphrodite/modeling/model_loader/loader.py +++ b/aphrodite/modeling/model_loader/loader.py @@ -353,7 +353,7 @@ def load_model(self, *, model_config: ModelConfig, "fall_back_to_pt_during_load", True)) model.load_weights(tensor_progress_bar(weights, wgt_bytes, - "Loading modules...")) + "Loading model weights...")) for _, module in model.named_modules(): quant_method = getattr(module, "quant_method", None) diff --git a/aphrodite/task_handler/model_runner.py b/aphrodite/task_handler/model_runner.py index 412aeb5a7..2ec8f1925 100644 --- a/aphrodite/task_handler/model_runner.py +++ b/aphrodite/task_handler/model_runner.py @@ -900,13 +900,13 @@ def load_model(self) -> None: if rank == 0: logger.info(f"Model loaded in {total_time:.2f} seconds.") logger.info( - "Weights memory usage: " - f"{self.model_memory_usage / float(2**30):.2f} GiB x {tp} =" - f" {self.model_memory_usage * tp / float(2**30):.2f} GiB") + "Total model weights memory usage: " + f"{self.model_memory_usage * tp / float(2**30):.2f} GiB") else: logger.info(f"Model weights loaded in {total_time:.2f} seconds.") - logger.info("Weights memory usage: " - f"{self.model_memory_usage / float(2**30):.2f} GiB") + logger.info( + "Total model weights memory usage: " + f"{self.model_memory_usage / float(2**30):.2f} GiB") if self.lora_config: assert supports_lora(self.model), "Model does not support LoRA"