More gardening for open source release (#22)

* removed comments, refactored compilation code * pinning versions * fp8 module * pinned nightlies + fp8 paths * removing half-implemented profiling * ruff
replicate · Sep 23, 2024 · 99cfbb7 · 99cfbb7
1 parent 5c8c435
commit 99cfbb7
Show file tree

Hide file tree

Showing 14 changed files with 71 additions and 148 deletions.
diff --git a/cog.yaml.template b/cog.yaml.template
@@ -20,13 +20,15 @@ build:
     - "tokenizers==0.19.1"
     - "protobuf==5.27.2"
     - "diffusers==0.29.2"
-    - "loguru"
-    - "pybase64"
+    - "loguru==0.7.2"
+    - "pybase64==1.4.0"
+    - "pydash==8.0.3"
 
 
   # commands run after the environment is setup
   run:
     - curl -o /usr/local/bin/pget -L "https://github.com/replicate/pget/releases/download/v0.8.2/pget_linux_x86_64" && chmod +x /usr/local/bin/pget
     - pip uninstall -y torch torchvision torchaudio
-    - pip3 install --pre torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/cu124
-    - pip install pydash
+    # - pip3 install --pre torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/cu124
+    # pinning to specific nightlies for release
+    - pip3 install https://download.pytorch.org/whl/nightly/cu124/torch-2.6.0.dev20240918%2Bcu124-cp311-cp311-linux_x86_64.whl https://download.pytorch.org/whl/nightly/cu124/torchaudio-2.5.0.dev20240918%2Bcu124-cp311-cp311-linux_x86_64.whl https://download.pytorch.org/whl/nightly/cu124/torchvision-0.20.0.dev20240918%2Bcu124-cp311-cp311-linux_x86_64.whl https://download.pytorch.org/whl/nightly/pytorch_triton-3.1.0%2B5fe38ffd73-cp311-cp311-linux_x86_64.whl
diff --git a/fp8/__init__.py b/fp8/__init__.py
diff --git a/configs/config-1-flux-dev-h100.json → fp8/configs/config-1-flux-dev-h100.json b/configs/config-1-flux-dev-h100.json → fp8/configs/config-1-flux-dev-h100.json
diff --git a/configs/config-1-flux-schnell-h100.json → fp8/configs/config-1-flux-schnell-h100.json b/configs/config-1-flux-schnell-h100.json → fp8/configs/config-1-flux-schnell-h100.json
diff --git a/float8_quantize.py → fp8/float8_quantize.py b/float8_quantize.py → fp8/float8_quantize.py
@@ -7,7 +7,7 @@
 from torch import __version__
 from torch.version import cuda
 
-from modules.flux_model import Modulation
+from fp8.modules.flux_model import Modulation
 
 IS_TORCH_2_4 = __version__ < (2, 4, 9)
 LT_TORCH_2_4 = __version__ < (2, 4)

diff --git a/flux_pipeline.py → fp8/flux_pipeline.py b/flux_pipeline.py → fp8/flux_pipeline.py
@@ -15,11 +15,6 @@
 import torch
 from einops import rearrange, repeat
 
-torch.backends.cuda.matmul.allow_tf32 = True
-torch.backends.cudnn.allow_tf32 = True
-torch.backends.cudnn.benchmark = True
-torch.backends.cudnn.benchmark_limit = 20
-torch.set_float32_matmul_precision("high")
 from pybase64 import standard_b64decode
 from torch._dynamo import config
 from torch._inductor import config as ind_config
@@ -32,9 +27,9 @@
 from torchvision.transforms import functional as TF
 from tqdm import tqdm
 
-import lora_loading
-from image_encoder import ImageEncoder
-from util import (
+import fp8.lora_loading as lora_loading
+from fp8.image_encoder import ImageEncoder
+from fp8.util import (
     LoadedModels,
     ModelSpec,
     ModelVersion,
@@ -51,9 +46,9 @@
 
 
 if TYPE_CHECKING:
-    from modules.autoencoder import AutoEncoder
-    from modules.conditioner import HFEmbedder
-    from modules.flux_model import Flux
+    from fp8.modules.autoencoder import AutoEncoder
+    from fp8.modules.conditioner import HFEmbedder
+    from fp8.modules.flux_model import Flux
 
 
 class FluxPipeline:
@@ -738,7 +733,7 @@ def load_pipeline_from_config_path(
     def load_pipeline_from_config(
         cls, config: ModelSpec, debug: bool = False, shared_models: LoadedModels = None
     ) -> "FluxPipeline":
-        from float8_quantize import quantize_flow_transformer_and_dispatch_float8
+        from fp8.float8_quantize import quantize_flow_transformer_and_dispatch_float8
 
         with torch.inference_mode():
             if debug:

diff --git a/image_encoder.py → fp8/image_encoder.py b/image_encoder.py → fp8/image_encoder.py
diff --git a/lora_loading.py → fp8/lora_loading.py b/lora_loading.py → fp8/lora_loading.py
@@ -7,8 +7,8 @@
     from cublas_ops import CublasLinear
 except Exception as e:
     CublasLinear = type(None)
-from float8_quantize import F8Linear
-from modules.flux_model import Flux
+from fp8.float8_quantize import F8Linear
+from fp8.modules.flux_model import Flux
 
 
 def swap_scale_shift(weight):

diff --git a/modules/autoencoder.py → fp8/modules/autoencoder.py b/modules/autoencoder.py → fp8/modules/autoencoder.py
diff --git a/modules/conditioner.py → fp8/modules/conditioner.py b/modules/conditioner.py → fp8/modules/conditioner.py
diff --git a/modules/flux_model.py → fp8/modules/flux_model.py b/modules/flux_model.py → fp8/modules/flux_model.py
@@ -4,7 +4,7 @@
 import torch
 
 if TYPE_CHECKING:
-    from util import ModelSpec
+    from fp8.util import ModelSpec
 
 DISABLE_COMPILE = os.getenv("DISABLE_COMPILE", "0") == "1"
 torch.backends.cuda.matmul.allow_tf32 = True
@@ -118,7 +118,7 @@ class MLPEmbedder(nn.Module):
     def __init__(
         self, in_dim: int, hidden_dim: int, prequantized: bool = False, quantized=False
     ):
-        from float8_quantize import F8Linear
+        from fp8.float8_quantize import F8Linear
 
         super().__init__()
         self.in_layer = (
@@ -188,7 +188,7 @@ def __init__(
         prequantized: bool = False,
     ):
         super().__init__()
-        from float8_quantize import F8Linear
+        from fp8.float8_quantize import F8Linear
 
         self.num_heads = num_heads
         head_dim = dim // num_heads
@@ -236,7 +236,7 @@ def forward(self, x: Tensor, pe: Tensor) -> Tensor:
 class Modulation(nn.Module):
     def __init__(self, dim: int, double: bool, quantized_modulation: bool = False):
         super().__init__()
-        from float8_quantize import F8Linear
+        from fp8.float8_quantize import F8Linear
 
         self.is_double = double
         self.multiplier = 6 if double else 3
@@ -272,7 +272,7 @@ def __init__(
         prequantized: bool = False,
     ):
         super().__init__()
-        from float8_quantize import F8Linear
+        from fp8.float8_quantize import F8Linear
 
         self.dtype = dtype
 
@@ -417,7 +417,7 @@ def __init__(
         prequantized: bool = False,
     ):
         super().__init__()
-        from float8_quantize import F8Linear
+        from fp8.float8_quantize import F8Linear
 
         self.dtype = dtype
         self.hidden_dim = hidden_size
@@ -515,7 +515,7 @@ def __init__(self, config: "ModelSpec", dtype: torch.dtype = torch.float16):
         prequantized_flow = config.prequantized_flow
         quantized_embedders = config.quantize_flow_embedder_layers and prequantized_flow
         quantized_modulation = config.quantize_modulation and prequantized_flow
-        from float8_quantize import F8Linear
+        from fp8.float8_quantize import F8Linear
 
         if config.params.hidden_size % config.params.num_heads != 0:
             raise ValueError(
@@ -671,7 +671,7 @@ def forward(
     def from_pretrained(
         cls: "Flux", path: str, dtype: torch.dtype = torch.float16
     ) -> "Flux":
-        from util import load_config_from_path
+        from fp8.util import load_config_from_path
         from safetensors.torch import load_file
 
         config = load_config_from_path(path)

diff --git a/util.py → fp8/util.py b/util.py → fp8/util.py
@@ -4,9 +4,9 @@
 from typing import Any, Literal, Optional
 
 import torch
-from modules.autoencoder import AutoEncoder, AutoEncoderParams
-from modules.conditioner import HFEmbedder
-from modules.flux_model import Flux, FluxParams
+from fp8.modules.autoencoder import AutoEncoder, AutoEncoderParams
+from fp8.modules.conditioner import HFEmbedder
+from fp8.modules.flux_model import Flux, FluxParams
 from safetensors.torch import load_file as load_sft
 
 try:
@@ -34,55 +34,10 @@ class QuantizationDtype(StrEnum):
     qint8 = "qint8"
 
 
-# @dataclass
-# class ModelSpec:
-#     version: ModelVersion
-#     params: FluxParams
-#     ae_params: AutoEncoderParams
-#     ckpt_path: str | None
-#     ae_path: str | None
-#     repo_id: str | None
-#     repo_flow: str | None
-#     repo_ae: str | None
-#     text_enc_max_length: int = 512
-#     text_enc_path: str | None = None
-#     text_enc_device: str | torch.device | None = "cuda:0"
-#     ae_device: str | torch.device | None = "cuda:0"
-#     flux_device: str | torch.device | None = "cuda:0"
-#     flow_dtype: str = "float16"
-#     ae_dtype: str = "bfloat16"
-#     text_enc_dtype: str = "bfloat16"
-#     num_to_quant: Optional[int] = 20
-#     quantize_extras: bool = False
-#     compile_extras: bool = False
-#     compile_blocks: bool = False
-#     flow_quantization_dtype: Optional[QuantizationDtype] = QuantizationDtype.qfloat8
-#     text_enc_quantization_dtype: Optional[QuantizationDtype] = QuantizationDtype.qfloat8
-#     ae_quantization_dtype: Optional[QuantizationDtype] = None
-#     clip_quantization_dtype: Optional[QuantizationDtype] = None
-#     offload_text_encoder: bool = False
-#     offload_vae: bool = False
-#     offload_flow: bool = False
-#     prequantized_flow: bool = False
-#     quantize_modulation: bool = True
-#     quantize_flow_embedder_layers: bool = False
-
-# @dataclass
-# class LoadedModels:
-#     flow: Flux
-#     ae: AutoEncoder
-#     clip: HFEmbedder
-#     t5: HFEmbedder
-#     config: ModelSpec
-
 class ModelSpec(BaseModel):
     class Config:
         arbitrary_types_allowed = True
         use_enum_values = True
-    # model_config: ConfigDict = {
-    #     "arbitrary_types_allowed": True,
-    #     "use_enum_values": True,
-    # }
     version: ModelVersion
     params: FluxParams
     ae_params: AutoEncoderParams
@@ -325,7 +280,7 @@ def load_autoencoder(config: ModelSpec) -> AutoEncoder:
         print_load_warning(missing, unexpected)
     ae.to(device=into_device(config.ae_device), dtype=into_dtype(config.ae_dtype))
     if config.ae_quantization_dtype is not None:
-        from float8_quantize import recursive_swap_linears
+        from fp8.float8_quantize import recursive_swap_linears
 
         recursive_swap_linears(ae)
     if config.offload_vae: