Skip to content

Commit

Permalink
[UI] Correct "convert_weight_only" to "convert_weights_only"
Browse files Browse the repository at this point in the history
This is a frequent typo among multiple developers, as "weights" is
typically plural.  This commit updates the command-line-argument from
`--convert-weight-only` to `--convert-weights-only`.  For backwards
compatibility, the original spelling is kept as an equivalent usage.
  • Loading branch information
Lunderberg committed Nov 9, 2023
1 parent 834811f commit 42eb0bb
Showing 1 changed file with 57 additions and 4 deletions.
61 changes: 57 additions & 4 deletions mlc_llm/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,10 @@
stablelm_3b,
)
from mlc_llm.relax_model.commons import create_shard_info_func, create_shard_transformation_func
from mlc_llm.relax_model.param_manager import transform_params_for_each_rank, chain_parameter_transforms
from mlc_llm.relax_model.param_manager import (
transform_params_for_each_rank,
chain_parameter_transforms,
)
from mlc_llm.transform import fuse_split_rotary_embedding, rewrite_attention


Expand All @@ -50,75 +53,103 @@ class BuildArgs:
The name of the model to build. If it is ``auto``, we will automatically
set the model name according to ``--model-path``, ``hf-path``, or the model
folders under ``--artifact-path/models``.
hf_path: str
Hugging Face path from which to download params, tokenizer, and config.
quantization: str
The quantization mode we use to compile.
max_seq_len: int
The maximum allowed sequence length for the model.
target: str
The target platform to compile the model for.
db_path: str
Path to log database for all models. Default: ``./log_db/``.
reuse_lib: str
Whether to reuse a previously generated lib.
artifact_path: str
Where to store the output.
use_cache: int
Whether to use previously pickled IRModule and skip trace.
convert_weight_only: bool
convert_weights_only: bool
Whether to only convert model weights and not build the model. If both
``convert_weight_only`` and ``build_model_only`` are set, the behavior is undefined.
build_model_only: bool
Whether to only build model and do not convert model weights.
debug_dump: bool
Whether to dump debugging files during compilation.
debug_load_script: bool
Whether to load the script for debugging.
llvm_mingw: str
``/path/to/llvm-mingw-root``, use llvm-mingw to cross compile to windows.
system_lib: bool
A parameter to ``relax.build``.
sep_embed: bool
Build with separated embedding layer, only applicable to LlaMa. This
feature is in testing stage, and will be formally replaced after massive
overhaul of embedding feature for all models and use cases.
sliding_window: int
The sliding window size in sliding window attention (SWA). This optional field
overrides the `sliding_window` in config.json for those models that use SWA.
Currently only useful when compiling Mistral.
sliding_window_chunk_size: int
The chunk size in sliding window attention (SWA) during prefilling. By default,
the chunk size is the same as sliding window. Currently only useful when compiling Mistral.
cc_path: str
``/path/to/cross_compiler_path``; currently only used for cross-compile
for nvidia/jetson device.
use_safetensors: bool
Specifies whether to use ``.safetensors`` instead of the default ``.bin``
when loading in model weights.
enable_batching: bool
Build the model for batched inference.
This is a temporary flag used to control the model execution flow in single-
sequence and batching settings for now. We will eventually merge two flows
in the future and remove this flag then.
no_cutlass_attn: bool
Disable offloading attention operations to CUTLASS.
no_cutlass_norm: bool
Disable offloading layer and RMS norm operations to CUTLASS.
no_cublas: bool
Disable the step that offloads matmul to cuBLAS. Without this flag,
matmul will be offloaded to cuBLAS if quantization mode is ``q0f16`` or
``q0f32``, target is CUDA and TVM has been built with cuBLAS enabled.
use_cuda_graph: bool
Specifies whether to enable CUDA Graph for the decoder. MLP and QKV
projection between two attention layers are put into a graph.
num_shards: int
Number of shards to split the model into in tensor parallelism multi-gpu
inference. Only useful when ``build_model_only`` is set.
use_flash_attn_mqa: bool
Offload multi-query attention workload to Flash Attention.
pdb: bool
If set, drop into a pdb debugger on error.
use_vllm_attention: bool
Use vLLM paged KV cache and attention kernel, only relevant when enable_batching=True.
"""
Expand Down Expand Up @@ -159,8 +190,9 @@ class BuildArgs:
default=1,
metadata={"help": "Whether to use previously pickled IRModule and skip trace."},
)
convert_weight_only: bool = field(
convert_weights_only: bool = field(
default=False,
dest="convert_weights_only",
metadata={
"help": "Whether to only convert model weights and not build the model.",
"action": "store_true",
Expand Down Expand Up @@ -333,6 +365,11 @@ class BuildArgs:
},
)

@property
def convert_weight_only(self):
"""A backwards-compatibility helper"""
return self.convert_weights_only


def convert_build_args_to_argparser() -> argparse.ArgumentParser:
"""Convert from BuildArgs to an equivalent ArgumentParser."""
Expand All @@ -347,6 +384,20 @@ def convert_build_args_to_argparser() -> argparse.ArgumentParser:
args.add_argument(field_name, default=field.default, **kwargs)
else:
args.add_argument(field_name, type=field.type, default=field.default, **kwargs)

# Most models contain more than a single parameter (citation
# needed), so "weights" should be plural. The initial use of
# "--convert-weight-only" caused enough typos that it is worth
# fixing. The old argument spelling is retained for backwards
# compatibility.
args.add_argument(
"--convert-weight-only",
default=False,
dest="convert_weights_only",
action="store_true",
help="Equivalent to --convert-weights-only, retained for backwards compatibility.",
)

return args


Expand Down Expand Up @@ -794,7 +845,9 @@ def build_model_from_args(args: argparse.Namespace):
mod_transform = seq(mod_transform)

params = utils.convert_weights(mod_transform, param_manager, params, args)
utils.save_params(params, args.artifact_path, args.num_shards if args.use_presharded_weights else 1)
utils.save_params(
params, args.artifact_path, args.num_shards if args.use_presharded_weights else 1
)

if args.model_category != "minigpt":
utils.copy_tokenizer(args)
Expand Down

0 comments on commit 42eb0bb

Please sign in to comment.