diff --git a/.github/.test_durations b/.github/.test_durations
index 4a5e8915..e5bbb6f1 100644
--- a/.github/.test_durations
+++ b/.github/.test_durations
@@ -72,69 +72,74 @@
     "tests/dry_test/test_models.py::test_models_dry_run[qwen-turbo-extra_args2-gsm8k]": 22.391773478000232,
     "tests/dry_test/test_models.py::test_models_dry_run[qwen-turbo-extra_args2-hellaswag]": 29.800566227000445,
     "tests/dry_test/test_models.py::test_models_dry_run[qwen-turbo-extra_args2-mmlu]": 27.11841242399987,
-    "tests/utilization/dataset/test_formatting.py::test_formatting[gsm8k-0-None-None]": 19.47190578499999,
-    "tests/utilization/dataset/test_formatting.py::test_formatting[gsm8k-2-least_to_most-None]": 19.036557892000246,
-    "tests/utilization/dataset/test_formatting.py::test_formatting[gsm8k-8-None-None]": 17.31018888100016,
-    "tests/utilization/dataset/test_formatting.py::test_formatting[gsm8k-8-base-None]": 17.836719111999855,
-    "tests/utilization/dataset/test_formatting.py::test_formatting[gsm8k-8-least_to_most-None]": 21.655824718000076,
-    "tests/utilization/dataset/test_formatting.py::test_formatting[gsm8k-8-pal-None]": 18.86650374399983,
-    "tests/utilization/dataset/test_formatting.py::test_formatting[mmlu:abstract_algebra-0-None-generation]": 0.18995946199970604,
-    "tests/utilization/dataset/test_formatting.py::test_formatting[mmlu:abstract_algebra-0-None-ppl]": 0.3936774169999353,
-    "tests/utilization/dataset/test_formatting.py::test_formatting[mmlu:abstract_algebra-0-None-ppl_no_option]": 0.2313289519997852,
-    "tests/utilization/dataset/test_formatting.py::test_formatting[mmlu:abstract_algebra-0-None-prob]": 0.1575275150003108,
-    "tests/utilization/dataset/test_formatting.py::test_formatting[mmlu:abstract_algebra-5-None-ppl]": 0.675848115000008,
-    "tests/utilization/dataset/test_formatting.py::test_formatting[mmlu:abstract_algebra-5-None-prob]": 0.2678239950000716,
-    "tests/utilization/model/test_apply_prompt_template.py::test_base": 0.0056042960002287145,
-    "tests/utilization/model/test_apply_prompt_template.py::test_final_strip": 0.0014387530000021798,
-    "tests/utilization/model/test_apply_prompt_template.py::test_llama2": 0.0019011760000466893,
-    "tests/utilization/model/test_apply_prompt_template.py::test_no_smart_space": 0.0026229310001326667,
-    "tests/utilization/model/test_apply_prompt_template.py::test_smart_space": 0.0018796639999436593,
-    "tests/utilization/model/test_to_model_prompt.py::test_to_model_prompt[generation-False]": 0.0033287819999259227,
-    "tests/utilization/model/test_to_model_prompt.py::test_to_model_prompt[generation-True]": 0.002076140999861309,
-    "tests/utilization/model/test_to_model_prompt.py::test_to_model_prompt[get_ppl-False]": 0.002438235000226996,
-    "tests/utilization/model/test_to_model_prompt.py::test_to_model_prompt[get_ppl-True]": 0.0021245209998141945,
-    "tests/utilization/model/test_to_model_prompt.py::test_to_model_prompt[get_prob-False]": 0.0019722209999599727,
-    "tests/utilization/model/test_to_model_prompt.py::test_to_model_prompt[get_prob-True]": 0.002425271000220164,
-    "tests/utilization/utils/test_batch_sampler.py::test_auto_batch_sampler": 0.001046856999892043,
-    "tests/utilization/utils/test_batch_sampler.py::test_auto_batch_sampler_auto_batching": 0.0025648010002896626,
-    "tests/utilization/utils/test_batch_sampler.py::test_dcbs": 0.8340403449999485,
-    "tests/utilization/utils/test_batch_sampler.py::test_dcbs_auto_batching": 0.06662718100005804,
-    "tests/utilization/utils/test_batch_sampler.py::test_dcbs_few_shot": 0.0838686949998646,
-    "tests/utilization/utils/test_batch_sampler.py::test_dcbs_few_shot_prefix_caching": 3.93439717199999,
-    "tests/utilization/utils/test_log_results.py::test_log_final_results[generation:no_norm:conv:sample1:api-False-no_split]": 0.002944755367934704,
-    "tests/utilization/utils/test_log_results.py::test_log_final_results[generation:no_norm:conv:sample1:api-False-split]": 0.0031265132129192352,
-    "tests/utilization/utils/test_log_results.py::test_log_final_results[generation:no_norm:conv:sample1:api-True-no_split]": 0.002978229895234108,
-    "tests/utilization/utils/test_log_results.py::test_log_final_results[generation:no_norm:conv:sample1:api-True-split]": 0.003244483843445778,
-    "tests/utilization/utils/test_log_results.py::test_log_final_results[generation:no_norm:conv:sample1:local-False-no_split]": 0.00300593301653862,
-    "tests/utilization/utils/test_log_results.py::test_log_final_results[generation:no_norm:conv:sample1:local-False-split]": 0.003051883541047573,
-    "tests/utilization/utils/test_log_results.py::test_log_final_results[generation:no_norm:conv:sample1:local-True-no_split]": 0.00298389233648777,
-    "tests/utilization/utils/test_log_results.py::test_log_final_results[generation:no_norm:conv:sample1:local-True-split]": 0.011334518902003765,
-    "tests/utilization/utils/test_log_results.py::test_log_final_results[generation:no_norm:conv:sample2:api-False-no_split]": 0.00297668669372797,
-    "tests/utilization/utils/test_log_results.py::test_log_final_results[generation:no_norm:conv:sample2:api-False-split]": 0.003215758129954338,
-    "tests/utilization/utils/test_log_results.py::test_log_final_results[generation:no_norm:conv:sample2:api-True-no_split]": 0.0030325893312692642,
-    "tests/utilization/utils/test_log_results.py::test_log_final_results[generation:no_norm:conv:sample2:api-True-split]": 0.003189575858414173,
-    "tests/utilization/utils/test_log_results.py::test_log_final_results[generation:no_norm:conv:sample2:local-False-no_split]": 0.0030073318630456924,
-    "tests/utilization/utils/test_log_results.py::test_log_final_results[generation:no_norm:conv:sample2:local-False-split]": 0.003115130588412285,
-    "tests/utilization/utils/test_log_results.py::test_log_final_results[generation:no_norm:conv:sample2:local-True-no_split]": 0.002978302538394928,
-    "tests/utilization/utils/test_log_results.py::test_log_final_results[generation:no_norm:conv:sample2:local-True-split]": 0.0031450027599930763,
-    "tests/utilization/utils/test_log_results.py::test_log_final_results[generation:no_norm:legacy:sample1:local-False-no_split]": 0.002832619473338127,
-    "tests/utilization/utils/test_log_results.py::test_log_final_results[generation:no_norm:legacy:sample1:local-False-split]": 0.0028070490807294846,
-    "tests/utilization/utils/test_log_results.py::test_log_final_results[generation:no_norm:legacy:sample1:local-True-no_split]": 0.0029166433960199356,
-    "tests/utilization/utils/test_log_results.py::test_log_final_results[generation:no_norm:legacy:sample1:local-True-split]": 0.005571841262280941,
-    "tests/utilization/utils/test_log_results.py::test_log_final_results[generation:no_norm:legacy:sample2:local-False-no_split]": 0.0028258198872208595,
-    "tests/utilization/utils/test_log_results.py::test_log_final_results[generation:no_norm:legacy:sample2:local-False-split]": 0.0027440031990408897,
-    "tests/utilization/utils/test_log_results.py::test_log_final_results[generation:no_norm:legacy:sample2:local-True-no_split]": 0.0027814237400889397,
-    "tests/utilization/utils/test_log_results.py::test_log_final_results[generation:no_norm:legacy:sample2:local-True-split]": 0.0028041303157806396,
-    "tests/utilization/utils/test_log_results.py::test_log_final_results[get_ppl:acc_norm:legacy:sample1:local-False-no_split]": 0.0029122764244675636,
-    "tests/utilization/utils/test_log_results.py::test_log_final_results[get_ppl:acc_norm:legacy:sample1:local-False-split]": 0.0029731355607509613,
-    "tests/utilization/utils/test_log_results.py::test_log_final_results[get_ppl:acc_norm:legacy:sample1:local-True-no_split]": 0.002978702075779438,
-    "tests/utilization/utils/test_log_results.py::test_log_final_results[get_ppl:acc_norm:legacy:sample1:local-True-split]": 0.0030855443328619003,
-    "tests/utilization/utils/test_log_results.py::test_log_final_results[get_ppl:no_norm:legacy:sample1:local-False-no_split]": 0.002977837808430195,
-    "tests/utilization/utils/test_log_results.py::test_log_final_results[get_ppl:no_norm:legacy:sample1:local-False-split]": 0.003017907030880451,
-    "tests/utilization/utils/test_log_results.py::test_log_final_results[get_ppl:no_norm:legacy:sample1:local-True-no_split]": 0.002957606688141823,
-    "tests/utilization/utils/test_log_results.py::test_log_final_results[get_ppl:no_norm:legacy:sample1:local-True-split]": 0.0033153872936964035,
-    "tests/utilization/utils/test_log_results.py::test_log_final_results[get_prob:no_norm:legacy:sample1:local-False-no_split]": 0.002739163115620613,
-    "tests/utilization/utils/test_log_results.py::test_log_final_results[get_prob:no_norm:legacy:sample1:local-False-split]": 0.002798774279654026,
-    "tests/utilization/utils/test_log_results.py::test_log_final_results[get_prob:no_norm:legacy:sample1:local-True-no_split]": 0.0028306040912866592,
-    "tests/utilization/utils/test_log_results.py::test_log_final_results[get_prob:no_norm:legacy:sample1:local-True-split]": 0.002850011922419071
+    "tests/utilization/dataset/test_formatting.py::test_formatting[gsm8k-0-None-None]": 16.317220823839307,
+    "tests/utilization/dataset/test_formatting.py::test_formatting[gsm8k-2-least_to_most-None]": 16.421796096488833,
+    "tests/utilization/dataset/test_formatting.py::test_formatting[gsm8k-8-None-None]": 15.621955395676196,
+    "tests/utilization/dataset/test_formatting.py::test_formatting[gsm8k-8-base-None]": 13.577086235396564,
+    "tests/utilization/dataset/test_formatting.py::test_formatting[gsm8k-8-least_to_most-None]": 15.5470489859581,
+    "tests/utilization/dataset/test_formatting.py::test_formatting[gsm8k-8-pal-None]": 14.772390438243747,
+    "tests/utilization/dataset/test_formatting.py::test_formatting[mmlu:abstract_algebra-0-None-generation]": 0.06618671026080847,
+    "tests/utilization/dataset/test_formatting.py::test_formatting[mmlu:abstract_algebra-0-None-ppl]": 0.1658585388213396,
+    "tests/utilization/dataset/test_formatting.py::test_formatting[mmlu:abstract_algebra-0-None-ppl_no_option]": 0.1148089962080121,
+    "tests/utilization/dataset/test_formatting.py::test_formatting[mmlu:abstract_algebra-0-None-prob]": 0.07302630506455898,
+    "tests/utilization/dataset/test_formatting.py::test_formatting[mmlu:abstract_algebra-5-None-ppl]": 0.19693499617278576,
+    "tests/utilization/dataset/test_formatting.py::test_formatting[mmlu:abstract_algebra-5-None-prob]": 0.09083058871328831,
+    "tests/utilization/dataset/test_formatting.py::test_multi_turn": 5.1760842725634575,
+    "tests/utilization/model/test_apply_prompt_template.py::test_base": 0.0028072865679860115,
+    "tests/utilization/model/test_apply_prompt_template.py::test_final_strip": 0.0016121016815304756,
+    "tests/utilization/model/test_apply_prompt_template.py::test_llama2": 0.0018699830397963524,
+    "tests/utilization/model/test_apply_prompt_template.py::test_no_smart_space": 0.0017823278903961182,
+    "tests/utilization/model/test_apply_prompt_template.py::test_smart_space": 0.0016713934019207954,
+    "tests/utilization/model/test_to_model_prompt.py::test_to_model_prompt[generation-False]": 0.0020645475015044212,
+    "tests/utilization/model/test_to_model_prompt.py::test_to_model_prompt[generation-True]": 0.0018382547423243523,
+    "tests/utilization/model/test_to_model_prompt.py::test_to_model_prompt[get_ppl-False]": 0.0018322393298149109,
+    "tests/utilization/model/test_to_model_prompt.py::test_to_model_prompt[get_ppl-True]": 0.0018253298476338387,
+    "tests/utilization/model/test_to_model_prompt.py::test_to_model_prompt[get_prob-False]": 0.0017082132399082184,
+    "tests/utilization/model/test_to_model_prompt.py::test_to_model_prompt[get_prob-True]": 0.0017740670591592789,
+    "tests/utilization/utils/test_batch_sampler.py::test_auto_batch_sampler": 0.0006852876394987106,
+    "tests/utilization/utils/test_batch_sampler.py::test_auto_batch_sampler_auto_batching": 0.0011906828731298447,
+    "tests/utilization/utils/test_batch_sampler.py::test_dcbs": 24.25295663345605,
+    "tests/utilization/utils/test_batch_sampler.py::test_dcbs_auto_batching": 0.028026112355291843,
+    "tests/utilization/utils/test_batch_sampler.py::test_dcbs_few_shot": 0.029421145096421242,
+    "tests/utilization/utils/test_batch_sampler.py::test_dcbs_few_shot_prefix_caching": 21.832826550118625,
+    "tests/utilization/utils/test_log_results.py::test_log_final_results[generation:no_norm:conv:sample1:api-False-no_split]": 0.002918471582233906,
+    "tests/utilization/utils/test_log_results.py::test_log_final_results[generation:no_norm:conv:sample1:api-False-split]": 0.003132498823106289,
+    "tests/utilization/utils/test_log_results.py::test_log_final_results[generation:no_norm:conv:sample1:api-True-no_split]": 0.0029817093163728714,
+    "tests/utilization/utils/test_log_results.py::test_log_final_results[generation:no_norm:conv:sample1:api-True-split]": 0.0031080804765224457,
+    "tests/utilization/utils/test_log_results.py::test_log_final_results[generation:no_norm:conv:sample1:local-False-no_split]": 0.002980290912091732,
+    "tests/utilization/utils/test_log_results.py::test_log_final_results[generation:no_norm:conv:sample1:local-False-split]": 0.0030254973098635674,
+    "tests/utilization/utils/test_log_results.py::test_log_final_results[generation:no_norm:conv:sample1:local-True-no_split]": 0.0028862925246357918,
+    "tests/utilization/utils/test_log_results.py::test_log_final_results[generation:no_norm:conv:sample1:local-True-split]": 0.003034195862710476,
+    "tests/utilization/utils/test_log_results.py::test_log_final_results[generation:no_norm:conv:sample2:api-False-no_split]": 0.0030438993126153946,
+    "tests/utilization/utils/test_log_results.py::test_log_final_results[generation:no_norm:conv:sample2:api-False-split]": 0.0031886026263237,
+    "tests/utilization/utils/test_log_results.py::test_log_final_results[generation:no_norm:conv:sample2:api-True-no_split]": 0.0029781851917505264,
+    "tests/utilization/utils/test_log_results.py::test_log_final_results[generation:no_norm:conv:sample2:api-True-split]": 0.003177349455654621,
+    "tests/utilization/utils/test_log_results.py::test_log_final_results[generation:no_norm:conv:sample2:local-False-no_split]": 0.002928532660007477,
+    "tests/utilization/utils/test_log_results.py::test_log_final_results[generation:no_norm:conv:sample2:local-False-split]": 0.0030969344079494476,
+    "tests/utilization/utils/test_log_results.py::test_log_final_results[generation:no_norm:conv:sample2:local-True-no_split]": 0.0029563801363110542,
+    "tests/utilization/utils/test_log_results.py::test_log_final_results[generation:no_norm:conv:sample2:local-True-split]": 0.003104960545897484,
+    "tests/utilization/utils/test_log_results.py::test_log_final_results[generation:no_norm:legacy:sample1:local-False-no_split]": 0.0027246493846178055,
+    "tests/utilization/utils/test_log_results.py::test_log_final_results[generation:no_norm:legacy:sample1:local-False-split]": 0.0027040820568799973,
+    "tests/utilization/utils/test_log_results.py::test_log_final_results[generation:no_norm:legacy:sample1:local-True-no_split]": 0.002927420660853386,
+    "tests/utilization/utils/test_log_results.py::test_log_final_results[generation:no_norm:legacy:sample1:local-True-split]": 0.018455663695931435,
+    "tests/utilization/utils/test_log_results.py::test_log_final_results[generation:no_norm:legacy:sample2:local-False-no_split]": 0.002770565450191498,
+    "tests/utilization/utils/test_log_results.py::test_log_final_results[generation:no_norm:legacy:sample2:local-False-split]": 0.0027120010927319527,
+    "tests/utilization/utils/test_log_results.py::test_log_final_results[generation:no_norm:legacy:sample2:local-True-no_split]": 0.0027657533064484596,
+    "tests/utilization/utils/test_log_results.py::test_log_final_results[generation:no_norm:legacy:sample2:local-True-split]": 0.0028459904715418816,
+    "tests/utilization/utils/test_log_results.py::test_log_final_results[get_ppl:acc_norm:legacy:sample1:local-False-no_split]": 0.0029102256521582603,
+    "tests/utilization/utils/test_log_results.py::test_log_final_results[get_ppl:acc_norm:legacy:sample1:local-False-split]": 0.002931329421699047,
+    "tests/utilization/utils/test_log_results.py::test_log_final_results[get_ppl:acc_norm:legacy:sample1:local-True-no_split]": 0.002965434454381466,
+    "tests/utilization/utils/test_log_results.py::test_log_final_results[get_ppl:acc_norm:legacy:sample1:local-True-split]": 0.0031904708594083786,
+    "tests/utilization/utils/test_log_results.py::test_log_final_results[get_ppl:no_norm:legacy:sample1:local-False-no_split]": 0.0029605934396386147,
+    "tests/utilization/utils/test_log_results.py::test_log_final_results[get_ppl:no_norm:legacy:sample1:local-False-split]": 0.0029690107330679893,
+    "tests/utilization/utils/test_log_results.py::test_log_final_results[get_ppl:no_norm:legacy:sample1:local-True-no_split]": 0.0028815260156989098,
+    "tests/utilization/utils/test_log_results.py::test_log_final_results[get_ppl:no_norm:legacy:sample1:local-True-split]": 0.003189605660736561,
+    "tests/utilization/utils/test_log_results.py::test_log_final_results[get_prob:no_norm:legacy:sample1:local-False-no_split]": 0.0028251782059669495,
+    "tests/utilization/utils/test_log_results.py::test_log_final_results[get_prob:no_norm:legacy:sample1:local-False-split]": 0.0027860552072525024,
+    "tests/utilization/utils/test_log_results.py::test_log_final_results[get_prob:no_norm:legacy:sample1:local-True-no_split]": 0.0028046108782291412,
+    "tests/utilization/utils/test_log_results.py::test_log_final_results[get_prob:no_norm:legacy:sample1:local-True-split]": 0.0028313947841525078,
+    "tests/utilization/utils/test_parse_arguments.py::test_default_no_efficient": 0.00703858956694603,
+    "tests/utilization/utils/test_parse_arguments.py::test_default_prefix_caching": 0.007218576036393642,
+    "tests/utilization/utils/test_parse_arguments.py::test_default_vllm": 0.00684856902807951,
+    "tests/utilization/utils/test_parse_arguments.py::test_no_prefix_caching": 0.007001873105764389
 }
\ No newline at end of file
diff --git a/docs/README.md b/docs/README.md
index 75ec2b2e..66cd610c 100644
--- a/docs/README.md
+++ b/docs/README.md
@@ -2,28 +2,30 @@
 
 ## Training
 
-[Training](https://github.com/RUCAIBox/LLMBox/tree/main/training)
+Tutorial: [Training](https://github.com/RUCAIBox/LLMBox/tree/main/training)
 
 ## Utilization
 
-[Utilization](https://github.com/RUCAIBox/LLMBox/tree/main/utilization)
+CLI Usage: [Utilization](https://github.com/RUCAIBox/LLMBox/tree/main/utilization)
+Reproduction: [test.sh](https://github.com/RUCAIBox/LLMBox/blob/main/test.sh)
 
 ### Datasets
 
-- [Supported datasets](https://github.com/RUCAIBox/LLMBox/tree/main/docs/utilization/supported-datasets.md)
-- [How to load datasets with subsets](https://github.com/RUCAIBox/LLMBox/tree/main/docs/utilization/how-to-load-datasets-with-subsets.md)
-- [How to load datasets from HuggingFace](https://github.com/RUCAIBox/LLMBox/tree/main/docs/utilization/how-to-load-datasets-from-huggingface.md)
-- [How to customize dataset](https://github.com/RUCAIBox/LLMBox/tree/main/docs/utilization/how-to-customize-dataset.md)
+- [Supported datasets](https://github.com/RUCAIBox/LLMBox/blob/main/docs/utilization/supported-datasets.md)
+- [How to load datasets with subsets](https://github.com/RUCAIBox/LLMBox/blob/main/docs/utilization/how-to-load-datasets-with-subsets.md)
+- [How to load datasets from HuggingFace](https://github.com/RUCAIBox/LLMBox/blob/main/docs/utilization/how-to-load-datasets-from-huggingface.md)
+- [How to customize dataset](https://github.com/RUCAIBox/LLMBox/blob/main/docs/utilization/how-to-customize-dataset.md)
 
 ### Models
 
-- [How to customize model](https://github.com/RUCAIBox/LLMBox/tree/main/docs/utilization/how-to-customize-model.md)
+- [How to customize model](https://github.com/RUCAIBox/LLMBox/blob/main/docs/utilization/how-to-customize-model.md)
 
 ## Examples
 
-- [Customize dataset](https://github.com/RUCAIBox/LLMBox/tree/main/docs/examples/customize_dataset.py)
-- [Customize HuggingFace model](https://github.com/RUCAIBox/LLMBox/tree/main/docs/examples/customize_huggingface_model.py)
+- [Customize dataset](https://github.com/RUCAIBox/LLMBox/blob/main/docs/examples/customize_dataset.py)
+- [Customize HuggingFace model](https://github.com/RUCAIBox/LLMBox/blob/main/docs/examples/customize_huggingface_model.py)
 
 ## Trouble Shooting
 
-- [vLLM no module name packaging](https://github.com/RUCAIBox/LLMBox/tree/main/docs/trouble_shooting/vllm_no_module_name_packaging.md)
+- [Debug an evaluation run](https://github.com/RUCAIBox/LLMBox/blob/main/docs/trouble_shooting/debug_evaluation_run.md)
+- [vLLM no module name packaging](https://github.com/RUCAIBox/LLMBox/blob/main/docs/trouble_shooting/vllm_no_module_name_packaging.md)
diff --git a/docs/trouble_shooting/debug_evaluation_run.md b/docs/trouble_shooting/debug_evaluation_run.md
new file mode 100644
index 00000000..4573397a
--- /dev/null
+++ b/docs/trouble_shooting/debug_evaluation_run.md
@@ -0,0 +1,232 @@
+# [Trouble Shooting] How to Debug an Evaluation Run
+
+This tutorial focuses on debugging an evaluation run that does not reproduce the expected results. This can be caused by the model not generating any predictions, the dataset not being formatted correctly, or the metrics not being calculated correctly. This tutorial will guide you through the process of locating the problem and fixing it.
+
+## Locating the Problem
+
+Every run of the model should produce an evaluation results file, which contains the input data and the model's predictions. You can find this file in the `evaluation_results` folder.
+
+If the process ended normally, the file should be a valid JSON file with metrics:
+
+```json
+[
+    {
+        "index":0,
+        "source":"<|start_header_id|>user<|end_header_id|>\n\nAnswer the following question.\n\nQuestion: Janet’s ducks lay 16 eggs per day. She eats three for breakfast every morning and bakes muffins for her friends every day with four. She sells the remainder at the farmers' market daily for $2 per fresh duck egg. How much in dollars does she make every day at the farmers' market?\nAnswer: <|eot_id|> <|start_header_id|>assistant<|end_header_id|>",
+        "raw_prediction":[
+            "\n\nLet's break this down step by step!\n\nJanet's ducks lay 16 eggs per day. She eats 3 for breakfast, so that leaves:\n\n16 - 3 = 13 eggs\n\nShe bakes muffins with 4 eggs, so that leaves:\n\n13 - 4 = 9 eggs\n\nShe sells the remaining 9 eggs at the farmers' market. Each egg sells for $2, so she makes:\n\n9 eggs x $2 per egg = $18\n\nJanet makes $18 every day at the farmers' market."
+        ],
+        "processed_prediction":[
+            "18"
+        ],
+        "reference":"18",
+        "metric":{
+            "Accuracy":true
+        },
+        "subset":null
+    },
+    ...
+]
+```
+
+Alternatively, if the process was ended prematurely, the file will be a valid jsonlines file:
+
+```json
+{"index": 0, "source": ["(\"<|start_header_id|>user<|end_header_id|>\\n\\nAnswer the following question.\\n\\nQuestion: Janet’s ducks lay 16 eggs per day. She eats three for breakfast every morning and bakes muffins for her friends every day with four. She sells the remainder at the farmers' market daily for $2 per fresh duck egg. How much in dollars does she make every day at the farmers' market?\\nAnswer: <|eot_id|> <|start_header_id|>assistant<|end_header_id|>\",)"], "raw_prediction": "\n\nLet's break this down step by step!\n\nJanet's ducks lay 16 eggs per day.\nShe eats 3 eggs for breakfast, so that leaves 16 - 3 = 13 eggs.\nShe bakes muffins with 4 eggs, so that leaves 13 - 4 = 9 eggs.\nShe sells the remaining 9 eggs at the farmers' market.\n\nEach egg sells for $2, so she makes:\n9 eggs x $2 per egg = $18\n\nJanet makes $18 every day at the farmers' market.", "reference": "18"}
+...
+```
+
+You can look into the evaluation reulsts file to see if the model is generating normally.
+
+1. If the `raw_prediction` field is empty, the model is not generating any predictions. This might because of the model encountering a stop sequence in output. You can check the `stop` field in the generation arguments and `default_stops` in the chat_templates configuration.
+
+2. If the `raw_prediction` field seems to be normal, you can check the `processed_prediction` field to see if the answer is being extracted correctly in the `post_processing` step.
+
+3. If the `raw_prediction` field continues to output after the completion of the output, it may be that the stop sequence has not been correctly configured. You can check the `stop` field in the generation arguments and the chat_templates configuration.
+
+4. If the `reference` field is not formatted as expected, it may be that the dataset is not formatted correctly. You can check the `references` property in the dataset class is correctly formatted.
+
+5. If everything seems to be normal, you can check the `metric` to see if the metrics are being calculated correctly, especially if the metric is complex.
+
+## Fixing the Problem
+
+If you have located the problem, you can try to fix it by following the steps below.
+
+### Checking the `stop` Generation Argument
+
+The `stop` argument is a list of strings that the model will stop generating after encountering. You can check the `stop` field in the log to see if the model is correctly configured.
+
+**HuggingFace Models:**
+
+```text
+2024-06-15 19:30:19 INFO batch_sampler.py:38 Evaluating generation on mt_bench (model_attr={'model_type': 'chat', 'model_backend': 'huggingface', 'model_max_input': 8192, 'model_max_input_and_output': 8192, 'multi_turn': True}, generation_kwargs={'max_new_tokens': 1024, 'stopping_criteria': [KeyWordsCriteria(stop_sequences=[[128009]])], 'pad_token_id': 128000, 'eos_token_id': 128001}, num_shots=0, len=1, num_instances=1, use_cache=False)
+```
+
+We convert the `stop` field to a list of integers in the `stopping_criteria` field. In the above example, the stop sequence is `[128009]`, which corresponds to the `<|eot_id|>` token.
+
+**vLLM Models:**
+
+```text
+2024-06-15 20:10:33 INFO batch_sampler.py:38 Evaluating generation on mt_bench (model_attr={'model_type': 'chat', 'model_backend': 'vllm'}, generation_kwargs=SamplingParams(n=1, best_of=1, presence_penalty=0.0, frequency_penalty=0.0, repetition_penalty=1.0, temperature=1.0, top_p=1.0, top_k=-1, min_p=0.0, seed=None, use_beam_search=False, length_penalty=1.0, early_stopping=False, stop=['<|eot_id|>'], stop_token_ids=[], include_stop_str_in_output=False, ignore_eos=False, max_tokens=1024, min_tokens=0, logprobs=None, prompt_logprobs=None, skip_special_tokens=True, spaces_between_special_tokens=True, truncate_prompt_tokens=None), num_shots=0, len=80, num_instances=80, use_cache=False)
+```
+
+LLaMA-3's default stop are `['<|eot_id|>']`.
+
+
+**API Models:**
+
+The following model does not use `stop`:
+
+```text
+2024-06-15 20:35:50 INFO batch_sampler.py:38 Evaluating generation on mt_bench (model_attr={'model_type': 'chat', 'model_backend': 'openai', 'multi_turn': True}, generation_kwargs={'max_tokens': 4096, 'seed': 2023}, num_shots=0, len=1, num_instances=1, use_cache=False)
+```
+
+While the following one uses `stop`:
+
+```text
+2024-06-15 20:39:37 INFO batch_sampler.py:38 Evaluating generation on drop (model_attr={'model_type': 'chat', 'model_backend': 'openai'}, generation_kwargs={'max_tokens': 64, 'seed': 2023, 'stop': ['\n'], 'temperature': 0}, num_shots=0, len=1, num_instances=1, use_cache=False)
+```
+
+**`stop` might be set in the following places:**
+
+1. In the `init_arguments` method or the class variable of the dataset class
+
+2. In the command line arguments `stop`
+
+3. In the chat template `default_stop`
+
+4. In the `transform` validation of generation arguments (Anthropic models does not support a whitespace stop)
+
+### Checking the Chat Template Configuration
+
+If you are using an instruct-tuned model, you need a chat template to correctly prompt the model. Different models may require different chat templates.
+
+Currently we support 7 chat templates including `base` (default), `llama3`, `chatml`, `llama2`, `zephyr`, `phi3`, and `alpaca`. This offers a more fine-grained control over the chat format.
+
+```python
+"llama3": {
+    "system_start": "<|start_header_id|>system<|end_header_id|>\n\n",
+    "system_end": "<|eot_id|>",
+    "user_start": "<|start_header_id|>user<|end_header_id|>\n\n",
+    "user_end": "<|eot_id|>",
+    "assistant_start": "<|start_header_id|>assistant<|end_header_id|>\n\n",
+    "assistant_end": "<|eot_id|>",
+    "auto_leading_space": True,
+    "default_stops": ["<|eot_id|>"],
+}
+```
+
+When loading a chat-based model, i.e. setting `--model_type chat`, we try to match the model with the chat template by the model's name. For example, the `Meta-Llama3-8B-Instruct` model will be matched with the `llama3` chat template.
+
+You can check that the chat template is correctly loaded in the log:
+
+```text
+2024-06-15 20:39:37 INFO Automatically set chat_template to llama3.
+```
+
+If the chat template is not correctly loaded, you can manually set the chat template by adding the `--chat_template` argument to the command line.
+
+```bash
+python inference.py -m internlm/internlm2-chat-7b -d gsm8k --chat_template chatml
+```
+
+If the chat format is not supported by LLMBox, you can create a new chat template by extending the [`chat_templates.py`](https://github.com/RUCAIBox/LLMBox/tree/main/utilization/chat_templates.py) file.
+
+Alternatively, you can also pass in a jinja2 template string, which is also compatible with HuggingFace's `tokenizers` library.
+
+### Checking the `default_stops` in the Chat Template
+
+In rare cases, you may want to modify the `default_stops` field in the chat template configuration.
+
+If the `default_stops` field prevents the model from generating output, you can try overwriting the `default_stops` arguments with an empty string.
+
+```bash
+python inference.py -m Meta-Llama3-8B-Instruct -d gsm8k --default_stops ""
+```
+
+If you need to extend the `default_stops` field in the chat template configuration.
+
+```bash
+python inference.py -m Meta-Llama3-8B-Instruct -d gsm8k --default_stops "<|eot_id|>" "<|start_header_id|>"
+```
+
+### Checking the `post_processing` Step
+
+The `post_processing` step is used to extract the answer from the model's output. If the `post_processing` step is not correctly configured, the model will not be able to extract the answer correctly.
+
+You can first locate the dataset class in the [`utilization/dataset`](https://github.com/RUCAIBox/LLMBox/tree/main/utilization/dataset) folder and check the `post_processing` method.
+
+```python
+class Drop(GenerationDataset):
+
+    ...
+
+    @staticmethod
+    def post_processing(predictions):
+        new_predictions = []
+        pattern = r"[.!(\n)]"
+        for pred in predictions:
+            match = re.search(pattern, pred)
+            if match:
+                index = match.start()
+                pred = pred[:index]
+            new_predictions.append(pred)
+        return new_predictions
+```
+
+### Checking the `references` Property
+
+The `references` property in the dataset class is used to check the model output against the reference answer. If the `references` property is not formatted correctly, the model will not be able to calculate the metrics correctly.
+
+
+```python
+class Drop(GenerationDataset):
+
+    ...
+
+    @cached_property
+    def references(self):
+        return [instance["answers_spans"]["spans"] for instance in self.evaluation_data]
+```
+
+### Checking the Metric Calculation
+
+```python
+class Drop(GenerationDataset):
+
+    metrics = [F1(force_number_match=True, word_tokenize="regex", align_bag="counter"), Em()]
+
+    ...
+```
+
+If you found `processed_prediction` matches the `reference` field, but the metric is still not calculated correctly, you can check the metric calculation method in the dataset class.
+
+
+```python
+class F1(Metric):
+
+    def __init__(
+        self,
+        *,
+        dataset: Literal["independent"] = "independent",
+        multiref_strategy: Literal["max", "leave_one_out"] = "max",
+        word_tokenize: Literal["nltk", "split", "regex"] = "nltk",
+        normalize_level: Literal["token", "text", "both"] = "both",
+        align_bag: Literal["counter", "set"] = "counter",
+        force_number_match=False,
+    ):
+        self.dataset = dataset
+        self.word_tokenize = _TOKENIZER_DICT[word_tokenize]
+        self.normalize_level = normalize_level
+        self.multiref_strategy = multiref_strategy
+        self.align_bag = align_bag
+        self.force_number_match = force_number_match
+    ...
+
+```
+
+## In Closing
+
+If you still have any problems replicating an evaluation run, please feel free to reach out to us by [creating an issue](https://github.com/RUCAIBox/LLMBox/issue).
+
+You can attach the log file and evaluation results file to the issue, and we will help you locate the problem.
diff --git a/docs/utilization/how-to-customize-dataset.md b/docs/utilization/how-to-customize-dataset.md
index 7d81dcc7..480dec3b 100644
--- a/docs/utilization/how-to-customize-dataset.md
+++ b/docs/utilization/how-to-customize-dataset.md
@@ -79,10 +79,34 @@ class MyDataset(Dataset):
         self.example_data = load_raw_dataset_from_file("examples.json")
 ```
 
-## Formating the instances
+## Formatting the instances
 
 Then, format the instance by implementing the `instruction` attribute and `format_instance` method. The returned instance should be a dictionary with keys accessible in the instruction string. Both `jinja2` and `f-string` are supported for the instruction string.
 
+```mermaid
+flowchart LR
+    subgraph MyDataset
+    exam[("example\ndata")]
+    eval[("evaluation\ndata")]
+    in["instruction"]
+    fex(["format_instance"])
+    fev(["format_instance"])
+    end
+
+    fexam["formatted\nexample\ndata"]
+    feval["formatted\nevaluation\ndata"]
+    c([construct_instances])
+    eval --> fev
+    in -->|example_idx>=0| fex
+    in -->|turn_idx>=0| fev
+    exam --> fex
+    fex --> fexam
+    fev --> feval
+    feval -->|"construct\ninstance"| c
+    fexam -->|num_shots| c
+    c --> ei["evaluation\ninstance"]
+```
+
 The following table shows the keys that should be returned from the `format_instance` method and the keys that can be accessed in the instruction string:
 
 <table class="waffle" cellspacing="0" cellpadding="0">
@@ -115,6 +139,7 @@ The following table shows the keys that should be returned from the `format_inst
         <tr>
         <td class="s1" dir="ltr">source_idx</td>
         <td class="s1" dir="ltr"><code>int</code></td>
+        <td class="s1" dir="ltr">Index of correct context (winogrande)</td>
         </tr>
         <tr>
         <td class="s1" dir="ltr">options</td>
@@ -139,11 +164,28 @@ The following table shows the keys that should be returned from the `format_inst
         <td class="s1" dir="ltr"><code>int</code></td>
         </tr>
         <tr>
-        <td class="s1" dir="ltr" colspan="2" rowspan="5"><i>No need to return from format_instance</i></td>
+        <td class="s1" dir="ltr" colspan="2" rowspan="7"><i>No need to return from format_instance</i></td>
+        <td class="s1" dir="ltr">num_turns</td>
+        <td class="s1" dir="ltr"><code>int</code></td>
+        <td class="s1" dir="ltr">Total turns of multi-turn conversation (default: 0)</td>
+        </tr>
+        <tr>
+        <td class="s1" dir="ltr">turn_idx</td>
+        <td class="s1" dir="ltr"><code>int</code></td>
+        <td class="s1" dir="ltr">evaluation_data: 0..num_turns-1 (inclusive)<br>example_data: -1</td>
+        </tr>
+        <tr>
+        <td class="s1" dir="ltr">real_num_shots</td>
+        <td class="s1" dir="ltr"><code>int</code></td>
+        <td class="s2 softmerge" dir="ltr">
+            <div class="softmerge-inner">The num shots formatted as examples</div>
+        </td>
+        </tr>
+        <tr>
         <td class="s1" dir="ltr">example_idx</td>
         <td class="s1" dir="ltr"><code>int</code></td>
         <td class="s2 softmerge" dir="ltr">
-            <div class="softmerge-inner">The index of examples if greater or equal than 0. Equal to -1 if it is not formatting an example</div>
+            <div class="softmerge-inner">evaluation_data: -1<br>example_data: 0..real_num_shots-1 (inclusive)</div>
         </td>
         </tr>
         <tr>
@@ -165,13 +207,6 @@ The following table shows the keys that should be returned from the `format_inst
             <div class="softmerge-inner">Display name of current dataset</div>
         </td>
         </tr>
-        <tr>
-        <td class="s1" dir="ltr">real_num_shots</td>
-        <td class="s1" dir="ltr"><code>int</code></td>
-        <td class="s2 softmerge" dir="ltr">
-            <div class="softmerge-inner">The num shots formatted as examples</div>
-        </td>
-        </tr>
     </tbody>
 </table>
 
@@ -244,7 +279,7 @@ And in `get_prob` method, this will generate one evaluation instance:
 ]
 ```
 
-### Formating Multiple-Context Dataset:
+### Formatting Multiple-Context Dataset:
 
 For datasets with multiple contexts (like `winogrande`), you can format the instance as follows:
 
@@ -270,7 +305,7 @@ which generates two evaluation instances:
 ]
 ```
 
-### Formating Generation Dataset:
+### Formatting Generation Dataset:
 
 ```python
 INSTRUCTION = """Answer the following question.
@@ -300,7 +335,30 @@ class MyDataset(GenerationDataset):
 
 To evaluate a pre-trained model that lacks instruction-following capabilities, you can provide an instruction explicitly by assigning a completion instruction `NO_INSTRUCTION = "{question}"` to the model, or pass an argument `--instruction "{question}"` in the command line.
 
-## Formating the references
+### Formatting Multi-Turn Dataset:
+
+For multi-turn datasets (like `mt_bench`), you can format the instance as follows:
+
+```python
+class MyDataset(GenerationDataset):
+
+    instruction = "{{source[turn_idx]}}"
+    metrics = [GPTEval(multi_turn=False)]
+    multi_turn = True
+    ...
+
+    def format_instance(self, instance):
+        return dict(
+            source=[instance["question_1"], instance["question_2"]]
+        )
+
+    def post_processing(self, predictions: List[Tuple[str, ...]]):
+        # By default, model outputs a tuple of predictions for each turn
+        # You can implement the post_processing method to return the last turn's assistant output
+        return [pred[-1] for pred in predictions]
+```
+
+## Formatting the references
 
 You can implement the `references` method to return the reference answers for evaluation.
 
diff --git a/docs/utilization/how-to-customize-model.md b/docs/utilization/how-to-customize-model.md
index b516d1d7..6f638fb3 100644
--- a/docs/utilization/how-to-customize-model.md
+++ b/docs/utilization/how-to-customize-model.md
@@ -25,4 +25,12 @@ class NewModel(Model):
         return results
 ```
 
-And then, you should register your model in the [`load`](https://github.com/RUCAIBox/LLMBox/tree/main/utilization/model/load.py) file.
+And then, you should register your model with `register_model` decorator:
+
+```python
+@register_model(model_backend="new_provider")
+def load_new_model(args: "ModelArguments"):
+    logger.info(f"Loading OpenAI API model `{args.model_name_or_path}`.")
+
+    return NewModel(args)
+```
diff --git a/tests/requirements-tests.txt b/tests/requirements-tests.txt
index 18438e04..a9fd23f6 100644
--- a/tests/requirements-tests.txt
+++ b/tests/requirements-tests.txt
@@ -37,3 +37,4 @@ language_data
 google-api-python-client
 immutabledict
 langdetect
+scipy
diff --git a/tests/utilization/dataset/test_formatting.py b/tests/utilization/dataset/test_formatting.py
index decd71c5..8e173ba2 100644
--- a/tests/utilization/dataset/test_formatting.py
+++ b/tests/utilization/dataset/test_formatting.py
@@ -385,3 +385,41 @@ def test_formatting(get_dataset, dataset_name, num_shots, cot, ranking_type):
     formatted_instance = "".join(p for p in formatted_instance if isinstance(p, str))
 
     assert formatted_instance == eval_instance
+
+
+def test_multi_turn(get_dataset):
+    dataset = get_dataset(dataset_name="mt_bench", num_shots=0, ranking_type="generation")
+
+    assert dataset.evaluation_instances[0].messages == [{
+        "content":
+        "Compose an engaging travel blog post about a recent trip to Hawaii, highlighting cultural experiences and must-see attractions.",
+        "role": "user",
+    }]
+    dataset.evaluation_instances[0].add_multi_turn(assistant="The fake model response.")
+    assert dataset.evaluation_instances[0].messages == [{
+        "content":
+        "Compose an engaging travel blog post about a recent trip to Hawaii, highlighting cultural experiences and must-see attractions.",
+        "role": "user",
+    }, {
+        "content": "The fake model response.",
+        "role": "assistant",
+    }, {
+        "content": "Rewrite your previous response. Start every sentence with the letter A.",
+        "role": "user",
+    }]
+
+    dataset.evaluation_instances[0].add_multi_turn(assistant="A fake model response.")
+    assert dataset.evaluation_instances[0].messages == [{
+        "content":
+        "Compose an engaging travel blog post about a recent trip to Hawaii, highlighting cultural experiences and must-see attractions.",
+        "role": "user",
+    }, {
+        "content": "The fake model response.",
+        "role": "assistant",
+    }, {
+        "content": "Rewrite your previous response. Start every sentence with the letter A.",
+        "role": "user",
+    }, {
+        "content": "A fake model response.",
+        "role": "assistant",
+    }]
diff --git a/tests/utilization/model/test_apply_prompt_template.py b/tests/utilization/model/test_apply_prompt_template.py
index 00d3d708..77ee3170 100644
--- a/tests/utilization/model/test_apply_prompt_template.py
+++ b/tests/utilization/model/test_apply_prompt_template.py
@@ -39,7 +39,7 @@ def test_no_smart_space(conversation: Conversation):
         "assistant_start": "",
         "assistant_end": "",
         "auto_leading_space": False,
-        "default_stops": [],
+        "default_stop": [],
     }
     formatter = ConversationFormatter(prompt_config, DEFAULT_CHAT_TEMPLATE)
     conversation.set_formatter(formatter)
@@ -58,7 +58,7 @@ def test_smart_space(conversation: Conversation):
         "assistant_start": "",
         "assistant_end": "",
         "auto_leading_space": True,
-        "default_stops": [],
+        "default_stop": [],
     }
     formatter = ConversationFormatter(prompt_config, DEFAULT_CHAT_TEMPLATE)
     conversation[2]["content"] = " This is an assistant message."  # extra leading space
@@ -80,7 +80,7 @@ def test_final_strip(conversation: Conversation):
         "auto_leading_space": True,
         "final_lstrip": False,
         "final_rstrip": False,
-        "default_stops": [],
+        "default_stop": [],
     }
     formatter = ConversationFormatter(prompt_config, DEFAULT_CHAT_TEMPLATE)
     conversation.set_formatter(formatter)
diff --git a/tests/utilization/utils/test_parse_arguments.py b/tests/utilization/utils/test_parse_arguments.py
new file mode 100644
index 00000000..32c0d132
--- /dev/null
+++ b/tests/utilization/utils/test_parse_arguments.py
@@ -0,0 +1,38 @@
+import sys
+
+from ..fixtures import *
+
+sys.path.append('.')
+from utilization.utils.arguments import parse_argument
+
+
+def test_default_vllm():
+    model_args, dataset_args, evaluation_args = parse_argument(['-m', 'a-random-fake-model', '-d', 'nq', 'quac'])
+    assert model_args.model_backend == "vllm"
+    assert model_args.prefix_caching is False
+
+
+def test_no_prefix_caching():
+    # currently vllm doesn't support returning logprob for prefix caching
+    model_args, dataset_args, evaluation_args = parse_argument([
+        '-m', 'a-random-fake-model', '-d', 'nq', 'mmlu', '-b', '1'
+    ])
+    assert model_args.model_backend == "huggingface"
+    assert model_args.prefix_caching is False
+
+
+def test_default_prefix_caching():
+    # currently vllm doesn't support returning logprob for prefix caching
+    model_args, dataset_args, evaluation_args = parse_argument([
+        '-m', 'a-random-fake-model', '-d', 'nq', 'mmlu', '-b', '16'
+    ])
+    assert model_args.model_backend == "huggingface"
+    assert model_args.prefix_caching is True
+
+
+def test_default_no_efficient():
+    model_args, dataset_args, evaluation_args = parse_argument([
+        '-m', 'a-random-fake-model', '-d', 'nq', '--vllm', 'False', '--prefix_caching', 'False'
+    ])
+    assert model_args.model_backend == "huggingface"
+    assert model_args.prefix_caching is False
diff --git a/utilization/__init__.py b/utilization/__init__.py
index d3fe9a57..0ce133ac 100644
--- a/utilization/__init__.py
+++ b/utilization/__init__.py
@@ -5,6 +5,7 @@
 from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional
 
 # this file only initializes .utils modules to avoid early import of torch
+from .load_model import register_model
 from .utils import DatasetArguments, EvaluationArguments, ModelArguments, parse_argument
 
 if TYPE_CHECKING:
@@ -48,5 +49,6 @@ def _register_dataset_class(cls):
 
 
 __all__ = [
-    "get_evaluator", "parse_argument", "ModelArguments", "DatasetArguments", "EvaluationArguments", "register_dataset"
+    "get_evaluator", "parse_argument", "ModelArguments", "DatasetArguments", "EvaluationArguments", "register_dataset",
+    "register_model"
 ]
diff --git a/utilization/chat_templates.py b/utilization/chat_templates.py
index d0605d32..aba8727b 100644
--- a/utilization/chat_templates.py
+++ b/utilization/chat_templates.py
@@ -1,20 +1,43 @@
-from typing import Any, Dict, List, Union
+from typing import Any, Dict, List, Optional, Union
 
 __all__ = ["DEFAULT_CHAT_TEMPLATE", "DEFAULT_CHAT_CONFIGS", "add_space", "smart_space"]
 
 
-def add_space(msg: str, auto_leading_space: bool, context: str) -> str:
-    if auto_leading_space and msg and context and not context[-1].isspace() and not msg[0].isspace():
+def add_space(
+    msg: str,
+    context: str,
+    auto_leading_space: bool = True,
+    remove_space_between: bool = True,
+    starts: Optional[List[str]] = None,
+    ends: Optional[List[str]] = None
+) -> str:
+    if starts is None or ends is None or remove_space_between is False:
+        context_ends_special = False
+        msg_starts_special = False
+    else:
+        context_ends_special = any(context.endswith(e) for e in ends if len(e) > 0)
+        msg_starts_special = any(msg.startswith(s) for s in starts if len(s) > 0)
+    if (auto_leading_space and msg and context)\
+            and not (context[-1].isspace() or msg[0].isspace())\
+            and not (context_ends_special and msg_starts_special):
         return ' ' + msg
     return msg
 
 
-def smart_space(parts: List[str], auto_leading_space) -> str:
-
+def smart_space(parts: List[str], auto_leading_space: bool, remove_space_between: bool, seq: List[str]) -> str:
+    starts = [seq[role + "_start"] for role in ["system", "user", "assistant"]]
+    ends = [seq[role + "_end"] for role in ["system", "user", "assistant"]]
     rendered = ""
     for part in parts:
         if part:
-            rendered += add_space(part, auto_leading_space, rendered)
+            rendered += add_space(
+                part,
+                rendered,
+                auto_leading_space=auto_leading_space,
+                remove_space_between=remove_space_between,
+                starts=starts,
+                ends=ends
+            )
     return rendered
 
 
@@ -37,7 +60,7 @@ def smart_space(parts: List[str], auto_leading_space) -> str:
     "{%- set data.parts = data.parts + [seq['assistant_start']] -%}"
     "{%- endif -%}"
     ""
-    "{{ data.parts | smart_space(auto_leading_space) }}"
+    "{{ data.parts | smart_space(auto_leading_space, remove_space_between, seq) }}"
 )
 
 # Chat configs format:
@@ -58,7 +81,7 @@ def smart_space(parts: List[str], auto_leading_space) -> str:
 #   - assistant_end: The string to append to the assistant message.
 #   - auto_leading_space: Whether to add a leading space when concatenating two
 #     strings if the first string does not end with a whitespace.
-#   - default_stops: A list of strings that indicate the end of a message.
+#   - default_stop: A list of strings that indicate the end of a message.
 #
 DEFAULT_CHAT_CONFIGS: Dict[str, Union[Dict[str, Any], str]] = {
     "base": {
@@ -70,7 +93,8 @@ def smart_space(parts: List[str], auto_leading_space) -> str:
         "assistant_end": "\n\n",
         "auto_leading_space": True,
         "final_rstrip": True,
-        "default_stops": [],
+        "remove_space_between": False,
+        "default_stop": [],
     },
     "llama2": {
         "all_start": "<s>[INST] ",
@@ -82,7 +106,8 @@ def smart_space(parts: List[str], auto_leading_space) -> str:
         "assistant_end": " </s><s>[INST] ",
         "auto_leading_space": True,
         "final_rstrip": False,
-        "default_stops": [],
+        "remove_space_between": True,
+        "default_stop": [],
     },
     "chatml": {
         "system_start": "<|im_start|>system\n",
@@ -93,7 +118,8 @@ def smart_space(parts: List[str], auto_leading_space) -> str:
         "assistant_end": "<|im_end|>\n",
         "auto_leading_space": True,
         "final_rstrip": False,
-        "default_stops": ["<|im_end|>"],
+        "remove_space_between": True,
+        "default_stop": ["<|im_end|>"],
     },
     "zephyr": {
         "system_start": "<|system|>\n",
@@ -104,7 +130,8 @@ def smart_space(parts: List[str], auto_leading_space) -> str:
         "assistant_end": "</s>\n",
         "auto_leading_space": True,
         "final_rstrip": False,
-        "default_stops": ["</s>"],
+        "remove_space_between": True,
+        "default_stop": ["</s>"],
     },
     "phi3": {
         "system_start": "<|system|>\n",
@@ -115,7 +142,8 @@ def smart_space(parts: List[str], auto_leading_space) -> str:
         "assistant_end": "<|end|>\n",
         "auto_leading_space": True,
         "final_rstrip": False,
-        "default_stops": ["<|end|>"],
+        "remove_space_between": True,
+        "default_stop": ["<|end|>"],
     },
     "llama3": {
         "system_start": "<|start_header_id|>system<|end_header_id|>\n\n",
@@ -126,7 +154,8 @@ def smart_space(parts: List[str], auto_leading_space) -> str:
         "assistant_end": "<|eot_id|>",
         "auto_leading_space": True,
         "final_rstrip": False,
-        "default_stops": ["<|eot_id|>"],
+        "remove_space_between": True,
+        "default_stop": ["<|eot_id|>"],
     },
     "alpaca": {
         "system_start": "### Input:\n",
@@ -137,6 +166,7 @@ def smart_space(parts: List[str], auto_leading_space) -> str:
         "assistant_end": "\n\n",
         "auto_leading_space": True,
         "final_rstrip": False,
-        "default_stops": ["###"],
+        "remove_space_between": False,
+        "default_stop": ["###"],
     }
 }
diff --git a/utilization/dataset/dataset.py b/utilization/dataset/dataset.py
index 31d89f4a..f01fea16 100644
--- a/utilization/dataset/dataset.py
+++ b/utilization/dataset/dataset.py
@@ -27,7 +27,7 @@
     # solve the circular import
     from ..metric.metric import Metric
     from ..model.model import Model
-    from ..utils import DatasetArguments
+    from ..utils import DatasetArguments, EvaluationArguments, ModelArguments
 
 _InputsWithOptionNum = Union[List[Tuple[str, int]], List[Tuple[str, str, int]], List[Tuple[str, str, str, int]]]
 """Instance format for the `get_prob` model evaluation method. The tuple contains the source text and the number of options. If prefix_caching is enabled, the source text will be segmented into prefixes."""
@@ -224,14 +224,11 @@ def __iter__(self):
         yield from self.evaluation_instances
 
     def format_instance(self, instance: dict) -> dict:
-        r"""Format the dataset instance into task source text, target text, and options (for ranking).
+        r"""Format the dataset instance into task format. See [docs](https://github.com/RUCAIBox/LLMBox/blob/main/docs/utilization/how-to-customize-dataset.md#formating-the-instances) for more details.
 
         Notes:
             The instance should not be mutated since the function might be called for multiple times when formatting examples.
 
-        Args:
-            instance (Dict): an instance dict of multiple key-value pairs.
-
         Returns:
             A dictionary with the following keys:
 
@@ -313,14 +310,14 @@ def _init_arguments(self):
         if self.model.args.api_endpoint is not None:
             model_endpoint = self.model.args.model_backend + "/" + self.model.args.api_endpoint
             if model_endpoint in ENDPOINT_ARGS:
-                endpoint_args = ENDPOINT_ARGS[model_endpoint]
+                endpoint_schema = ENDPOINT_ARGS[model_endpoint]
                 methods = ["get_ppl", "get_prob", "generation"]
                 requireds = [
                     ("echo", "max_tokens", "logprobs"),
                     ("max_tokens", "temperature", "logit_bias"),
                     ("max_tokens", "temperature"),
                 ]
-                support = [m for m, r in zip(methods, requireds) if all(a in endpoint_args for a in r)]
+                support = [m for m, r in zip(methods, requireds) if all(a in endpoint_schema for a in r)]
                 if self.model_evaluation_method not in support:
                     warn_once(
                         logger,
@@ -356,11 +353,11 @@ def _init_arguments(self):
         self._extra_model_args = deepcopy(self.extra_model_args)
 
         # apply chat template
-        if self.conversation_formatter.default_stops:
+        if self.conversation_formatter.default_stop:
             if "stop" not in self._extra_model_args:
                 self._extra_model_args["stop"] = []
-            self._extra_model_args["stop"].extend(self.conversation_formatter.default_stops)
-        logger.debug(f"Chat template stops: {self.conversation_formatter.default_stops}")
+            self._extra_model_args["stop"].extend(self.conversation_formatter.default_stop)
+        logger.debug(f"Chat template stops: {self.conversation_formatter.default_stop}")
 
         # temperature
         if self.sample_num > 1 and self._extra_model_args.get("temperature", 0) == 0:
@@ -369,17 +366,13 @@ def _init_arguments(self):
                 f"Self-consistency only supports generation with temperature > 0, automatically set temperature = 1."
             )
 
-        if self.extra_model_args.get("multi_turn") and self.model.model_backend == "vllm":
-            raise ValueError(
-                f"We do not support multi-turn generation using vllm currently. Please set --vllm to False."
-            )
-
         if self.use_normalization and self.model_evaluation_method == "get_ppl":
             logger.warning("Normalization is only supported for PPL evaluation.")
 
         if self.multi_turn:
             assert self.model_evaluation_method == "generation", "Multi-turn is only supported for generation evaluation."
 
+        assert "multi_turn" not in self._extra_model_args, "Use `multi_turn` attribute instead of `multi_turn` in `extra_model_args`."
         self._extra_model_args["multi_turn"] = self.multi_turn
 
         logger.info(self.model.args)
@@ -416,13 +409,13 @@ def load_raw_dataset(
     def construct_instances(self):
         r"""Construct and format all the instances of `evaluation_data`.
 
-        1. Format the example data with `format_instance`
-        2. Format the evaluation data with `format_instance`
-        3. For each instance
-            evaluation instance = instruction + examples + instance:
-            1. Dynamically construct the instruction and examples with `construct_instance`
-            2. Construct final `evaluation_instances` and `option_nums` based on the model evaluation method.
-        4. Apply self-consistency if needed.
+        1. `format_instance`: Format the example data into dictionaries
+        2. `format_instance`: Format the evaluation data into dictionaries
+        3. `generate_ape` if needed.
+        4. For each evalution instance
+            1. `construct_instance`: Construct `Conversation` format for each evaluation instance and examples
+            2. Apply normalization if needed.
+        5. Apply self-consistency if needed.
 
         Returns:
             List[str]: The list of final formatted instances.
@@ -555,6 +548,8 @@ def _format_instance(
         formatted_instance["source_idx"] = source_idx
         formatted_instance["target_idx"] = target_idx
         formatted_instance["example_idx"] = example_idx
+        formatted_instance["turn_idx"] = 0
+        formatted_instance["num_turns"] = 1
         dataset_extensions = ["dataset_name", "subset_name", "display_name", "real_num_shots"]
         for key in dataset_extensions:
             if key in formatted_instance and formatted_instance[key] != getattr(self, key):
@@ -569,6 +564,16 @@ def _format_instance(
                 source = self.instruction_template.render(formatted_instance)
             else:
                 source = self.instruction.format_map(formatted_instance)
+        elif self.multi_turn:
+            formatted_instance["num_turns"] = len(source)
+            new_source: List[str] = []
+            for turn_idx in range(formatted_instance["num_turns"]):
+                formatted_instance["turn_idx"] = turn_idx
+                if self.instruction_template.debug_info:
+                    new_source.append(self.instruction_template.render(formatted_instance))
+                else:
+                    new_source.append(self.instruction.format_map(formatted_instance))
+            source = new_source
 
         return {"source": source, "target": target, "options": options}
 
@@ -576,7 +581,7 @@ def construct_instance(
         self,
         instance: Dict[str, typing.Any],
     ) -> Tuple[List[Conversation], int]:
-        r"""Format one instance with the instruction and demonstration.
+        r"""Construct the final formatted Conversation instance for evaluation.
 
         Args:
             instance (dict): the pre-formatted source.
@@ -585,7 +590,7 @@ def construct_instance(
             Union[str, List[str]]: The final formatted instance. Return a list of formatted instances if the source is a list (in cases like winogrande).
         """
 
-        if self.model_type in {"chat"} and self.system_prompt:
+        if self.model_type == "chat" and self.system_prompt:
             convers = Conversation([{"role": "system", "content": self.system_prompt}])
         else:
             convers = Conversation()
@@ -599,6 +604,7 @@ def construct_instance(
         else:
             # FIXME new example format for quac, squad
             logger.warning(f"{self.display_name} has legacy examples format. Skipping the examples.")
+
         option_num = len(instance["options"]) if instance.get("options", None) else 1
         if isinstance(instance["source"], list):
             if self.model_evaluation_method == "get_ppl":
@@ -899,6 +905,13 @@ def _split_by_subset(
                 yield {k: v[st:st + dlen] for k, v in obj.items()}
                 st += dlen
 
+    def setup_metrics(
+        self, model_args: "ModelArguments", dataset_args: "DatasetArguments", evaluation_args: "EvaluationArguments"
+    ):
+        for d in self._datasets:
+            for m in d.metrics:
+                m.setup_metric(model_args, dataset_args, evaluation_args, d)
+
     def log_final_results(
         self,
         raw_predictions: List[str],
@@ -988,7 +1001,7 @@ def get_batch_sampler(self, reload_tokenizer: bool = False):
         if reload_tokenizer:
             self._datasets[0].model._remove_tokenizer()
         return DatasetCollectionBatchSampler(
-            self, self.args.batch_size, self._datasets[0].model.model_backend == "vllm", self.args.auto_batch_size
+            self, self.args.batch_size, self._datasets[0].model.is_vllm_model(), self.args.auto_batch_size
         )
 
     def step(
diff --git a/utilization/dataset/gaokao.py b/utilization/dataset/gaokao.py
index 966d763f..233cdd95 100644
--- a/utilization/dataset/gaokao.py
+++ b/utilization/dataset/gaokao.py
@@ -3,7 +3,7 @@
 from logging import getLogger
 
 from ..dataset_enum import GAOKAO_TASKS
-from ..metric import Gaokao_bench_metric
+from ..metric import GaokaoBenchMetric
 from .generation_dataset import GenerationDataset
 
 logger = getLogger(__name__)
@@ -59,7 +59,7 @@ class Gaokao(GenerationDataset):
     example_set = None
     evaluation_set = "test"
     load_args = ("RUCAIBox/gaokao-bench",)
-    metrics = [Gaokao_bench_metric()]
+    metrics = [GaokaoBenchMetric()]
     categorized_subsets = None  # weighted average score
 
     def init_arguments(self):
diff --git a/utilization/dataset/gpqa.py b/utilization/dataset/gpqa.py
index b5195f68..38eb2514 100644
--- a/utilization/dataset/gpqa.py
+++ b/utilization/dataset/gpqa.py
@@ -17,9 +17,15 @@
     """What is the correct answer to this question: {{ question }}{{ '\n\nChoices:\n' + options if options }}\n\nFormat your response as follows: "The correct answer is (insert answer here)".""",
     "five_shot_cot":
     """{{ 'Here are some example questions from experts. An explanation is given before the final answer. Answer the final question yourself, giving your reasoning beforehand.\n' if example_idx == 0 }}Question: {{ question }}{{ '\nChoices:\n' + options if options }}\n{{ 'The correct answer is' if example_idx >= 0 else ('Give step by step reasoning before you answer, and when you' + "'" + 're ready to answer, please use the format "The correct answer is (insert answer here)":\n') }}""",
-    "zero_shot_cot":
-    """What is the correct answer to this question: {{ question }}{{ '\n\nChoices:\n' + options if options }}\nLet's think step by step: __SEPARATOR__\n\nBased on the above, what is the single, most likely answer choice? Answer in the format "The correct answer is (insert answer here)".""",
-    "retrieval": "{{ bing }}\n\nQuestion: {{ question }}{{ '\n\nChoices:\n' + options if options }}"
+    "zero_shot_cot": (
+        "{% if turn_idx == 0 %}"
+        "What is the correct answer to this question: {{ question }}{{ '\n\nChoices:\n' + options if options }}\nLet's think step by step:"
+        "{% elif turn_idx == 1 %}"
+        '\n\nBased on the above, what is the single, most likely answer choice? Answer in the format "The correct answer is (insert answer here)".'
+        "{% endif %}"
+    ),
+    "retrieval":
+    "{{ bing }}\n\nQuestion: {{ question }}{{ '\n\nChoices:\n' + options if options }}"
 }
 
 
@@ -48,7 +54,7 @@ def init_arguments(self):
             self.max_num_shots = 0
 
         if self.max_num_shots == 0 and self.cot == "base":
-            self.extra_model_args["multi_turn"] = True
+            self.multi_turn = True
 
         if self.max_num_shots != 0 and self.max_num_shots != 5:
             logger.warning("Only 0-shot and 5-shot are supported for GPQA, but got %d-shot.", self.max_num_shots)
diff --git a/utilization/dataset/humaneval.py b/utilization/dataset/humaneval.py
index b937daf6..8ab25b97 100644
--- a/utilization/dataset/humaneval.py
+++ b/utilization/dataset/humaneval.py
@@ -42,9 +42,6 @@ class Humaneval(GenerationDataset):
     extra_model_args = dict(max_tokens=512, temperature=0.1)
     metrics = [PassAtK()]
 
-    def init_arguments(self):
-        self.metrics[0].set_k(k=self.args.pass_at_k)
-
     def format_instance(self, instance):
         source_text = instance["prompt"].strip()
         target_text = instance["canonical_solution"]
diff --git a/utilization/dataset/lambada.py b/utilization/dataset/lambada.py
index 5cc7b6f6..00d95415 100644
--- a/utilization/dataset/lambada.py
+++ b/utilization/dataset/lambada.py
@@ -1,6 +1,6 @@
 from functools import cached_property
 
-from ..metric import Word_Accuracy
+from ..metric import WordAccuracy
 from .generation_dataset import GenerationDataset
 
 
@@ -20,9 +20,7 @@ class Lambada(GenerationDataset):
     example_set = None
     load_args = ("EleutherAI/lambada_openai", "default")
     extra_model_args = dict(max_tokens=5, temperature=0)
-
-    def init_arguments(self):
-        self.metrics = [Word_Accuracy(self.tokenizer)]
+    metrics = [WordAccuracy()]
 
     def format_instance(self, instance):
 
diff --git a/utilization/dataset/mbpp.py b/utilization/dataset/mbpp.py
index 0f9b143c..89360af4 100644
--- a/utilization/dataset/mbpp.py
+++ b/utilization/dataset/mbpp.py
@@ -44,9 +44,6 @@ class Mbpp(GenerationDataset):
     extra_model_args = dict(stop=['\n[DONE]'], temperature=0.1)
     metrics = [PassAtK()]
 
-    def init_arguments(self):
-        self.metrics[0].set_k(k=self.args.pass_at_k)
-
     def load_raw_dataset(self, dataset_path, subset_name, evaluation_set, example_set):
         super().load_raw_dataset(dataset_path, subset_name, evaluation_set, example_set)
         self.example_data = EXAMPLARS
diff --git a/utilization/dataset/mt_bench.py b/utilization/dataset/mt_bench.py
index d79c0b9f..fa599bee 100644
--- a/utilization/dataset/mt_bench.py
+++ b/utilization/dataset/mt_bench.py
@@ -16,16 +16,12 @@ class Mt_bench(GenerationDataset):
         reference: ["You are in second place.", "Uncertain."]
     """
 
-    instruction = "{source}"
+    instruction = "{{source[turn_idx]}}"
     example_set = None
     evaluation_set = "train"
     load_args = ("HuggingFaceH4/mt_bench_prompts",)
     metrics = [GPTEval(multi_turn=True, type="single")]
-    extra_model_args = {"multi_turn": True}
-
-    def init_arguments(self):
-        # TODO add prefix caching
-        self.prefix_caching = False
+    multi_turn = True
 
     def load_raw_dataset(self, dataset_path, subset_name, evaluation_set, example_set):
         super().load_raw_dataset(dataset_path, subset_name, evaluation_set, example_set)
@@ -38,12 +34,7 @@ def load_raw_dataset(self, dataset_path, subset_name, evaluation_set, example_se
         self.evaluation_data = new_evaluation_data
 
     def format_instance(self, instance):
-        # TODO return a list of questions instead of using __SEPARATOR__
-        return dict(
-            source=instance["question_1"].strip() + "__SEPARATOR__" + instance["question_2"].strip(),
-            target="",
-            num_turns=2,
-        )
+        return dict(source=[instance["question_1"].strip(), instance["question_2"].strip()])
 
     @cached_property
     def references(self):
diff --git a/utilization/dataset/real_toxicity_prompts.py b/utilization/dataset/real_toxicity_prompts.py
index 4a0d4c1e..f17e1eb0 100644
--- a/utilization/dataset/real_toxicity_prompts.py
+++ b/utilization/dataset/real_toxicity_prompts.py
@@ -1,6 +1,6 @@
 from functools import cached_property
 
-from ..metric import Perspective_api
+from ..metric import PerspectiveApi
 from .generation_dataset import GenerationDataset
 
 
@@ -22,9 +22,7 @@ class Real_toxicity_prompts(GenerationDataset):
     evaluation_set = "train"
     load_args = ("allenai/real-toxicity-prompts",)
     extra_model_args = dict(temperature=0, stop='\n\n')
-
-    def init_arguments(self):
-        self.metrics = [Perspective_api(api_key=self.args.perspective_api_key, proxy_port=self.args.proxy_port)]
+    metrics = [PerspectiveApi()]
 
     def format_instance(self, instance):
         return instance
diff --git a/utilization/evaluator.py b/utilization/evaluator.py
index bc1ec407..ff987742 100644
--- a/utilization/evaluator.py
+++ b/utilization/evaluator.py
@@ -8,7 +8,7 @@
 from .utils.catch_error import catch_error
 from .utils.dynamic_stride_tqdm import dynamic_stride_tqdm
 from .utils.log_results import PredictionWriter
-from .utils.logging import set_logging
+from .utils.logging import log_once, set_logging
 from .utils.random import set_seed
 
 logger = getLogger(__name__)
@@ -61,6 +61,7 @@ def __init__(
             evaluation_data=evaluation_data,
             example_data=example_data,
         )
+        self.dataset.setup_metrics(self.model_args, self.dataset_args, self.evaluation_args)
         self.writer.write_metainfo(self.model_args, self.dataset_args, self.evaluation_args)
 
     @catch_error(True)
@@ -107,11 +108,14 @@ def evaluate(self) -> Dict[str, Dict[str, float]]:
         if self.evaluation_args.continue_from:
             raw_predictions.extend(self.writer.load_continue())
         for batch in dataloader:
+            log_once(logger.debug, f"batch_size {len(batch)}, first instance in batch:\n{batch[0]}", "fbi")
             batch_results = call_model(batch)
             if len(batch) != len(batch_results) and len(batch_results) != 0:
                 raise RuntimeError(
                     f"The number of results {len(batch_results)} should be equal to the number of samples in the batch {len(batch)}."
                 )
+            if len(batch_results) > 0:
+                log_once(logger.debug, f"first output in batch:\n{batch_results[0]}", "fbo")
             raw_predictions.extend(batch_results)
             self.dataset.step(self.writer, dataloader, batch_results)
 
diff --git a/utilization/load_dataset.py b/utilization/load_dataset.py
index cb9e52d7..2c8f447d 100644
--- a/utilization/load_dataset.py
+++ b/utilization/load_dataset.py
@@ -348,8 +348,9 @@ def load_datasets(
 ) -> DatasetCollection:
 
     # batch size for vllm is set after model is loaded
-    if model.model_backend == "vllm":
+    if model.is_vllm_model():
         args.batch_size = -1
+        args.auto_batch_size = False
         logger.info("Setting batch_size to -1, since vllm can automatically planning the optimal batch and order.")
 
     if model.args.prefix_caching and model.model_backend != "huggingface":
diff --git a/utilization/load_model.py b/utilization/load_model.py
index bcb53279..9434e505 100644
--- a/utilization/load_model.py
+++ b/utilization/load_model.py
@@ -1,3 +1,4 @@
+from functools import wraps
 from logging import getLogger
 from typing import TYPE_CHECKING
 
@@ -10,6 +11,81 @@
 
 logger = getLogger(__name__)
 
+__all__ = ["register_model", "load_model"]
+
+LOAD_REGISTERY = {}
+
+
+def register_model(model_backend):
+
+    def inner_decrator(fn):
+        LOAD_REGISTERY[model_backend] = fn
+        return fn
+
+    return inner_decrator
+
+
+@register_model(model_backend="openai")
+def load_openai(args: "ModelArguments"):
+    logger.info(f"Loading OpenAI API model `{args.model_name_or_path}`.")
+    from .model.openai_model import Openai
+
+    return Openai(args)
+
+
+@register_model(model_backend="anthropic")
+def load_anthropic(args: "ModelArguments"):
+    logger.info(f"Loading Anthropic API model `{args.model_name_or_path}`.")
+    from .model.anthropic_model import Anthropic
+
+    return Anthropic(args)
+
+
+@register_model(model_backend="dashscope")
+def load_dashscope(args: "ModelArguments"):
+    logger.info(f"Loading Dashscope (Aliyun) API model `{args.model_name_or_path}`.")
+    from .model.dashscope_model import Dashscope
+
+    return Dashscope(args)
+
+
+@register_model(model_backend="qianfan")
+def load_qianfan(args: "ModelArguments"):
+    logger.info(f"Loading Qianfan (Baidu) API model `{args.model_name_or_path}`.")
+    from .model.qianfan_model import Qianfan
+
+    return Qianfan(args)
+
+
+@register_model(model_backend="vllm")
+def load_vllm(args: "ModelArguments"):
+    try:
+        import vllm
+        logger.debug(f"vllm version: {vllm.__version__}")
+
+        from .model.vllm_model import vllmModel
+
+        return vllmModel(args)
+    except ModuleNotFoundError:
+        logger.warning(f"vllm has not been installed, falling back.")
+        return None
+    except ValueError as e:
+        if "are not supported for now" in str(e):
+            logger.warning(f"vllm has not supported the architecture of {args.model_name_or_path} for now.")
+            return None
+        elif "divisible by tensor parallel size" in str(e):
+            raise ValueError(f"Set an appropriate tensor parallel size via CUDA_VISIBLE_DEVICES: {e}")
+        else:
+            raise e
+
+
+@register_model(model_backend="huggingface")
+def load_huggingface(args: "ModelArguments"):
+    logger.info(f"Loading HuggingFace model `{args.model_name_or_path}`.")
+    from .model.huggingface_model import HuggingFaceModel
+
+    return HuggingFaceModel(args)
+
 
 @catch_error()
 def load_model(args: "ModelArguments") -> "Model":
@@ -21,48 +97,18 @@ def load_model(args: "ModelArguments") -> "Model":
     Returns:
         Model: Our class for model.
     """
-    if args.is_openai_model():
-        logger.info(f"Loading OpenAI API model `{args.model_name_or_path}`.")
-        from .model.openai_model import Openai
-
-        return Openai(args)
-    elif args.is_anthropic_model():
-        logger.info(f"Loading Anthropic API model `{args.model_name_or_path}`.")
-        from .model.anthropic_model import Anthropic
-
-        return Anthropic(args)
-    elif args.is_dashscope_model():
-        logger.info(f"Loading Dashscope (Aliyun) API model `{args.model_name_or_path}`.")
-        from .model.dashscope_model import Dashscope
-
-        return Dashscope(args)
-    elif args.is_qianfan_model():
-        logger.info(f"Loading Qianfan (Baidu) API model `{args.model_name_or_path}`.")
-        from .model.qianfan_model import Qianfan
-
-        return Qianfan(args)
+    loads = args.model_backend
+    if loads not in LOAD_REGISTERY:
+        raise ValueError(f"Model backend `{loads}` is not supported.")
+
+    if loads == "vllm":
+        loads = ["vllm", "huggingface"]
     else:
-        if args.vllm:
-            try:
-                import vllm
-                vllm.__version__
-
-                from .model.vllm_model import vllmModel
-
-                return vllmModel(args)
-            except ModuleNotFoundError:
-                args.vllm = False
-                args.model_backend = "huggingface"
-                logger.warning(f"vllm has not been installed, falling back to huggingface.")
-            except ValueError as e:
-                if "are not supported for now" in str(e):
-                    args.vllm = False
-                    args.model_backend = "huggingface"
-                    logger.warning(f"vllm has not supported the architecture of {args.model_name_or_path} for now.")
-                elif "divisible by tensor parallel size" in str(e):
-                    raise ValueError(f"Set an appropriate tensor parallel size via CUDA_VISIBLE_DEVICES: {e}")
-                else:
-                    raise e
-        from .model.huggingface_model import HuggingFaceModel
-
-        return HuggingFaceModel(args)
+        loads = [loads]
+
+    for load in loads:
+        model = LOAD_REGISTERY[load](args)
+        if model is not None:
+            return model
+
+    raise ValueError(f"Model backend `{loads}` is not supported.")
diff --git a/utilization/metric/__init__.py b/utilization/metric/__init__.py
index 45ac4c0a..ecd91269 100644
--- a/utilization/metric/__init__.py
+++ b/utilization/metric/__init__.py
@@ -4,8 +4,8 @@
 from .metric import Metric
 
 __all__ = [
-    "avg_metrics", "Accuracy", "Bleu", "F1", "Em", "Gaokao_bench_metric", "GPTEval", "IFEval", "PassAtK",
-    "Perspective_api", "Rouge", "Word_Accuracy"
+    "avg_metrics", "Accuracy", "Bleu", "F1", "Em", "GaokaoBenchMetric", "GPTEval", "IFEval", "PassAtK",
+    "PerspectiveApi", "Rouge", "WordAccuracy"
 ]
 
 from .metric_utils import avg_metrics
@@ -15,13 +15,13 @@
     from .bleu import Bleu as _Bleu
     from .em_f1 import F1 as _F1
     from .em_f1 import Em as _Em
-    from .gaokao_bench_metric import Gaokao_bench_metric as _Gaokao_bench_metric
+    from .gaokao_bench_metric import GaokaoBenchMetric as _GaokaoBenchMetric
     from .gpteval import GPTEval as _GPTEval
     from .ifeval import IFEval as _IFEval
     from .pass_at_k import PassAtK as _PassAtK
-    from .perspective_api import Perspective_api as _Perspective_api
+    from .perspective_api import PerspectiveApi as _PerspectiveApi
     from .rouge import Rouge as _Rouge
-    from .word_accuracy import Word_Accuracy as _Word_Accuracy
+    from .word_accuracy import WordAccuracy as _WordAccuracy
 
 
 def lazy_import(module, instance) -> Any:
@@ -47,10 +47,10 @@ def __call__(self, *args, **kwargs):
 Bleu: Type["_Bleu"] = lazy_import("bleu", "Bleu")
 F1: Type["_F1"] = lazy_import("em_f1", "F1")
 Em: Type["_Em"] = lazy_import("em_f1", "Em")
-Gaokao_bench_metric: Type["_Gaokao_bench_metric"] = lazy_import("gaokao_bench_metric", "Gaokao_bench_metric")
+GaokaoBenchMetric: Type["_GaokaoBenchMetric"] = lazy_import("gaokao_bench_metric", "GaokaoBenchMetric")
 GPTEval: Type["_GPTEval"] = lazy_import("gpteval", "GPTEval")
 IFEval: Type["_IFEval"] = lazy_import("ifeval", "IFEval")
 PassAtK: Type["_PassAtK"] = lazy_import("pass_at_k", "PassAtK")
-Perspective_api: Type["_Perspective_api"] = lazy_import("perspective_api", "Perspective_api")
+PerspectiveApi: Type["_PerspectiveApi"] = lazy_import("perspective_api", "PerspectiveApi")
 Rouge: Type["_Rouge"] = lazy_import("rouge", "Rouge")
-Word_Accuracy: Type["_Word_Accuracy"] = lazy_import("word_accuracy", "Word_Accuracy")
+WordAccuracy: Type["_WordAccuracy"] = lazy_import("word_accuracy", "WordAccuracy")
diff --git a/utilization/metric/em_f1.py b/utilization/metric/em_f1.py
index 3fc6d612..aea83c2a 100644
--- a/utilization/metric/em_f1.py
+++ b/utilization/metric/em_f1.py
@@ -105,10 +105,11 @@ class F1(Metric):
 
     Args:
         `multiref_strategy`: Strategy to aggregate F1 scores for multiple references.
-        `force_number_match`: If reference contains numbers, prediction must matches all the numbers in the reference.
         `word_tokenize`: Tokenizer functions for different tokenization methods. Default: nltk.word_tokenize.
             DROP: https://github.com/EleutherAI/lm-evaluation-harness/blob/3196e907fa195b684470a913c7235ed7f08a4383/lm_eval/tasks/drop/utils.py#L193
             SQuAD: https://github.com/huggingface/datasets/blob/f96e74d5c633cd5435dd526adb4a74631eb05c43/metrics/squad_v2/evaluate.py#L80
+        `normalize_level`: Where to normalize the text. Default: both.
+        `align_bag`: How to align the bag of words. Default: counter.
 
     Return:
         "F1": float
diff --git a/utilization/metric/gaokao_bench_metric.py b/utilization/metric/gaokao_bench_metric.py
index afc8b984..7ea5a607 100644
--- a/utilization/metric/gaokao_bench_metric.py
+++ b/utilization/metric/gaokao_bench_metric.py
@@ -13,7 +13,7 @@ def multi_ref_aggregation(scores, multiref_strategy):
     return func(scores)
 
 
-class Gaokao_bench_metric(Metric):
+class GaokaoBenchMetric(Metric):
     r""" Calculate the Gaokao-Bench score.
 
     Return:
diff --git a/utilization/metric/gpteval.py b/utilization/metric/gpteval.py
index 6a393fb7..8d3c78ad 100644
--- a/utilization/metric/gpteval.py
+++ b/utilization/metric/gpteval.py
@@ -1,14 +1,21 @@
+import datetime
+import os
 import re
-import time
 from logging import getLogger
-from typing import Literal
+from typing import TYPE_CHECKING, Dict, List, Literal, Tuple
 
 import numpy as np
 import openai
 from tqdm import tqdm
 
+from ..utils.log_results import PredictionWriter
+from ..utils.logging import DEFAULT_DATETIME_FORMAT
 from .metric import Metric
 
+if TYPE_CHECKING:
+    from ..dataset import Dataset
+    from ..utils.arguments import DatasetArguments, EvaluationArguments, ModelArguments
+
 logger = getLogger(__name__)
 
 SINGLE_JUDGE_PROMPT = "[Instruction]\nPlease act as an impartial judge and evaluate the quality of the response provided by an AI assistant to the user question displayed below. Your evaluation should consider factors such as the helpfulness, relevance, accuracy, depth, creativity, and level of detail of the response. Begin your evaluation by providing a short explanation. Be as objective as possible. After providing your explanation, you must rate the response on a scale of 1 to 10 by strictly following this format: \"[[rating]]\", for example: \"Rating: [[5]]\".\n\n[Question]\n{question}\n\n[The Start of Assistant's Answer]\n{answer}\n[The End of Assistant's Answer]"
@@ -37,8 +44,9 @@ def __init__(self, multi_turn=False, type: Literal["single", "pairwise"] = "sing
 
         self.multi_turn = multi_turn
         self.type = type
+        self.gpteval_model = "gpt-3.5-turbo"
         self.model_args = ModelArguments(
-            model_name_or_path="gpt-3.5-turbo",  # use it to judge the model.
+            model_name_or_path=self.gpteval_model,  # use it to judge the model.
             max_tokens=1024,
             temperature=0,
             openai_api_key=openai.api_key
@@ -46,13 +54,23 @@ def __init__(self, multi_turn=False, type: Literal["single", "pairwise"] = "sing
         self.min_scoring = 1
         self.max_scoring = 10
 
-    def __call__(self, predictions, references):
+    def setup_metric(
+        self, model_args: "ModelArguments", dataset_args: "DatasetArguments", evaluation_args: "EvaluationArguments",
+        dataset: "Dataset"
+    ):
+        execution_time = datetime.datetime.now().strftime(DEFAULT_DATETIME_FORMAT)
+        log_filename = f"{self.gpteval_model}-gpteval-{execution_time}.json"
+        gpteval_path = os.path.join(evaluation_args.evaluation_results_dir, log_filename)
+        self.gpteval_writer = PredictionWriter(gpteval_path)
+
+    def __call__(self, predictions: List[Tuple[str, str]], references: List[Dict[str, str]]):
 
         # load gpteval model after the predictions of dataset are generated
         from ..load_model import load_model
 
         self.model = load_model(self.model_args)
         self.model.set_generation_args()
+        logger.info(f"GPTEval results will be saved to {self.gpteval_writer.evaluation_path}")
 
         if self.type == "single":
             ratings = self._get_single_ratings(predictions, references)
@@ -71,8 +89,9 @@ def _generation(self, prompt):
             logger.warning(f"Failed to generate GPTEval response: {e}")
             return [str(self.min_scoring)]
 
-    def _get_single_ratings(self, predictions, references):
+    def _get_single_ratings(self, predictions: List[Tuple[str, str]], references: List[Dict[str, str]]):
         responses = []
+        lines_iter = iter(zip(range(len(references)), predictions, references))
         for pred, refer in tqdm(zip(predictions, references), desc="Judging", total=len(predictions)):
             if "ref_answer_1" not in refer:
                 user_prompt = SINGLE_JUDGE_PROMPT_MT.format(
@@ -89,7 +108,9 @@ def _get_single_ratings(self, predictions, references):
                 ) if self.multi_turn else SINGLE_JUDGE_PROMPT_MATH.format(
                     question=refer["turns"][0], ref_answer_1=refer["ref_answer_1"], answer=pred
                 )
-            responses.extend(self._generation(user_prompt))
+            resp = self._generation(user_prompt)
+            self.gpteval_writer.log_batch_results([resp], False, lines_iter)
+            responses.extend(resp)
 
         ratings = []
         for response in responses:
@@ -107,14 +128,16 @@ def _get_single_ratings(self, predictions, references):
                 logger.warning(f"Failed to extract rating from response: {response}")
         return ratings
 
-    def _get_pairwise_ratings(self, predictions, references):
+    def _get_pairwise_ratings(self, predictions: List[Tuple[str, str]], references: List[Dict[str, str]]):
         responses = []
+        lines_iter = iter(zip(range(len(references)), predictions, references))
         for pred, refer in tqdm(zip(predictions, references), desc="Judging", total=len(predictions)):
             current_prompt = PAIRWISE_JUDGE_PROMPT.format(
                 question=refer["instruction"], answer_a=refer["output"], answer_b=pred
             )
-            responses.extend(self._generation(current_prompt))
-            time.sleep(5)
+            resp = self._generation(current_prompt)
+            self.gpteval_writer.log_batch_results([resp], False, lines_iter)
+            responses.extend(resp)
 
         ratings = []
         for response in responses:
diff --git a/utilization/metric/metric.py b/utilization/metric/metric.py
index 3de655b8..11cc3d89 100644
--- a/utilization/metric/metric.py
+++ b/utilization/metric/metric.py
@@ -1,8 +1,12 @@
 from logging import getLogger
-from typing import Dict, List
+from typing import TYPE_CHECKING, Dict, List
 
 import numpy as np
 
+if TYPE_CHECKING:
+    from ..dataset import Dataset
+    from ..utils import DatasetArguments, EvaluationArguments, ModelArguments
+
 logger = getLogger(__name__)
 
 
@@ -32,6 +36,12 @@ def __call__(self, predictions, references) -> Dict[str, float]:
     def __repr__(self) -> str:
         return self.__class__.__name__ + "()"
 
+    def setup_metric(
+        self, model_args: "ModelArguments", dataset_args: "DatasetArguments", evaluation_args: "EvaluationArguments",
+        dataset: "Dataset"
+    ):
+        pass
+
     @property
     def last_score_lists(self) -> Dict[str, List[float]]:
         if self._last_score_lists is None:
diff --git a/utilization/metric/pass_at_k.py b/utilization/metric/pass_at_k.py
index c69dd20a..e880d256 100644
--- a/utilization/metric/pass_at_k.py
+++ b/utilization/metric/pass_at_k.py
@@ -25,6 +25,9 @@ def set_k(self, k: int):
             raise ValueError(f"Pass@K expects an integer, but got {type(k)}.")
         self.k = k
 
+    def setup_metric(self, model_args, dataset_args, evaluation_args, dataset):
+        self.k = dataset_args.pass_at_k
+
     def __call__(self, predictions, references):
         os.environ["TOKENIZERS_PARALLELISM"] = "false"
 
diff --git a/utilization/metric/perspective_api.py b/utilization/metric/perspective_api.py
index ccd307a3..9340085a 100644
--- a/utilization/metric/perspective_api.py
+++ b/utilization/metric/perspective_api.py
@@ -1,5 +1,6 @@
 import time
 from logging import getLogger
+from typing import TYPE_CHECKING
 
 import httplib2
 import numpy as np
@@ -8,13 +9,22 @@
 
 from .metric import Metric
 
+if TYPE_CHECKING:
+    from ..dataset import Dataset
+    from ..utils import DatasetArguments, EvaluationArguments, ModelArguments
+
 logger = getLogger(__name__)
 
 
-class Perspective_api(Metric):
+class PerspectiveApi(Metric):
     r"""Call api to get the perspective toxic score."""
 
-    def __init__(self, api_key: str, proxy_port: int):
+    def setup_metric(
+        self, model_args: "ModelArguments", dataset_args: "DatasetArguments", evaluation_args: "EvaluationArguments",
+        dataset: "Dataset"
+    ):
+        api_key = dataset_args.perspective_api_key
+        proxy_port = evaluation_args.proxy_port
         if proxy_port is None:
             http = httplib2.Http(timeout=10, disable_ssl_certificate_validation=False)
         else:
diff --git a/utilization/metric/word_accuracy.py b/utilization/metric/word_accuracy.py
index 65856b51..9f8ae9d5 100644
--- a/utilization/metric/word_accuracy.py
+++ b/utilization/metric/word_accuracy.py
@@ -1,4 +1,4 @@
-from typing import Dict, Union
+from typing import TYPE_CHECKING, Dict, Union
 
 import numpy as np
 import tiktoken
@@ -6,16 +6,23 @@
 
 from .metric import Metric
 
+if TYPE_CHECKING:
+    from ..dataset import Dataset
+    from ..utils.arguments import DatasetArguments, EvaluationArguments, ModelArguments
 
-class Word_Accuracy(Metric):
+
+class WordAccuracy(Metric):
     r""" For those tasks only require to predict curtain number words, calculate the Accuracy score.
 
     Return
         "Accuracy": float
     """
 
-    def __init__(self, tokenizer: Union[tiktoken.Encoding, PreTrainedTokenizer, PreTrainedTokenizerFast]):
-        self.tokenizer = tokenizer
+    def setup_metric(
+        self, model_args: "ModelArguments", dataset_args: "DatasetArguments", evaluation_args: "EvaluationArguments",
+        dataset: "Dataset"
+    ):
+        self.tokenizer = dataset.tokenizer
 
     def __call__(self, predictions, references) -> Dict[str, float]:
         if isinstance(self.tokenizer, tiktoken.Encoding):
diff --git a/utilization/model/huggingface_model.py b/utilization/model/huggingface_model.py
index 8431e6ca..59c4e496 100644
--- a/utilization/model/huggingface_model.py
+++ b/utilization/model/huggingface_model.py
@@ -10,7 +10,8 @@
 from transformers.tokenization_utils import PreTrainedTokenizer
 from transformers.tokenization_utils_fast import PreTrainedTokenizerFast
 
-from ..utils import ModelArguments
+from ..model_enum import HUGGINGFACE_ARGS
+from ..utils import GenerationArg, ModelArguments, resolve_generation_args
 from .model import Model
 from .model_utils.conversation import Conversation
 from .model_utils.keywords_criteria import KeyWordsCriteria
@@ -524,69 +525,36 @@ def get_ppl(
         return ppls
 
     def set_generation_args(self, **extra_model_args):
-        self.stop_id_sequences = []
-
-        generation_kwargs = {}
-        for key in [
-            "temperature",
-            "top_p",
-            "top_k",
-            "max_tokens",
-            "best_of",
-            "repetition_penalty",
-            "length_penalty",
-            "early_stopping",
-            "no_repeat_ngram_size",
-            "stop",
-        ]:
-            # ModelArguments (cmd) > extra_model_args > ModelArguments (default)
-            if not self.args.passed_in_commandline(key):
-                value = extra_model_args.pop(key, None)
-            if value is None:
-                value = getattr(self.args, key, None)
-
-            if key == "max_tokens":
-                if value is None:
-                    value = 1024
-                else:
-                    # if `max_tokens` is provided, ensure the maximum length of input
-                    self.model_max_input = self.model_max_input_and_output - value
-
-            if value is not None:
-                if key == "max_tokens":
-                    generation_kwargs["max_new_tokens"] = value
-                elif key == "best_of":
-                    generation_kwargs["num_beams"] = value
-                elif key == "temperature":
-                    if value > 0:
-                        generation_kwargs["temperature"] = value
-                        generation_kwargs["do_sample"] = True
-                    else:
-                        generation_kwargs["do_sample"] = False
-                elif key == "stop":
-                    self.stop_id_sequences.extend(
-                        self._tokenize_postfix(
-                            value,  # type: ignore
-                            add_dummy_prefix=True,
-                            padding=False,
-                        )
-                    )
-                    generation_kwargs["stopping_criteria"] = [KeyWordsCriteria(self.stop_id_sequences)]
-                else:
-                    generation_kwargs[key] = value
-
-        generation_kwargs["pad_token_id"] = self.tokenizer.pad_token_id
-        generation_kwargs["eos_token_id"] = self.tokenizer.eos_token_id
 
         self.multi_turn = extra_model_args.pop("multi_turn", False)
+        if self.model_type != "chat" and self.multi_turn:
+            raise ValueError(
+                "The multi_turn is only available for chat-based model. Please use a chat model and set `--model_type chat`."
+            )
 
-        if self.model_type != "chat":
-            if self.multi_turn:
-                raise ValueError(
-                    "The multi_turn is only available for chat-based model. Please use a chat model and set `--model_type chat`."
+        self.stop_id_sequences = []
+
+        def add_stop(value, details: GenerationArg):
+            self.stop_id_sequences.extend(
+                self._tokenize_postfix(
+                    value,  # type: ignore
+                    add_dummy_prefix=True,
+                    padding=False,
                 )
+            )
+            return {"stopping_criteria": [KeyWordsCriteria(self.stop_id_sequences)]}
+
+        self.generation_kwargs = resolve_generation_args(
+            self.args,
+            extra_model_args,
+            HUGGINGFACE_ARGS,
+            extra_generation_args={
+                "stop": add_stop,
+                "pad_token_id": self.tokenizer.pad_token_id,
+                "eos_token_id": self.tokenizer.eos_token_id,
+            },
+        )
 
-        self.generation_kwargs = generation_kwargs
         if len(extra_model_args) > 0:
             logger.warning(f"Unused generation arguments: {extra_model_args}")
         return self.generation_kwargs
@@ -595,7 +563,7 @@ def generation_with_cache(
         self,
         batched_inputs: List[str],
         prefix_cache: SequenceCache,
-    ) -> List[str]:
+    ) -> Tuple[List[str], SequenceCache]:
 
         caches = self.get_cache(batched_inputs, prefix_cache, save_token_ids=True, save_next_logits=True)
         prefix_cache = SequenceCache.pad_and_stack(caches)
@@ -607,6 +575,7 @@ def generation_with_cache(
         generation_kwargs = self.generation_kwargs.copy()
         if "max_new_tokens" in generation_kwargs:
             generation_kwargs["max_new_tokens"] -= 1
+
         batch_outputs = self.model.generate(
             inputs=inputs,
             attention_mask=attention_mask,
@@ -628,18 +597,15 @@ def generation(self,
         Returns:
             List[str]: The list of generation results.
         """
-
+        # batched_inputs: List[Conversation], batched_prompts: List[str] or List[List[str]]
         batched_prompts = [i.to_model_prompt() for i in batched_inputs]
-        if isinstance(batched_prompts[0], str):
-            prompts: List[str] = batched_prompts
-        else:
+        num_turns = batched_inputs[0].num_turns
+        assert all(conv.num_turns == num_turns for conv in batched_inputs)
+        if not isinstance(batched_prompts[0], str):
             grouped_prompts = list(map(list, zip(*batched_prompts)))
-            prompts: List[str] = ["".join(pg[i] for pg in grouped_prompts) for i in range(len(grouped_prompts[0]))]
             cache_level = len(grouped_prompts)
 
-        if self.model_type == "chat" and self.multi_turn:
-            return self._multi_turn_generation(batched_inputs)
-        elif use_cache and self.use_cache:
+        if use_cache and self.use_cache:
             # if cache is available, generation_with_cache
             prefix_cache, cached_num = self.cacher.get_cache()
             if prefix_cache is not None and cached_num == cache_level - 1:
@@ -652,42 +618,10 @@ def generation(self,
             self.cacher.step()
             return []
 
-        results = self._generation(prompts)
-        results = [conv[-1]["content"] if isinstance(conv, Conversation) else conv for conv in results]
-        return results
-
-    def _multi_turn_generation(self, batched_mt_inputs: List[Conversation]) -> List[Tuple[str, ...]]:
-        """Multi-turn generation
-
-        Args:
-            batched_mt_inputs (List[Conversation]): Batched multi-turn inputs.
-
-        Returns:
-            List[Tuple[str, ...]]: [Batch Turn]
-        """
-        batch_size = len(batched_mt_inputs)
-        max_turn = max(conv.num_turns for conv in batched_mt_inputs)
-
-        answers = self._generation(history_conversations)
-        responses = [[conv[-1]["content"]] for conv in answers]
-
-        for turn_idx in range(1, max_turn):
-            cur_turn = [mt_input[turn_idx] if len(mt_input) > turn_idx else None for mt_input in batched_mt_inputs]
-            next_batch = []
-            for conv, turn in zip(history_conversations, cur_turn):
-                if turn is not None:
-                    conv.add_message({"role": "user", "content": turn})
-                    next_batch.append(conv)
-
-            answers = self._generation(next_batch)
-
-            for idx, (r, turn) in enumerate(zip(responses, cur_turn)):
-                if turn is not None:
-                    conv = answers.pop(0)
-                    r.append(conv[-1]["content"])
-                    history_conversations[idx] = conv
+        for turn_idx in range(num_turns):
+            batched_inputs = self._generation(batched_inputs, turn_idx + 1)
 
-        return [tuple(r) for r in responses]
+        return [c.get_generation_results() for c in batched_inputs]
 
     def _generation(
         self,
diff --git a/utilization/model/model.py b/utilization/model/model.py
index 4cc0614a..52ff0824 100644
--- a/utilization/model/model.py
+++ b/utilization/model/model.py
@@ -7,7 +7,7 @@
 from transformers import PreTrainedModel, PreTrainedTokenizer, PreTrainedTokenizerFast
 
 from ..model_enum import API_MODELS, ENDPOINT_ARGS, ERROR_OVERVIEW
-from ..utils import ModelArguments
+from ..utils import ModelArguments, resolve_generation_args
 from ..utils.arguments import ModelBackendMixin
 from .model_utils.conversation import Conversation
 from .model_utils.prefix_caching import Cacher
@@ -203,68 +203,15 @@ def _get_error_type(response: Any) -> Optional[Type[Exception]]:
     def set_generation_args(self, **extra_model_args):
         r"""Set the configurations for open-ended generation. This is useful because different datasets may have different requirements for generation."""
         if self.name in API_MODELS and "args" in API_MODELS[self.name]:
-            endpoint_args = API_MODELS[self.name]["args"]
+            endpoint_schema = API_MODELS[self.name]["args"]
         else:
             endpoint_name = self.model_backend + "/" + self.endpoint
             if endpoint_name not in ENDPOINT_ARGS:
                 raise ValueError(f"Endpoint {endpoint_name} is not supported.")
-            endpoint_args = ENDPOINT_ARGS[endpoint_name]
+            endpoint_schema = ENDPOINT_ARGS[endpoint_name]
 
-        generation_kwargs = {}
-
-        def set_args(key, value, extra_body):
-            if extra_body:
-                if "extra_body" not in generation_kwargs:
-                    generation_kwargs["extra_body"] = {}
-
-                if key in generation_kwargs["extra_body"] and generation_kwargs["extra_body"][key] != value:
-                    raise ValueError(f"Conflict value for {key}: {generation_kwargs['extra_body'][key]} vs {value}")
-
-                generation_kwargs["extra_body"][key] = value
-            else:
-
-                if key in generation_kwargs and generation_kwargs[key] != value:
-                    raise ValueError(f"Conflict value for {key}: {generation_kwargs[key]} vs {value}")
-
-                generation_kwargs[key] = value
-
-        for key, details in endpoint_args.items():
-            # ModelArguments (cmd) > extra_model_args > ModelArguments (default)
-            if not self.args.passed_in_commandline(key):
-                value = extra_model_args.pop(key, None)
-            else:
-                value = None
-            if value is None:
-                value = getattr(self.args, key, None)
-
-            # set default values
-            if value is None and details.default is not None:
-                value = details.default
-
-            # set alias after default values
-            if details.transform_key is not None:
-                key = details.transform_key
-
-            # type casting
-            if details._type is not None and value is not None:
-                value = details._type(value)
-
-            # transform
-            if details.transform_value is not None and value is not None:
-                value = details.transform_value(value)
-
-            # skip if no value
-            if value is None and not details.nullable:
-                continue
-
-            if details.needs is not None:
-                for need, need_value in details.needs.items():
-                    set_args(need, need_value, endpoint_args[need].extra_body)
-
-            set_args(key, value, details.extra_body)
-
-        self.generation_kwargs = generation_kwargs
         self.multi_turn = extra_model_args.pop("multi_turn", False)
+        self.generation_kwargs = resolve_generation_args(self.args, extra_model_args, endpoint_schema)
 
         if len(extra_model_args) > 0:
             logger.warning(f"Unused generation arguments: {extra_model_args}")
diff --git a/utilization/model/model_utils/batch_sampler.py b/utilization/model/model_utils/batch_sampler.py
index ef9ce331..8bb19bde 100644
--- a/utilization/model/model_utils/batch_sampler.py
+++ b/utilization/model/model_utils/batch_sampler.py
@@ -85,7 +85,7 @@ def check_new_batch(self, queries: List[int], next_data: int) -> bool:
 
         batch_size = available_space // max_len
         batch_size = round_down(batch_size)
-        print("!!!", queries, current_batch, batch_size, available_space, max_len, self.first_max_len)
+        # print("!!!", queries, current_batch, batch_size, available_space, max_len, self.first_max_len)
         return current_batch >= batch_size
 
     def __iter__(self) -> Iterator[List[int]]:
@@ -183,7 +183,7 @@ def __iter__(self) -> Iterator[List[int]]:
                 yield from AutoBatchSizeSampler(
                     iterator,
                     self.batch_size if not self.vllm else total,
-                    self.auto_batch_size,
+                    self.auto_batch_size and not self.vllm,
                     start_from=accumulative
                 )
             accumulative += total
diff --git a/utilization/model/model_utils/conversation.py b/utilization/model/model_utils/conversation.py
index cbc47c87..94c34ed0 100644
--- a/utilization/model/model_utils/conversation.py
+++ b/utilization/model/model_utils/conversation.py
@@ -49,7 +49,7 @@ def __init__(
         chat_config: Dict[str, str],
         chat_template: str,
     ):
-        self.default_stops = chat_config.pop("default_stops", [])
+        self.default_stop = chat_config.pop("default_stop", [])
         self.auto_leading_space = chat_config.pop("auto_leading_space", True)
         self.final_lstrip = chat_config.pop("final_lstrip", True)
         self.final_rstrip = chat_config.pop("final_rstrip", True)
@@ -159,7 +159,7 @@ def _get_segs(self, conversations: List["Conversation"], max_turns: int = 1) ->
             for seg in (system, examples, source, target):
                 if len(seg) > 0:
                     if len(result) > 0:
-                        seg = add_space(seg, True, result[-1])
+                        seg = add_space(seg, result[-1])
                     elif self.final_lstrip:
                         seg = seg.lstrip()
                     result += (seg,)
@@ -300,9 +300,6 @@ def get_segs(
         seg: Optional[Literal["system", "examples", "source", "target"]] = None,
     ) -> Union[List[Dict[str, str]], Dict[str, List[Dict[str, str]]]]:
         """Get splitted segments of the conversation to cache the KV of them."""
-        assert len(
-            self.mt_users
-        ) == self.num_turns - 1, "`get_segs` is only available before adding assistant responses."
 
         # system
         example_st = 0
@@ -312,8 +309,8 @@ def get_segs(
         else:
             system = []
 
-        # few-shots example
-        example_ed = example_st + self.num_shots * 2
+        # few-shots example (and previous turns of conversation)
+        example_ed = example_st + self.num_shots * 2 + (self.num_turns - len(self.mt_users) - 1) * 2
         examples = self.messages[example_st:example_ed]
         assert all(msg["role"] == "user" for msg in examples[::2]) and all(
             msg["role"] == "assistant" for msg in examples[1::2]
@@ -336,6 +333,17 @@ def get_segs(
             return results[seg]
         return results
 
+    def get_generation_results(self) -> Union[str, Tuple[str, ...]]:
+        if self.num_turns > 1:
+            multi_turn_st = int(self.messages[0]["role"] == "system") + self.num_shots * 2
+            # transform to tuples for hashability
+            results = tuple(msg["content"] for msg in self.messages[multi_turn_st:] if msg["role"] == "assistant")
+            assert len(results) == self.num_turns
+            return results
+        else:
+            assert self.messages[-1]["role"] == "assistant"
+            return self.messages[-1]["content"]
+
     def set_formatter(
         self,
         formatter: ConversationFormatter,
@@ -344,7 +352,7 @@ def set_formatter(
     ):
         self.formatter = formatter
         self.model_evaluation_method = model_evaluation_method
-        self.split = split
+        self.split = split and self.get_segs_num() > 1
 
     def to_model_prompt(
         self,
diff --git a/utilization/model/model_utils/prefix_caching.py b/utilization/model/model_utils/prefix_caching.py
index 51d157a0..b4bbcbc3 100644
--- a/utilization/model/model_utils/prefix_caching.py
+++ b/utilization/model/model_utils/prefix_caching.py
@@ -246,7 +246,7 @@ def pad_and_stack(cls, seq_caches: Sequence["SequenceCache"], pad_token_id: int
 
     def to_legacy_cache(self) -> Optional[Tuple[Tuple[torch.Tensor, torch.Tensor], ...]]:
         """Converts the `DynamicCache` instance into the its equivalent in the legacy cache format."""
-        if any(s == 0 for s in self.key_cache[0].shape):
+        if len(self.key_cache) == 0 or any(s == 0 for s in self.key_cache[0].shape):
             return None
 
         legacy_cache = ()
diff --git a/utilization/model/vllm_model.py b/utilization/model/vllm_model.py
index 5ef9c891..2713314f 100644
--- a/utilization/model/vllm_model.py
+++ b/utilization/model/vllm_model.py
@@ -5,6 +5,8 @@
 import torch
 from packaging import version
 
+from ..model_enum import VLLM_ARGS
+from ..utils import resolve_generation_args
 from .model import Model
 from .model_utils.conversation import Conversation
 
@@ -120,40 +122,26 @@ def get_ppl(self, batched_inputs):
         return ppls
 
     def set_generation_args(self, **extra_model_args):
-        generation_kwargs = {}
-        for key in [
-            "temperature",
-            "top_p",
-            "top_k",
-            "max_tokens",
-            "best_of",
-            "frequency_penalty",
-            "presence_penalty",
-            "repetition_penalty",
-            "length_penalty",
-            "early_stopping",
-            "stop",
-        ]:
-            # ModelArguments > extra_model_args
-            value = getattr(self.args, key, None)
-            if value is None:
-                value = extra_model_args.pop(key, None)
-
-            if key == "max_tokens" and value is None:
-                value = 1024
-            if value is not None:
-                generation_kwargs[key] = value
-        if generation_kwargs.get("best_of", 0) > 1:
-            generation_kwargs["use_beam_search"] = True
+
+        self.multi_turn = extra_model_args.pop("multi_turn", False)
+        generation_kwargs = resolve_generation_args(self.args, extra_model_args, VLLM_ARGS)
         self.generation_kwargs = SamplingParams(**generation_kwargs)
+
         if len(extra_model_args) > 0:
             logger.warning(f"Unused generation arguments: {extra_model_args}")
         return self.generation_kwargs
 
     def generation(self, batched_inputs: List[Conversation]) -> List[str]:
-        batched_inputs = [conv.to_model_prompt() for conv in batched_inputs]
-        results = self.model.generate(batched_inputs, sampling_params=self.generation_kwargs)
-        return [r.outputs[0].text for r in results]
+        num_turns = batched_inputs[0].num_turns
+        assert all(conv.num_turns == num_turns for conv in batched_inputs)
+
+        for turn_idx in range(num_turns):
+            batched_prompts = [conv.to_model_prompt() for conv in batched_inputs]
+            results = self.model.generate(batched_prompts, sampling_params=self.generation_kwargs)
+            for i, result in enumerate(results):
+                batched_inputs[i].add_multi_turn(assistant=result.outputs[0].text)
+
+        return [c.get_generation_results() for c in batched_inputs]
 
     def set_prob_args(self, **extra_model_args):
         self.prob_kwargs = SamplingParams(max_tokens=1, temperature=0)
diff --git a/utilization/model_enum.py b/utilization/model_enum.py
index 8906438a..19e075be 100644
--- a/utilization/model_enum.py
+++ b/utilization/model_enum.py
@@ -7,8 +7,8 @@
     "temperature": generation_arg(),
     "top_p": generation_arg(),
     "top_k": generation_arg(),
-    "max_tokens": generation_arg(),
-    "best_of": generation_arg(),
+    "max_tokens": generation_arg(default=1024),
+    "best_of": generation_arg(needs=lambda b, _: {"use_beam_search": b > 1}),
     "frequency_penalty": generation_arg(),
     "presence_penalty": generation_arg(),
     "repetition_penalty": generation_arg(),
@@ -18,11 +18,11 @@
 }
 
 HUGGINGFACE_ARGS = {
-    "temperature": generation_arg(),
+    "temperature": generation_arg(needs=lambda t, _: {"do_sample": t > 0}),
     "top_p": generation_arg(),
     "top_k": generation_arg(),
-    "max_tokens": generation_arg(),
-    "best_of": generation_arg(),
+    "max_tokens": generation_arg(default=1024, transform_key="max_new_tokens"),
+    "best_of": generation_arg(transform_key="num_beams"),
     "repetition_penalty": generation_arg(),
     "length_penalty": generation_arg(),
     "early_stopping": generation_arg(),
@@ -89,22 +89,15 @@
 }
 
 QIANFAN_CHAT_COMPLETIONS_ARGS = {
-    "temperature":
-    generation_arg(transform_value=lambda x: min(max(0.0001, x), 1.0), _type=float),
-    "top_p":
-    generation_arg(),
-    "top_k":
-    generation_arg(),
-    "penalty_score":
-    generation_arg(),
-    "stop":
-    generation_arg(),
-    "disable_search":
-    generation_arg(),
-    "enable_citation":
-    generation_arg(),
+    "temperature": generation_arg(transform_value=lambda x: min(max(0.0001, float(x)), 1.0)),
+    "top_p": generation_arg(),
+    "top_k": generation_arg(),
+    "penalty_score": generation_arg(),
+    "stop": generation_arg(),
+    "disable_search": generation_arg(),
+    "enable_citation": generation_arg(),
     "max_tokens":
-    generation_arg(default=1024, transform_key="max_output_tokens", transform_value=lambda x: max(2, x), _type=int),
+    generation_arg(default=1024, transform_key="max_output_tokens", transform_value=lambda x: max(2, int(x))),
 }
 
 
diff --git a/utilization/utils/__init__.py b/utilization/utils/__init__.py
index 45409e77..02b10d5a 100644
--- a/utilization/utils/__init__.py
+++ b/utilization/utils/__init__.py
@@ -1,5 +1,6 @@
 from .arguments import DatasetArguments, EvaluationArguments, ModelArguments, parse_argument
 from .catch_error import catch_error
 from .dynamic_stride_tqdm import dynamic_stride_tqdm
+from .generation_args import GenerationArg, resolve_generation_args
 from .log_results import PredictionWriter, log_final_results
 from .logging import getFileLogger
diff --git a/utilization/utils/arguments.py b/utilization/utils/arguments.py
index a103e8a0..40f85df1 100644
--- a/utilization/utils/arguments.py
+++ b/utilization/utils/arguments.py
@@ -256,17 +256,21 @@ class ModelArguments(ModelBackendMixin):
     }
 
     def __post_init__(self):
-        if self.vllm is None:
-            self.vllm = not self.prefix_caching
 
-        # set _model_impl first
+        # ============= Set model_backend =============
         if self.model_backend is None:
             if self.model_name_or_path in API_MODELS:
                 self.model_backend = API_MODELS[self.model_name_or_path]["model_backend"]
-            elif self.vllm:
-                self.model_backend = "vllm"
-            else:
+            elif self.vllm is not None:
+                self.model_backend = "vllm" if self.vllm else "huggingface"
+            elif self.prefix_caching is not None:
+                # unless explicitly set backend to vllm, prefix_caching uses huggingface backend
                 self.model_backend = "huggingface"
+            else:
+                # try to load with vllm first
+                self.model_backend = "vllm"
+
+        # ============= Init api keys and tokenizers =============
 
         # set `self.openai_api_key` and `openai.api_key` from environment variables
         if "OPENAI_API_KEY" in os.environ and self.openai_api_key is None:
@@ -282,10 +286,12 @@ def __post_init__(self):
             if self.tokenizer_name_or_path is None:
                 try:
                     self.tokenizer_name_or_path = tiktoken.encoding_for_model(self.model_name_or_path).name
-                except (KeyError, AttributeError) as e:
+                except AttributeError as e:
                     raise RuntimeError(
                         "Unsupported tiktoken library version. Please update the tiktoken library to the latest version or manually specify the tokenizer.\n\n  pip install tiktoken --upgrade"
                     ) from e
+                except KeyError as e:
+                    self.tokenizer_name_or_path = "cl100k_base"
 
         # set `self.anthropic_api_key` from environment variables
         if "ANTHROPIC_API_KEY" in os.environ and self.anthropic_api_key is None:
@@ -322,35 +328,30 @@ def __post_init__(self):
             if self.tokenizer_name_or_path is None:
                 self.tokenizer_name_or_path = "cl100k_base"
 
-        if self.is_local_model():
-            if self.model_type == "chat":
-                if not re.search(r"chat|instruct", self.model_name_or_path.lower()):
-                    logger.warning(
-                        f"Model {self.model_name_or_path} seems to be a base model, you can set --model_type to `base` or `instruction` to use base format."
-                    )
-            else:
-                if re.search(r"chat|instruct", self.model_name_or_path.lower()):
-                    logger.warning(
-                        f"Model {self.model_name_or_path} seems to be a chat-based model, you can set --model_type to `chat` to use chat format."
-                    )
-
         if self.tokenizer_name_or_path is None:
             self.tokenizer_name_or_path = self.model_name_or_path
 
+        # ============= Init model type =============
+
         if self.model_name_or_path in API_MODELS:
             auto_model_type = API_MODELS[self.model_name_or_path]["model_type"]
+        elif self.is_local_model():
+            auto_model_type = "chat" if re.search(r"chat|instruct", self.model_name_or_path.lower()) else "base"
         else:
             auto_model_type = None
 
+        # set auto_model_type
         if self.model_type is None and auto_model_type is not None:
             self.model_type = auto_model_type
         elif self.model_type is None and auto_model_type is None:
-            self.model_type = "chat"
+            self.model_type = "chat"  # default model_type is "chat"
         elif auto_model_type is not None and self.model_type != auto_model_type:
             logger.warning(
                 f"Model {self.model_name_or_path} seems to be a {auto_model_type} model, but get model_type {self.model_type}."
             )
 
+        # ============= Init api endpoint =============
+
         if self.model_name_or_path in API_MODELS:
             auto_endpoint = API_MODELS[self.model_name_or_path]["endpoint"]
         elif not self.is_local_model():
@@ -361,19 +362,33 @@ def __post_init__(self):
         if self.api_endpoint is None:
             self.api_endpoint = auto_endpoint
 
+        # ============= Resolve vLLM and local inference backend =============
+
         # try to load as vllm model. If failed, fallback to huggingface model.
         # See `model/load.py` for details.
-        if not self.is_local_model():
-            self.vllm = False
-        elif self.is_vllm_model():
-            self.vllm = True
 
-        if self.vllm:
+        if self.is_vllm_model():
             self.vllm_gpu_memory_utilization = 0.9
+            if self.prefix_caching is None:
+                # prefix_caching is still experimental
+                self.prefix_caching = False
 
-        if self.prefix_caching is None:
-            # prefix caching of vllm is still experimental
-            self.prefix_caching = not self.vllm
+        elif self.is_huggingface_model():
+            if self.prefix_caching is None:
+                self.prefix_caching = True
+
+        # argparse encodes string with unicode_escape, decode it to normal string, e.g., "\\n" -> "\n"
+        if self.stop is not None:
+            if isinstance(self.stop, str):
+                self.stop = [self.stop]
+            for idx in range(len(self.stop)):
+                self.stop[idx] = self.stop[idx].encode('utf-8').decode('unicode_escape')
+        if self.system_prompt is not None:
+            self.system_prompt = self.system_prompt.encode('utf-8').decode('unicode_escape')
+        if self.chat_template is not None:
+            self.chat_template = self.chat_template.encode('utf-8').decode('unicode_escape')
+
+        # ============= Set chat model and chat-templates =============
 
         if self.model_type != "chat":
             if self.system_prompt:
@@ -393,15 +408,6 @@ def __post_init__(self):
                     logger.info(f"Automatically set chat_template to {config_name}.")
                     break
 
-        # argparse encodes string with unicode_escape, decode it to normal string, e.g., "\\n" -> "\n"
-        if self.stop is not None:
-            if isinstance(self.stop, str):
-                self.stop = [self.stop]
-            for idx in range(len(self.stop)):
-                self.stop[idx] = self.stop[idx].encode('utf-8').decode('unicode_escape')
-        if self.system_prompt is not None:
-            self.system_prompt = self.system_prompt.encode('utf-8').decode('unicode_escape')
-
 
 @dataclass
 class DatasetArguments:
@@ -414,7 +420,7 @@ class DatasetArguments:
         metadata={"metavar": "DATASET"},
     )
     batch_size: batch_size_type = HfArg(
-        default=1,
+        default="16:auto",
         aliases=["-bsz", "-b"],
         help=
         "The evaluation batch size. Specify an integer (e.g., '10') to use a fixed batch size for all iterations. Alternatively, append ':auto' (e.g., '10:auto') to start with the specified batch size and automatically adjust it in subsequent iterations to maintain constant CUDA memory usage",
@@ -493,7 +499,6 @@ class DatasetArguments:
     )
 
     continue_from: ClassVar[int] = 0
-    proxy_port: ClassVar[int] = None
 
     # set in `set_logging` with format "{evaluation_results_dir}/{log_filename}.json"
     evaluation_results_path: ClassVar[Optional[str]] = None
@@ -600,14 +605,12 @@ def check_args(model_args: ModelArguments, dataset_args: DatasetArguments, evalu
             os.environ.pop("CUDA_VISIBLE_DEVICES")
 
     # vllm still has some bugs in ranking task
-    if model_args.is_local_model() and all(d not in DEFAULT_VLLM_DATASETS for d in dataset_args.dataset_names
-                                           ) and not model_args.passed_in_commandline("vllm"):
-        model_args.vllm = False
+    if model_args.is_vllm_model() and not model_args.passed_in_commandline("vllm") and any(
+        d not in DEFAULT_VLLM_DATASETS for d in dataset_args.dataset_names
+    ):
         model_args.model_backend = "huggingface"
-
-    # copy arguments
-    if evaluation_args.proxy_port:
-        dataset_args.proxy_port = evaluation_args.proxy_port
+        if not model_args.passed_in_commandline("prefix_caching"):
+            model_args.prefix_caching = True
 
     model_args.seed = int(evaluation_args.seed)
 
diff --git a/utilization/utils/generation_args.py b/utilization/utils/generation_args.py
index 1f5e0580..3de81a8a 100644
--- a/utilization/utils/generation_args.py
+++ b/utilization/utils/generation_args.py
@@ -1,5 +1,15 @@
+from copy import deepcopy
 from dataclasses import dataclass
-from typing import Any, Callable, Optional
+from logging import getLogger
+from typing import TYPE_CHECKING, Any, Callable, Dict, Optional, Union
+
+if TYPE_CHECKING:
+    from .arguments import ModelArguments
+
+logger = getLogger(__name__)
+
+Val = Any
+ValMap = Callable[[Val, "GenerationArg"], Dict[str, Val]]
 
 
 @dataclass
@@ -10,7 +20,7 @@ class GenerationArg:
     transform_value: Optional[Callable[[Any], Any]]
     transform_key: Optional[str]
     nullable: bool
-    needs: Optional[dict]
+    needs: Optional[Union[dict, ValMap]]
     extra_body: bool
 
 
@@ -21,7 +31,109 @@ def generation_arg(
     transform_value: Optional[Callable[[Any], Any]] = None,
     transform_key: Optional[str] = None,
     nullable: bool = False,
-    needs: Optional[dict] = None,
+    needs: Optional[Union[dict, ValMap]] = None,
     extra_body: bool = False,
 ) -> GenerationArg:
+    assert _type is None or transform_value is None, "Cannot have both _type and transform_value"
     return GenerationArg(default, _type, transform_value, transform_key, nullable, needs, extra_body)
+
+
+def set_args(
+    generation_kwargs: Dict[str, Val],
+    key: str,
+    value: Val,
+    details: Optional[GenerationArg] = None,
+    extra_body=False,
+):
+
+    if details is not None:
+
+        # type casting
+        if details._type is not None and value is not None:
+            value = details._type(value)
+
+        # transform
+        if details.transform_value is not None and value is not None:
+            value = details.transform_value(value)
+
+        extra_body = details.extra_body
+
+    display_key = f"extra_body.{key}" if extra_body else key
+    logger.debug(f"Setting {display_key} to {value}")
+    if extra_body:
+        if "extra_body" not in generation_kwargs:
+            generation_kwargs["extra_body"] = {}
+
+        if key in generation_kwargs["extra_body"] and generation_kwargs["extra_body"][key] != value:
+            raise ValueError(f"Conflict value for {key}: {generation_kwargs['extra_body'][key]} vs {value}")
+
+        generation_kwargs["extra_body"][key] = value
+    else:
+
+        if key in generation_kwargs and generation_kwargs[key] != value:
+            raise ValueError(f"Conflict value for {key}: {generation_kwargs[key]} vs {value}")
+
+        generation_kwargs[key] = value
+
+
+def resolve_generation_args(
+    model_args: "ModelArguments",
+    extra_model_args: Dict[str, Any],
+    endpoint_schema: Dict[str, GenerationArg],
+    extra_generation_args: Optional[Dict[str, Union[Val, ValMap]]] = None,
+) -> Dict[str, Any]:
+    generation_kwargs = {}
+    if extra_generation_args is None:
+        extra_generation_args = {}
+
+    for key, details in deepcopy(endpoint_schema).items():
+        # ModelArguments (cmd) > extra_model_args > ModelArguments (default)
+        if not model_args.passed_in_commandline(key):
+            value = extra_model_args.pop(key, None)
+        else:
+            value = None
+        if value is None:
+            value = getattr(model_args, key, None)
+
+        # overrides
+        if key in extra_generation_args:
+            extra = extra_generation_args.pop(key)
+            if value is None and not details.nullable:
+                continue
+            if callable(extra):
+                overrided = extra(value, details)
+                for new_key, new_value in overrided.items():
+                    set_args(generation_kwargs, new_key, new_value, details)
+            else:
+                set_args(generation_kwargs, key, extra, details)
+            continue
+
+        # set default values
+        if value is None and details.default is not None:
+            value = details.default
+
+        # set alias after default values
+        if details.transform_key is not None:
+            key = details.transform_key
+
+        # skip if no value
+        if value is None and not details.nullable:
+            continue
+
+        if isinstance(details.needs, dict):
+            for need, need_value in details.needs.items():
+                set_args(generation_kwargs, need, need_value, endpoint_schema.get(need, None))
+        elif callable(details.needs):
+            need_dict = details.needs(value, model_args)
+            for need, need_value in need_dict.items():
+                set_args(generation_kwargs, need, need_value, endpoint_schema.get(need, None))
+
+        set_args(generation_kwargs, key, value, details)
+
+    if extra_generation_args:
+        if any(callable(v) for v in extra_generation_args.values()):
+            raise ValueError("Extra model args must be resolved before this point")
+        generation_kwargs.update(extra_generation_args)
+        extra_generation_args.clear()
+
+    return generation_kwargs
diff --git a/utilization/utils/logging.py b/utilization/utils/logging.py
index 1481f174..fee8d5f7 100644
--- a/utilization/utils/logging.py
+++ b/utilization/utils/logging.py
@@ -5,7 +5,7 @@
 import sys
 from dataclasses import fields
 from functools import lru_cache
-from typing import TYPE_CHECKING, Callable, List, Optional
+from typing import TYPE_CHECKING, List, Optional
 
 import coloredlogs
 
@@ -37,13 +37,17 @@
     "translation_dataset", "warn_once"
 }
 
-WARNED = set()
+LOGGED = set()
 
 
-def warn_once(logger: logging.Logger, msg: str, identifier: str) -> Callable[[str], None]:
-    if identifier not in WARNED:
-        logger.warning(msg)
-        WARNED.add(identifier)
+def log_once(call_log: callable, msg: str, identifier: str, stacklevel=2):
+    if identifier not in LOGGED:
+        call_log(msg, stacklevel=stacklevel)
+        LOGGED.add(identifier)
+
+
+def warn_once(logger: logging.Logger, msg: str, identifier: str):
+    log_once(logger.warning, msg, identifier, stacklevel=3)
 
 
 @lru_cache