-
Notifications
You must be signed in to change notification settings - Fork 1
/
config_large.yml
144 lines (124 loc) · 6.63 KB
/
config_large.yml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
# Model info
# Mistral 8x7B MoE 46B: mistralai/Mixtral-8x7B-Instruct-v0.1
# LLaMA 3 70B: meta-llama/Meta-Llama-3-70B-Instruct
model_name : "meta-llama/Meta-Llama-3-70B-Instruct"
# Dataset info
# English podcasts: shuyuej/English-Pretraining-Dataset
# Spanish podcasts: shuyuej/Spanish-Pretraining-Dataset
# French podcasts: shuyuej/French-Pretraining-Dataset
# Multilingual podcasts: shuyuej/Multilingual-Pretraining-Dataset
dataset_hf : "shuyuej/MedPodGPT-Demo-Data"
# This is my Hugging Face `read` and `write` tokens. Please replace it to yours.
# `read` token: for downloading models
# `write` token: for uploading your models to Hugging Face
# For your information: https://huggingface.co/settings/tokens
hf_read_token : "YOUR_HUGGING_FACE_READ_TOKEN" # Hugging Face `read` Token
hf_write_token : "YOUR_HUGGING_FACE_WRITE_TOKEN" # Hugging Face `write` Token
# Evaluate the original pre-trained model's performance
eval_pretrain : False
# LoRA
# LoRA attention dimension
lora_r : 16
# Alpha parameter for LoRA scaling
lora_alpha : 32
# Dropout probability for LoRA layers
lora_dropout : 0.1
# Dataset path
# English Benchmarks
english_medqa : "./benchmark/english_medqa/MedQA_USMLE_test.jsonl"
english_pubmedqa : "./benchmark/english_pubmedqa/PubMedQA_test.json"
english_medmcqa : "./benchmark/english_medmcqa/MedMCQA_test.json"
english_usmle_step1 : "./benchmark/english_usmle/USMLE_STEP_1.json"
english_usmle_step2 : "./benchmark/english_usmle/USMLE_STEP_2.json"
english_usmle_step3 : "./benchmark/english_usmle/USMLE_STEP_3.json"
english_usmle_ethics : "./benchmark/english_usmle/USMLE_ethics.json"
english_mmlu_anatomy : "./benchmark/english_mmlu/anatomy_test.csv"
english_mmlu_clinical_knowledge : "./benchmark/english_mmlu/clinical_knowledge_test.csv"
english_mmlu_college_biology : "./benchmark/english_mmlu/college_biology_test.csv"
english_mmlu_college_medicine : "./benchmark/english_mmlu/college_medicine_test.csv"
english_mmlu_medical_genetics : "./benchmark/english_mmlu/medical_genetics_test.csv"
english_mmlu_professional_medicine : "./benchmark/english_mmlu/professional_medicine_test.csv"
english_medexpqa : "./benchmark/english_medexpqa/test.en.casimedicos.rag.jsonl"
# Chinese Benchmarks
chinese_mcmle : "./benchmark/chinese_mcmle/MedQA-MCMLE.jsonl"
chinese_cmmlu_anatomy : "./benchmark/chinese_cmmlu/anatomy.csv"
chinese_cmmlu_clinical_knowledge : "./benchmark/chinese_cmmlu/clinical_knowledge.csv"
chinese_cmmlu_college_medicine : "./benchmark/chinese_cmmlu/college_medicine.csv"
chinese_cmmlu_genetics : "./benchmark/chinese_cmmlu/genetics.csv"
chinese_cmmlu_nutrition : "./benchmark/chinese_cmmlu/nutrition.csv"
chinese_cmmlu_tcm: "./benchmark/chinese_cmmlu/traditional_chinese_medicine.csv"
chinese_cmmlu_virology : "./benchmark/chinese_cmmlu/virology.csv"
# French Benchmarks
french_medmcqa : "./benchmark/french_medmcqa/FrenchMedMCQA-test.json"
french_mmlu_anatomy : "./benchmark/french_mmlu/mmlu_French_test_anatomy_test.csv"
french_mmlu_clinical_knowledge : "./benchmark/french_mmlu/mmlu_French_test_clinical_knowledge_test.csv"
french_mmlu_college_biology : "./benchmark/french_mmlu/mmlu_French_test_college_biology_test.csv"
french_mmlu_college_medicine : "./benchmark/french_mmlu/mmlu_French_test_college_medicine_test.csv"
french_mmlu_medical_genetics : "./benchmark/french_mmlu/mmlu_French_test_medical_genetics_test.csv"
french_mmlu_professional_medicine : "./benchmark/french_mmlu/mmlu_French_test_professional_medicine_test.csv"
french_medexpqa : "./benchmark/french_medexpqa/test.fr.casimedicos.rag.jsonl"
# Spanish Benchmarks
spanish_headqa : "./benchmark/spanish_headqa/HEAD-QA-test.json"
spanish_mmlu_anatomy : "./benchmark/spanish_mmlu/mmlu_Spanish_test_anatomy_test.csv"
spanish_mmlu_clinical_knowledge : "./benchmark/spanish_mmlu/mmlu_Spanish_test_clinical_knowledge_test.csv"
spanish_mmlu_college_biology : "./benchmark/spanish_mmlu/mmlu_Spanish_test_college_biology_test.csv"
spanish_mmlu_college_medicine : "./benchmark/spanish_mmlu/mmlu_Spanish_test_college_medicine_test.csv"
spanish_mmlu_medical_genetics : "./benchmark/spanish_mmlu/mmlu_Spanish_test_medical_genetics_test.csv"
spanish_mmlu_professional_medicine : "./benchmark/spanish_mmlu/mmlu_Spanish_test_professional_medicine_test.csv"
spanish_medexpqa : "./benchmark/spanish_medexpqa/test.es.casimedicos.rag.jsonl"
# Hindi Benchmarks
hindi_mmlu_anatomy : "./benchmark/hindi_mmlu/mmlu_Hindi_test_anatomy_test.csv"
hindi_mmlu_clinical_knowledge : "./benchmark/hindi_mmlu/mmlu_Hindi_test_clinical_knowledge_test.csv"
hindi_mmlu_college_biology : "./benchmark/hindi_mmlu/mmlu_Hindi_test_college_biology_test.csv"
hindi_mmlu_college_medicine : "./benchmark/hindi_mmlu/mmlu_Hindi_test_college_medicine_test.csv"
hindi_mmlu_medical_genetics : "./benchmark/hindi_mmlu/mmlu_Hindi_test_medical_genetics_test.csv"
hindi_mmlu_professional_medicine : "./benchmark/hindi_mmlu/mmlu_Hindi_test_professional_medicine_test.csv"
# Saving path
result_dir : "./results"
save_dir : "./save_folder"
data_save_dir : "./save_folder/data"
# Training length and number of the generated tokens
train_max_len : 2048
max_new_tokens : 1024
# Batch size
train_batch_size : 1
# Number of training epochs
epochs : 5
# Optimizer, Learning rate schedule, warm-up ratio
# paged_adamw_8bit, adamw_torch
optim : "paged_adamw_8bit"
lr_scheduler_type : "cosine"
warmup_ratio : 0.03
# activation checkpointing
# When enabled, a lot of memory can be freed at the cost of small decrease in the training speed
# due to recomputing parts of the graph during back-propagation.
gradient_checkpointing : True
# Number of update steps to accumulate the gradients
gradient_accumulation_steps : 1
# Initial learning rate
learning_rate : 0.000005
# Weight decay to apply to all layers except bias/LayerNorm weights
weight_decay : 0.01
# Save the model for a number of steps
save_strategy : "epoch"
# Whether to save the optimizer and scheduler
save_only_model : True
# Number of total saved checkpoints
save_total_limit : 10
# Log every X updates steps
logging_steps : 1
# Logging save platform
log_save_platform : "tensorboard"
# Output directory where the model predictions and checkpoints will be stored
# Enable fp16/bf16 training (set bf16 to True with an A100)
# Whether to use fp16 16-bit (mixed) precision training instead of 32-bit training.
fp16 : False
# Whether to use bf16 16-bit (mixed) precision training instead of 32-bit training. Requires Ampere or higher
# NVIDIA architecture or using CPU (use_cpu) or Ascend NPU. This is an experimental API and it may change.
bf16 : True
# Choose which GPU to use
device_map : "auto"
# The number of GPUs and GPU utilization for the vLLM Engine
# https://docs.vllm.ai/en/latest/serving/distributed_serving.html
num_gpus_vllm : 4
gpu_utilization_vllm : 0.95