config_large.yml

# Model info
# Mistral 8x7B MoE 46B: mistralai/Mixtral-8x7B-Instruct-v0.1
# LLaMA 3 70B: meta-llama/Meta-Llama-3-70B-Instruct
model_name : "meta-llama/Meta-Llama-3-70B-Instruct"

# Dataset info
# English podcasts: shuyuej/English-Pretraining-Dataset
# Spanish podcasts: shuyuej/Spanish-Pretraining-Dataset
# French podcasts: shuyuej/French-Pretraining-Dataset
# Multilingual podcasts: shuyuej/Multilingual-Pretraining-Dataset
dataset_hf : "shuyuej/MedPodGPT-Demo-Data"

# This is my Hugging Face `read` and `write` tokens. Please replace it to yours.
# `read` token: for downloading models
# `write` token: for uploading your models to Hugging Face
# For your information: https://huggingface.co/settings/tokens
hf_read_token : "YOUR_HUGGING_FACE_READ_TOKEN"  # Hugging Face `read` Token
hf_write_token : "YOUR_HUGGING_FACE_WRITE_TOKEN"  # Hugging Face `write` Token

# Evaluate the original pre-trained model's performance
eval_pretrain : False

# LoRA
# LoRA attention dimension
lora_r : 16
# Alpha parameter for LoRA scaling
lora_alpha : 32
# Dropout probability for LoRA layers
lora_dropout : 0.1

# Dataset path
# English Benchmarks
english_medqa : "./benchmark/english_medqa/MedQA_USMLE_test.jsonl"
english_pubmedqa : "./benchmark/english_pubmedqa/PubMedQA_test.json"
english_medmcqa : "./benchmark/english_medmcqa/MedMCQA_test.json"
english_usmle_step1 : "./benchmark/english_usmle/USMLE_STEP_1.json"
english_usmle_step2 : "./benchmark/english_usmle/USMLE_STEP_2.json"
english_usmle_step3 : "./benchmark/english_usmle/USMLE_STEP_3.json"
english_usmle_ethics : "./benchmark/english_usmle/USMLE_ethics.json"
english_mmlu_anatomy : "./benchmark/english_mmlu/anatomy_test.csv"
english_mmlu_clinical_knowledge : "./benchmark/english_mmlu/clinical_knowledge_test.csv"
english_mmlu_college_biology : "./benchmark/english_mmlu/college_biology_test.csv"
english_mmlu_college_medicine : "./benchmark/english_mmlu/college_medicine_test.csv"
english_mmlu_medical_genetics : "./benchmark/english_mmlu/medical_genetics_test.csv"
english_mmlu_professional_medicine : "./benchmark/english_mmlu/professional_medicine_test.csv"
english_medexpqa : "./benchmark/english_medexpqa/test.en.casimedicos.rag.jsonl"

# Chinese Benchmarks
chinese_mcmle : "./benchmark/chinese_mcmle/MedQA-MCMLE.jsonl"
chinese_cmmlu_anatomy : "./benchmark/chinese_cmmlu/anatomy.csv"
chinese_cmmlu_clinical_knowledge : "./benchmark/chinese_cmmlu/clinical_knowledge.csv"
chinese_cmmlu_college_medicine : "./benchmark/chinese_cmmlu/college_medicine.csv"
chinese_cmmlu_genetics : "./benchmark/chinese_cmmlu/genetics.csv"
chinese_cmmlu_nutrition : "./benchmark/chinese_cmmlu/nutrition.csv"
chinese_cmmlu_tcm: "./benchmark/chinese_cmmlu/traditional_chinese_medicine.csv"
chinese_cmmlu_virology : "./benchmark/chinese_cmmlu/virology.csv"

# French Benchmarks
french_medmcqa : "./benchmark/french_medmcqa/FrenchMedMCQA-test.json"
french_mmlu_anatomy : "./benchmark/french_mmlu/mmlu_French_test_anatomy_test.csv"
french_mmlu_clinical_knowledge : "./benchmark/french_mmlu/mmlu_French_test_clinical_knowledge_test.csv"
french_mmlu_college_biology : "./benchmark/french_mmlu/mmlu_French_test_college_biology_test.csv"
french_mmlu_college_medicine : "./benchmark/french_mmlu/mmlu_French_test_college_medicine_test.csv"
french_mmlu_medical_genetics : "./benchmark/french_mmlu/mmlu_French_test_medical_genetics_test.csv"
french_mmlu_professional_medicine : "./benchmark/french_mmlu/mmlu_French_test_professional_medicine_test.csv"
french_medexpqa : "./benchmark/french_medexpqa/test.fr.casimedicos.rag.jsonl"

# Spanish Benchmarks
spanish_headqa : "./benchmark/spanish_headqa/HEAD-QA-test.json"
spanish_mmlu_anatomy : "./benchmark/spanish_mmlu/mmlu_Spanish_test_anatomy_test.csv"
spanish_mmlu_clinical_knowledge : "./benchmark/spanish_mmlu/mmlu_Spanish_test_clinical_knowledge_test.csv"
spanish_mmlu_college_biology : "./benchmark/spanish_mmlu/mmlu_Spanish_test_college_biology_test.csv"
spanish_mmlu_college_medicine : "./benchmark/spanish_mmlu/mmlu_Spanish_test_college_medicine_test.csv"
spanish_mmlu_medical_genetics : "./benchmark/spanish_mmlu/mmlu_Spanish_test_medical_genetics_test.csv"
spanish_mmlu_professional_medicine : "./benchmark/spanish_mmlu/mmlu_Spanish_test_professional_medicine_test.csv"
spanish_medexpqa : "./benchmark/spanish_medexpqa/test.es.casimedicos.rag.jsonl"

# Hindi Benchmarks
hindi_mmlu_anatomy : "./benchmark/hindi_mmlu/mmlu_Hindi_test_anatomy_test.csv"
hindi_mmlu_clinical_knowledge : "./benchmark/hindi_mmlu/mmlu_Hindi_test_clinical_knowledge_test.csv"
hindi_mmlu_college_biology : "./benchmark/hindi_mmlu/mmlu_Hindi_test_college_biology_test.csv"
hindi_mmlu_college_medicine : "./benchmark/hindi_mmlu/mmlu_Hindi_test_college_medicine_test.csv"
hindi_mmlu_medical_genetics : "./benchmark/hindi_mmlu/mmlu_Hindi_test_medical_genetics_test.csv"
hindi_mmlu_professional_medicine : "./benchmark/hindi_mmlu/mmlu_Hindi_test_professional_medicine_test.csv"

# Saving path
result_dir : "./results"
save_dir : "./save_folder"
data_save_dir : "./save_folder/data"

# Training length and number of the generated tokens
train_max_len : 2048
max_new_tokens : 1024

# Batch size
train_batch_size : 1

# Number of training epochs
epochs : 5

# Optimizer, Learning rate schedule, warm-up ratio
# paged_adamw_8bit, adamw_torch
optim : "paged_adamw_8bit"
lr_scheduler_type : "cosine"
warmup_ratio : 0.03

# activation checkpointing
# When enabled, a lot of memory can be freed at the cost of small decrease in the training speed
# due to recomputing parts of the graph during back-propagation.
gradient_checkpointing : True
# Number of update steps to accumulate the gradients
gradient_accumulation_steps : 1
# Initial learning rate
learning_rate : 0.000005
# Weight decay to apply to all layers except bias/LayerNorm weights
weight_decay : 0.01

# Save the model for a number of steps
save_strategy : "epoch"
# Whether to save the optimizer and scheduler
save_only_model : True
# Number of total saved checkpoints
save_total_limit : 10

# Log every X updates steps
logging_steps : 1
# Logging save platform
log_save_platform : "tensorboard"

# Output directory where the model predictions and checkpoints will be stored
# Enable fp16/bf16 training (set bf16 to True with an A100)
# Whether to use fp16 16-bit (mixed) precision training instead of 32-bit training.
fp16 : False
# Whether to use bf16 16-bit (mixed) precision training instead of 32-bit training. Requires Ampere or higher
# NVIDIA architecture or using CPU (use_cpu) or Ascend NPU. This is an experimental API and it may change.
bf16 : True

# Choose which GPU to use
device_map : "auto"

# The number of GPUs and GPU utilization for the vLLM Engine
# https://docs.vllm.ai/en/latest/serving/distributed_serving.html
num_gpus_vllm : 4
gpu_utilization_vllm : 0.95