More systematic and reproducible conversion of SFT datasets (#387)

* Reorganize the data preparation scripts for tulu v1 and v2. * Minor improvement * Remove open_platypus_commercial subset from Daring-Anteater * Use hard-coded examples repo. * Fix some bugs. * Add OpenMathInstruct. * Add a few more v3.5.x SFT mix ablations for the cleaner datasets. * More experiments on mixes. * help merge * prep for merge * reapply changes * fix naming --------- Co-authored-by: Nathan Lambert <[email protected]>
allenai · Nov 11, 2024 · f43d69a · f43d69a
1 parent 8de53e6
commit f43d69a
Show file tree

Hide file tree

Showing 45 changed files with 3,985 additions and 7 deletions.
diff --git a/...onfigs/sft/tulu3_8b_preview_mix_v3.5.yaml → ...nfigs/sft/tulu3_8b_preview_mix_v3.10.yaml b/...onfigs/sft/tulu3_8b_preview_mix_v3.5.yaml → ...nfigs/sft/tulu3_8b_preview_mix_v3.10.yaml
diff --git a/configs/train_configs/sft/tulu3_8b_preview_mix_v3.5.x/llama3.1_8b_preview_mix_v3.5.1.yaml b/configs/train_configs/sft/tulu3_8b_preview_mix_v3.5.x/llama3.1_8b_preview_mix_v3.5.1.yaml
@@ -0,0 +1,114 @@
+model_name_or_path: meta-llama/Llama-3.1-8B
+model_revision: main
+use_flash_attn: true
+tokenizer_name: meta-llama/Llama-3.1-8B
+use_slow_tokenizer: true
+dataset_mixer:
+    # For this mix, we try to replce the datasets in v3.4 with 
+    # the filtered and clearner versions, processed by us
+    # ------------------------------------------------------
+    # no_robot dataset, human written, for general chat. 
+    # License: CC-By-NC-4.0
+    # Total: 9500
+    ai2-adapt-dev/no_robots_converted: 9500
+    # ------------------------------------------------------
+    # OpenAssistant dataset, human written, for general chat.
+    # Here, only the highest rated paths are extracted.
+    # We use the version 2.
+    # License: Apache-2.0
+    # Total: 9091
+    ai2-adapt-dev/oasst2_converted: 9091
+    # ------------------------------------------------------
+    # Aya dataset, human written, for general chat (multilingual).
+    # License: Apache-2.0
+    # Total: 202285
+    ai2-adapt-dev/aya_dataset_converted: 202285
+    # ------------------------------------------------------
+    # Tulu hard-coded examples, human written, for identity-related questions.
+    # License: TBD
+    # Total: 240
+    ai2-adapt-dev/tulu_hard_coded_repeated_10: 240
+    # ------------------------------------------------------
+    # SciIFF dataset, human (researchers) converted from existing datasets, for scientific literature understanding.
+    # Here, we use the subset extracted by the author in building allenai/SciRIFF-train-mix.
+    # License: ODC-By (the underlying dataset has individual license)
+    # Total: 35357
+    ai2-adapt-dev/sciriff_converted: 10000
+    # ------------------------------------------------------
+    # SlimOrca dataset, gpt4 generated, for general chat.
+    # License: MIT
+    # Total: 517980
+    ai2-adapt-dev/slim_orca_converted: 100000
+    # ------------------------------------------------------
+    # WizardLM evol instruct dataset, gpt4 generated, for general chat.
+    # License: Academic-only disclaimer (https://github.com/nlpxucan/WizardLM?tab=readme-ov-file#disclaimer)
+    # Total: 142432
+    ai2-adapt-dev/wizardlm_converted: 30000
+    # ------------------------------------------------------
+    # WildChat dataset, real user queries + gpt4 responses, for general chat.
+    # License: ODC-BY
+    # Total: 241307 (significantly more if including the gpt3.5 instances)
+    ai2-adapt-dev/wildchat_gpt4_converted: 241307
+    # ------------------------------------------------------
+    # Daring-Anteater, a mix of existing datasets, for general chat.
+    # Here, we removed the `open_platypus_commercial` due to test set contamination.
+    # License: CC-By-4.0
+    # Total: 93486
+    ai2-adapt-dev/daring_anteater_converted: 93486
+    # ------------------------------------------------------
+    # MetaMathQA dataset, augmented using gpt4, for math capability.
+    # License: MIT
+    # Total: 394996
+    ai2-adapt-dev/metamath_converted: 100000
+    # ------------------------------------------------------
+    # WebInstruct dataset, extract&rewritten using gpt4, (mainly) for math/science related questions.
+    # We processed the Stack Exchange and Socratic separately due to their different licenses.
+    # Stack Exchange subset
+    # License: Apache-2.0
+    # Total: 1801802
+    ai2-adapt-dev/webinstruct_se_sub_converted: 75000
+    # Socratic subset
+    # License: CC BY-NC 4.0
+    # Total: 533382
+    ai2-adapt-dev/webinstruct_socratic_sub_converted: 25000
+    # ------------------------------------------------------
+    # Codefeedback Filtered Instruction, a mix of existing dataset, for coding
+    # The data mix includes:
+    #   Magicoder-OSS-Instruct
+    #   Python code subset of ShareGPT
+    #   Magicoder-Evol-Instruct
+    #   Evol-Instruct-Code
+    # Total: 156361
+    # License: listed as Apache 2.0, but the individual subsets have descrepancy
+    ai2-adapt-dev/codefeedback_filtered_instructions_converted: 156361
+    # ------------------------------------------------------
+    # Table-GPT dataset, converted & synthesized, for table understanding and operations
+    # Total: 13222
+    # License: MIT
+    ai2-adapt-dev/table_gpt_converted: 3000
+    # ------------------------------------------------------
+    # Coconot dataset, generated by gpt4, for non-compliance
+    # Total: 10983
+    # License: ODC-BY
+    ai2-adapt-dev/coconot_converted: 10983
+    # ------------------------------------------------------
+    # NuminaMATH-TIR, extracted and generated by gpt4, for tool-integrated reasoning for math
+    # Total: 72441
+    # License: CC-By-NC-4.0
+    ai2-adapt-dev/numinamath_tir_converted: 72441
+max_seq_length: 4096 # Note, reduced from 8192 to fit on one GPU with DeepSpeed Stage3
+preprocessing_num_workers: 128
+per_device_train_batch_size: 1 # note, this is set up for 8 GPUs
+gradient_accumulation_steps: 4 # effective batch size 128 with 4 nodes
+learning_rate: 5.0e-06 # best LR so far
+lr_scheduler_type: linear
+warmup_ratio: 0.03
+weight_decay: 0.0
+num_train_epochs: 2
+output_dir: /output/
+with_tracking: true
+report_to:
+  - wandb
+logging_steps: 1
+checkpointing_steps: epoch
+dataset_mix_dir: /output/
diff --git a/configs/train_configs/sft/tulu3_8b_preview_mix_v3.5.x/llama3.1_8b_preview_mix_v3.5.10.yaml b/configs/train_configs/sft/tulu3_8b_preview_mix_v3.5.x/llama3.1_8b_preview_mix_v3.5.10.yaml
@@ -0,0 +1,123 @@
+model_name_or_path: meta-llama/Llama-3.1-8B
+model_revision: main
+use_flash_attn: true
+tokenizer_name: meta-llama/Llama-3.1-8B
+use_slow_tokenizer: true
+dataset_mixer:
+    #  This mix adds back SlimOrca to v3.5.9
+    # ------------------------------------------------------
+    # OpenAssistant dataset, human written, for general chat.
+    # Here, only the highest rated paths are extracted.
+    # We use the version 2.
+    # License: Apache-2.0
+    # Total: 9091
+    ai2-adapt-dev/oasst2_converted: 9091
+    # ------------------------------------------------------
+    # Aya dataset, human written, for general chat (multilingual).
+    # License: Apache-2.0
+    # Total: 202285
+    ai2-adapt-dev/aya_dataset_converted: 202285
+    # ------------------------------------------------------
+    # Tulu hard-coded examples, human written, for identity-related questions.
+    # License: TBD
+    # Total: 240
+    ai2-adapt-dev/tulu_hard_coded_repeated_10: 240
+    # ------------------------------------------------------
+    # FLAN v2, human (researchers) converted from existing datasets, for reasoning.
+    # Here, we downsampled with very different portions of the subsets based on our intuitions.
+    # License: Codebase is licensed under Apache-2.0.
+    # Total: 89982
+    ai2-adapt-dev/flan_v2_converted: 89982
+    # ------------------------------------------------------
+    # SlimOrca, GPT4 generated based on FLAN.
+    # License: MIT
+    # Total: 517980
+    ai2-adapt-dev/slim_orca_converted: 100000
+    # ------------------------------------------------------
+    # SciIFF dataset, human (researchers) converted from existing datasets, for scientific literature understanding.
+    # Here, we use the subset extracted by the author in building allenai/SciRIFF-train-mix.
+    # License: ODC-By (the underlying dataset has individual license)
+    # Total: 35357
+    ai2-adapt-dev/sciriff_converted: 10000
+    # ------------------------------------------------------
+    # WildChat dataset, real user queries + gpt4 responses, for general chat.
+    # License: ODC-BY
+    # Total: 241307 (significantly more if including the gpt3.5 instances)
+    ai2-adapt-dev/wildchat_gpt4_converted: 241307
+    # ------------------------------------------------------
+    # Daring-Anteater, a mix of existing datasets, for general chat.
+    # Here, we removed the `open_platypus_commercial` due to test set contamination.
+    # License: CC-By-4.0
+    # Total: 93486
+    # ai2-adapt-dev/daring_anteater_converted: 93486
+    # ------------------------------------------------------
+    # Codefeedback Filtered Instruction, a mix of existing dataset, for coding
+    # The data mix includes:
+    #   Magicoder-OSS-Instruct
+    #   Python code subset of ShareGPT
+    #   Magicoder-Evol-Instruct
+    #   Evol-Instruct-Code
+    # Total: 156361
+    # License: listed as Apache 2.0, but the individual subsets have descrepancy
+    # ai2-adapt-dev/codefeedback_filtered_instructions_converted: 156361
+    # ------------------------------------------------------
+    # As a replacement of codefeedback, we use Magicoder-Evol-Instruct, 
+    # which is a decontaminated version of evol-codealpaca-v1.
+    # License: Apache 2.0
+    # Total: 110999
+    ai2-adapt-dev/evol_codealpaca_converted: 110999
+    # ------------------------------------------------------
+    # Table-GPT dataset, converted & synthesized, for table understanding and operations
+    # Total: 13222
+    # License: MIT
+    ai2-adapt-dev/table_gpt_converted: 3000
+    # ------------------------------------------------------
+    # Coconot dataset, generated by gpt4, for non-compliance
+    # Total: 10983
+    # License: ODC-BY
+    ai2-adapt-dev/coconot_converted: 10983
+    # ------------------------------------------------------
+    # NuminaMATH-TIR, extracted and generated by gpt4, for tool-integrated reasoning for math
+    # Total: 72441
+    # License: CC-By-NC-4.0
+    ai2-adapt-dev/numinamath_tir_converted: 72441
+    # ------------------------------------------------------
+    # math dataset generated by Faeze using personahub
+    # License: TBD
+    # Total: 149975
+    ai2-adapt-dev/personahub_math_v4_149975: 149975
+    # ------------------------------------------------------
+    # instruction following dataset generated by Faeze using personahub
+    # LIcense: TBD
+    # Total: 29980
+    ai2-adapt-dev/personahub_ifdata_v1_29980: 29980
+    # ------------------------------------------------------
+    # OpenMathInstruct-2 dataset
+    # Here, we take their gsm8k subset
+    # License: CC-By-4.0
+    # Total: 2570505
+    ai2-adapt-dev/openmath-2-gsm8k: 50000
+    # Wildguardmix dataset reprocessed by Nouha, targetting for safety
+    # License: TBD
+    # Total: 86759
+    ai2-adapt-dev/synthetic-finalresp-wildguarmixtrain: 86759
+    # Wildjailbreak dataset reprocessed by Nouha, targetting for safety
+    # License: TBD
+    # Total: 261559
+    ai2-adapt-dev/processed-wildjailbreak: 261559
+max_seq_length: 4096 # Note, reduced from 8192 to fit on one GPU with DeepSpeed Stage3
+preprocessing_num_workers: 128
+per_device_train_batch_size: 1 # note, this is set up for 8 GPUs
+gradient_accumulation_steps: 4 # effective batch size 128 with 4 nodes
+learning_rate: 5.0e-06 # best LR so far
+lr_scheduler_type: linear
+warmup_ratio: 0.03
+weight_decay: 0.0
+num_train_epochs: 2
+output_dir: /output/
+with_tracking: true
+report_to:
+  - wandb
+logging_steps: 1
+checkpointing_steps: epoch
+dataset_mix_dir: /output/
diff --git a/configs/train_configs/sft/tulu3_8b_preview_mix_v3.5.x/llama3.1_8b_preview_mix_v3.5.11.yaml b/configs/train_configs/sft/tulu3_8b_preview_mix_v3.5.x/llama3.1_8b_preview_mix_v3.5.11.yaml
@@ -0,0 +1,123 @@
+model_name_or_path: meta-llama/Llama-3.1-8B
+model_revision: main
+use_flash_attn: true
+tokenizer_name: meta-llama/Llama-3.1-8B
+use_slow_tokenizer: true
+dataset_mixer:
+    #  This mix adds back WizardLM to v3.5.9
+    # ------------------------------------------------------
+    # OpenAssistant dataset, human written, for general chat.
+    # Here, only the highest rated paths are extracted.
+    # We use the version 2.
+    # License: Apache-2.0
+    # Total: 9091
+    ai2-adapt-dev/oasst2_converted: 9091
+    # ------------------------------------------------------
+    # Aya dataset, human written, for general chat (multilingual).
+    # License: Apache-2.0
+    # Total: 202285
+    ai2-adapt-dev/aya_dataset_converted: 202285
+    # ------------------------------------------------------
+    # Tulu hard-coded examples, human written, for identity-related questions.
+    # License: TBD
+    # Total: 240
+    ai2-adapt-dev/tulu_hard_coded_repeated_10: 240
+    # ------------------------------------------------------
+    # FLAN v2, human (researchers) converted from existing datasets, for reasoning.
+    # Here, we downsampled with very different portions of the subsets based on our intuitions.
+    # License: Codebase is licensed under Apache-2.0.
+    # Total: 89982
+    ai2-adapt-dev/flan_v2_converted: 89982
+    # ------------------------------------------------------
+    # SciIFF dataset, human (researchers) converted from existing datasets, for scientific literature understanding.
+    # Here, we use the subset extracted by the author in building allenai/SciRIFF-train-mix.
+    # License: ODC-By (the underlying dataset has individual license)
+    # Total: 35357
+    ai2-adapt-dev/sciriff_converted: 10000
+    # ------------------------------------------------------
+    # WildChat dataset, real user queries + gpt4 responses, for general chat.
+    # License: ODC-BY
+    # Total: 241307 (significantly more if including the gpt3.5 instances)
+    ai2-adapt-dev/wildchat_gpt4_converted: 241307
+    # ------------------------------------------------------
+    # WizardLM dataset, GPT4 generated using the eval-instruct method
+    # License: Academic-only disclaimer (https://github.com/nlpxucan/WizardLM?tab=readme-ov-file#disclaimer)
+    # Total: 142432
+    ai2-adapt-dev/wizardlm_converted: 100000
+    # ------------------------------------------------------
+    # Daring-Anteater, a mix of existing datasets, for general chat.
+    # Here, we removed the `open_platypus_commercial` due to test set contamination.
+    # License: CC-By-4.0
+    # Total: 93486
+    # ai2-adapt-dev/daring_anteater_converted: 93486
+    # ------------------------------------------------------
+    # Codefeedback Filtered Instruction, a mix of existing dataset, for coding
+    # The data mix includes:
+    #   Magicoder-OSS-Instruct
+    #   Python code subset of ShareGPT
+    #   Magicoder-Evol-Instruct
+    #   Evol-Instruct-Code
+    # Total: 156361
+    # License: listed as Apache 2.0, but the individual subsets have descrepancy
+    # ai2-adapt-dev/codefeedback_filtered_instructions_converted: 156361
+    # ------------------------------------------------------
+    # As a replacement of codefeedback, we use Magicoder-Evol-Instruct, 
+    # which is a decontaminated version of evol-codealpaca-v1.
+    # License: Apache 2.0
+    # Total: 110999
+    ai2-adapt-dev/evol_codealpaca_converted: 110999
+    # ------------------------------------------------------
+    # Table-GPT dataset, converted & synthesized, for table understanding and operations
+    # Total: 13222
+    # License: MIT
+    ai2-adapt-dev/table_gpt_converted: 3000
+    # ------------------------------------------------------
+    # Coconot dataset, generated by gpt4, for non-compliance
+    # Total: 10983
+    # License: ODC-BY
+    ai2-adapt-dev/coconot_converted: 10983
+    # ------------------------------------------------------
+    # NuminaMATH-TIR, extracted and generated by gpt4, for tool-integrated reasoning for math
+    # Total: 72441
+    # License: CC-By-NC-4.0
+    ai2-adapt-dev/numinamath_tir_converted: 72441
+    # ------------------------------------------------------
+    # math dataset generated by Faeze using personahub
+    # License: TBD
+    # Total: 149975
+    ai2-adapt-dev/personahub_math_v4_149975: 149975
+    # ------------------------------------------------------
+    # instruction following dataset generated by Faeze using personahub
+    # LIcense: TBD
+    # Total: 29980
+    ai2-adapt-dev/personahub_ifdata_v1_29980: 29980
+    # ------------------------------------------------------
+    # OpenMathInstruct-2 dataset
+    # Here, we take their gsm8k subset
+    # License: CC-By-4.0
+    # Total: 2570505
+    ai2-adapt-dev/openmath-2-gsm8k: 50000
+    # Wildguardmix dataset reprocessed by Nouha, targetting for safety
+    # License: TBD
+    # Total: 86759
+    ai2-adapt-dev/synthetic-finalresp-wildguarmixtrain: 86759
+    # Wildjailbreak dataset reprocessed by Nouha, targetting for safety
+    # License: TBD
+    # Total: 261559
+    ai2-adapt-dev/processed-wildjailbreak: 261559
+max_seq_length: 4096 # Note, reduced from 8192 to fit on one GPU with DeepSpeed Stage3
+preprocessing_num_workers: 128
+per_device_train_batch_size: 1 # note, this is set up for 8 GPUs
+gradient_accumulation_steps: 4 # effective batch size 128 with 4 nodes
+learning_rate: 5.0e-06 # best LR so far
+lr_scheduler_type: linear
+warmup_ratio: 0.03
+weight_decay: 0.0
+num_train_epochs: 2
+output_dir: /output/
+with_tracking: true
+report_to:
+  - wandb
+logging_steps: 1
+checkpointing_steps: epoch
+dataset_mix_dir: /output/