src/1-parts.yml

src:
  info: of all creation

genus:
  model: Qwen/Qwen1.5-0.5B-Chat
  info: it's in the DNA
  precision: b16
  training:
    type: lora
    r: 8
    lora_alpha: 16
    lora_dropout: 0
    use_rslora: False
    use_dora: False
    bias: "none"
    target_modules:
      - q_proj
      - k_proj
      - v_proj
      - o_proj
      - gate_proj
      - up_proj
      - down_proj
    gradient_checkpointing: False
    optimizer: Prodigy
    learning_rate: 1.0
    block_size: 2048
    stride: 128
    num_steps: 1000
    warmup_steps: 10
    batch_size: 1
    gradient_accumulation_steps: 16
    weight_decay: 0.1
    gradient_clip_val: 1.0
    scheduler: cosine
    val_split: 0.01
    val_interval: 100
    generate_every: 10
    save_every: 250

mind:
  model: TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T
  info: use your heads
  context_length: 2048
  precision: b16
  generation_profile: lowpenalty
  training:
    type: lora
    r: 32
    lora_alpha: 32
    lora_dropout: 0
    use_rslora: True
    bias: "none"
    target_modules:
      - q_proj
      - k_proj
      - v_proj
      - o_proj
      - gate_proj
      - up_proj
      - down_proj
    gradient_checkpointing: True
    optimizer: AdamW
    learning_rate: 0.00022
    block_size: 1536
    stride: 128
    num_steps: 100000
    warmup_steps: 100
    batch_size: 1
    gradient_accumulation_steps: 6
    weight_decay: 0.1
    gradient_clip_val: 1.0
    scheduler: cosine
    val_split: 0.01
    val_interval: 2500
    generate_every: 10
    save_every: 250
    checkpoint_every: 250

heart:
  model: RWKV/rwkv-4-430m-pile
  info: with everything you got
  mode: transformer
  adapters:
    - base
  training:
    name: base
    datasets:
      streaming:
        - c4
    type: lokr
    alpha: 32
    rank_dropout: 0.1
    module_dropout: 0.1
    decompose_both: True
    decompose_factor: -1
    target_modules:
      - receptance
      - value
      - output
    gradient_checkpointing: False
    optimizer: AdamW
    learning_rate: 0.000666
    block_size: 512
    stride: 256
    num_steps: 25000
    warmup_steps: 100
    weight_decay: 0.01
    gradient_clip_val: 1.0
    scheduler: cosine
    batch_size: 1
    gradient_accumulation_steps: 16
    val_split: 0.01
    val_interval: 1000
    generate_every: 25
    save_every: 100
    checkpoint_every: 100

soul:
  model: bigscience/bloom-560m
  info: because nobody has one
  petals: True
  training:
    datasets:
      streaming:
        - redpajama2
    generate_every: 50
    save_every: 100
    padding_side: left
    model_max_length: 256
    type: "prefix"
    num_virtual_tokens: 128
    learning_rate: 0.001
    block_size: 256
    num_steps: 33333
    warmup_steps: 250
    weight_decay: 0.01
    gradient_clip_val: 1.23
    scheduler: cosine
    batch_size: 6
    regen: False

wisdom:
  info: of the masses
  model: state-spaces/mamba-130m-hf
  training:
    type: lora
    r: 8
    lora_alpha: 16
    lora_dropout: 0
    use_rslora: True
    use_dora: True
    bias: "none"
    target_modules:
      # - embeddings
      - in_proj
      - x_proj
      - dt_proj
      - out_proj
    gradient_checkpointing: True
    optimizer: AdamW
    generate_every: 100
    learning_rate: 0.001
    block_size: 2048
    stride: 0
    num_steps: 1000
    warmup_steps: 10
    weight_decay: 0.0001
    gradient_clip_val: 1.0
    scheduler: cosine
    batch_size: 1
    gradient_accumulation_steps: 16
    val_split: 0.05
    val_interval: 100

envy:
  model: facebook/opt-350m
  info: a simple transformer
  generation_profile: lowpenalty
  training:
    regen: False
    type: lora
    r: 16
    lora_alpha: 24
    lora_dropout: 0.01
    bias: "all"
    use_rslora: False
    use_dora: False
    target_modules:
      - q_proj
      - k_proj
      - v_proj
      - out_proj
      - fc1
      - fc2
    gradient_checkpointing: True
    optimizer: AdamW
    generate_every: 250
    learning_rate: 0.001
    swa_learning_rate: 0.01
    block_size: 2048
    stride: 256
    num_steps: 10000
    warmup_steps: 100
    weight_decay: 0.001
    gradient_clip_val: 1.0
    scheduler: cosine
    batch_size: 1
    gradient_accumulation_steps: 27
    val_split: 0.05
    val_interval: 10000

chaos:
  model: EleutherAI/pythia-410m-deduped-v0
  info: a monstrosity
  generation_profile: chaotic
  precision: 32
  training:
    precision: 16
    type: lora
    r: 64
    lora_alpha: 16
    lora_dropout: 0.1
    use_rslora: True
    use_dora: True
    bias: "all"
    target_modules:
      - query_key_value
      - dense
      - dense_h_to_4h
      - dense_4h_to_h
      - embed_out
    gradient_checkpointing: True
    optimizer: Prodigy
    learning_rate: 1.0
    weight_decay: 0.001
    safeguard_warmup: True
    bias_correction: True
    block_size: 1024
    stride: 0
    num_steps: 20000
    warmup_steps: 100
    gradient_clip_val: 1.0
    scheduler: cosine
    batch_size: 1
    gradient_accumulation_steps: 16
    val_split: 0.001
    val_interval: 1000
    generate_every: 10
    save_every: 100
    checkpoint_every: 100
    datasets:
      streaming:
        - instruct

malice:
  model: cerebras/Cerebras-GPT-111M
  info: a GPT-2 clone
  generation_profile: lowpenalty
  training:
    type: lora

pain:
  model: HuggingFaceTB/SmolLM-360M-Instruct
  info: a derelict herd
  precision: 32
  generation_profile: predictable
  token_map:
    bos_token: 0
    eos_token: 0
  training:
    precision: b16
    type: lora
    r: 64
    lora_alpha: 12
    lora_dropout: 0.1
    use_rslora: True
    use_dora: True
    bias: "none"
    init_lora_weights: olora
    target_modules:
      - q_proj
      - k_proj
      - v_proj
      - o_proj
      - up_proj
      - gate_proj
      - down_proj
    gradient_checkpointing: True
    optimizer: Lion
    scheduler: cosine
    learning_rate: 0.0001
    weight_decay: 0.001
    use_gc: True
    adanorm: True
    block_size: 512
    stride: 0
    num_steps: 40000
    warmup_steps: 10
    gradient_clip_val: 1.0
    batch_size: 1
    gradient_accumulation_steps: 16
    val_split: 0.01
    val_interval: 2500
    generate_every: 25
    save_every: 500
    checkpoint_every: 500
    datasets:
      streaming:
        - instruct

rot:
  model: apple/OpenELM-270M-Instruct
  info: in the core
  precision: 32
  tokenizer: "NousResearch/Llama-2-7b-hf"
  training:
    type: lora
    r: 64
    lora_alpha: 12
    lora_dropout: 0.1
    use_rslora: True
    use_dora: True
    bias: "none"
    target_modules:
      - qkv_proj
      - out_proj
      - proj_1
      - proj_2
    tokenizer: "NousResearch/Llama-2-7b-hf"
    gradient_checkpointing: True
    optimizer: Lion
    scheduler: cosine
    learning_rate: 0.0001
    weight_decay: 0.001
    block_size: 512
    stride: 0
    num_steps: 40000
    warmup_steps: 0
    batch_size: 1
    gradient_accumulation_steps: 16
    gradient_clip_val: 1.0
    val_split: 0.01
    val_interval: 2500
    generate_every: 25
    save_every: 500
    checkpoint_every: 500

sick:
  model: UNSAFE/Mixtress-135M
  info: and tired
  generation_profile: predictable
  training:
    type: standard
    gradient_checkpointing: True
    optimizer: GrokFastAdamW
    scheduler: cosine
    learning_rate: 0.001
    weight_decay: 0.1
    block_size: 768
    stride: 0
    num_steps: 2000
    warmup_steps: 200
    grokfast_after_step: 100
    batch_size: 1
    gradient_accumulation_steps: 16
    gradient_clip_val: 1.0
    val_split: 0.01
    val_interval: 250
    generate_every: 25
    save_every: 500

toe:
  info: nailed to the foot
  model: EleutherAI/gpt-neo-125M
  training:
    type: "loha"