config.yaml

# Configuration file for pipeline

# List matutils pre-built mutation-annotated tree. We do the analysis for each tree and put
# in a subdirectory called `results_{mat}` where `{mat}` is the name of that tree.
# The most recent public trees are at
# http://hgdownload.soe.ucsc.edu/goldenPath/wuhCor1/UShER_SARS-CoV-2/
mat_trees:
  public_2024-11-06: http://hgdownload.soe.ucsc.edu/goldenPath/wuhCor1/UShER_SARS-CoV-2/2024/11/06/public-2024-11-06.all.masked.pb.gz
  public_2024-04-19: http://hgdownload.soe.ucsc.edu/goldenPath/wuhCor1/UShER_SARS-CoV-2/2024/04/19/public-2024-04-19.all.masked.pb.gz
  public_2023-10-01: http://hgdownload.soe.ucsc.edu/goldenPath/wuhCor1/UShER_SARS-CoV-2/2023/10/01/public-2023-10-01.all.masked.nextclade.pangolin.pb.gz
  public_2023-06-21: http://hgdownload.soe.ucsc.edu/goldenPath/wuhCor1/UShER_SARS-CoV-2/2023/06/21/public-2023-06-21.all.masked.nextclade.pangolin.pb.gz
  public_2023-05-11: http://hgdownload.soe.ucsc.edu/goldenPath/wuhCor1/UShER_SARS-CoV-2/2023/05/11/public-2023-05-11.all.masked.nextclade.pangolin.pb.gz
  public_2023-03-27: http://hgdownload.soe.ucsc.edu/goldenPath/wuhCor1/UShER_SARS-CoV-2/2023/03/27/public-2023-03-27.all.masked.nextclade.pangolin.pb.gz
  public_2022-12-18: http://hgdownload.soe.ucsc.edu/goldenPath/wuhCor1/UShER_SARS-CoV-2/2022/12/18/public-2022-12-18.all.masked.nextclade.pangolin.pb.gz
  public_2022-01-31: http://hgdownload.soe.ucsc.edu/goldenPath/wuhCor1/UShER_SARS-CoV-2/2022/01/31/public-2022-01-31.all.masked.nextclade.pangolin.pb.gz

# specify one of the mutation-annotated trees in `mat_trees` as the "current" one that is
# linked to by default as the best current estimates.
current_mat: public_2024-11-06

# Reference GTF and FASTA, and location of spike coding sequence
ref_fasta: http://hgdownload.soe.ucsc.edu/goldenPath/wuhCor1/bigZips/wuhCor1.fa.gz
ref_gtf: http://hgdownload.soe.ucsc.edu/goldenPath/wuhCor1/bigZips/genes/ncbiGenes.gtf.gz

# gene annotations to add to the reference GTF, which is missing ORF9b, see:
# https://github.com/jbloomlab/SARS2-mut-fitness/issues/21
add_to_ref_gtf:
  ORF9b: [28284, 28577]

# Only keep nextstrain clades with at least this many samples in mutation-annotated tree
min_clade_samples: 10000

# Subset samples based on whether they start with these regex matches
sample_subsets:
  all: .  # regex to match anything
  USA: USA
  England: England

# Clade founder genotypes. For backwards-compatibility, we use the ones from Richard Neher
# when available, and those from Cornelius Roemer when the Neher file does not have a
# founder for a clade, which is the case for newer clades as he is not updating that file.
clade_founders:
  neher_json: https://raw.githubusercontent.com/neherlab/SC2_variant_rates/7e738194a8c6592082f1caa9a6ca70cb68289790/data/clade_gts.json
  roemer_json: https://raw.githubusercontent.com/corneliusroemer/pango-sequences/main/data/pango-consensus-sequences_summary.json
# map Pango clades in `roemer_json` to NextStrain clades, from here:
# https://github.com/nextstrain/ncov/blob/master/defaults/clade_display_names.yml
roemer_nextstrain_to_pango:
  23A: XBB.1.5
  23B: XBB.1.16
  23C: CH.1.1
  23D: XBB.1.9
  23E: XBB.2.3
  23F: EG.5.1
  23G: XBB.1.5.70
  23H: HK.3
  23I: BA.2.86
  24A: JN.1
  24B: JN.1.11.1
  24C: KP.3
  24D: XDV.1
  24E: KP.3.1.1
  24F: XEC
  24G: KP.2.3
# Exclude these clades, for instance if no clade founder defined for them
clades_to_exclude:
  - 21A

# For counting mutations, exclude any branches with more than this many nucleotide
# mutations or more than this many reversions to reference or clade founder. These
# filters are applied only on the coding nucleotide changes, not any non-coding ones.
max_nt_mutations: 4
max_reversions_to_ref: 1
max_reversions_to_clade_founder: 1

# Exclude nucleotide mutations from reference to clade founder and their reversions.
# These sites have higher than normal errors due to calling of missing bases to reference.
exclude_ref_to_founder_muts: true

# Only include sites in this range (inclusive). Currently set to exclude sites in the UTRs
# before/after coding sequences as these may not be well covered in UShER tree:
site_include_range:
  start: 266  # start of ORF1ab
  end: 29674  # end of ORF10

# exclude the following sites for all clades (set to null for no exclusions)
sites_to_exclude:
  # Sites in Table S1 of https://journals.plos.org/plosgenetics/article?id=10.1371/journal.pgen.1009175
  - 153
  - 1149
  - 2198
  - 3145
  - 3564
  - 3778
  - 4050
  - 6255
  - 8022
  - 8790
  - 13402
  - 13947
  - 22802
  - 24389
  - 24390
  - 24933
  # sites specified for exclusion because they have extremely high mutation
  # frequency in some clade
  - 5629  # T5629G is much higher (~5% of all) in clade 20A than any other mutation.
  - 6851  # C6851A and its reversion are top two mutations in 20C at ~5% and ~3% of all mutations
  - 7328  # ~6% of all mutations in clade 21I, also highly mutated (~4% of all) in several other clades
  - 28095  # ~11% of all mutations in clade 20I
  - 29362  # ~30% of all mutations in clade 21C

# Name of VCF with sites masked across entire tree
site_mask_vcf: https://raw.githubusercontent.com/W-L/ProblematicSites_SARS-CoV2/master/subset_vcf/problematic_sites_sarsCov2.mask.vcf

# Name of YAML with UShER masked sites. These are sites masked per clade
# in the pre-built UShER tree.
usher_masked_sites: data/usher_masked_sites.yaml

# for analysis of 4-fold synonymous mutation spectra/rates, only keep clade subsets with
# at least this many non-excluded mutation counts
synonymous_spectra_min_counts: 5000

# Orf1ab to Nsp numbering (amino-acid start in Orf1ab) from
# https://github.com/theosanderson/Codon2Nucleotide/blob/main/src/App.js
orf1ab_to_nsps:
  nsp1: [1, 180]
  nsp2: [181, 818]
  nsp3: [819, 2763]
  nsp4: [2764, 3263]
  nsp5 (Mpro): [3264, 3569]
  nsp6: [3570, 3859]
  nsp7: [3860, 3942]
  nsp8: [3943, 4140]
  nsp9: [4141, 4253]
  nsp10: [4254, 4392]
  nsp12 (RdRp): [4393, 5324]
  nsp13: [5325, 5925]
  nsp14: [5926, 6452]
  nsp15: [6453, 6798]
  nsp16: [6799, 7096]

# How to handle nucleotide sites that overlap among sites: exclude them from
# amino-acid fitness estimates or retain those sites in estimates. If retained, estimates
# can be confounded by not knowing which gene the selection is acting on.
gene_overlaps:
  exclude:
    - [ORF1a, ORF1ab]  # 11 sites of overlap near ribosomal frameshifting location
    - [ORF7a, ORF7b]  # 1 site of overlap corresponding to ORF7a stop codon
  retain:
    - [N, ORF9b]  # ORF9b is a 291 out-of-frame overlapping reading frame in N
  
# Pseudocount for calculating amino-acid fitnesses
fitness_pseudocount: 0.5

# initial cutoff for minimum expected count to show fitness values
min_expected_count: 10

# only plot correlation among clades when at least this many expected counts
clade_corr_min_count: 500000

# initial clade founder to use as "wildtype" when plotting aa fitnesses
aa_fitness_init_ref_clade: 19A

# amino-acid fitness heatmap color spans **at least** much (expanded if needed by data)
aa_fitness_heatmap_minimal_domain: [-6, 2]

# Define common names of Nexstrain clades
clade_synonyms:
  19A: B
  20A: B.1
  20B: B.1.1
  20C: B.1.367
  20E: B.1.177
  20F: D.2
  20G: B.1.2
  20I: Alpha
  20J: Gamma
  21C: Epsilon
  21F: Iota
  21I: Delta
  21J: Delta
  21K: Omicron BA.1
  21L: Omicron BA.2
  22A: Omicron BA.4
  22B: Omicron BA.5
  22C: Omicron BA.2.12.1
  22D: Omicron BA.2.75
  22E: Omicron BQ.1
  22F: XBB
  23A: XBB.1.5
  23B: BB.1.16
  23C: CH.1.1
  23D: XBB.1.9
  23E: XBB.2.3
  23F: EG.5.1
  23G: XBB.1.5.70
  23H: HK.3
  23I: BA.2.86
  24A: JN.1
  24B: JN.1.11.1
  24C: KP.3
  24D: XDV.1
  24E: KP.3.1.1
  24F: XEC
  24G: KP.2.3

# Deep mutational scanning datasets to correlate fitness estimates with:
dms_datasets:
  dadonaite_ba1_spike:
    study: https://doi.org/10.1016/j.cell.2023.02.001
    description: spike (Dadonaite et al, 2023)
    gene: S
    url: https://raw.githubusercontent.com/dms-vep/SARS-CoV-2_Omicron_BA.1_spike_DMS_mAbs/main/results/muteffects_functional/muteffects_observed.csv
    filter_cols:
      times_seen: 3
  starr_rbd:
    study: https://journals.plos.org/plospathogens/article?id=10.1371/journal.ppat.1010951
    description: RBD (Starr et al, 2022)
    gene: S
    url: https://media.githubusercontent.com/media/jbloomlab/SARS-CoV-2-RBD_DMS_Omicron/f76ba2b2bec18ede9fa9da18c9ccc52389b1ba3a/results/final_variant_scores/final_variant_scores.csv
  iketani_mpro:
    study: https://www.sciencedirect.com/science/article/pii/S1931312822004024
    description: Mpro (Iketani et al, 2022)
    gene: nsp5 (Mpro)
    url: https://raw.githubusercontent.com/alejandrochavezlab/3CL_protease_DMS/dc802ee3ea9d43005ca97a28af078767fd66777c/outputs/results/normalized_to_wt_and_stop/activity_scores.csv
    wt_seq: data/Mpro.fa
  flynn_mpro_2022:
    study: https://elifesciences.org/articles/77433
    description: Mpro (Flynn et al, 2022)
    gene: nsp5 (Mpro)
    url: https://cdn.elifesciences.org/articles/77433/elife-77433-fig2-data1-v2.xlsx
    wt_seq: data/Mpro.fa
  flynn_mpro_2023:
    study: https://doi.org/10.1101/2023.03.02.530652
    description: Mpro (Flynn et al, 2023)
    gene: nsp5 (Mpro)
    url: https://www.biorxiv.org/content/biorxiv/early/2023/03/02/2023.03.02.530652/DC1/embed/media-1.xlsx

# Spike fitness estimates from Richard Neher, based on approach in his paper:
# https://academic.oup.com/ve/advance-article/doi/10.1093/ve/veac113/6887176
# These are designed as an independent cross check of the fitness estimates here.
neher_fitness: data/Neher_aa_fitness.csv

# For analysis of fixed mutations in each clade, do this relative
# to this clade founder as "reference"
clade_fixed_muts_ref: 19A  # this is Wuhan-Hu-1

# For analysis of mutation fitness versus terminal / non-terminal counts,
# specify pseudocount and initial minimum actual count
terminal_pseudocount: 0.5
terminal_min_actual_count: 5

# For dN/dS comparative analysis
dnds: https://raw.githubusercontent.com/spond/SARS-CoV-2-variation/master/windowed-sites-fel-all.csv

# For comparison to other studies
comparator_studies:
  rodriguez_rivas_dca:
    data: https://raw.githubusercontent.com/GiancarloCroce/DCA_SARS-CoV-2/main/data/data_dca_proteome.csv  
    name: Rodriguez-Rivas et al (2022)
  maher_drivers:
    data: https://www.science.org/doi/suppl/10.1126/scitranslmed.abk3445
    name: Maher et al (2022)
  thadani_learning:
    data: https://www.biorxiv.org/content/biorxiv/early/2023/04/12/2022.07.21.501023/DC4/embed/media-4.zip
    name: Thadani et al (2023)

# For GitHub Pages docs of plots
docs_url: https://jbloomlab.github.io/SARS2-mut-fitness  # base URL for GitHub pages site
docs_plot_annotations: data/docs_plot_annotations.yaml  # lists plots and their annotations

# Metadata for aa_fitness JSON
citation: https://academic.oup.com/ve/article/9/2/vead055/7265011
authors: [Jesse Bloom, Richard Neher]
source: https://github.com/jbloomlab/SARS2-mut-fitness/tree/main
description: Estimated fitness for amino acid mutations in each SARS-CoV-2 gene from natural occurrences.