Skip to content

Generation of simulated data

Luca Santuari edited this page Jul 24, 2020 · 2 revisions
wget ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/reference/phase2_reference_assembly_sequence/hs37d5.fa.gz
d10eebe06c0dbbcb04253e3294d63efc  hs37d5.fa.gz

selecting chromosomes 10 and 12:

echo -e '10\n12' > seq.list
seqtk subseq hs37d5.fa.gz seq.list > 10_12_hs37d5.fa
md5sum 10_12_hs37d5.fa
b80f2327fc52323bf4e0986524edfd79  10_12_hs37d5.fa
git clone https://github.com/GooglingTheCancerGenome/sv-gen.git
cd sv-gen
git checkout v1.0.0
cd snakemake/data
ln -s /project/gcg/Data/simulated-data/test_chromosomes/10_12_hs37d5.fa 10_12_hs37d5.fasta
cd ../..
snakemake -np
snakemake --use-conda --latency-wait 30 --jobs \
--cluster 'xenon scheduler slurm --location local:// submit --name smk.{rule} --inherit-env --max-run-time 4500 --working-directory . --stderr stderr-%j.log --stdout stdout-%j.log' &>smk.log&
# I/O files
input:
  fasta: data/10_12_hs37d5.fasta  # filepath of ref. genome (haploid)
  seqids: [10, 12]  # zero or more SeqIDs (e.g. chromosomes)

coverage:

simulation:
  # SURVIVOR parameters
  config: survivor.cfg
  svtype:
    dup: [5000, 50, 50000]    # duplication: [count, min_len, max_len]
    inv: [5000, 50, 50000]      # inversion: ""
    tra: [5000, 50, 50000]    # translocation: ""
    indel: [10000, 50, 50000]    # insertion+deletion: ""
    invdel: [0, 600, 800]   # inversion+deletion: ""
    invdup: [0, 600, 800]   # inversion+duplication: ""
  # ART parameters
  seed: 1000
  profile: HSXt
  coverage: [30]  # [cov1, cov2, ...]
  read:
    length: [150]  # [len1, len2, ...]
  insert:
    stdev: 50      # standard deviation of the fragment length (bp)
    length: [500]  # [len1, len2, ...]

read length:

simulation:
  # SURVIVOR parameters
  config: survivor.cfg
  svtype:
    dup: [5000, 50, 50000]    # duplication: [count, min_len, max_len]
    inv: [5000, 50, 50000]      # inversion: ""
    tra: [5000, 50, 50000]    # translocation: ""
    indel: [10000, 50, 50000]    # insertion+deletion: ""
    invdel: [0, 600, 800]   # inversion+deletion: ""
    invdup: [0, 600, 800]   # inversion+duplication: ""
  # ART parameters
  seed: 1000
  profile: HSXt
  coverage: [30]  # [cov1, cov2, ...]
  read:
    length: [36, 50, 75, 100, 150, 250]  # [len1, len2, ...]
  insert:
    stdev: 50      # standard deviation of the fragment length (bp)
    length: [500]  # [len1, len2, ...]

insert size:

simulation:
  # SURVIVOR parameters
  config: survivor.cfg
  svtype:
    dup: [5000, 50, 50000]    # duplication: [count, min_len, max_len]
    inv: [5000, 50, 50000]      # inversion: ""
    tra: [5000, 50, 50000]    # translocation: ""
    indel: [10000, 50, 50000]    # insertion+deletion: ""
    invdel: [0, 600, 800]   # inversion+deletion: ""
    invdup: [0, 600, 800]   # inversion+duplication: ""
  # ART parameters
  seed: 1000
  profile: HSXt
  coverage: [30]  # [cov1, cov2, ...]
  read:
    length: [100]  # [len1, len2, ...]
  insert:
    stdev: 15      # standard deviation of the fragment length (bp)
    length: [150]  # [len1, len2, ...]

  insert:
    stdev: 20      # standard deviation of the fragment length (bp)
    length: [200]  # [len1, len2, ...]

  insert:
    stdev: 25      # standard deviation of the fragment length (bp)
    length: [250]  # [len1, len2, ...]

  insert:
    stdev: 30      # standard deviation of the fragment length (bp)
    length: [300]  # [len1, len2, ...]

  insert:
    stdev: 40      # standard deviation of the fragment length (bp)
    length: [400]  # [len1, len2, ...]

  insert:
    stdev: 50      # standard deviation of the fragment length (bp)
    length: [500]  # [len1, len2, ...]