-
Notifications
You must be signed in to change notification settings - Fork 6
Generation of simulated data
Luca Santuari edited this page Jul 24, 2020
·
2 revisions
wget ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/reference/phase2_reference_assembly_sequence/hs37d5.fa.gz
d10eebe06c0dbbcb04253e3294d63efc hs37d5.fa.gz
selecting chromosomes 10 and 12:
echo -e '10\n12' > seq.list
seqtk subseq hs37d5.fa.gz seq.list > 10_12_hs37d5.fa
md5sum 10_12_hs37d5.fa
b80f2327fc52323bf4e0986524edfd79 10_12_hs37d5.fa
git clone https://github.com/GooglingTheCancerGenome/sv-gen.git
cd sv-gen
git checkout v1.0.0
cd snakemake/data
ln -s /project/gcg/Data/simulated-data/test_chromosomes/10_12_hs37d5.fa 10_12_hs37d5.fasta
cd ../..
snakemake -np
snakemake --use-conda --latency-wait 30 --jobs \
--cluster 'xenon scheduler slurm --location local:// submit --name smk.{rule} --inherit-env --max-run-time 4500 --working-directory . --stderr stderr-%j.log --stdout stdout-%j.log' &>smk.log&
# I/O files
input:
fasta: data/10_12_hs37d5.fasta # filepath of ref. genome (haploid)
seqids: [10, 12] # zero or more SeqIDs (e.g. chromosomes)
coverage:
simulation:
# SURVIVOR parameters
config: survivor.cfg
svtype:
dup: [5000, 50, 50000] # duplication: [count, min_len, max_len]
inv: [5000, 50, 50000] # inversion: ""
tra: [5000, 50, 50000] # translocation: ""
indel: [10000, 50, 50000] # insertion+deletion: ""
invdel: [0, 600, 800] # inversion+deletion: ""
invdup: [0, 600, 800] # inversion+duplication: ""
# ART parameters
seed: 1000
profile: HSXt
coverage: [30] # [cov1, cov2, ...]
read:
length: [150] # [len1, len2, ...]
insert:
stdev: 50 # standard deviation of the fragment length (bp)
length: [500] # [len1, len2, ...]
read length:
simulation:
# SURVIVOR parameters
config: survivor.cfg
svtype:
dup: [5000, 50, 50000] # duplication: [count, min_len, max_len]
inv: [5000, 50, 50000] # inversion: ""
tra: [5000, 50, 50000] # translocation: ""
indel: [10000, 50, 50000] # insertion+deletion: ""
invdel: [0, 600, 800] # inversion+deletion: ""
invdup: [0, 600, 800] # inversion+duplication: ""
# ART parameters
seed: 1000
profile: HSXt
coverage: [30] # [cov1, cov2, ...]
read:
length: [36, 50, 75, 100, 150, 250] # [len1, len2, ...]
insert:
stdev: 50 # standard deviation of the fragment length (bp)
length: [500] # [len1, len2, ...]
insert size:
simulation:
# SURVIVOR parameters
config: survivor.cfg
svtype:
dup: [5000, 50, 50000] # duplication: [count, min_len, max_len]
inv: [5000, 50, 50000] # inversion: ""
tra: [5000, 50, 50000] # translocation: ""
indel: [10000, 50, 50000] # insertion+deletion: ""
invdel: [0, 600, 800] # inversion+deletion: ""
invdup: [0, 600, 800] # inversion+duplication: ""
# ART parameters
seed: 1000
profile: HSXt
coverage: [30] # [cov1, cov2, ...]
read:
length: [100] # [len1, len2, ...]
insert:
stdev: 15 # standard deviation of the fragment length (bp)
length: [150] # [len1, len2, ...]
insert:
stdev: 20 # standard deviation of the fragment length (bp)
length: [200] # [len1, len2, ...]
insert:
stdev: 25 # standard deviation of the fragment length (bp)
length: [250] # [len1, len2, ...]
insert:
stdev: 30 # standard deviation of the fragment length (bp)
length: [300] # [len1, len2, ...]
insert:
stdev: 40 # standard deviation of the fragment length (bp)
length: [400] # [len1, len2, ...]
insert:
stdev: 50 # standard deviation of the fragment length (bp)
length: [500] # [len1, len2, ...]