From e128c74a0465bf12e9959b3334f0c272263553e7 Mon Sep 17 00:00:00 2001
From: qbicStefanC <stefan.czemmel@uni-tuebingen.de>
Date: Thu, 27 Sep 2018 13:05:32 +0200
Subject: [PATCH] Update to Snakefile

Update to Snakefile. Workflow was tested with 1000Genomes datasets and in house datasets and performed well.
---
 Snakefile | 99 ++++++++++++++-----------------------------------------
 1 file changed, 24 insertions(+), 75 deletions(-)

diff --git a/Snakefile b/Snakefile
index 96b4b6d..ec5d8e0 100644
--- a/Snakefile
+++ b/Snakefile
@@ -65,28 +65,28 @@ def read_design():
     pool = {}
     pools['all'] = pool
 
-    files = os.listdir(DATA)
+    files = sorted(os.listdir(DATA))
 
     for barcode in test_samples.Identifier:
-        sample_files = [file.split('.')[0] for file in files if (file.startswith(barcode) & file.endswith('.gz'))]
+        sample_files = [file.split('.')[0] for file in files if (file.startswith(barcode) & file.endswith('.fastq'))]
         if len(sample_files) == 2:
             pool[barcode] = sample_files
     return pools
 
 pools = read_design()
 
-print(pools)
+##print(pools)
 
 def fastq_per_group(wildcards):
     pool = wildcards['pool']
     group = wildcards['group']
-    print(group)
+    ##print(group)
     if pool not in pools:
         return ['/i_do_not_exist']
     if group not in pools[pool]:
         return ['/i_do_not_exist']
     names = pools[pool][group]
-    return expand(data("{name}.fastq.gz"), name=names)
+    return expand(data("{name}.fastq"), name=names)
 
 all_files = []
 for pool in pools.values():
@@ -124,26 +124,26 @@ ID="merged"
 
 rule all:
     input:
-        expand("checksums/{name}.txt", name=all_files),
+        #expand("checksums/{name}.txt", name=all_files),
         #expand("mapqc/{pool}.qcML", pool=pools.keys()),
-        expand_pools_groups(result("{pool}_{group}_readqc.qcML")),
+        #expand_pools_groups(result("{pool}_{group}_readqc.qcML")),
         expand("fastqc/{name}", name=all_files),
         expand_pools_groups("trim_fastqc/{pool}_{group}"),
         #expand_pools_groups("stampy/{pool}_{group}.sorted.bam.flagstat"),
         expand_pools_groups("map_bwa/{pool}_{group}.sorted.bam.flagstat"),
         expand_pools_groups("duplicates/{pool}_{group}.sorted.bam"),
         expand_pools_groups("clipped/{pool}_{group}.sorted.bam.flagstat"),
-        expand_pools_groups("mapqc/{pool}_{group}.qcML"),
+        #expand_pools_groups("mapqc/{pool}_{group}.qcML"),
         expand_pools_groups("fastqc_clipped/{pool}_{group}"),
         #expand("fastqc_clipped/{pool}", pool=pools.keys())
         #expand("map_fastqc/{pool}.qcML", pool=pools.keys())
         #expand("result/{name}.qcML", name=all_files)
         expand("merged_bam/{id}_sorted.bam",id=ID),
         expand("merged_bam/{id}_sorted.bam.bai",id=ID),
-        expand("annovar/annovar_in/{id}_sorted.vcf.avinput",id=ID),
-        expand("annovar/annovar_out/{id}_sorted.vcf.avoutput",id=ID),
-        expand("snpeff/{id}_sorted.snpeff.vcf",id=ID)
-
+        #expand("annovar/annovar_in/{id}_sorted.vcf.avinput",id=ID),
+        #expand("annovar/annovar_out/{id}_sorted.vcf.avoutput",id=ID),
+	#expand("snpeff/{id}_sorted.snpeff.vcf",id=ID),
+        expand("vcf_sorted/{id}_sorted.vcf",id=ID)
 
 
 rule verify_checksums:
@@ -180,7 +180,7 @@ rule read_qc:
 
 
 rule fastqc:
-    input: data("{name}.fastq.gz")
+    input: data("{name}.fastq")
     output: "fastqc/{name}"
     run:
         try:
@@ -248,6 +248,7 @@ rule sam_index:
     run:
         shell("samtools index {input}")
 
+
 rule sam_index_unsorted:
     input: "{file}.unsorted.bam"
     output: "{file}.unsorted.bam.bai"
@@ -261,7 +262,7 @@ rule map_stampy:
     output: "stampy/{pool}_{group}.unsorted.sam"
     run:
         shell("stampy.py -g {fasta} -h {fasta} "
-              "--readgroup=ID:{group} --bamkeepgoodreads -M "
+              "--readgroup=ID:{group} -t8 --bamkeepgoodreads -M "
               "{input} -o {output}".format(fasta=indexed_genome,
                                            input=input,
                                            output=output,
@@ -329,15 +330,16 @@ rule clip_overlap:
     input: bam="realigned/{name}.unsorted.bam", \
            bai="realigned/{name}.unsorted.bam.bai"
     output: "clipped/{name}.unsorted.bam"
-    run:
-        shell("BamClipOverlap -in {input.bam} -out {output}")
+    #resources: mem=200000
+    shell:
+         "BamClipOverlap -in {input.bam} -out {output}"
 
 
-rule mapqc:
-    input: bam="clipped/{name}.sorted.bam",\
-           bai="clipped/{name}.sorted.bam.bai"
-    output: "mapqc/{name}.qcML"
-    shell: "MappingQC -in {input.bam} -out {output} -wgs hg19"
+#rule mapqc:
+#    input: bam="clipped/{name}.sorted.bam",\
+#           bai="clipped/{name}.sorted.bam.bai"
+#    output: "mapqc/{name}.qcML"
+#    shell: "MappingQC -in {input.bam} -out {output} -wgs hg19"
 
 rule fastqc_clipped:
     input: "clipped/{pool}.sorted.bam"
@@ -380,6 +382,7 @@ rule freebayes:
     input: bam="merged_bam/merged_sorted.bam", \
            bai="merged_bam/merged_sorted.bam.bai"
     output: "vcf_base/merged_base.vcf"
+    #resources: mem=200000
     run:
         shell("/lustre_cfc/software/qbic/freebayes/bin/freebayes --min-base-quality 20 --min-alternate-qsum 90 -f %s.fa -b {input.bam} > {output}" % indexed_genome)
 
@@ -454,64 +457,10 @@ rule sort_variants:
         shell("cat {input} | /lustre_cfc/software/qbic/vcftools_0.1.13/bin/vcf-sort > {output}")
 
 
-#####
-## some stats on base calling, filtered calling and final output which is the coordinate sorted vcf file
-#####
-
-
 
 
 
-### annotation
 
-## 1) annovar annotation
-## module is installed by Chris
-## in /lustre_cfc/qbic/reference_genomes/ I downloaded database for annovar via annotate_variation.pl -buildver mm10 -downdb refGene mm10db_annovar/
-## then check the log file in mm10db_annovar/
-## #NOTICE: the FASTA file http://www.openbioinformatics.org/annovar/download mm10_refGeneMrna.fa.gz is not available to download but can be generated by the ANNOVAR software. PLEASE RUN THE FOLLOWING TWO COMMANDS CONSECUTIVELY TO GENERATE THE FASTA FILES:
-## annotate_variation.pl --buildver mm10 --downdb seq mm10db_annovar/mm10_seq
-## retrieve_seq_from_fasta.pl mm10db_annovar/mm10_refGene.txt -seqdir mm10db_annovar/mm10_seq -format refGene -outfile mm10db_annovar/mm10_refGeneMrna.fa
-## check under /lustre_cfc/qbic/reference_genomes/mm10db_annovar/ how it eventually looks like
 
-rule annovar_convert:
-    input: "vcf_sorted/merged_sorted.vcf"
-    output: "annovar/annovar_in/merged_sorted.vcf.avinput"
-    run:
-        try:
-            shell("convert2annovar.pl -format vcf4 --allsample --withfreq --includeinfo {input} --outfile {output}")
-        except subprocess.CalledProcessError:
-            pass
-        shell("convert2annovar.pl -format vcf4old --withfreq --includeinfo {input} --outfile {output}")
-    #### the option --allsample is supported only if --format is 'vcf4'
 
 
-rule annovar_annotate:
-    input: "annovar/annovar_in/merged_sorted.vcf.avinput"
-    output: "annovar/annovar_out/merged_sorted.vcf.avoutput"
-    run:
-        shell("annotate_variation.pl -buildver hg19 --otherinfo --infoasscore {input} > {output} --outfile {output} /lustre_cfc/qbic/reference_genomes/humandb_annovar/")
-
-
-
-
-## 2) snpeff annotation
-## installed snpeff under /qbic/software but not as module yet
-## to check what databases are available you can do:
-## java -Xmx4g -jar /lustre_cfc/software/qbic/snpEff/snpEff.jar databases | less
-## #to find mouse specific stuff:
-## java -Xmx4g -jar /lustre_cfc/software/qbic/snpEff/snpEff.jar databases | grep -i musculus
-## Note: If you are running SnpEff from a directory different than the one it was installed, you will have to specify where the config file is. This is done using the '-c' command line option:
-## in snpeff.config file changed data.dir: data.dir = /lustre_cfc/qbic/reference_genomes/snpeff
-## run example
-## java -Xmx4g -jar /lustre_cfc/software/qbic/snpEff/snpEff.jar eff -c /lustre_cfc/software/qbic/snpEff/snpEff.config -noStats -noLog -v GRCm38.79 sample_sorted.vcf > sample_sorted.snpeff.ann.vcf
-## as pointed out in the command above direct to config with -c, set a version of the genome (should be found within databases available, see above command)
-## also make sure a directory exists in data.dir named here GRCm38.79, then the db will be stored there.
-## from Marc Sturm's commands leave out -spliceRegionIntronMax as not clear how to set this here
-## #updated also the config file to add a line for Mitochondrial DNA for my genome: GRCm38.79.MT.codonTable : Vertebrate_Mitochondrial
-
-rule snpeff:
-    input: vcf="vcf_sorted/merged_sorted.vcf",\
-           file="../etc/header_add.1.txt"
-    output: "snpeff/merged_sorted.snpeff.vcf"
-    run:
-        shell("java -Xmx4g -jar /lustre_cfc/software/qbic/snpEff/snpEff.jar eff -c /lustre_cfc/software/qbic/snpEff/snpEff.config -v -cancer -cancerSamples {input.file} -noLog hg19 {input.vcf} > {output}")