diff --git a/config_hpf.yaml b/GRCh37/config_hpf.yaml similarity index 92% rename from config_hpf.yaml rename to GRCh37/config_hpf.yaml index f70358c..ea53b2d 100644 --- a/config_hpf.yaml +++ b/GRCh37/config_hpf.yaml @@ -7,7 +7,7 @@ run: hpo: "" # five-column TSV with HPO terms; leave this string empty is there are no hpo terms flank: 100000 gatk: "gatk" - pipeline: "wes" #either wes (exomes) or wgs (genomes) or annot (to annotate and produce reports for an input vcf) or mity (to generate mitochondrial reports) + pipeline: "wes" #either wes (exomes) or wgs (genomes) or annot (to annotate and produce reports for an input vcf) minio: "" PT_credentials: "" @@ -82,11 +82,11 @@ qc: annotation: cre.vcfanno: - conf: "~/crg2/vcfanno/cre.vcfanno.conf" + conf: "~/crg2/vcfanno/cre.vcfanno_GRCh37.conf" lua_script: "~/crg2/vcfanno/cre.vcfanno.lua" - base_path: "/hpf/largeprojects/ccmbio/naumenko/tools/bcbio/genomes/Hsapiens/GRCh37/variation/" + base_path: "/hpf/largeprojects/ccmbio/naumenko/tools/bcbio/genomes/Hsapiens/GRCh37" vcfanno: - conf: "~/crg2/vcfanno/crg.vcfanno.conf" + conf: "~/crg2/vcfanno/crg.vcfanno_GRCh37.conf" lua_script: "~/crg2/vcfanno/crg.vcfanno.lua" base_path: "/hpf/largeprojects/ccmbio/naumenko/tools/bcbio/genomes/Hsapiens/GRCh37/variation/" mt.vcfanno: @@ -102,6 +102,7 @@ annotation: intron_bed: "/hpf/largeprojects/ccm_dccforge/dccdipg/Common/annotation/SVScore/refGene.introns.bed.gz" cadd: "/hpf/largeprojects/ccm_dccforge/dccdipg/Common/annotation/SVScore/whole_genome_SNVs.tsv.gz" svreport: + reference: GRCh37 hgmd: "/hpf/largeprojects/ccm_dccforge/dccdipg/Common/annotation/HGMD_2018/hgmd_pro.db" protein_coding_genes: "/hpf/largeprojects/ccm_dccforge/dccdipg/Common/annotation/grch37.p13.ensembl.sorted.protein.coding.genes.bed" exon_bed: "/hpf/largeprojects/ccm_dccforge/dccdipg/Common/annotation/exons/hg19_UCSC_exons_canonical.bed" @@ -121,7 +122,7 @@ annotation: database_path: "/hpf/largeprojects/ccmbio/naumenko/tools/bcbio/genomes/Hsapiens/GRCh37/variation/" validation: - benchmark: "benchmark_hpf.tsv" + benchmark: "benchmark.tsv" params: bwa: @@ -135,9 +136,6 @@ params: #BaseRecalibrator: "--interval-set-rule INTERSECTION -U LENIENT-VCF-PROCESSING --read-filter BadCigar --read-filter NotzPrimaryAlignment" GenotypeGVCFs: "" VariantRecalibrator: "" - Mutect2: - gnomad_germline: "/hpf/largeprojects/ccm_dccforge/dccdipg/Common/annotation/Mutect2/af-only-gnomad.raw.sites.vcf" - FilterMutectCalls: "" gatk3: java_opts: "-Xms500m -Xmx9555m" HaplotypeCaller: "-drf DuplicateRead --interval_set_rule INTERSECTION --pair_hmm_implementation VECTOR_LOGLESS_CACHING -ploidy 2 -U LENIENT_VCF_PROCESSING --read_filter BadCigar --read_filter NotPrimaryAlignment" @@ -173,14 +171,10 @@ params: bcftools: mpileup: "-a DP -a AD " call: " -m -v " - # -F x sets the output filter to PASS if any of the variant filters is PASS in sample VCFs to be merged - merge: "-F x" - freebayes: - call: " --genotype-qualities --strict-vcf --ploidy 2 --no-partial-observations --min-repeat-entropy 1 " + freebayes: " --genotype-qualities --strict-vcf --ploidy 2 --no-partial-observations --min-repeat-entropy 1 " platypus: "--filterDuplicates=0" rtg-tools: java_opts: "-Xmx20g" vcfeval: sdf: "/hpf/largeprojects/ccm_dccforge/dccdipg/Common/rtg-tools/GRch37_SDF/" - vcfsubset: - java_opts: "-Xmx2048m" + diff --git a/README.md b/README.md index 0968bff..78d92f3 100644 --- a/README.md +++ b/README.md @@ -5,14 +5,16 @@ Clinical research pipeline for exploring variants in whole genome (WGS) and exom -crg2 is a research pipeline aimed at discovering clinically relevant variants (SNVs, indels, SVs, STRs) in whole genome and exome sequencing data. +crg2 is a research pipeline aimed at discovering clinically relevant variants in whole genome and exome sequencing data. It aims to provide reproducible results, be computationally efficient, and transparent in it's workflow. crg2 uses Snakemake and Conda to manage jobs and software dependencies. ## Installation instructions -1. A note about the config files: the values in config_hpf.yaml refer to tool/filepaths on SickKid's HPC4Health tenancy, while the values in config_cheo_ri.yaml refer to tool/filepaths on CHEO's HPC4Health tenancy. For simplicity, we refer below only to config_hpf.yaml; if you are running crg2 on CHEO's tenancy, use config_cheo_ri.yaml instead. +A note about the config files: the values in config_hpf.yaml refer to tool/filepaths on SickKid's HPC4Health tenancy (hpf), while the values in config_cheo_ri.yaml refer to tool/filepaths on CHEO's HPC4Health tenancy. For simplicity, we refer below only to config_hpf.yaml; if you are running crg2 on CHEO's tenancy, use config_cheo_ri.yaml instead. + +1. If you are running crg2 on the hpf, skip to the 'Running the pipeline' section. 2. Download and setup Anaconda: https://docs.conda.io/projects/conda/en/latest/user-guide/install/linux.html 3. Install Snakemake 5.10.0 via Conda: https://snakemake.readthedocs.io/en/stable/getting_started/installation.html 4. Git clone this repo, [crg](https://github.com/ccmbioinfo/crg), and [cre](https://github.com/ccmbioinfo/cre). crg2 uses various scripts from other two repos to generate final reports. @@ -64,22 +66,124 @@ Make sure to replace ```~/crg2-conda``` with the path made in step 4. This will > install.packages("dbscan") > install.packages("doSNOW") ``` +13. To generate a mobile element insertion report (WGS only), MELT installation is required and some paths must be added to config_hpf.yaml: +- Download MELT from https://melt.igs.umaryland.edu/downloads.php. +- Unpack the .tar.gz file: ```tar zxf MELTvX.X.tar.gz ``` + This should create a MELTvX.X directory in your current directory. +- In config_hpf.yaml, add the path to the MELTvX.X directory to `config[“tools”][”melt”]` . +- Generate a transposon reference text file containing a list of full paths to the mobile element references. For example: + ```ls /MELTv2.2.2/me_refs/1KGP_Hg19/*_MELT.zip > transposon_file_list.txt ``` +- In config_hpf.yaml, add the full path to the transposon reference text file to `config[“ref”][“melt_element_ref”]`. +- In config_hpf.yaml, add the full path to hg19.genes.bed, containing the gene annotation for the FASTA reference to `config[“annotation”][“melt”][“genes”]`. The file can be found at: + ```/MELTv2.2.2/add_bed_files/1KGP_Hg19/hg19.genes.bed ``` + +## Pipeline details + +### Pre-calling steps +1. Map fastqs to the human decoy genome GRCh37d5 + +2. Picard MarkDuplicates, but don't remove reads + +3. GATK4 base recalibration + +4. Remove reads mapped to decoy chromosomes + +### WGS: SNV +1. Call SNV's and generate gVCFs + +2. Merge gVCF's and perform joint genotyping + +3. Filter against GATK best practices filters + +4. Decompose multiallelics, sort and uniq the filtered VCF using vt + +5. Annotate using vcfanno and VEP + +6. Generate a gemini db using vcf2db.py + +7. Generate a cre report using cre.sh + +### WES: SNV +1. Call variants using GATK, Freebayes, Platypus, and SAMTools. + +2. Apply caller specific filters and retain PASS variants + +3. Decompose multiallelics, sort and uniq filtered VCF using vt + +4. Retain variants called by GATK or 2 other callers; Annotate caller info in VCF with INFO/CALLER and INFO/NUMCALLS. + +5. Annotate using vcfanno and VEP + +6. Generate a gemini db using vcf2db.py + +7. Generate a cre report using cre.sh + +8. Repeat the above 1-7 steps using GATK MUTECT2 variant calls to generate a mosaic variant report + +### WGS: SV (filtered and unfiltered) +1. Call SV's using Manta, Smoove and Wham + +2. Merge calls using MetaSV + +3. Annotate VCF using snpEff and SVScores + +4. Split multi-sample VCF into individual sample VCFs + +5. Generate an annotated report (produces filtered, i.e. SV called by at least two callers, and unfiltered reports) + +### WGS: BND +1. Filter Manta SV calls to exclude all variants but BNDs + +2. Annotate VCF using snpEff + +3. Generate an annotated BND report + + +### WGS: STR + +A. ExpansionHunter: known repeat location + + 1. Identify repeat expansions in sample BAM/CRAMs + 2. Annotate repeats with disease threshold, gene name, repeat sizes from 1000Genome (mean& median) + 3. Generate per-family report as Excel file + +B. ExpansionHunterDenovo: denovo repeats + + 1. Identify denovo repeat in sample BAM/CRAMs + 2. Combine individual JSONs from current family and 1000Genomes to a multi-sample TSV + 3. Run DBSCAN clustering to identify outlier repeats + 4. Annotate with gnoMAD, OMIM, ANNOVAR + 5. Generate per-family report as Excel file + +### WGS: Mitochondrial variants + + 1. Call MT variants using mity call + 2. Normalize MT variants using mity normalize + 3. Annotate MT variants using vcfanno + 4. Generate mitochondrial variant report + +### WGS: MELT mobile element insertions (MEIs) + 1. Preprocess CRAMs + 2. Call MEIs in individuals + 3. Group and genotype MEIs across a family + 4. Filter out ac0 calls + 5. Annotate variants + 6. Generate MELT report + ## Running the pipeline -1. Make a folder in a directory with sufficient space. Copy over the template files samples.tsv, units.tsv, config_hpf.yaml, crg2/dnaseq_cluster.pbs, pbs_profile/pbs_config.yaml . -You may need to re-copy config_hpf.yaml and pbs_config.yaml if the files were recently updated in the repo from previous crg2 runs. Note that 'pbs_config.yaml' is for submitting each rule as cluster jobs, so ignore this if not running on cluster. +1. Make a folder in a directory with sufficient space. Copy over the template files crg2/samples.tsv, crg2/units.tsv, crg2/config_hpf.yaml, crg2/dnaseq_slurm_hpf.sh, crg2/slurm_profile/slurm-config.yaml . +You may need to re-copy config_hpf.yaml and slurm-config.yaml if the files were recently updated in the repo from previous crg2 runs. Note that 'slurm-config.yaml' is for submitting each rule as cluster jobs, so ignore this if not running on cluster. ``` mkdir NA12878 -cp crg2/samples.tsv crg2/units.tsv crg2/config_hpf.yaml crg2/dnaseq_cluster.pbs crg2/pbs_profile/pbs_config.yaml NA12878 +cp crg2/samples.tsv crg2/units.tsv crg2/config_hpf.yaml crg2/dnaseq_slurm_hpf.sh crg2/slurm_profile/slurm-config.yaml NA12878 cd NA12878 ``` 2. Set up pipeline run * Reconfigure 'samples.tsv', 'units.tsv' to reflect sample names and input files. - * If you are using cram files as input, make sure that you are specifying the correct cram references in 'config_hpf.yaml'- `old_cram_ref` refers to the original reference the cram was aligned to, and `new_cram_ref` refers to the new reference used to convert the cram to fastq. You can get the `old_cram_ref` from the cram file header by running `samtools view -H file_name.cram`. If there are multiple fastqs per read end, these must be comma-delimited within the units.tsv file. - * Modify 'dnaseq_cluster.pbs' to reflect `configfile` location (working directory path, in this case NA12878). * Modify 'config_hpf.yaml': * change `project` to refer to the family ID (here, NA12878). - * set `pipeline` to `wes`, `wgs` or `annot` for exome sequences, whole genome sequences, or to simply annotate a VCF respectively. + * set `pipeline` to `wes`, `wgs`, `annot` or `mity` for exome sequences, whole genome sequences, to simply annotate a VCF respectively or to generate mitochondrial reports * inclusion of a panel bed file (`panel`) or hpofile (`hpo`) will generate 2 SNV reports with all variants falling within these regions, one which includes variants in flanking regions as specified by `flank`. * inclusion of a `ped` file with parents and a proband(s) will allow generation of a genome-wide de novo report if `pipeline` is `wgs`. * `minio` refers to the path of the file system mount that backs MinIO in the CHEO HPC4Health tenancy. Exome reports (and coverage reports, if duplication percentage is >20%) will be copied to this path if specified. @@ -93,7 +197,7 @@ NA12878 units.tsv ``` sample platform fq1 fq2 bam cram -NA12878 ILLUMINA /hpf/largeprojects/ccm_dccforge/dccdipg/Common/NA12878/NA12878.bam_1.fq /hpf/largeprojects/ccm_dccforge/dccdipg/Common/NA12878/NA12878.bam_2.fq +NA12878 ILLUMINA /hpf/largeprojects/ccmbio/GIAB_benchmark_datasets/hg19/WGS/NA12878/RMNISTHS_30xdownsample.bam ``` config_hpf.yaml @@ -107,8 +211,9 @@ run: hpo: "" # five-column TSV with HPO terms; leave this string empty is there are no hpo terms flank: 100000 gatk: "gatk" - pipeline: "wes" #either wes (exomes) or wgs (genomes) or annot (to annotate and produce reports for an input vcf) + pipeline: "wgs" #either wes (exomes) or wgs (genomes) or annot (to annotate and produce reports for an input vcf) or mity (to generate mitochondrial reports) minio: "" + PT_credentials: "" ... ``` @@ -120,55 +225,69 @@ run: 5.10.0 ``` -4. Test that the pipeline will run by adding the flag "-n" to the command in dnaseq_cluster.pbs. +4. Test that the pipeline will run by adding the flag "-n" to the command in dnaseq_slurm_hpf.sh and running it. ``` -(snakemake) [dennis.kao@qlogin5 crg2]$ snakemake --use-conda -s ~/crg2/Snakefile --cores 4 --conda-prefix /hpf/largeprojects/ccm_dccforge/dccdipg/Common/snakemake --profile ~/crg2/pbs_profile --configfile config_hpf.yaml -n +(snakemake) [dennis.kao@qlogin5 crg2]$ sh dnaseq_slurm_hpf.sh Building DAG of jobs... Job counts: count jobs + 1 EH + 1 EH_report + 1 EHdn + 1 EHdn_report 1 all - 86 call_variants - 86 combine_calls - 86 genotype_variants + 1 allsnvreport + 1 bam_to_cram + 1 bcftools_stats + 1 bgzip + 25 call_variants + 25 combine_calls + 1 fastq_screen + 1 fastqc + 25 genotype_variants 2 hard_filter_calls + 1 input_prep + 1 manta 1 map_reads + 1 mark_duplicates + 1 md5 1 merge_calls 1 merge_variants + 1 metasv + 1 mito_vcfanno + 1 mity_call + 1 mity_normalise + 1 mity_report + 1 mosdepth + 1 multiqc + 1 pass + 1 peddy + 1 qualimap 1 recalibrate_base_qualities + 1 remove_decoy + 3 samtools_index + 1 samtools_index_cram + 1 samtools_stats 2 select_calls - 1 snvreport + 1 smoove + 2 snpeff + 1 subset + 1 svreport + 2 svscore + 1 tabix 1 vcf2db 1 vcfanno 1 vep + 1 verifybamid2 1 vt - 272 - -[Tue May 12 10:56:12 2020] -rule map_reads: - input: /hpf/largeprojects/ccm_dccforge/dccdipg/Common/NA12878/NA12878.bam_1.fq, /hpf/largeprojects/ccm_dccforge/dccdipg/Common/NA12878/NA12878.bam_2.fq - output: mapped/NA12878-1.sorted.bam - log: logs/bwa_mem/NA12878-1.log - jobid: 271 - wildcards: sample=NA12878, unit=1 - threads: 4 - -[Tue May 12 10:56:12 2020] -rule recalibrate_base_qualities: - input: mapped/NA12878-1.sorted.bam, /hpf/largeprojects/ccm_dccforge/dccdipg/Common/genomes/GRCh37d5/GRCh37d5.fa, /hpf/largeprojects/ccmbio/naumenko/tools/bcbio/gemini_data/dbsnp.b147.20160601.tidy.vcf.gz - output: recal/NA12878-1.bam - log: logs/gatk/bqsr/NA12878-1.log - jobid: 270 - wildcards: sample=NA12878, unit=1 - -[Tue May 12 10:56:12 2020] -rule call_variants: - input: recal/NA12878-1.bam, /hpf/largeprojects/ccm_dccforge/dccdipg/Common/genomes/GRCh37d5/GRCh37d5.fa, /hpf/largeprojects/ccmbio/naumenko/tools/bcbio/gemini_data/dbsnp.b147.20160601.tidy.vcf.gz - output: called/NA12878.GL000204.1.g.vcf.gz - log: logs/gatk/haplotypecaller/NA12878.GL000204.1.log - jobid: 239 - wildcards: sample=NA12878, contig=GL000204.1 - + 1 wham + 1 write_version + 129 + + [rule inputs and outputs removed for brevity] + +This was a dry-run (flag -n). The order of jobs does not reflect the order of execution. ... ``` @@ -190,17 +309,46 @@ rule call_variants: `dnaseq_slurm_api.sh` is called by [Stager](https://stager.genomics4rd.ca/) when exome analyses are requested. It automatically sets up (`exome_setup_stager.py`) and kicks off the crg2 WES pipeline via the slurm API using linked files that have been uploaded to MinIO. - + +## Pipeline reports + +## Reports + +Column descriptions and more info on how variants are filtered can be found in the [CCM Sharepoint.](https://sickkidsca.sharepoint.com/:f:/r/sites/thecenterforcomputationalmedicineworkspace/Shared%20Documents/C4Rare-updates/Report_documentation/Documentation?csf=1&web=1&e=nSMt2p) + +The WES pipeline generates 5 reports: + +1. wes.regular - report on coding SNVs in exonic regions + +2. wes.synonymous - report on synonymous SNVs in exonic regions + +3. clinical.wes.regular - same as wes.regular but with more stringent filters + +4. clinical.wes.synonymous - same as wes.synonymous but with more stringent filters + +5. wes.mosaic - putative mosaic variants + +The WGS pipeline generates up to 11 reports: + The SNV reports can be found in the directories: - report/coding/{PROJECT_ID}/{PROJECT_ID}.\*wes\*.csv - report/panel/{PROJECT_ID}/{PROJECT_ID}.\*wgs\*.csv - report/panel-flank/{PROJECT_ID}/{PROJECT_ID}.\*wgs\*.csv -The SV reports can be found in the directory: - - report/sv/{PROJECt_ID}.wgs.{VER}.{DATE}.tsv. + - report/denovo/{PROJECT_ID}/{PROJECT_ID}.\*wgs\*.csv + +The SV reports can be found in the directory (SV, unfiltered SV, and BND): + - report/sv/{PROJECT_ID}.wgs.{VER}.{DATE}.tsv The STR reports can be found in: - report/str/{PROJECT_ID}.EH.v1.1.{DATE}.xlsx - report/str/{PROJECT_ID}.EHDN.{DATE}.xlsx + +The mitochondrial report can be found in the directory: + - report/mitochondrial/{PROJECT_ID}.mitochondrial.report.csv + +The MELT mobile element report can be found in the directory: + - report/MELT/{PROJECT_ID}.wgs.{DATE}.csv + ## Automatic pipeline submission ### Genomes `parser_genomes.py` script can be used to automate the above process for a batch of genomes. @@ -218,114 +366,54 @@ optional arguments: The script performs the following operations for each familyID present in the sample info file - create run folder: \ - copy the following files to run folder and update settings where applicable: - - config.yaml: update run name, input type, panel and ped (if avaialble in /hpf/largeprojects/ccmbio/dennis.kao/gene_data/{HPO,Pedigrees}) + - config_hpf.yaml: update run name, input type, panel and ped (if avaialble in /hpf/largeprojects/ccmbio/dennis.kao/gene_data/{HPO,Pedigrees}) - units.tsv: add sample name, and input file paths - samples.tsv: add sample names - - dnaseq_cluser.pbs: rename job (#PBS -N \) - - pbs_config.yaml + - dnaseq_slurm_hpf.sh: rename job + - slurm-config.yaml - submit Snakemake job +### Exomes for re-analysis +`exome_reanalysis.py` script can be used to automate the above process for exome re-analysis. +``` +usage: exome_reanalysis.py [-h] -f FILE -d path -## Pipeline details - -### Pre-calling steps -1. Map fastqs to the human decoy genome GrCh37d5 - -2. Picard MarkDuplicates, but don't remove reads - -3. GATK4 base recalibration - -4. Remove reads mapped to decoy chromosomes - -### WGS: SNV -1. Call SNV's and generate gVCFs - -2. Merge gVCF's and perform joint genotyping - -3. Filter against GATK best practices filters - -4. Decompose multiallelics, sort and uniq the filtered VCF using vt - -5. Annotate using vcfanno and VEP - -6. Generate a gemini db using vcf2db.py - -7. Generate a cre report using cre.sh - -### WES: SNV -1. Call variants using GATK, Freebayes, Platypus, and SAMTools - -2. Apply caller specific filters and retain PASS variants - -3. Decompose multiallelics, sort and uniq filtered VCF using vt - -4. Retain variants called by GATK or 2 other callers; Annotate caller info in VCF with INFO/CALLER and INFO/NUMCALLS. - -5. Annotate using vcfanno and VEP - -6. Generate a gemini db using vcf2db.py - -7. Generate a cre report using cre.sh - -### SV -1. Call SV's using Manta, Smoove and Wham - -2. Merge calls using MetaSV - -3. Annotate VCF using snpEff and SVScores - -4. Split multi-sample VCF into individual sample VCFs - -5. Generate an annotated report using crg - -### STR - -A. ExpansionHunter: known repeat location - - 1. Identify repeat expansions in sample BAM/CRAMs - 2. Annotate repeats with disease threshold, gene name, repeat sizes from 1000Genome (mean& median) - 3. Generate per-family report as Excel file - -B. ExpansionHunterDenovo: denovo repeats - - 1. Identify denovo repeat in sample BAM/CRAMs - 2. Combine individual JSONs from current family and 1000Genomes to a multi-sample TSV - 3. Run DBSCAN clustering to identify outlier repeats - 4. Annotate with gnoMAD, OMIM, ANNOVAR - 5. Generate per-family report as Excel file - - -## Reports - -Column descriptions and more info on how variants are filtered can be found here: - -SNV: https://docs.google.com/document/d/1zL4QoINtkUd15a0AK4WzxXoTWp2MRcuQ9l_P9-xSlS4 - -SV: https://docs.google.com/document/d/1o870tr0rcshoae_VkG1ZOoWNSAmorCZlhHDpZuZogYE - -The WGS pipeline generates 6 reports: - -1. wgs.snv - a report on coding SNVs across the entire genome - -2. wgs.panel.snv - a report on SNVs within the panel specified bed file - -3. wgs.panel.snv - a report on SNVs within the panel specified bed file with a 100kb flank on each side +Reads sample info from Stager analysis csv file (-f) and creates directory (-d) necessary to run crg2. -4. wgs.sv - a report on SVs across the entire genome +optional arguments: + -h, --help show this help message and exit + -f FILE, --file FILE Analyses csv output from STAGER + -d path, --dir path Absolute path where crg2 directory structure will be created under familyID/analysisID as base directory + ``` -5. EH - a report on repeat expansions in known locations +The script parses an analysis request csv from Stager for exome re-analyses and sets up necessary directories (under 2nd argument), files as below: +1. create family analysis directory under directory provided +3. copy config_cheo_ri.yaml, slurm-config.yaml and dnaseq_slurm_cheo_ri.sh from crg2 repo and replace necessary strings +4. search results directory /srv/shared/hpf/exomes/results for crams from previous analyses +5. create units.tsv and samples.tsv for snakemake +6. submit job if all the above goes well -6. EHDN - a report on denovo repeats filtered from a case-control outlier analysis +### Genomes for re-analysis +`genome_reanalysis.py` script can be used to automate the above process for genome re-analysis. +``` +usage: genome_reanalysis.py [-h] -f FILE -d path -The WES pipeline generates 4 reports for SNV: +Reads sample info from Stager analysis csv file (-f) and creates directory (-d) necessary to run crg2. -1. clinical.wes.regular - report on coding SNVs in exonice regions using clinical filters as decribed [here](https://docs.google.com/document/d/1zL4QoINtkUd15a0AK4WzxXoTWp2MRcuQ9l_P9-xSlS4/edit#heading=h.e4whjtn15ybp) +optional arguments: + -h, --help show this help message and exit + -f FILE, --file FILE Analyses csv output from STAGER + -d path, --dir path Absolute path where crg2 directory structure will be created under familyID as base directory + ``` -2. clinical.wes.synonymous - report on synonymous SNVs in exonic regions using clinical filters as decribed in [here](https://docs.google.com/document/d/1zL4QoINtkUd15a0AK4WzxXoTWp2MRcuQ9l_P9-xSlS4/edit#heading=h.e4whjtn15ybp) +The script parses an analysis request csv from Stager for genome re-analyses and sets up necessary directories (under 2nd argument), files as below: +1. create family analysis directory under directory provided +3. copy config_cheo_ri.yaml, slurm-config.yaml and dnaseq_slurm_cheo_ri.sh from crg2 repo and replace necessary strings +4. search results directories for crams from previous analyses +5. create units.tsv and samples.tsv for snakemake +6. submit job if all the above goes well -3. wes.regular - report on coding SNVs in exonic regions -4. wes.synonymous - report on synonymous SNVs in exonic regions ## Extra targets The following output files are not included in the main Snakefile and can be requested in `snakemake` command-line. @@ -351,7 +439,9 @@ SNV calls from WES and WGS pipeline can be benchmarked using the GIAB dataset _H The inputs in `units.tsv` are downsampled for testing purposes. Edit the tsv to use the inputs from HPF: `ccmmarvin_shared/validation/benchmarking/benchmark-datasets` or CHEO-RI: `/srv/shared/data/benchmarking-datasets/HG001`. 2. Copy `crg2/benchmark_hpf.tsv` or `crg2/benchmark_cheo_ri.tsv` to current directory. _Note_: benchmark_x.tsv uses HG001_NA12878 as family_sample name, so you should edit the "project" name in `config_hpf.yaml` 3. Edit the `config_hpf.yaml` to set "wes" or "wgs" pipeline -4. Edit `dnaseq_cluster.pbs` to include the target `validation/HG001` +4. Edit `dnaseq_slurm_hpf.sh` to include the target `validation/HG001` +## crg2 Developer Documentation: +A guideline to developing crg2 can be found in this [document.](https://sickkidsca.sharepoint.com/:w:/r/sites/thecenterforcomputationalmedicineworkspace/Shared%20Documents/C4Rare-updates/crg2-development.docx?d=w5cad9def56a44b2bba951e6a0a7a4334&csf=1&web=1&e=FHxe4j) diff --git a/Snakefile b/Snakefile index e5cc4fb..9063d18 100644 --- a/Snakefile +++ b/Snakefile @@ -24,12 +24,12 @@ elif config["run"]["pipeline"] == "wgs": expand("report/{p}/{family}", p=["panel", "panel-flank", "denovo"], family=project) if (config["run"]["hpo"] or config["run"]["panel"]) and config["run"]["ped"] else [], "report/coding/{family}".format(family=project), "report/sv", - expand("report/str/{family}.{report_name}.xlsx", family=project, report_name=["EH-v1.1","EHDN"]), + #expand("report/str/{family}.{report_name}.xlsx", family=project, report_name=["EH-v1.1","EHDN"]), "qc/multiqc/multiqc.html", #"plots/depths.svg", #"plots/allele-freqs.svg" "programs-{}.txt".format(PIPELINE_VERSION), - "report/mitochondrial/{family}.mitochondrial.report.csv".format(family=project), + #"report/mitochondrial/{family}.mitochondrial.report.csv".format(family=project) if config["tools"]["mity"] else [], [expand("recal/{family}_{sample}.bam.md5".format(family=config["run"]["project"], sample=s)) for s in samples.index], "report_upload/demultiplexed_reports/{}".format(project) if config["run"]["PT_credentials"] else [] diff --git a/hg38/config_hpf.yaml b/hg38/config_hpf.yaml new file mode 100644 index 0000000..45fff8b --- /dev/null +++ b/hg38/config_hpf.yaml @@ -0,0 +1,188 @@ +run: + project: "NA12878" + samples: samples.tsv + units: units.tsv + ped: "" # leave this string empty if there is no ped + panel: "" # three-column BED file based on hpo file; leave this string empty if there is no panel + hpo: "" # five-column TSV with HPO terms; leave this string empty is there are no hpo terms + flank: 100000 + gatk: "gatk" #or gatk3, or dragen (for dragen-gatk variant calling) + pipeline: "wes" #either wes (exomes) or wgs (genomes) or annot (to annotate and produce reports for an input vcf) + minio: "" + PT_credentials: "" + +genes: + ensembl: "/hpf/largeprojects/ccmbio/nhanafi/c4r/downloads/databases/ensembl_genes_hg38_gtf.csv" + refseq: "/hpf/largeprojects/ccm_dccforge/dccdipg/Common/annotation/GRCh37_latest_genomic.gff_subset.csv" + hgnc: "/hpf/largeprojects/ccm_dccforge/dccdipg/Common/annotation/HGNC_20210617.txt" + +tools: + svscore_script: "/hpf/largeprojects/ccm_dccforge/dccdipg/Common/crg2-non-conda-tools/SVScore/svscore.pl" + annotsv: "/hpf/largeprojects/ccm_dccforge/dccdipg/Common/crg2-non-conda-tools/AnnotSV_2.1" + cre: "~/cre" + crg: "~/crg" + crg2: "~/crg2" + ehdn: "/hpf/largeprojects/ccmbio/arun/Tools/EHDN.TCAG/ExpansionHunterDenovo-v0.7.0" + mity: "/hpf/largeprojects/ccmbio/ajain/mity/mity_package/bin" + +ref: + name: GRCh38.86 + no_decoy_name: GRCh38 + genome: /hpf/largeprojects/ccmbio/nhanafi/c4r/genomes/GIAB/GCA_000001405.15_GRCh38_no_alt_analysis_set_maskedGRC_exclusions_v2.fasta + known-variants: /hpf/largeprojects/ccmbio/nhanafi/c4r/genomes/Homo_sapiens_assembly38.dbsnp138.vcf + no_decoy: /hpf/largeprojects/ccmbio/nhanafi/c4r/genomes/GIAB/GCA_000001405.15_GRCh38_no_alt_analysis_set_maskedGRC_exclusions_v2_EBV_removed.fasta + decoy_bed: "/hpf/largeprojects/ccm_dccforge/dccdipg/Common/pipelines/cre/data/grch37d5.decoy.bed" + canon_bed: "/hpf/largeprojects/ccmbio/nhanafi/c4r/genomes/illumina/grch38_canon.bed" + split_genome: "/hpf/largeprojects/ccm_dccforge/dccdipg/Common/genomes/split_genome" + dragstr_table: "/hpf/largeprojects/ccmbio/nhanafi/c4r/genomes/illumina/hg38.str.zip" + gatk-known: "/hpf/largeprojects/ccmbio/nhanafi/c4r/genomes/Mills_and_1000G_gold_standard.indels.hg38.vcf.gz" + old_cram_ref: "UR:/mnt/hnas/reference/hg19/hg19.fa" + new_cram_ref: "/hpf/largeprojects/ccmbio/naumenko/tools/bcbio/genomes/Hsapiens/hg19/seq/hg19.fa" + bed_index: "/hpf/largeprojects/ccmbio/naumenko/tools/bcbio/genomes/Hsapiens/GRCh37/seq/GRCh37.fa.bedtoolsindex" + +filtering: + vqsr: false + hard: + # hard filtering as outlined in GATK docs + # (https://gatkforums.broadinstitute.org/gatk/discussion/2806/howto-apply-hard-filters-to-a-call-set) + snvs: + "QD < 2.0 || FS > 60.0 || MQ < 30.0 || MQRankSum < -12.5 || ReadPosRankSum < -8.0" + indels: + "QD < 2.0 || FS > 200.0 || ReadPosRankSum < -20.0" + dragen: + "QUAL < 10.4139" + soft: + platypus: + name: 'PlatQualDepth' + filter: '(FR[0] <= 0.5 && TC < 4 && %QUAL < 20) || (TC < 13 && %QUAL < 10) || (FR[0] > 0.5 && TC < 4 && %QUAL < 50)' + freebayes: + name: 'FBQualDepth' + filter: '(AF[0] <= 0.5 && (max(FORMAT/DP) < 4 || (max(FORMAT/DP) < 13 && %QUAL < 10))) || (AF[0] > 0.5 && (max(FORMAT/DP) < 4 && %QUAL < 50))' + samtools: + name: 'stQualDepth' + filter: '((AC[0] / AN) <= 0.5 && max(FORMAT/DP) < 4 && %QUAL < 20) || (max(FORMAT/DP) < 13 && %QUAL < 10) || ((AC[0] / AN) > 0.5 && max(format/DP) < 4 && %QUAL < 50)' + gatk: + snvs: + name: 'GATKCutoffSNP' + filter: 'TYPE="snp" && (MQRankSum < -12.5 || ReadPosRankSum < -8.0 || QD < 2.0 || FS > 60.0 || MQ < 30.0)' + indel: + name: 'GATKCutoffIndel' + filter: 'TYPE="indel" && (ReadPosRankSum < -20.0 || QD < 2.0 || FS > 200.0 || SOR > 10.0)' + dragen: + name: 'DRAGENHardQUAL' + filter: 'QUAL < 10.4139' + +processing: + mark-duplicates: true + # Uncomment and point to a bed file with, e.g., captured regions if necessary, + # see https://gatkforums.broadinstitute.org/gatk/discussion/4133/when-should-i-use-l-to-pass-in-a-list-of-intervals. + # restrict-regions: captured_regions.bed + # If regions are restricted, uncomment this to enlarge them by the given value in order to include + # flanking areas. + # region-padding: 100 + +qc: + fastq_screen: + conf: "/hpf/largeprojects/ccm_dccforge/dccdipg/Common/qc/FastQ_Screen_Genomes/fastq_screen.conf" + +annotation: + cre.vcfanno: + conf: "~/crg2/vcfanno/cre.vcfanno_hg38.conf" + lua_script: "~/crg2/vcfanno/cre.vcfanno.lua" + base_path: "/hpf/largeprojects/ccmbio/nhanafi/c4r/downloads/databases/" + vcfanno: + conf: "~/crg2/vcfanno/crg.vcfanno_hg38.conf" + lua_script: "~/crg2/vcfanno/crg.vcfanno.lua" + base_path: "/hpf/largeprojects/ccmbio/nhanafi/c4r/downloads/databases/" + mt.vcfanno: + conf: "~/crg2/vcfanno/mt.vcfanno.conf" + base_path: "/hpf/largeprojects/ccmbio/ajain/mity/vcfanno" + vep: + dir: "/hpf/largeprojects/ccmbio/nhanafi/c4r/downloads/executables/ensembl-vep/" + dir_cache: "/hpf/largeprojects/ccmbio/nhanafi/c4r/downloads/databases/vep" + snpeff: + dataDir: "/hpf/largeprojects/ccmbio/nhanafi/c4r/downloads/databases/snpeff/data" + svscore: + exon_bed: "/hpf/largeprojects/ccmbio/nhanafi/c4r/downloads/databases/svscore/refGene.txt.exons.bed.gz" + intron_bed: "/hpf/largeprojects/ccmbio/nhanafi/c4r/downloads/databases/svscore/refGene.txt.introns.bed.gz" + cadd: "/hpf/largeprojects/ccmbio/nhanafi/c4r/downloads/databases/CADD_v1.6_whole_genome_SNVs.tsv.gz" + svreport: + reference: GRCh38 + hgmd: "/hpf/largeprojects/ccm_dccforge/dccdipg/Common/annotation/HGMD_2018/hgmd_pro.db" + protein_coding_genes: "/hpf/largeprojects/ccmbio/nhanafi/c4r/downloads/databases/protein_coding_genes.tsv" + exon_bed: "/hpf/largeprojects/ccmbio/nhanafi/c4r/downloads/databases/hg38_exons_from_biomart_20220802.txt" + exac: "/hpf/largeprojects/ccm_dccforge/dccdipg/Common/annotation/ExAC/fordist_cleaned_nonpsych_z_pli_rec_null_data.txt" + omim: "/hpf/largeprojects/ccm_dccforge/dccdipg/Common/annotation/OMIM_2020-04-09/genemap2.txt" + gnomad: "/hpf/largeprojects/ccmbio/nhanafi/c4r/downloads/databases/gnomad_v2_sv.sites_hg38_liftover_nochr_FINAL.bed" + biomart: "/hpf/largeprojects/ccmbio/nhanafi/c4r/downloads/databases/BioMaRt.GrCh38.p13.ensembl.mim.hgnc.entrez.tsv.gz" + mssng_manta_counts: "/hpf/largeprojects/ccmbio/nhanafi/c4r/downloads/databases/Canadian_MSSNG_parent_SVs.Manta.counts.txt" + mssng_lumpy_counts: "/hpf/largeprojects/ccmbio/nhanafi/c4r/downloads/databases/Canadian_MSSNG_parent_SVs.LUMPY.counts.txt" + eh: + catalog: "/hpf/largeprojects/ccm_dccforge/dccdipg/Common/annotation/ExpansionHunter/tandem_repeat_disease_loci_v1.1.hg19.masked.json" + trf: "/hpf/largeprojects/ccm_dccforge/dccdipg/Common/annotation/ExpansionHunter/tandem_repeat_disease_loci_v1.1.tsv" + 1000g: "/hpf/largeprojects/ccm_dccforge/dccdipg/Common/annotation/ExpansionHunter/1000G_EH_v1.0.tsv" + ehdn: + files: "/hpf/largeprojects/ccm_dccforge/dccdipg/Common/annotation/ExpansionHunterDenovo/" + cre: + database_path: "/hpf/largeprojects/ccmbio/nhanafi/c4r/downloads/databases/" + +validation: + benchmark: "benchmark_hpf.tsv" + +params: + bwa: + verbosity: "-v 1" + markSplitReads: "-M" + maxMem: "" + gatk: + java_opts: "-Xms500m -Xmx9555m -Dsamjdk.compression_level=5" + HaplotypeCaller: "" + BaseRecalibrator: "" + #BaseRecalibrator: "--interval-set-rule INTERSECTION -U LENIENT-VCF-PROCESSING --read-filter BadCigar --read-filter NotzPrimaryAlignment" + GenotypeGVCFs: "" + VariantRecalibrator: "" + gatk3: + java_opts: "-Xms500m -Xmx9555m" + HaplotypeCaller: "-drf DuplicateRead --interval_set_rule INTERSECTION --pair_hmm_implementation VECTOR_LOGLESS_CACHING -ploidy 2 -U LENIENT_VCF_PROCESSING --read_filter BadCigar --read_filter NotPrimaryAlignment" + annotation: "MappingQualityRankSumTest MappingQualityZero QualByDepth ReadPosRankSumTest RMSMappingQuality BaseQualityRankSumTest FisherStrand GCContent HaplotypeScore HomopolymerRun DepthPerAlleleBySample Coverage ClippingRankSumTest DepthPerSampleHC" + RealignerTargetCreator: " -l INFO --interval_set_rule INTERSECTION -U LENIENT_VCF_PROCESSING --read_filter BadCigar --read_filter NotPrimaryAlignment " + IndelRealigner: " -U LENIENT_VCF_PROCESSING --read_filter BadCigar --read_filter NotPrimaryAlignment " + BaseRecalibrator: " --interval_set_rule INTERSECTION -U LENIENT_VCF_PROCESSING --read_filter BadCigar --read_filter NotPrimaryAlignment " + PrintReads: " -jdk_deflater -jdk_inflater -U LENIENT_VCF_PROCESSING --read_filter BadCigar --read_filter NotPrimaryAlignment " + dragen: + java_opts: "-Xms500m -Xmx9555m" + HaplotypeCaller: "" + picard: + MarkDuplicates: "REMOVE_DUPLICATES=false" + ValidationStringency: "VALIDATION_STRINGENCY=SILENT" + AssumeSortOrder: "ASSUME_SORT_ORDER=coordinate" + java_opts: "-Xmx2g" + qualimap: + mem: "20G" + nw: 400 + c: "-c" + hm: 3 + gtf: "/hpf/largeprojects/ccmbio/nhanafi/c4r/downloads/databases/hg38_exons_from_biomart_20220802_placeholder_nochr.txt" + extra: "" + verifybamid2: + svd_prefix: "1000g.phase3 100k b38" + # --DisableSanityCheck ensures verifybamid2 does not fail for exomes with < 10,000 markers + extra: "--DisableSanityCheck" + multiqc: + config: "~/crg2/rules/multiqc_config.yaml" + snpeff: + java_opts: "-Xms750m -Xmx20g" + svscore: + operations: "max,sum,top5,top10,mean" + samtools: + mpileup: " -t DP -t AD -u -g " + bcftools: + mpileup: "-a DP -a AD " + call: " -m -v " + freebayes: " --genotype-qualities --strict-vcf --ploidy 2 --no-partial-observations --min-repeat-entropy 1 " + platypus: "--filterDuplicates=0" + rtg-tools: + java_opts: "-Xmx20g" + vcfeval: + sdf: "/hpf/largeprojects/ccmbio/nhanafi/c4r/genomes/GIAB/hg38_SD" + diff --git a/parser_exomes.py b/parser_exomes.py new file mode 100755 index 0000000..e8d9ee2 --- /dev/null +++ b/parser_exomes.py @@ -0,0 +1,348 @@ +import argparse +import ast +import glob +import os +import sys +import pandas as pd +import subprocess +from collections import namedtuple + + +""" +Usage: python parser.py -f -d -s [mapped|recal|fastq|decoy_rm] -b +Parses analyses request csv from Stager and sets up necessary directories (under 2nd argument), files as below: +1. create family and directory passed as "-s" +2. symlink BAM files if "-s" is not fastq +3. copy config.yaml, pbs_config.yaml and dnaseq_cluster.pbs from crg2 repo and replace necessary string +4. searches hpf for input files uploaded via MinIO, or from previously run analyses +5. create units.tsv and samples.tsv for snakemake +6. submit job if all the above goes well +""" + +crg2_dir = os.path.dirname(os.path.realpath(sys.argv[0])) + + +def input_file(input_path): + """Given hpf path, find input files""" + fq1 = sorted(glob.glob(os.path.join(input_path, "*_R1*.fastq.gz"))) + sorted( + glob.glob(os.path.join(input_path, "*_1.fastq.gz")) + ) + fq2 = sorted(glob.glob(os.path.join(input_path, "*_R2*.fastq.gz"))) + sorted( + glob.glob(os.path.join(input_path, "*_2.fastq.gz")) + ) + bam = glob.glob(os.path.join(input_path, "*bam")) + cram = glob.glob(os.path.join(input_path, "*cram")) + # prioritize fastq as input, then bam, then cram + input_types = ("fastq", "bam", "cram") + input = dict.fromkeys(input_types, "") + if len(fq1) != 0: + if len(fq1) == len(fq2): + input["fastq"] = {"R1": fq1, "R2": fq2} + else: + print(f"Number of R1 and R2 files in {input_path} does not match") + input = None + elif len(bam) != 0: + if len(bam) == 1: + input["bam"] = bam[0] + else: + print(f"Multiple bams found under {input_path}") + input = None + elif len(cram) != 0: + if len(cram) == 1: + input["cram"] = cram[0] + else: + print(f"Multiple crams found under {input_path}") + input = None + else: + input = None + + return input + + +def check_minio_dccforge(family, participant, sequencing_id): + """Given family and participant codenames, find input files on the hpf""" + DCCFORGE_UPLOADS = "/hpf/largeprojects/ccm_dccforge/dccforge/uploads/*/" + MINIO_MIRROR = ( + "/hpf/largeprojects/ccm_dccforge/dccforge/uploads/MinIO_G4RD_Mirror/*/" + ) + DPLM = "/hpf/largeprojects/ccmbio_ephemeral/C4R/" + dccforge_input = input_file(f"{DCCFORGE_UPLOADS}/{family}_{participant}*/") + minio_input = input_file(f"{MINIO_MIRROR}/{family}_{participant}*/") + # some datasets are transferred to the ccmbio_ephemeral space by DPLM + dplm_input = None + if sequencing_id != None: + dplm_input = input_file(f"{DPLM}/*{sequencing_id}*/") + if not dccforge_input and not minio_input and not dplm_input: + input = None + else: + if minio_input: + input = minio_input + elif dplm_input: + input = dplm_input + else: + input = dccforge_input + return input + + +def find_input(family, participant, sequencing_id): + """Given family and participant codenames, check if individuals have already been analyzed""" + RESULTS_DIR = "/hpf/largeprojects/ccm_dccforge/dccforge/results/*/" + # check uploads folders first, as participant may have been resequenced + input = check_minio_dccforge(family, participant, sequencing_id) + if not input: + # if not in uploads, this is a re-analysis using bams from results directory + bam = glob.glob(f"{RESULTS_DIR}/{family}/{family}_{participant}.bam") + if len(bam) == 1: + input_types = ("fastq", "bam", "cram") + input = dict.fromkeys(input_types, "") + input["bam"] = bam + # if not in results, may be missing + else: + input = None + + return input + + +def valid_dir(dir): + """Check that directory provided exists""" + if os.path.isdir(dir): + return dir + else: + message = f"{dir} path does not exist. Please provide absolute path" + print(message) + raise argparse.ArgumentTypeError(message) + + +def valid_file(filename): + """Check that file exists and is not empty""" + if not os.path.isfile(filename): + message = f"{filename} file does not exist" + raise argparse.ArgumentTypeError(message) + else: + if not os.path.getsize(filename) > 0: + message = f"{filename} file is empty" + print(message) + raise argparse.ArgumentTypeError(message) + return filename + + +def assign_exomes(requested, bioinfos): + """Assign analyses to bioinfos""" + num_analyses = len(requested) + remainder = num_analyses % len(bioinfos) + num_analyses_per = (num_analyses - remainder) / len(bioinfos) + + assignees = [] + for b in bioinfos: + assignees.append([b] * (int(num_analyses_per))) + assignees.append([bioinfos[0]] * remainder) + # flatten assignees sublists in assignees list + assignees = [item for sublist in assignees for item in sublist] + requested["assignee"] = assignees + + return requested + + +def setup_directories(family, sample_list, filepath, step=None): + d = os.path.join(filepath, family) + if os.path.isdir(d): + print(f"{family} analysis directory already exists") + inputs_flag = False + else: + os.mkdir(d) + # copy config.yaml, pbs_config.yaml, and dnaseq_cluster.pbs + for i in ["config.yaml", "pbs_profile/pbs_config.yaml", "dnaseq_cluster.pbs"]: + cmd = ["cp", os.path.join(crg2_dir, i), d] + subprocess.check_call(cmd) + + # replace family ID in config.yaml & dnaseq_cluster.pbs + replace = f"s/NA12878/{family}/" + config = os.path.join(d, "config.yaml") + if os.path.isfile(config): + cmd = ["sed", "-i", replace, config] + subprocess.check_call(cmd) + replace = f"s/crg2_pbs/{family}/" + pbs = os.path.join(d, "dnaseq_cluster.pbs") + if os.path.isfile(pbs): + cmd = ["sed", "-i", replace, pbs] + subprocess.check_call(cmd) + + # check to see if each sample is associated with input file + inputs_flag = check_inputs(sample_list) + + # if an input file for any sample is missing, the inputs_flag is False and the job will not be submitted + if inputs_flag: + # set input type + if step == "fastq": + input_files = [input_types(sample) for sample in sample_list] + if set(input_files) == {"bam"}: + # if inputs are bams, set input type to bam in config.yaml (default is fastq) + replace = 's/input: "fastq"/input: "bam"/g' + config = os.path.join(d, "config.yaml") + if os.path.isfile(config): + cmd = ["sed", "-i", replace, config] + subprocess.check_call(cmd) + else: + inputs_flag = False + elif set(input_files) == {"cram"}: + # if inputs are crams, set input type to cram + replace = 's/input: "fastq"/input: "cram"/g' + config = os.path.join(d, "config.yaml") + if os.path.isfile(config): + cmd = ["sed", "-i", replace, config] + subprocess.check_call(cmd) + else: + inputs_flag = False + + return inputs_flag + + +def check_inputs(sample_list): + """Checks if each sample (a named tuple) has an input file (fastq, bam, or cram)""" + check_inputs = True + for i in sample_list: + if not i.fq1 and not i.fq2 and not i.bam and not i.cram: + check_inputs = False + return check_inputs + + +def input_types(sample): + """Check if input type is fastq or bam""" + if sample.fq1: + return "fastq" + elif sample.cram: + return "cram" + else: + return "bam" + + +def write_proj_files(sample_list, filepath): + units, samples = os.path.join(filepath, "units.tsv"), os.path.join( + filepath, "samples.tsv" + ) + with open(units, "w") as u, open(samples, "w") as s: + s.writelines("sample\n") + u.writelines("sample\tplatform\tfq1\tfq2\tbam\tcram\n") + for i in sample_list: + s.writelines(f"{i.sample}\n") + u.writelines( + f"{i.sample}\t{i.platform}\t{i.fq1}\t{i.fq2}\t{i.bam}\t{i.cram}\n" + ) + + +def submit_jobs(directory, family): + if not os.path.isdir(directory): + print(f"{directory} not found. Exiting!") + exit() + pbs = "dnaseq_cluster.pbs" + if os.path.isfile(os.path.join(directory, pbs)): + cwd = os.getcwd() + os.chdir(directory) + cmd = ["qsub", pbs] + jobid = subprocess.check_output(cmd).decode("UTF-8").rstrip() + print(f"Submitted snakemake job for {family}: {jobid}") + os.chdir(cwd) + else: + print(f"{pbs} not found, not submitting jobs for {family}.") + + +if __name__ == "__main__": + description = "Reads sample info from csv file (-f) and creates directory (-d) necessary to start crg2 from step (-s) requested." + parser = argparse.ArgumentParser(description=description) + parser.add_argument( + "-f", + "--file", + type=valid_file, + required=True, + help="Analyses csv output from STAGER", + ) + + parser.add_argument( + "-d", + "--dir", + type=valid_dir, + required=True, + metavar="path", + help="Absolute path where crg2 directory structure will be created under familyID as base directory", + ) + + parser.add_argument( + "-s", + "--step", + type=str, + required=True, + choices=["fastq", "mapped", "recal", "decoy_rm"], + help="start running from this folder creation(step)", + ) + + parser.add_argument( + "-b", + "--bioinfos", + nargs="+", + type=str, + required=True, + help="Names of bioinformaticians who will be assigned cases separated by spaces, e.g. MC NH PX", + ) + + args = parser.parse_args() + requested = pd.read_csv(args.file) + inputs_list = [] + + Units = namedtuple("Units", "sample platform fq1 fq2 bam cram") + platform = "ILLUMINA" + projects = {} + for index, row in requested.iterrows(): + family = list(set(ast.literal_eval(row["family_codenames"]))) + # Will need to modify for multi-family analyses (i.e. cohort) + family = family[0] + if not family in projects: + projects[family] = [] + family_inputs = [] + for participant, sequencing_id in zip( + ast.literal_eval(row["participant_codenames"]), + ast.literal_eval(row["sequencing_id"]), + ): + # look for input files in MinIO mirror on hpf, or in results directory + input = find_input(family, participant, sequencing_id) + if input: + if input["fastq"]: + fq1, fq2 = (",").join(input["fastq"]["R1"]), (",").join( + input["fastq"]["R2"] + ) + else: + fq1, fq2 = "", "" + bam, cram = input["bam"], input["cram"] + projects[family].append( + Units(participant, platform, fq1, fq2, bam, cram) + ) + else: + fq1, fq2, bam, cram = "", "", "", "" + projects[family].append( + Units(participant, platform, fq1, fq2, bam, cram) + ) + print(f"Input files are missing for {family}_{participant}") + + family_inputs.append(input) + inputs_list.append(family_inputs) + + for i in projects: + + sample_list = projects[i] + filepath = os.path.join(args.dir, i) + + # create project directory + # copy config.yaml, dnaseq_cluster.pbs & replace family id; copy pbs_config.yaml + print(f"\nStarting to setup project directories for family: {i}") + submit_flag = setup_directories(i, sample_list, args.dir, args.step) + + write_proj_files(sample_list, filepath) + if submit_flag: + print(f"\nSubmitting job for family: {i}") + submit_jobs(filepath, i) + else: + print(f"Job for {i} not submitted") + + requested["input"] = inputs_list + requested = assign_exomes(requested, args.bioinfos) + requested_name = args.file.strip(".csv") + requested.to_csv(f"{requested_name}_with_input_assigned.csv", index=False) diff --git a/parser_genomes.py b/parser_genomes.py old mode 100755 new mode 100644 index 01b0a94..8e8f1ec --- a/parser_genomes.py +++ b/parser_genomes.py @@ -1,72 +1,57 @@ import sys, os, subprocess, glob from collections import namedtuple import argparse -import pandas as pd +<<<<<<< HEAD +''' +======= """ + +>>>>>>> 3c5d678d0fee2c2492072798a04fb2663bc1e247 Usage: python parser.py -f -d -s [mapped|recal|fastq|decoy_rm] Parse five-column(family,sample,fq1,fq2,bam) TSV file (1st argument) and sets up necessary directories (under 2nd argument), files as below: 1. create family and directory passed as "-s" 2. symlink BAM files if "-s" is not fastq -3. copy config_hpf.yaml, slurm-config.yaml and dnaseq_slurm_hpf.sh from crg2 repo and replace necessary string +3. copy config.yaml, pbs_config.yaml and dnaseq_cluster.pbs from crg2 repo and replace necessary string 4. create units.tsv and samples.tsv for snakemake 5. submit job if all the above goes well -""" +''' crg2_dir = os.path.dirname(os.path.realpath(sys.argv[0])) -folder_file_suffix = { - "mapped": "sorted.bam", - "recal": "bam", - "decoy_rm": "no_decoy_reads.bam", -} - - +folder_file_suffix = {"mapped": "sorted.bam", "recal": "bam", "decoy_rm": "no_decoy_reads.bam"} def create_symlink(src, dest): if not os.path.isfile(dest): cmd = ["ln", "-s", src, dest] subprocess.check_call(cmd) - def setup_directories(family, sample_list, filepath, step): d = os.path.join(filepath, family) if not os.path.isdir(d): os.mkdir(d) - # copy config_hpf.yaml, slurm_config.yaml, and dnaseq_slurm_hpf.sh - for i in [ - "config_hpf.yaml", - "slurm_profile/slurm-config.yaml", - "dnaseq_slurm_hpf.sh", - ]: + #copy config.yaml, pbs_config.yaml, and dnaseq_cluster.pbs + for i in ["config.yaml", "pbs_profile/pbs_config.yaml", "dnaseq_cluster.pbs"]: cmd = ["cp", os.path.join(crg2_dir, i), d] subprocess.check_call(cmd) - # replace family ID and pipeline in config_hpf.yaml & dnaseq_slurm_hpf.sh - replace = "s/NA12878/{}/".format(family) - pipeline = "s/wes/wgs/" - PT_credentials = "s+PT_credentials: ""+PT_credentials: {}+".format( - "~/crg2/credentials.csv" - ) - config = os.path.join(d, "config_hpf.yaml") + #replace family ID and pipeline in config.yaml & dnaseq_cluster.pbs + replace = 's/NA12878/{}/'.format(family) + pipeline = 's/wes/wgs/' + config = os.path.join(d,"config.yaml") if os.path.isfile(config): cmd = ["sed", "-i", replace, config] subprocess.check_call(cmd) cmd = ["sed", "-i", pipeline, config] subprocess.check_call(cmd) - cmd = ["sed", "-i", PT_credentials, config] - subprocess.check_call(cmd) - replace = "s/job-name=crg2/job-name={}/".format(family) - config_path = "s+~/crg2/config_hpf.yaml+{}/config_hpf.yaml+".format(d) - slurm = os.path.join(d, "dnaseq_slurm_hpf.sh") - if os.path.isfile(slurm): - cmd = ["sed", "-i", replace, slurm] - subprocess.check_call(cmd) - cmd = ["sed", "-i", config_path, slurm] + replace = 's/crg2_pbs/{}/'.format(family) + pbs = os.path.join(d,"dnaseq_cluster.pbs") + if os.path.isfile(pbs): + cmd = ["sed", "-i", replace, pbs] subprocess.check_call(cmd) - # glob hpo - hpo_path = os.path.expanduser("~/gene_data/HPO") - hpo = glob.glob(("{}/{}_HPO.txt").format(hpo_path, family)) + #glob hpo + hpo_path = os.path.expanduser('~/gene_data/HPO') + hpo = glob.glob(('{}/{}_HPO.txt').format(hpo_path, family)) if len(hpo) > 1: print(f"Multiple HPO files found: {hpo}. Exiting!") exit() @@ -76,119 +61,75 @@ def setup_directories(family, sample_list, filepath, step): cmd = ["sed", "-i", replace, config] subprocess.check_call(cmd) - # glob ped - ped_path = os.path.expanduser("~/gene_data/pedigrees") - ped = glob.glob(("{}/{}*ped").format(ped_path, family)) + #glob ped + ped_path = os.path.expanduser('~/gene_data/pedigrees') + ped = glob.glob(('{}/{}*ped').format(ped_path, family)) if len(ped) > 1: print(f"Multiple ped files found: {ped}. Exiting!") exit() - if len(ped) == 0: - print(f"No ped files found for family: {family}") - return False if len(ped) == 1 and os.path.isfile(config): ped = ped[0] - pedi = pd.read_csv( - ped, - sep=" ", - header=None, - names=["fam_id", "individual_id", "pat_id", "mat_id", "sex", "phenotype"], - ) - for index, row in pedi.iterrows(): - individual_id = str(row.individual_id) - pat_id = str(row.pat_id) - mat_id = str(row.mat_id) - # individual_id, pat_id and mat_id cannot be both numeric and single number - if not ( - (individual_id.isnumeric() and len(individual_id) == 1) - | (pat_id.isnumeric() and len(pat_id) == 1) - | (mat_id.isnumeric() and len(mat_id) == 1) - ): - # write and check sample - write_sample(filepath, family) - samples = os.path.join(filepath, family, "samples.tsv") - samples = pd.read_csv(samples, dtype=str) - samples = list(samples.iloc[:, 0]) - samples = [family + s for s in samples] - samples.sort() - print(f"samples to be processed: {samples}") - ped_samples = [individual_id, pat_id, mat_id] - ped_samples.sort() - if all(s in samples for s in ped_samples): - replace = 's+ped: ""+ped: "{}"+'.format(ped) - cmd = ["sed", "-i", replace, config] - subprocess.check_call(cmd) - break - else: - print( - f"Individuals in ped file do not match with samples.tsv, double check: {ped}." - ) - return False - else: - print( - f"Ped file is either not a trio or not linked, double check: {ped}." - ) - return False + replace = 's+ped: ""+ped: "{}"+'.format(ped) + cmd = ["sed", "-i", replace, config] + subprocess.check_call(cmd) # bam: start after folder creation and symlink for step if step in ["mapped", "decoy_rm", "recal"]: if len(sample_list) == len([i for i in sample_list if i.bam]): + start_folder = os.path.join(d, step) if not os.path.isdir(start_folder): os.mkdir(start_folder) for i in sample_list: - dest = "{}_{}.{}".format(family, i.sample, folder_file_suffix[step]) + dest = "{}_{}.{}".format(family,i.sample,folder_file_suffix[step]) create_symlink(i.bam, os.path.join(start_folder, dest)) if os.path.isfile(i.bam + ".bai"): src = i.bam + ".bai" dest = dest + ".bai" create_symlink(src, os.path.join(start_folder, dest)) else: - print( - f"{start_folder} directory already exists! Won't rewrite/submit jobs. " - ) + print(f"{start_folder} directory already exists! Won't rewrite/submit jobs. ") return False return True - - # fastq: no directory creations required; inputs can be fastq or bam - # set input: bam or fastq(default) in config_hpf.yaml + + #fastq: no directory creations required; inputs can be fastq or bam + #set input: bam or fastq(default) in config.yaml if step == "fastq": - if len(sample_list) == len([i for i in sample_list if i.bam]): + if len(sample_list) == len([i for i in sample_list if i.bam ]): + replace = 's/input: "fastq"/input: "bam"/g' + config = os.path.join(d,"config.yaml") + if os.path.isfile(config): + cmd = ["sed", "-i", replace, config] + subprocess.check_call(cmd) return True - if len(sample_list) == len([i for i in sample_list if i.fq1]): + if len(sample_list) == len([i for i in sample_list if i.fq1 ]): return True + - -def write_sample(filepath, family): - samples = os.path.join(filepath, family, "samples.tsv") - with open(samples, "w") as s: +def write_proj_files(sample_list, filepath): + units, samples = os.path.join(filepath, "units.tsv"), os.path.join(filepath, "samples.tsv") + with open(units, "w") as u, open(samples, "w") as s: s.writelines("sample\n") - for i in sample_list: - s.writelines(f"{i.sample}\n") - - -def write_units(sample_list, filepath): - units = os.path.join(filepath, "units.tsv") - with open(units, "w") as u: u.writelines("sample\tplatform\tfq1\tfq2\tbam\n") for i in sample_list: + s.writelines(f"{i.sample}\n") u.writelines(f"{i.sample}\t{i.platform}\t{i.fq1}\t{i.fq2}\t{i.bam}\n") - + def submit_jobs(directory, family): if not os.path.isdir(directory): print(f"{directory} not found. Exiting!") exit() - slurm = "dnaseq_slurm_hpf.sh" - if os.path.isfile(os.path.join(directory, slurm)): + pbs = "dnaseq_cluster.pbs" + if os.path.isfile(os.path.join(directory,pbs)): cwd = os.getcwd() os.chdir(directory) - cmd = ["sbatch", slurm] + cmd = ["qsub", pbs] jobid = subprocess.check_output(cmd).decode("UTF-8").rstrip() print(f"Submitted snakemake job for {family}: {jobid}") os.chdir(cwd) else: - print(f"{slurm} not found, not submitting jobs for {family}.") - + print(f"{pbs} not found, not submitting jobs for {family}.") def valid_dir(dir): @@ -199,7 +140,6 @@ def valid_dir(dir): print(message) raise argparse.ArgumentTypeError(message) - def valid_file(filename): if not os.path.isfile(filename): @@ -215,33 +155,33 @@ def valid_file(filename): if __name__ == "__main__": - description = """Reads sample info from TSV file (-f) and creates directory (-d) necessary to start + description = '''Reads sample info from TSV file (-f) and creates directory (-d) necessary to start crg2 from step (-s) requested. - """ + ''' parser = argparse.ArgumentParser(description=description) parser.add_argument( - "-f", - "--file", - type=valid_file, - required=True, - help="Five column TAB-seperated sample info file", - ) + "-f", + "--file", + type=valid_file, + required=True, + help="Five column TAB-seperated sample info file", + ) parser.add_argument( - "-s", - "--step", - type=str, - required=True, - choices=["fastq", "mapped", "recal", "decoy_rm"], - help="start running from this folder creation(step)", - ) + "-s", + "--step", + type=str, + required=True, + choices=["fastq", "mapped", "recal", "decoy_rm"], + help="start running from this folder creation(step)", + ) parser.add_argument( - "-d", - "--dir", - type=valid_dir, - required=True, - metavar="path", - help="Absolute path where crg2 directory struture will be created under familyID as base directory", - ) + "-d", + "--dir", + type=valid_dir, + required=True, + metavar="path", + help="Absolute path where crg2 directory struture will be created under familyID as base directory", + ) projects = {} Units = namedtuple("Units", "sample platform fq1 fq2 bam") platform = "ILLUMINA" @@ -255,19 +195,17 @@ def valid_file(filename): projects[family].append(Units(sample, platform, fq1, fq2, bam)) for i in projects: - + sample_list = projects[i] - filepath = os.path.join(args.dir, i) + filepath = os.path.join(args.dir,i) - # create project directory - # copy config_hpf.yaml, dnaseq_slurm_hpf.sh & replace family id; copy slurm_config.yaml + #create project directory + #copy config.yaml, dnaseq_cluster.pbs & replace family id; copy pbs_config.yaml print(f"\nStarting to setup project directories for family: {i}") submit_flag = setup_directories(i, sample_list, args.dir, args.step) - write_units(sample_list, filepath) - + if submit_flag: + write_proj_files(sample_list, filepath) submit_jobs(filepath, i) - else: - write_sample(args.dir, i) print("DONE") diff --git a/pbs_profile/pbs_config.yaml b/pbs_profile/pbs_config.yaml index f3be1cc..acba2fb 100755 --- a/pbs_profile/pbs_config.yaml +++ b/pbs_profile/pbs_config.yaml @@ -52,6 +52,9 @@ manta: peddy: mem: "60g" +<<<<<<< HEAD minio: - mem: "60g" \ No newline at end of file + mem: "60g" +======= +>>>>>>> changed memory usage for peddy diff --git a/rules/calling.smk b/rules/calling.smk index 3380893..c9edfb2 100644 --- a/rules/calling.smk +++ b/rules/calling.smk @@ -16,24 +16,58 @@ def get_decoy_removed_sample_bams(wildcards): sample=wildcards.sample, family=wildcards.family) -rule call_variants: - input: - bam=get_sample_bams, - ref=config["ref"]["genome"], - known=config["ref"]["known-variants"], - regions="called/{contig}.regions.bed" if config["processing"].get("restrict-regions") else [] - output: - gvcf=temp("called/{family}_{sample}.{contig}.g.vcf.gz") - log: - "logs/gatk/haplotypecaller/{family}_{sample}.{contig}.log" - params: - extra=get_call_variants_params, - java_opts=config["params"]["gatk"]["java_opts"], - group: "gatkcall" - resources: - mem=lambda wildcards, input: len(input.bam) * 15 - wrapper: - get_wrapper_path("gatk", "haplotypecaller") +#rule drag_str: +# input: +# bam=get_sample_bams, +# ref=config["ref"]["genome"], +# dragstr_table=config["ref"]["dragstr_table"] +# output: +# dragstr_parameters="called/{family}_{sample}.drag_str_parameters.txt" +# log: +# "logs/gatk/CalibrateDragstrModel/{family}_{sample}.log" +# wrapper: +# get_wrapper_path("gatk", "CalibrateDragstrModel") + + +if config["run"]["gatk"] == "gatk": + rule gatk: + input: + bam=get_sample_bams, + ref=config["ref"]["genome"], + known=config["ref"]["known-variants"], + regions="called/{contig}.regions.bed" if config["processing"].get("restrict-regions") else [] + output: + gvcf=temp("called/{family}_{sample}.{contig}.g.vcf.gz") + log: + "logs/gatk/haplotypecaller/{family}_{sample}.{contig}.log" + params: + extra=get_call_variants_params, + java_opts=config["params"]["gatk"]["java_opts"], + group: "gatkcall" + resources: + mem=lambda wildcards, input: len(input.bam) * 15 + wrapper: + get_wrapper_path("gatk", "haplotypecaller") +elif config["run"]["gatk"] == "dragen": + rule dragen: + input: + bam=get_sample_bams, + ref=config["ref"]["genome"], + known=config["ref"]["known-variants"], + regions="called/{contig}.regions.bed" if config["processing"].get("restrict-regions") else [], +# dragstr_parameters="called/{family}_{sample}.drag_str_parameters.txt" + output: + gvcf=temp("called/{family}_{sample}.{contig}.g.vcf.gz") + log: + "logs/gatk/dragenhaplotypecaller/{family}_{sample}.{contig}.log" + params: + extra=get_call_variants_params, + java_opts=config["params"]["dragen"]["java_opts"], + group: "gatkcall" + resources: + mem=lambda wildcards, input: len(input.bam) * 15 + wrapper: + get_wrapper_path("gatk", "dragenhaplotypecaller") rule combine_calls: diff --git a/rules/common.smk b/rules/common.smk index 95e066a..188552c 100644 --- a/rules/common.smk +++ b/rules/common.smk @@ -268,4 +268,4 @@ def get_gatk_vcf_tbi(wildcards): vcf_tbi = expand("genotyped/{family}-gatk_haplotype.vcf.gz.tbi", family=wildcards.family) elif config["run"]["pipeline"] == "wgs": vcf_tbi = expand("genotyped/{family}.vcf.gz.tbi", family=wildcards.family) - return vcf_tbi \ No newline at end of file + return vcf_tbi diff --git a/rules/cre/calling.smk b/rules/cre/calling.smk index 4301bf1..6d45aa8 100644 --- a/rules/cre/calling.smk +++ b/rules/cre/calling.smk @@ -54,7 +54,6 @@ rule call_variants: wrapper: get_wrapper_path("gatk", "haplotypecaller") - rule combine_calls: input: ref=config["ref"]["genome"], diff --git a/rules/filtering.smk b/rules/filtering.smk index 899cf18..e4a50b6 100644 --- a/rules/filtering.smk +++ b/rules/filtering.smk @@ -18,9 +18,14 @@ rule select_calls: def get_filter(wildcards): - return { - "snv-hard-filter": - config["filtering"]["hard"][wildcards.vartype]} + if config["run"]["gatk"] == "gatk": + return { + "snv-hard-filter": + config["filtering"]["hard"][wildcards.vartype]} + elif config["run"]["gatk"] == "dragen": + return { + "snv-hard-filter": + config["filtering"]["hard"]["dragen"]} rule hard_filter_calls: @@ -30,7 +35,7 @@ rule hard_filter_calls: output: vcf=temp("filtered/{family}.{vartype}.hardfiltered.vcf.gz") params: - filters=get_filter + filters=get_filter log: "logs/gatk/variantfiltration/{family}.{vartype}.log" wrapper: @@ -41,7 +46,7 @@ rule recalibrate_calls: input: vcf="filtered/{family}.{vartype}.vcf.gz" output: - vcf=temp("filtered{family}.{vartype}.recalibrated.vcf.gz") + vcf=temp("filtered/{family}.{vartype}.recalibrated.vcf.gz") params: extra=config["params"]["gatk"]["VariantRecalibrator"] log: diff --git a/rules/mapping.smk b/rules/mapping.smk index 8189556..b43c424 100644 --- a/rules/mapping.smk +++ b/rules/mapping.smk @@ -33,7 +33,7 @@ rule map_reads: markSplitReads = config["params"]["bwa"]["markSplitReads"], sort = "samtools", sort_order = "coordinate" - threads: 16 + threads: 32 resources: mem=lambda wildcards, threads: threads * 2 wrapper: @@ -193,7 +193,7 @@ rule remove_decoy: "../envs/samtools.yaml" shell: """ - samtools view --threads {threads} -h -L {input.canon} {input.bam} | egrep -v "hs37d5|NC_007605" | samtools view --threads {threads} - -bh > {output.out_f} + samtools view --threads {threads} -h -L {input.canon} {input.bam} | egrep -v "_decoy|chrEBV|hs37d5|NC_007605" | samtools view --threads {threads} - -bh > {output.out_f} """ rule samtools_index: diff --git a/rules/svreport.smk b/rules/svreport.smk index 629995d..1585377 100644 --- a/rules/svreport.smk +++ b/rules/svreport.smk @@ -8,6 +8,7 @@ rule svreport: params: PIPELINE_VERSION=PIPELINE_VERSION, project=config["run"]["project"], + reference=config["annotation"]["svreport"]["reference"], hgmd=config["annotation"]["svreport"]["hgmd"], hpo=config["run"]["hpo"], protein_coding_genes=config["annotation"]["svreport"]["protein_coding_genes"], diff --git a/scripts/SVRecords/SVAnnotator.py b/scripts/SVRecords/SVAnnotator.py index 2554533..b6b596f 100644 --- a/scripts/SVRecords/SVAnnotator.py +++ b/scripts/SVRecords/SVAnnotator.py @@ -9,6 +9,7 @@ # original code written by Dennis Kao: https://github.com/ccmbioinfo/crg/blob/master/SVRecords/SVAnnotator.py + class SVTYPE(Enum): def _generate_next_value_(name, start, count, last_values): return name @@ -22,7 +23,7 @@ def _generate_next_value_(name, start, count, last_values): class SVAnnotator: - def __init__(self, exon_bed, hgmd_db, hpo, exac, omim, biomart): + def __init__(self, reference, exon_bed, hgmd_db, hpo, exac, omim, biomart): self.make_gene_ref_df(biomart) @@ -160,7 +161,7 @@ def find_min_distance(position, boundaries): position, boundaries ) boundary_distances[breakpoint]["nearest_boundary"].append( - gene + "|" + str(min_boundary) + str(gene) + "|" + str(min_boundary) ) boundary_distances[breakpoint]["nearest_distance"].append( min_distance @@ -579,32 +580,38 @@ def annotate_counts( "COUNT": "sum", } ) - ann_df["COUNT_SV"] = ann_df[["COUNT_CHROM", "COUNT_START", "COUNT_END"]].apply( - lambda x: "{}:{}-{}".format(x[0], x[1], x[2]), axis=1 - ) - ann_df = ann_df.drop( - columns=["COUNT_CHROM", "COUNT_START", "COUNT_END", "COUNT_SVTYPE"] - ) - - ann_df.columns = ann_df.columns.str.replace("COUNT", prefix) + if len(ann_df) != 0: + ann_df["COUNT_SV"] = ann_df[ + ["COUNT_CHROM", "COUNT_START", "COUNT_END"] + ].apply(lambda x: "{}:{}-{}".format(x[0], x[1], x[2]), axis=1) + ann_df = ann_df.drop( + columns=["COUNT_CHROM", "COUNT_START", "COUNT_END", "COUNT_SVTYPE"] + ) - df = sv_record.df.join(ann_df) - df[[prefix]] = df[[prefix]].fillna(0) + ann_df.columns = ann_df.columns.str.replace("COUNT", prefix) + df = sv_record.df.join(ann_df) + df[[prefix]] = df[[prefix]].fillna(0) + else: + # sample SVs don't overlap any SVs in count table (eg no BNDs in Hg38 Manta), so ann_df is empty + df = sv_record.df + df[f"{prefix}"] = ["-"] * len(df) + df[f"{prefix}_SV"] = ["-"] * len(df) return df - def annotsv(self, sample_df): + def annotsv(self, reference, sample_df): """ Handles DGV, DDD annotations """ all_sv_bed_name = "all_sv.bed" annotated = "./{}.annotated.tsv".format(all_sv_bed_name) + print(sample_df.columns) sample_df.reset_index()[["CHROM", "POS", "END", "SVTYPE"]].to_csv( all_sv_bed_name, index=False, sep="\t" ) subprocess.call( - "$ANNOTSV/bin/AnnotSV -SVinputFile {} -SVinputInfo 1 -outputFile {}".format( - all_sv_bed_name, annotated + "$ANNOTSV/bin/AnnotSV -genomeBuild {} -SVinputFile {} -SVinputInfo 1 -outputFile {}".format( + reference, all_sv_bed_name, annotated ), shell=True, ) @@ -677,6 +684,7 @@ def annotsv(self, sample_df): def make_gene_ref_df(self, biomart): df = pd.read_csv(biomart, sep="\t") + df["Associated Gene Name"] = df["Associated Gene Name"].astype(str) # df = df[['Ensembl Gene ID', 'Ensembl Transcript ID', 'Associated Gene Name', 'HGNC ID(s)', 'MIM Gene Accession']].drop_duplicates() df = df[ [ diff --git a/scripts/SVRecords/SVGrouper.py b/scripts/SVRecords/SVGrouper.py index ecfa242..655ec8e 100644 --- a/scripts/SVRecords/SVGrouper.py +++ b/scripts/SVRecords/SVGrouper.py @@ -8,6 +8,7 @@ # original code written by Dennis Kao: https://github.com/ccmbioinfo/crg/blob/master/SVRecords/SVGrouper.py + class SVGrouper: def __init__(self, vcfs, report_type, ann_fields=[]): def list2string(col): @@ -138,7 +139,7 @@ def split_Ensembl_ids(id_list): # if 'chr' in CHROM field, remove vcf_dict["variants/CHROM"] = [ - chrom.strip("chr") for chrom in vcf_dict["variants/CHROM"] + chrom.replace("chr", "") for chrom in vcf_dict["variants/CHROM"] ] # grab alt allele @@ -378,7 +379,7 @@ def _make_bnd_dict(self, vcfs): pr = record.samples[sample]["PR"][1] name = record.samples[sample].name bnd = ( - record.chrom + record.chrom.replace("chr", "") + ":" + str(record.pos) + "-" diff --git a/scripts/clean_gtf.py b/scripts/clean_gtf.py index 7309542..2a68627 100755 --- a/scripts/clean_gtf.py +++ b/scripts/clean_gtf.py @@ -47,7 +47,7 @@ def get_parser(): ensembl_df.columns = map(str.lower, ensembl_df.columns) ensembl_df = ensembl_df.rename(columns={"gene_id": "name"}) - ensembl_df.to_csv("Homo_sapiens.GRCh37.87.gtf_subset.csv", index=False) + ensembl_df.to_csv("Homo_sapiens.GRCh38.110.gtf_subset.csv", index=False) # clean refseq gff (for some reason gtf throws errors with pyranges) refseq = pyranges.read_gff3(args.refseq_gff3) @@ -91,6 +91,6 @@ def get_parser(): refseq_df = refseq_df.drop(columns=["chromosome_refseq"]) refseq_df = refseq_df[["chromosome", "start", "end", "source", "name"]] - refseq_df.to_csv("GRCh37_latest_genomic.gff_subset.csv", index=False) + refseq_df.to_csv("GRCh38_latest_genomic.gff_subset.csv", index=False) print("Done") diff --git a/scripts/crg.intersect_sv_vcfs.py b/scripts/crg.intersect_sv_vcfs.py index b466cc1..5a5ae1f 100755 --- a/scripts/crg.intersect_sv_vcfs.py +++ b/scripts/crg.intersect_sv_vcfs.py @@ -20,6 +20,7 @@ def filter_report(svreport): def main( + reference, protein_coding_genes, exon_bed, hgmd_db, @@ -64,14 +65,14 @@ def main( ) print("Annotating structural variants ...") - ann_records = SVAnnotator(exon_bed, hgmd_db, hpo, exac, omim, biomart) + ann_records = SVAnnotator(reference, exon_bed, hgmd_db, hpo, exac, omim, biomart) sv_records.df = ann_records.annotate_genes(sv_records.df, Protein_coding_genes_col) for sv_count in sv_counts: prefix = Path(sv_count).stem sv_records.df = ann_records.annotate_counts(sv_count, sv_records, prefix=prefix) - sv_records.df = ann_records.annotsv(sv_records.df) + sv_records.df = ann_records.annotsv(reference,sv_records.df) sv_records.df = ann_records.calc_exons_spanned(sv_records.df, exon_bed) sv_records.df = ann_records.annotate_gnomad(gnomad, sv_records) sv_records.df = ann_records.annotate_hgmd(hgmd_db, sv_records.df) @@ -237,6 +238,7 @@ def main( ) main( + snakemake.params.reference, snakemake.params.protein_coding_genes, snakemake.params.exon_bed, snakemake.params.hgmd, @@ -275,6 +277,7 @@ def main( ) main( + snakemake.params.reference, snakemake.params.protein_coding_genes, snakemake.params.exon_bed, snakemake.params.hgmd, diff --git a/scripts/hpo_to_panel.py b/scripts/hpo_to_panel.py index a99a27e..4e3c95d 100644 --- a/scripts/hpo_to_panel.py +++ b/scripts/hpo_to_panel.py @@ -44,7 +44,9 @@ def main(family, hpo, ensembl, refseq, hgnc): # join hpo terms and ensembl+refseq genes on gene id (most gene ids in the hpo file are ENSG ids, but not all) log_message("Mapping HPO genes to gene coordinates") hpo_coord = genes.join(hpo, how="inner").reset_index() - missing = [gene for gene in hpo.index if gene not in hpo_coord["index"].values] + # add 'chr' for hg38 pipeline + hpo_coord['chromosome'] = 'chr' + hpo_coord['chromosome'].astype(str) + missing = [gene for gene in hpo.index if gene not in hpo_coord["name"].values] # check missing genes against HGNC previous symbols and aliases log_message("Mapping missing genes to HGNC aliases") diff --git a/units.tsv b/units.tsv index 5486e89..fece489 100644 --- a/units.tsv +++ b/units.tsv @@ -1,2 +1,2 @@ sample platform fq1 fq2 bam cram -NA12878 ILLUMINA /hpf/largeprojects/ccm_dccforge/dccdipg/Common/NA12878/NA12878.bam_1.fq.gz /hpf/largeprojects/ccm_dccforge/dccdipg/Common/NA12878/NA12878.bam_2.fq.gz +NA12878 ILLUMINA /hpf/largeprojects/ccmbio/GIAB_benchmark_datasets/hg19/WGS/NA12878/RMNISTHS_30xdownsample.bam diff --git a/utils/check_dup_rate.sh b/utils/check_dup_rate.sh index dab3bd6..f8da2af 100755 --- a/utils/check_dup_rate.sh +++ b/utils/check_dup_rate.sh @@ -1,6 +1,7 @@ #!/bin/bash family=$1 multiqc=`echo $family/qc/multiqc/multiqc_data/multiqc_picard_dups.txt` +<<<<<<< HEAD dups=(`cat $multiqc | awk -F '\t' '{print $10}'`) samples=(`cat $multiqc | awk -F '\t' '{print $1}'`) num_of_samples=$((`echo ${#samples[@]}`-1)) @@ -13,6 +14,16 @@ do echo "$sample in $family has greater than 20% duplication rate, run coverage metrics: ${dups[i]}" else echo "$sample in $family duplication rate OK: ${dups[i]}" +======= +dups=`cat $multiqc | awk -F '\t' '{print $10}'` +for d in $dups +do + if [ "$d" != "PERCENT_DUPLICATION" ]; then + dups=`awk "BEGIN {print ${d}*100}" | cut -d '.' -f1` + if (( $dups >= "20")); then + echo "A sample in ${family} has greater than 20% duplication rate, run coverage metrics: $d" + fi +>>>>>>> useful scripts for uploading/moving data and making bam slices fi done diff --git a/vcfanno/cre.vcfanno.conf b/vcfanno/cre.vcfanno_GRCh37.conf similarity index 78% rename from vcfanno/cre.vcfanno.conf rename to vcfanno/cre.vcfanno_GRCh37.conf index 8dd7187..2628928 100644 --- a/vcfanno/cre.vcfanno.conf +++ b/vcfanno/cre.vcfanno_GRCh37.conf @@ -1,11 +1,11 @@ [[annotation]] -file="gnomad_exome.2.1.1.vcf.gz" +file="variation/gnomad_exome.2.1.1.vcf.gz" fields=["AC","nhomalt","AF_popmax","AN","AC_male"] names=["gnomad_ac_es","gnomad_hom_es","gnomad_af_es","gnomad_an_es","gnomad_male_ac_es"] ops=["first","first","first","first","first"] [[annotation]] -file="gnomad_genome.2.1.1.vcf.gz" +file="variation/gnomad_genome.2.1.1.vcf.gz" fields=["AC", "nhomalt","AF_popmax","AN","AC_male"] names=["gnomad_ac_gs", "gnomad_hom_gs","gnomad_af_gs","gnomad_an_gs","gnomad_male_ac_gs"] ops=["self","self","self","self","self"] @@ -47,13 +47,13 @@ name="gnomad_male_ac" type="Integer" [[annotation]] -file="dbsnp-151.vcf.gz" +file="variation/dbsnp-151.vcf.gz" fields=["ID"] names=["rs_ids"] ops=["concat"] [[annotation]] -file="clinvar.vcf.gz" +file="variation/clinvar.vcf.gz" fields=["CLNSIG","CLNREVSTAT"] names=["clinvar_pathogenic", "clinvar_status"] ops=["concat", "concat"] @@ -67,49 +67,49 @@ type="String" #CADD 1.6 - SNVs [[annotation]] -file = "CADD-1.6_whole_genome_SNVs.tsv.gz" +file = "variation/CADD-1.6_whole_genome_SNVs.tsv.gz" names = ["CADD_phred"] columns = [6] ops = ["self"] #CADD 1.6 - indels [[annotation]] -file = "CADD-1.6_InDels.tsv.gz" +file = "variation/CADD-1.6_InDels.tsv.gz" names = ["CADD_phred"] columns = [6] ops = ["self"] #dbNSFP v3.4 [[annotation]] -file = "dbNSFP.txt.gz" +file = "variation/dbNSFP.txt.gz" names = ["phyloP20way_mammalian","phastCons20way_mammalian","Vest3_score","Revel_score","Gerp_score"] columns = [111,115,58,70,107] ops = ["first","first","first","first","first"] # spliceAI [[annotation]] -file = "spliceai_scores.masked.indel.hg19.vcf.gz" +file = "variation/spliceai_scores.masked.indel.hg19.vcf.gz" fields = ["SpliceAI"] names = ["spliceai_score"] ops = ["self"] # spliceAI [[annotation]] -file = "spliceai_scores.masked.snv.hg19.vcf.gz" +file = "variation/spliceai_scores.masked.snv.hg19.vcf.gz" fields = ["SpliceAI"] names = ["spliceai_score"] ops = ["self"] # UCE [[annotation]] -file = "13661_UCEs_over_100bp.hg19.bed.gz" -#file="13661_UCEs_over_100bp.hg19.test.bed.gz" +file = "variation/13661_UCEs_over_100bp.hg19.bed.gz" +#file="variation/13661_UCEs_over_100bp.hg19.test.bed.gz" columns = [3] names = ["UCE_100bp"] ops = ["flag"] [[annotation]] -file="2175_UCEs_over_200bp.GRCH37.bed.gz" +file="variation/2175_UCEs_over_200bp.GRCH37.bed.gz" columns = [3] names = ["UCE_200bp"] ops = ["flag"] diff --git a/vcfanno/cre.vcfanno_hg38.conf b/vcfanno/cre.vcfanno_hg38.conf new file mode 100644 index 0000000..7091b58 --- /dev/null +++ b/vcfanno/cre.vcfanno_hg38.conf @@ -0,0 +1,130 @@ +#gnomad exomes v.2.1.1 (lifted over) +[[annotation]] +file="gnomad_exome.2.1.1_LIFTOVER.vcf.gz" +fields=["AC","nhomalt","AF_popmax","AN","AC_male"] +names=["gnomad_ac_es","gnomad_hom_es","gnomad_af_es","gnomad_an_es","gnomad_male_ac_es"] +ops=["first","first","first","first","first"] + +#gnomad genomes v.3.1.2 (native hg38) +[[annotation]] +file="gnomad_genomes_v.3.1.2.vcf.gz" +fields=["AC", "nhomalt","AF_popmax","AN","AC_XY"] +names=["gnomad_ac_gs", "gnomad_hom_gs","gnomad_af_gs","gnomad_an_gs","gnomad_male_ac_gs"] +ops=["self","self","self","self","self"] + +[[postannotation]] +fields=["gnomad_ac_es","gnomad_ac_gs"] +op="sum" +name="gnomad_ac" +type="Integer" + +[[postannotation]] +fields=["gnomad_hom_es","gnomad_hom_gs"] +op="sum" +name="gnomad_hom" +type="Integer" + +[[postannotation]] +fields=["gnomad_af_es","gnomad_af_gs"] +op="max" +name="gnomad_af_popmax" +type="Float" + +[[postannotation]] +fields=["gnomad_an_es","gnomad_an_gs"] +op="sum" +name="gnomad_an" +type="Integer" + +[[postannotation]] +fields=["gnomad_ac","gnomad_an"] +op="div2" +name="gnomad_af" +type="Float" + +[[postannotation]] +fields=["gnomad_male_ac_es","gnomad_male_ac_gs"] +op="sum" +name="gnomad_male_ac" +type="Integer" + +[[annotation]] +file="dbsnp-151.vcf.gz" +fields=["ID"] +names=["rs_ids"] +ops=["concat"] + +[[annotation]] +file="clinvar.vcf.gz" +fields=["CLNSIG","CLNREVSTAT"] +names=["clinvar_pathogenic", "clinvar_status"] +ops=["concat", "concat"] + +# convert 5 to 'pathogenic', 255 to 'unknown', etc. +[[postannotation]] +fields=["clinvar_pathogenic"] +op="lua:clinvar_sig(clinvar_pathogenic)" +name="clinvar_sig" +type="String" + +#CADD 1.6 - SNVs +[[annotation]] +file = "CADD_v1.6_whole_genome_SNVs.tsv.gz" +names = ["CADD_phred"] +columns = [6] +ops = ["self"] + +#CADD 1.6 - indels +[[annotation]] +file = "CADD_v.1.6_gnomad.genomes.r3.0.indel.tsv.gz" +names = ["CADD_phred"] +columns = [6] +ops = ["self"] + +#dbNSFP v3.4 +[[annotation]] +file = "dbNSFPv4.2a_combined_hg38.txt.gz" +names = ["phyloP30way_mammalian","phastCons30way_mammalian","Vest4_score","Revel_score","Gerp_score"] +columns = [159,165,67,82,155] +ops = ["first","first","first","first","first"] + +# spliceAI +[[annotation]] +file = "spliceai_scores.masked.indel.hg38.vcf.gz" +fields = ["SpliceAI"] +names = ["spliceai_score"] +ops = ["self"] + +# spliceAI +[[annotation]] +file = "spliceai_scores.masked.snv.hg38.vcf.gz" +fields = ["SpliceAI"] +names = ["spliceai_score"] +ops = ["self"] + +# UCE +[[annotation]] +file = "13661_UCEs_over_100bp.hg38_sorted_LIFTOVER.bed.gz" +columns = [3] +names = ["UCE_100bp"] +ops = ["flag"] + +[[annotation]] +file="2175_UCEs_over_200bp.hg38_sorted_LIFTOVER.bed.gz" +columns = [3] +names = ["UCE_200bp"] +ops = ["flag"] + +#C4R WES counts and samples +[[annotation]] +file="C4R_wes_counts_LIFTED_rmspace_sorted.tsv.gz" +columns = [5,6] +names = ["c4r_wes_samples", "c4r_wes_counts"] +ops=["first", "first"] + +#AlphaMissense +[[annotation]] +file="AlphaMissense_hg38.tsv.gz" +columns = [9] +names = ["AlphaMissense"] +ops=["first"] diff --git a/vcfanno/crg.vcfanno.conf b/vcfanno/crg.vcfanno_GRCh37.conf similarity index 100% rename from vcfanno/crg.vcfanno.conf rename to vcfanno/crg.vcfanno_GRCh37.conf diff --git a/vcfanno/crg.vcfanno_hg38.conf b/vcfanno/crg.vcfanno_hg38.conf new file mode 100644 index 0000000..cbd95a4 --- /dev/null +++ b/vcfanno/crg.vcfanno_hg38.conf @@ -0,0 +1,192 @@ +#gnomad v.2.1.1 exomes (lifted over) +[[annotation]] +file="gnomad_exome.2.1.1_LIFTOVER.vcf.gz" +fields=["AC","nhomalt","AF_popmax","AN","AC_male"] +names=["gnomad_ac_es","gnomad_hom_es","gnomad_af_es","gnomad_an_es","gnomad_male_ac_es"] +ops=["self","self","self","self","self"] + +#gnomad v.3.1.2 genomes (native hg38) +[[annotation]] +file="gnomad_genomes_v.3.1.2.vcf.gz" +fields=["AC", "nhomalt","AF_popmax","AN","AC_XY"] +names=["gnomad_ac_gs", "gnomad_hom_gs","gnomad_af_gs","gnomad_an_gs","gnomad_male_ac_gs"] +ops=["self","self","self","self","self"] + +[[postannotation]] +fields=["gnomad_ac_es","gnomad_ac_gs"] +op="sum" +name="gnomad_ac" +type="Integer" + +[[postannotation]] +fields=["gnomad_hom_es","gnomad_hom_gs"] +op="sum" +name="gnomad_hom" +type="Integer" + +[[postannotation]] +fields=["gnomad_af_es","gnomad_af_gs"] +op="max" +name="gnomad_af_popmax" +type="Float" + +[[postannotation]] +fields=["gnomad_an_es","gnomad_an_gs"] +op="sum" +name="gnomad_an" +type="Integer" + +[[postannotation]] +fields=["gnomad_ac","gnomad_an"] +op="div2" +name="gnomad_af" +type="Float" + +[[postannotation]] +fields=["gnomad_male_ac_es","gnomad_male_ac_gs"] +op="sum" +name="gnomad_male_ac" +type="Integer" + +[[annotation]] +file="dbsnp-151.vcf.gz" +fields=["ID"] +names=["rs_ids"] +ops=["concat"] + +[[annotation]] +file="clinvar.vcf.gz" +fields=["CLNSIG","CLNREVSTAT"] +names=["clinvar_pathogenic", "clinvar_status"] +ops=["concat", "concat"] + +# convert 5 to 'pathogenic', 255 to 'unknown', etc. +[[postannotation]] +fields=["clinvar_pathogenic"] +op="lua:clinvar_sig(clinvar_pathogenic)" +name="clinvar_sig" +type="String" + +#CADD 1.7 - SNVs +[[annotation]] +file = "CADD1.7/whole_genome_SNVs.tsv.gz" +names = ["CADD_phred"] +columns = [6] +ops = ["self"] + +#CADD 1.7 - indels +[[annotation]] +file = "CADD1.7/gnomad.genomes.r4.0.indel.tsv.gz" +names = ["CADD_phred"] +columns = [6] +ops = ["self"] + +#dbNSFP v3.4 +[[annotation]] +file = "dbNSFPv4.2a_combined_hg38.txt.gz" +names = ["phyloP30way_mammalian","phastCons30way_mammalian","Vest4_score","REVEL_score","Gerp_score"] +columns = [159,165,67,82,155] +ops = ["self","self","self","self","self"] + +# spliceAI +[[annotation]] +file = "spliceai_scores.masked.indel.hg38.vcf.gz" +fields = ["SpliceAI"] +names = ["spliceai_score"] +ops = ["self"] + +# spliceAI +[[annotation]] +file = "spliceai_scores.masked.snv.hg38.vcf.gz" +fields = ["SpliceAI"] +names = ["spliceai_score"] +ops = ["self"] + +# UCE +[[annotation]] +file = "13661_UCEs_over_100bp.hg38_sorted_LIFTOVER.bed.gz" +columns = [3] +names = ["UCE_100bp"] +ops = ["flag"] + +[[annotation]] +file="2175_UCEs_over_200bp.hg38_sorted_LIFTOVER.bed.gz" +columns = [3] +names = ["UCE_200bp"] +ops = ["flag"] + +[[annotation]] +file="wgEncodeRegDnaseClusteredV3_hg38_sorted_LIFTOVER.bed.gz" +columns=[3] +names=["DNaseI_hypersensitive_site"] +ops=["flag"] + +[[annotation]] +file="wgEncodeUwTfbsGm12864CtcfStdRawRep3_sorted_LIFTOVER.bed.gz" +columns=[3] +names=["CTCF_binding_site"] +ops=["flag"] + + +#Enhancer +[[annotation]] +file="Enhancer.hg38.all.celltypes.formatted_sorted_LIFTOVER.bed.gz" +columns = [6] +names = ["ENH_cellline_tissue"] +ops = ["self"] + +#TFBS +[[annotation]] +file="wgEncodeRegTfbsClusteredV3_hg38_sorted_LIFTOVER.bed.gz" +columns = [4] +names = ["tf_binding_sites"] +ops=["concat"] + +#ncER score +[[annotation]] +file="GRCh38_ncER_perc.bed.gz" +columns = [4] +names = ["ncER"] +ops=["max"] + +#LinSight Score +[[annotation]] +file = "GRCh38_LinSight.bed.gz" +names = ["LinSight_Score"] +columns = [4] +ops = ["max"] + +#ReMM scores +[[annotation]] +file = "ReMM.v0.4.hg38.tsv.gz" +names = ["ReMM"] +columns = [3] +ops = ["max"] + +#C4R WES counts and samples +[[annotation]] +file="C4R_wes_counts_LIFTED_rmspace_sorted.tsv.gz" +columns = [5,6] +names = ["c4r_wes_samples", "c4r_wes_counts"] +ops=["first", "first"] + +#C4R WGS counts and samples +[[annotation]] +file="C4R_genome_counts_sorted_LIFTED.tsv.gz" +columns = [5,6] +names = ["c4r_wgs_samples", "c4r_wgs_counts"] +ops=["first", "first"] + +#AlphaMissense +[[annotation]] +file="AlphaMissense_hg38.tsv.gz" +columns = [9] +names = ["AlphaMissense"] +ops=["first"] + +#GreenDB +[[annotation]] +file="GRCh38_GREEN-DB.bed.gz" +columns = [5,9,13] +names = ["GreenDB_variant_type", "GreenDB_closest_gene", "GreenDB_controlled_gene"] +ops=["self", "self", "self"] diff --git a/wrappers/gatk/CalibrateDragstrModel/environment.yaml b/wrappers/gatk/CalibrateDragstrModel/environment.yaml new file mode 100644 index 0000000..efc32f1 --- /dev/null +++ b/wrappers/gatk/CalibrateDragstrModel/environment.yaml @@ -0,0 +1,6 @@ +channels: + - bioconda + - conda-forge + - defaults +dependencies: + - gatk4 ==4.2.6.1 diff --git a/wrappers/gatk/CalibrateDragstrModel/wrapper.py b/wrappers/gatk/CalibrateDragstrModel/wrapper.py new file mode 100644 index 0000000..9536b51 --- /dev/null +++ b/wrappers/gatk/CalibrateDragstrModel/wrapper.py @@ -0,0 +1,12 @@ +from snakemake.shell import shell +import os + +log = snakemake.log_fmt_shell(stdout=True, stderr=True) + +shell( + "gatk CalibrateDragstrModel " + "-R {snakemake.input.ref} " + "-I {snakemake.input.bam} " + "--str-table-path {snakemake.input.dragstr_table} " + "-O {snakemake.output[0]} {log}" +) diff --git a/wrappers/gatk/baserecalibrator/environment.yaml b/wrappers/gatk/baserecalibrator/environment.yaml index 3510cd8..ebd93b0 100644 --- a/wrappers/gatk/baserecalibrator/environment.yaml +++ b/wrappers/gatk/baserecalibrator/environment.yaml @@ -3,5 +3,5 @@ channels: - conda-forge - defaults dependencies: - - gatk4 ==4.1.4.1 - - openjdk =8 \ No newline at end of file + - gatk4 ==4.2.6.1 + - openjdk =8 diff --git a/wrappers/gatk/combinegvcfs/environment.yaml b/wrappers/gatk/combinegvcfs/environment.yaml index 17b64b9..efc32f1 100644 --- a/wrappers/gatk/combinegvcfs/environment.yaml +++ b/wrappers/gatk/combinegvcfs/environment.yaml @@ -3,4 +3,4 @@ channels: - conda-forge - defaults dependencies: - - gatk4 ==4.1.4.1 + - gatk4 ==4.2.6.1 diff --git a/wrappers/gatk/dragenhaplotypecaller/environment.yaml b/wrappers/gatk/dragenhaplotypecaller/environment.yaml new file mode 100644 index 0000000..efc32f1 --- /dev/null +++ b/wrappers/gatk/dragenhaplotypecaller/environment.yaml @@ -0,0 +1,6 @@ +channels: + - bioconda + - conda-forge + - defaults +dependencies: + - gatk4 ==4.2.6.1 diff --git a/wrappers/gatk/dragenhaplotypecaller/meta.yaml b/wrappers/gatk/dragenhaplotypecaller/meta.yaml new file mode 100644 index 0000000..8278afb --- /dev/null +++ b/wrappers/gatk/dragenhaplotypecaller/meta.yaml @@ -0,0 +1,15 @@ +name: gatk HaplotypeCaller +description: | + Run gatk HaplotypeCaller. +authors: + - Johannes Köster + - Jake VanCampen +input: + - bam file +output: + - GVCF file +notes: | + * The `java_opts` param allows for additional arguments to be passed to the java compiler, e.g. "-Xmx4G" for one, and "-Xmx4G -XX:ParallelGCThreads=10" for two options. + * The `extra` param alllows for additional program arguments. + * For more inforamtion see, https://software.broadinstitute.org/gatk/documentation/article?id=11050 + diff --git a/wrappers/gatk/dragenhaplotypecaller/test/Snakefile b/wrappers/gatk/dragenhaplotypecaller/test/Snakefile new file mode 100644 index 0000000..f65b9eb --- /dev/null +++ b/wrappers/gatk/dragenhaplotypecaller/test/Snakefile @@ -0,0 +1,15 @@ +rule haplotype_caller: + input: + # single or list of bam files + bam="mapped/{sample}.bam", + ref="genome.fasta" + # known="dbsnp.vcf" # optional + output: + gvcf="calls/{sample}.g.vcf", + log: + "logs/gatk/haplotypecaller/{sample}.log" + params: + extra="", # optional + java_opts="", # optional + wrapper: + "master/bio/gatk/haplotypecaller" diff --git a/wrappers/gatk/dragenhaplotypecaller/test/genome.dict b/wrappers/gatk/dragenhaplotypecaller/test/genome.dict new file mode 100644 index 0000000..128d1d5 --- /dev/null +++ b/wrappers/gatk/dragenhaplotypecaller/test/genome.dict @@ -0,0 +1,3 @@ +@HD VN:1.5 +@SQ SN:ref LN:45 M5:7a66cae8ab14aef8d635bc80649e730b UR:file:/home/johannes/scms/snakemake-wrappers/bio/picard/createsequencedictionary/test/genome.fasta +@SQ SN:ref2 LN:40 M5:1636753510ec27476fdd109a6684680e UR:file:/home/johannes/scms/snakemake-wrappers/bio/picard/createsequencedictionary/test/genome.fasta diff --git a/wrappers/gatk/dragenhaplotypecaller/test/genome.fasta b/wrappers/gatk/dragenhaplotypecaller/test/genome.fasta new file mode 100644 index 0000000..afe990a --- /dev/null +++ b/wrappers/gatk/dragenhaplotypecaller/test/genome.fasta @@ -0,0 +1,4 @@ +>ref +AGCATGTTAGATAAGATAGCTGTGCTAGTAGGCAGTCAGCGCCAT +>ref2 +aggttttataaaacaattaagtctacagagcaactacgcg diff --git a/wrappers/gatk/dragenhaplotypecaller/test/genome.fasta.fai b/wrappers/gatk/dragenhaplotypecaller/test/genome.fasta.fai new file mode 100644 index 0000000..3daa621 --- /dev/null +++ b/wrappers/gatk/dragenhaplotypecaller/test/genome.fasta.fai @@ -0,0 +1,2 @@ +ref 45 5 45 46 +ref2 40 57 40 41 diff --git a/wrappers/gatk/dragenhaplotypecaller/test/mapped/a.bam b/wrappers/gatk/dragenhaplotypecaller/test/mapped/a.bam new file mode 100644 index 0000000..a407ae2 Binary files /dev/null and b/wrappers/gatk/dragenhaplotypecaller/test/mapped/a.bam differ diff --git a/wrappers/gatk/dragenhaplotypecaller/test/mapped/a.bam.bai b/wrappers/gatk/dragenhaplotypecaller/test/mapped/a.bam.bai new file mode 100644 index 0000000..a9b9979 Binary files /dev/null and b/wrappers/gatk/dragenhaplotypecaller/test/mapped/a.bam.bai differ diff --git a/wrappers/gatk/dragenhaplotypecaller/wrapper.py b/wrappers/gatk/dragenhaplotypecaller/wrapper.py new file mode 100644 index 0000000..f724b5a --- /dev/null +++ b/wrappers/gatk/dragenhaplotypecaller/wrapper.py @@ -0,0 +1,34 @@ +__author__ = "Johannes Köster" +__copyright__ = "Copyright 2018, Johannes Köster" +__email__ = "johannes.koester@protonmail.com" +__license__ = "MIT" + + +import os + +from snakemake.shell import shell + +known = snakemake.input.get("known", "") +if known: + known = "--dbsnp " + known + +extra = snakemake.params.get("extra", "") +java_opts = snakemake.params.get("java_opts", "") +bams = snakemake.input.bam +regions = snakemake.input.get("regions", "") +if regions: + extra = extra + " -L {}".format(regions) + +if isinstance(bams, str): + bams = [bams] +bams = list(map("-I {}".format, bams)) + +log = snakemake.log_fmt_shell(stdout=True, stderr=True) +shell( + "gatk --java-options '{java_opts}' HaplotypeCaller {extra} " + "-R {snakemake.input.ref} {bams} " + "-ERC GVCF " + "--dragen-mode " +# "--dragstr-params-path {snakemake.input.dragstr_parameters} " + "-O {snakemake.output.gvcf} {known} {log}" +) diff --git a/wrappers/gatk/genotypegvcfs/environment.yaml b/wrappers/gatk/genotypegvcfs/environment.yaml index 17b64b9..efc32f1 100644 --- a/wrappers/gatk/genotypegvcfs/environment.yaml +++ b/wrappers/gatk/genotypegvcfs/environment.yaml @@ -3,4 +3,4 @@ channels: - conda-forge - defaults dependencies: - - gatk4 ==4.1.4.1 + - gatk4 ==4.2.6.1 diff --git a/wrappers/gatk/haplotypecaller/environment.yaml b/wrappers/gatk/haplotypecaller/environment.yaml index 17b64b9..efc32f1 100644 --- a/wrappers/gatk/haplotypecaller/environment.yaml +++ b/wrappers/gatk/haplotypecaller/environment.yaml @@ -3,4 +3,4 @@ channels: - conda-forge - defaults dependencies: - - gatk4 ==4.1.4.1 + - gatk4 ==4.2.6.1 diff --git a/wrappers/gatk/mutect/environment.yaml b/wrappers/gatk/mutect/environment.yaml index 17b64b9..efc32f1 100644 --- a/wrappers/gatk/mutect/environment.yaml +++ b/wrappers/gatk/mutect/environment.yaml @@ -3,4 +3,4 @@ channels: - conda-forge - defaults dependencies: - - gatk4 ==4.1.4.1 + - gatk4 ==4.2.6.1 diff --git a/wrappers/gatk/selectvariants/environment.yaml b/wrappers/gatk/selectvariants/environment.yaml index 17b64b9..efc32f1 100644 --- a/wrappers/gatk/selectvariants/environment.yaml +++ b/wrappers/gatk/selectvariants/environment.yaml @@ -3,4 +3,4 @@ channels: - conda-forge - defaults dependencies: - - gatk4 ==4.1.4.1 + - gatk4 ==4.2.6.1 diff --git a/wrappers/gatk/splitncigarreads/environment.yaml b/wrappers/gatk/splitncigarreads/environment.yaml index 17b64b9..efc32f1 100644 --- a/wrappers/gatk/splitncigarreads/environment.yaml +++ b/wrappers/gatk/splitncigarreads/environment.yaml @@ -3,4 +3,4 @@ channels: - conda-forge - defaults dependencies: - - gatk4 ==4.1.4.1 + - gatk4 ==4.2.6.1 diff --git a/wrappers/gatk/variantfiltration/environment.yaml b/wrappers/gatk/variantfiltration/environment.yaml index 17b64b9..efc32f1 100644 --- a/wrappers/gatk/variantfiltration/environment.yaml +++ b/wrappers/gatk/variantfiltration/environment.yaml @@ -3,4 +3,4 @@ channels: - conda-forge - defaults dependencies: - - gatk4 ==4.1.4.1 + - gatk4 ==4.2.6.1 diff --git a/wrappers/gatk/variantrecalibrator/environment.yaml b/wrappers/gatk/variantrecalibrator/environment.yaml index 17b64b9..efc32f1 100644 --- a/wrappers/gatk/variantrecalibrator/environment.yaml +++ b/wrappers/gatk/variantrecalibrator/environment.yaml @@ -3,4 +3,4 @@ channels: - conda-forge - defaults dependencies: - - gatk4 ==4.1.4.1 + - gatk4 ==4.2.6.1 diff --git a/wrappers/peddy/wrapper.py b/wrappers/peddy/wrapper.py index 713e616..c322413 100644 --- a/wrappers/peddy/wrapper.py +++ b/wrappers/peddy/wrapper.py @@ -12,4 +12,4 @@ shell( "peddy {snakemake.params} --prefix {prefix} {snakemake.input.vcf} {snakemake.input.ped} {log} " -) +) \ No newline at end of file diff --git a/wrappers/platypus/environment.yaml b/wrappers/platypus/environment.yaml index e5c8521..aec09a4 100644 --- a/wrappers/platypus/environment.yaml +++ b/wrappers/platypus/environment.yaml @@ -5,3 +5,4 @@ channels: dependencies: - platypus-variant ==0.8.1.2 - vcflib ==1.0.0_rc2 + - picard ==2.9.2 diff --git a/wrappers/platypus/wrapper.py b/wrappers/platypus/wrapper.py index ad7f2ce..cce537c 100644 --- a/wrappers/platypus/wrapper.py +++ b/wrappers/platypus/wrapper.py @@ -24,6 +24,10 @@ params = snakemake.params if snakemake.params else "" +# need sequence dictionary for picard sort +# sometimes Platypus VCFs are not properly sorted , even after vcfstreamsort +seq_dict = snakemake.input.ref.replace(".fa", ".dict") + log = snakemake.log_fmt_shell(stdout=True, stderr=True) shell( @@ -32,5 +36,6 @@ "--output={snakemake.output} && " "vcfallelicprimitives -t DECOMPOSED --keep-geno {snakemake.output} " "| vcffixup - | vcfstreamsort > temp && " - "mv temp {snakemake.output} {log} " + "picard -Xmx2g SortVcf I=temp O=temp.sort.vcf SEQUENCE_DICTIONARY={seq_dict} && " + "rm temp && mv temp.sort.vcf {snakemake.output} {log} " ) diff --git a/wrappers/samtools/fastq/wrapper.py b/wrappers/samtools/fastq/wrapper.py index 5aceb1d..c8db8a9 100644 --- a/wrappers/samtools/fastq/wrapper.py +++ b/wrappers/samtools/fastq/wrapper.py @@ -22,29 +22,41 @@ if input_type == ["bam"]: print("File type is BAM") bam_file = units.loc[sample, "bam"] + dest=os.path.join(f"fastq/{family}_{sample}.bam") + + copy_cmd = " cp {bam_file} {dest} {log}; " + if 'resarchivezone' in bam_file: # BAM is stored in iRods archive on hpf + copy_cmd = " module load irods_client; iget {bam_file} {dest} {log}; " + + shell("(" + copy_cmd + ")") + sort_cmd = "" - bamtofastq_cmd = " samtools fastq {bam_file} -1 fastq/{family}_{sample}_R1.fastq.gz -2 fastq/{family}_{sample}_R2.fastq.gz -s /dev/null; " + bamtofastq_cmd = " samtools fastq {dest} -1 fastq/{family}_{sample}_R1.fastq.gz -2 fastq/{family}_{sample}_R2.fastq.gz -s /dev/null; " if snakemake.params.sort_check: print("Checking sort_order") - bam = pysam.AlignmentFile(bam_file, "rb") + bam = pysam.AlignmentFile(dest, "rb") header = bam.header sort_order = header["HD"]["SO"] bam.close() if sort_order != "queryname": print("Sorting by queryname") - sort_cmd = "samtools sort -n -@ {snakemake.threads} -T {snakemake.params.outdir}/{snakemake.wildcards.sample} {bam_file} |" + sort_cmd = "samtools sort -n -@ {snakemake.threads} -T {snakemake.params.outdir}/{snakemake.wildcards.sample} {dest} |" bamtofastq_cmd = "samtools fastq -1 fastq/{family}_{sample}_R1.fastq.gz -2 fastq/{family}_{sample}_R2.fastq.gz -s /dev/null; " + + rm_cmd="rm fastq/{family}_{sample}.bam;" - shell("(" + sort_cmd + bamtofastq_cmd + ") {log}") + shell("(" + sort_cmd + bamtofastq_cmd + rm_cmd + ") {log}") elif input_type == ["cram"]: print("File type is CRAM") cram_file = units.loc[sample, "cram"] dest = os.path.join(f"fastq/{family}_{sample}.cram") - copy_cmd = " cp {cram_file} {dest}; " + copy_cmd = " cp {cram_file} {dest} {log}; " + if 'resarchivezone' in cram_file: # CRAM is stored in iRods archive on hpf + copy_cmd = " module load irods_client; iget {cram_file} {dest} {log}; " get_header_cmd = " samtools view -H {dest} > fastq/{family}_{sample}_header.sam; " diff --git a/wrappers/verifybamid2/environment.yaml b/wrappers/verifybamid2/environment.yaml index e88eb44..200f592 100644 --- a/wrappers/verifybamid2/environment.yaml +++ b/wrappers/verifybamid2/environment.yaml @@ -3,4 +3,4 @@ channels: - conda-forge - defaults dependencies: - - verifybamid2==1.0.6 + - verifybamid2==2.0.1