From c507e93fe493c1efe1c23003e740a22aaf0451a2 Mon Sep 17 00:00:00 2001 From: Owen Marshall Date: Wed, 16 Jun 2021 18:17:31 +1000 Subject: [PATCH 1/2] Initial docs commit --- docs/.nojekyll | 0 1 file changed, 0 insertions(+), 0 deletions(-) create mode 100644 docs/.nojekyll diff --git a/docs/.nojekyll b/docs/.nojekyll new file mode 100644 index 0000000..e69de29 From 5b923a9a073415ba92c52c93bb087ab92fee192c Mon Sep 17 00:00:00 2001 From: Owen Marshall Date: Mon, 25 Nov 2024 19:36:06 +1100 Subject: [PATCH 2/2] Updated to release v1.6 --- README.md | 120 +++++++++++--- changelog => changelog.md | 68 +++++--- damidseq_pipeline | 324 ++++++++++++++++++++++++++++---------- 3 files changed, 376 insertions(+), 136 deletions(-) rename changelog => changelog.md (57%) diff --git a/README.md b/README.md index dafe81d..63872f7 100644 --- a/README.md +++ b/README.md @@ -3,11 +3,14 @@ [damidseq_pipeline](https://github.com/owenjm/damidseq_pipeline/releases) is a single script that automatically handles sequence alignment, read extension, binned counts, normalisation, pseudocount addition and final ratio file generation. The script uses FASTQ or BAM files as input, and outputs the final log2 ratio files in bedGraph format. ## Features -* Fully automated processing of NGS DamID-seq datasets, from FASTQ input to bedGraph output -* Handles both single- and paired-end datasets +* Fully automated processing of all NGS DamID-seq datasets, from FASTQ input to bedGraph output +* Automatic grouping and processing of multiple experimental and replicate batches +* Automatically detects and processs both single- and paired-end datasets * Can be used with either FASTQ or pre-aligned BAM input files * Multiple methods of normalisation provided -* As of v1.5.3 and greater, can also handle and process ChIP-seq NGS data +* Optional generation of outputs for CATaDa processing +* Can also handle and process ChIP-seq or CUT&RUN NGS data +* Free and open-source software maintained by the [Marshall lab](https://marshall-lab.org) ## Citation @@ -16,13 +19,15 @@ If you find this software useful, please cite: Marshall OJ and Brand AH. (2015) damidseq_pipeline: an automated pipeline for processing DamID sequencing datasets. *Bioinformatics.* 31(20): 3371--3. ([pubmed](http://www.ncbi.nlm.nih.gov/pubmed/26112292); [full text, open access](https://academic.oup.com/bioinformatics/article/31/20/3371/196153)) +Please note that damidseq_pipeline has now evolved well beyond the functionality described in that article. + # Download and installation [Download the latest version](https://github.com/owenjm/damidseq_pipeline/releases) of the pipeline script and associated files. Prebuilt GATC fragment files used by the script are available for the following genomes: -* [*Drosophila melanogaster* r5.57](https://github.com/owenjm/damidseq_pipeline/raw/gh-pages/pipeline_gatc_files/Dmel_r5.57.GATC.gff.gz) * [*D. melanogaster* r6](https://github.com/owenjm/damidseq_pipeline/raw/gh-pages/pipeline_gatc_files/Dmel_BDGP6.GATC.gff.gz) +* [*Drosophila melanogaster* r5.57](https://github.com/owenjm/damidseq_pipeline/raw/gh-pages/pipeline_gatc_files/Dmel_r5.57.GATC.gff.gz) * [*Mus musculus* GRCm38](https://github.com/owenjm/damidseq_pipeline/raw/gh-pages/pipeline_gatc_files/MmGRCm38.GATC.gff.gz) or * [Human GRCh38](https://github.com/owenjm/damidseq_pipeline/raw/gh-pages/pipeline_gatc_files/HsGRCh38.GATC.gff.gz). @@ -36,7 +41,13 @@ Prebuilt GATC fragment files used by the script are available for the following ## Installation -1. Extract the pipeline script archive, make the damid_pipeline file executable and place it in your path +1. Extract the pipeline script archive, make the `damid_pipeline` file executable and place it in your path + ```bash + # Very simple way to do this in a *nix environment, + # Change to the directory with the extracted files and: + chmod a+x damidseq_pipeline + sudo cp damidseq_pipeline /usr/local/bin/ + ``` 1. Install [Bowtie 2](http://bowtie-bio.sourceforge.net/bowtie2/index.shtml) 1. Obtain Bowtie 2 indices provided by [Bowtie 2](http://bowtie-bio.sourceforge.net/bowtie2/index.shtml) or [Illumina's iGenome](http://support.illumina.com/sequencing/sequencing_software/igenome.html) @@ -44,20 +55,20 @@ Prebuilt GATC fragment files used by the script are available for the following 1. Download the latest FASTA genome primary_assembly (or toplevel) file from [Ensembl](http://ftp.ensembl.org/pub/current_fasta/) e.g. [the current release for *Mus musculus*](http://ftp.ensembl.org/pub/current_fasta/mus_musculus/dna/) - (alternatively, for *Drosophila*, download from [the Flybase FTP site](ftp://ftp.flybase.net/releases/current/) + (alternatively, for *Drosophila*, download from [the Flybase FTP site](ftp://ftp.flybase.net/releases/current/)) 1. Extract the .gz file - 1. Run bowtie2-build in the directory containing the extracted .fasta file. For the examples above: + 1. Run bowtie2-build in the directory containing the extracted .fasta file. For example: bowtie2-build Mus_musculus.GRCm38.dna.primary_assembly.fa GRCm38 bowtie2-build dmel-all-chromosome-r5.57.fasta dmel_r5.57 1. Install [SAMtools](http://samtools.sourceforge.net) 1. Download a pre-built GATC fragment file for - * [*D. melanogaster* r5.57](https://github.com/owenjm/damidseq_pipeline/raw/gh-pages/pipeline_gatc_files/Dmel_r5.57.GATC.gff.gz) * [*D. melanogaster* r6](https://github.com/owenjm/damidseq_pipeline/raw/gh-pages/pipeline_gatc_files/Dmel_BDGP6.GATC.gff.gz) + * [*D. melanogaster* r5.57](https://github.com/owenjm/damidseq_pipeline/raw/gh-pages/pipeline_gatc_files/Dmel_r5.57.GATC.gff.gz) _(... surely nobody's still using release 5, though, right?)_ * [*Mus musculus* GRCm38](https://github.com/owenjm/damidseq_pipeline/raw/gh-pages/pipeline_gatc_files/MmGRCm38.GATC.gff.gz) or * [Human GRCh38](https://github.com/owenjm/damidseq_pipeline/raw/gh-pages/pipeline_gatc_files/HsGRCh38.GATC.gff.gz). - Alternatively build your own: + Alternatively, build your own: 1. Download the FASTA genome sequence, as in step 3 above (no need to extract the gzipped files) 1. Run the provided [gatc.track.maker.pl](http://github.com/owenjm/damid_pipeline/blob/master/gatc.track.maker.pl?raw=true) script on the fasta sequence, e.g.: @@ -87,24 +98,70 @@ Once run once with these options and correct values, the paths will be saved for # Using damidseq_pipeline -Run damidseq_pipeline in a directory containing sequencing files in FASTQ or BAM format. The default behaviour is to process all files in FASTQ format, and if none are found, all files in BAM format. +Run damidseq_pipeline in a directory containing sequencing files in FASTQ or BAM format. The default behaviour is to process all files in FASTQ format, and if none are found, all files in BAM format. + +By default, the pipeline will process all files found in the current working directory. To process files in a different directory, specify the path with the `--datadir` command-line switch (from v1.6). -Alternatively, individual files may be specified on the command line if the user does not wish to process all available files present in the directory (for example, if the sequencing lane contained multiple replicates). +Alternatively, individual files may be specified on the command line if the user does not wish to process all available files present in the directory (but there is little reason to do this, as from v1.6 the pipeline can correctly group and process multiple replicates and experiments from the one batch of files). ## Sample names -Sample names are assigned from filenames. If a single filename being processed begins with "Dam", this will be assigned as the Dam-only control. +Sample names are assigned from filenames. Ideally, start each sample with the name of the protein being profiled: if you do this, everything else will be easy. + +If a single filename being processed begins with "Dam", this will be assigned as the Dam-only control. + +If no sample filename, or multiple filenames, begin with "Dam", use `--dam=[filename]` to specify the Dam-only control sample manually. If a Dam-only control cannot be automatically determined, damidseq_pipeline will exit and prompt you to specify one. (But, please, save yourself the trouble and just start your filenames with the protein name. You'll thank me later.) + +## Processing multiple experiments and replicates + +As of v1.6, damidseq_pipeline will group files into different experiments and replicates, and handle this automatically. (There's no more need for complex snakemake workarounds or shell-scripted loops if you just want to align and process a set of samples.) -If no sample filename, or multiple filenames, begin with "Dam", use `--dam=[filename]` to specify the Dam-only control sample manually. If a Dam-only control cannot be automatically determined, damidseq_pipeline will exit and prompt you to specify one. +Experimental and replicate group detection relies on at least two parameters: +* `--exp_prefix`: the common characters immediately preceding the experiment name (default `_`) +* `--rep_prefix`: the common characters that prefix the replicate number (default `_n`) + +However, if you have additional information between the experiment name and the replicate designation, you can also set: +* `--exp_suffix`: the common characters immediately following the experiment name (takes the value of `--rep_prefix` when unset) + +In the Marshall lab, we typically use the following naming format: + + `[protein]_[celltype]-[experiment number]_n[replicate]` + +Thus, the following filenames will all work with the default values above: + +``` +Dpn_NSCs-OM1_n1 +Dpn_NSCs-OM1_n2 +Dam_NSCs-OM1_n1 +Dam_NSCs-OM1_n2 +HP1a_Neurons-OM2_n1 +Dam_Neurons-OM2_n1 +``` + +But you can designate your own character strings to fit your own naming conventions if you need to, using the command-line switches above. + +We think this makes processing much easier (especially these days when you can multiplex 50+ samples in a lane of sequencing). But, if you prefer to keep things simple and want the old v1.5.3 and earlier functionality back, you can always run with the command-line option `--nogroups` and everything will be like it was before. +{: .notice--info} ## Paired-end sequencing files -To process paired-end FASTQ files, use the `--paired` option and the pipeline will search for, and match, paired reads. +As of v1.6, damidseq_pipeline will automatically detect paired-end or single-end reads and process these appropriately. You can happily mix read types within a single processing run if you want or need to. -BAM files generated from paired-end data are automatically detected and processed, without requiring this option. +What form of sequencing do we suggest? Paired-end sequencing is always better if you can afford it (and really there should be very little price difference these days). Paired-ends provides more accurate alignments and allows significantly better alignment within repetitive regions. +{: .notice--info} + +If you're using an earlier version of the pipeline and don't want to update, use the `--paired` command-line option and the pipeline will search for, and match, paired reads. But we recommend updating to the latest version (or if there's a bug preventing you from updating, let us know!). + +## Generating coverage bedGraph files for CATaDa + +As of v1.6, the `--catada` command-line option will output binned coverage tracks (in RPM, reads per million mapped reads, by default) for individual BAM or FASTQ files and exit (i.e. no ratio files will be generated). (Earlier versions provided the same functionality with the `--just_coverage` flag, which is still present and works identically to `--catada`.) -## Processing ChIP-seq data -As of v1.5.3, damidseq_pipeline can also handle ChIP-seq data via the `--chipseq` flag. This option will remove PCR duplicate reads, only process uniquely mapping reads, and output binned coverage tracks in RPM (reads per million mapped reads). +To generate ratios _and_ output coverage files, use the `--coverage` flag instead, and get two wishes in one. -Warning -- do not use this option with DamID-seq data. DamID-seq is all about the PCR duplicates! +Coverage bedGraphs from Dam-only files generated in this way can be used for [Chromatin Accessibilty TaDa (CATaDa)](https://elifesciences.org/articles/32341) processing. + +## Processing ChIP-seq or CUT&RUN data +As of v1.5.3 and later, damidseq_pipeline can also handle ChIP-seq or CUT&RUN data via the `--chipseq` flag. This option will remove PCR duplicate reads, only process uniquely mapping reads, and output binned coverage tracks in RPM (reads per million mapped reads). + +Warning -- do not use this option, or attempt to remove PCR duplicates, with DamID-seq data. **DamID-seq is _all_ about the PCR duplicates!!** We can't emphasise this enough. {: .notice--warning} ## Other options @@ -144,18 +201,35 @@ The final output will be a single ratio file: Sample-vs-Dam.gatc.bedgraph. The . The [bedGraph format](http://genome.ucsc.edu/goldenpath/help/bedgraph.html) is used by default. The pipeline script can output the final ratio files in [GFF format](http://www.ensembl.org/info/website/upload/gff.html) instead if the `--output_format=gff` command-line switch is used. -### Visualising the DNA binding profiles +## Visualising DNA binding profiles The bedgraph output files can be can viewed directly in genome browsers such as [IGV](http://www.broadinstitute.org/software/igv/). For publication-quality figures, we recommend [pyGenomeTracks](https://pygenometracks.readthedocs.io/). -### Calling significant peaks from the data +## Calling significant peaks from the data The [find_peaks](http://github.com/owenjm/find_peaks) software will process the output .gatc.bedgraph ratio file and call significant peaks present in the dataset. Please see the find_peaks page for more details. -### Calling transcribed genes from RNA pol II datasets +## Calling transcribed genes from RNA pol II datasets The [polii.gene.call](http://github.com/owenjm/polii.gene.call) Rscript will call transcribed genes (i.e. gene bodies with significantly enriched pol II occupancy) from the output .gatc.bedgraph file. Please see the polii.gene.call page for more details. -### Other useful scripts and utilities +## Comparative protein binding, gene expression and chromatin accessibility analysis + +We should have a comprehensive downstream damidBind R package for downstream processing of protein binding, Pol II occupancy and CATaDa chromatin accessibility released shortly. Watch this space. + +# DamID, TaDa, FlyORF-TaDa, NanoDam and CATaDa + +No matter what flavour of DamID you're using, if there's a DNA Adenine Methylase enzyme involved, `damidseq_pipeline` is the tool to use. + +For every technique other than CATaDa, process your samples as per normal. + +For CATaDa, use `--catada` to generate chromatin accessibility coverage bedGraphs. + +# Reporting issues, feature requests, and bugs + +Please log these via the [damidseq_pipeline GitHub site](https://github.com/owenjm/damidseq_pipeline/). + +# DamID protocols and reagents + +For our latest lab protocol for Targeted DamID, advice and base plasmid files and sequences, please see our [dedicated page on TaDa](https://marshall-lab.org/tada) for more details. -A collection of useful R and Perl scripts for comparing and analysing DamID-seq data [is maintained here](http://github.com/owenjm/damid_misc). diff --git a/changelog b/changelog.md similarity index 57% rename from changelog rename to changelog.md index fa55b0e..7f5cf3a 100644 --- a/changelog +++ b/changelog.md @@ -1,16 +1,32 @@ - -[v1.5.3] -* New feature: --coverage flag outputs coverage bedGraph track of each sample (useful with Dam-only tracks for CATaDa analysis). --just_coverage exits after coverage tracks are written. -* New feature: Kernel density normalisation routines sped up ~10x via Inline::C. This is enabled by default, but will only work if the Inline::C module is installed; damidseq_pipeline will fall back on the Perl coded routine if this is not present. If you want to control explicitly, do so via the --kde_inline flag. -* New feature: ChIP-seq-specific processing routines added: --remdups removes PCR duplicates, --unique only maps unique reads. Enable all of these with --chipseq (also turns of GATC-level binning, GATC-based read extension and will only output coverage tracks not ratios) +## v1.6 +* New feature: automatic grouping and processing of experiments and replicates within the set of input files (if you don't want to use this, run with `--nogroups` to restore old v1.5.3 functionality). This removes the need for any higher-level scripting of sample processing, and means that a large set of disparate samples can be processed with just a single command. Experimental and replicate group detection relies on three parameters: + * `--exp_prefix`: the common characters immediately preceding the experiment name (default `_`) + * `--exp_suffix`: the common characters immediately following the experiment name (if unset, takes the value of `--rep_prefix`) + * `--rep_prefix`: the common characters that prefix the replicate number (default `_n`) +* New feature: dry-run with `--n` to check experiment and replicate groupings prior to execution +* New feature: automatic detection of PE or SE sequencing inputs. No need to explicitly use CLI flags anymore to specify; `damidseq_pipeline` will detect these automatically. This should work out of the box for most sequencing facility outputs, but if you need to adjust the detection parameters use `--paired_match` to change the paired-end prefix search string. Including mixed PE and SE inputs in the same processing run is now completely fine. +* New feature: use `--datadir` to specify the directory of FASTQs or BAMs to process. +* New feature: optionally, do not process and overwrite previously generated BAM files when reprocessing a large number of samples. Use `--ncbam` to enable this (ensure you manually delete any partly-aligned BAM files first if using this; the pipeline applies no sanity checks to detect part-aligned files). +* New feature: `--catada` flag is an alias for `--just_coverage` (as apparently the built-in CATaDa file generation functionality wasn't very transparent in the old release) +* Modified: Inline::C normalisation code compilation now saves the compiled C code to the user's .config directory for reuse +* Fixed: explicitly prevent any form of read extension when using PE sequencing data +* Fixed: GATC fragment fence-post error corrected (fragments in output bedgraphs no longer have a 1bp overlap) +* Some minor code refactoring to allow experimental and replicate groupings + + +## v1.5.3 +* New feature: `--coverage` flag outputs coverage bedGraph track of each sample (useful with Dam-only tracks for CATaDa analysis). `--just_coverage` exits after coverage tracks are written. +* New feature: Kernel density normalisation routines sped up ~10x via Inline::C. This is enabled by default, but will only work if the Inline::C module is installed; damidseq\_pipeline will fall back on the Perl coded routine if this is not present. If you want to control explicitly, do so via the `--kde_inline` flag. +* New feature: ChIP-seq-specific processing routines added: `--remdups` removes PCR duplicates, `--unique` only maps unique reads. Enable all of these with `--chipseq` (also turns of GATC-level binning, GATC-based read extension and will only output coverage tracks not ratios) * New feature: New normalisation method, "rawbins". This writes the normalisation data to TSV format and runs a user-provided script for custom normalisation. Your script should read the data file and output the normalisation factor. Advanced users only, use with caution. -* New feature: consider all occupancy as RPM (reads per million mapped reads), set as default. Control with --scores_as_rpm flag. +* New feature: consider all occupancy as RPM (reads per million mapped reads), set as default. Control with `--scores_as_rpm` flag. * Changed: bowtie2 command handling refactored more elegantly -* Added: ability to add custom flags to bowtie2 command with --bowtie2_add_flags +* Added: ability to add custom flags to bowtie2 command with `--bowtie2_add_flags` * Fixed: small changes to read mapping code for more accurate mapping -* Changed: filename filters are now turned off by default (use --no_file_filters=0 to turn back on) +* Changed: filename filters are now turned off by default (use `--no_file_filters=0` to turn back on) + -[v1.4.6] +## v1.4.6 * New feature: paired-end reads are now handled natively at the fastq level. Use the --paired flag to enable, and your paired reads should be automagically detected and aligned * Changed: BAM file format is now used natively without SAM intermediates at all levels (also fixes the recent samtools version handling bug) * Changed: paired-end reads are selected based on the 0x2 bit on the SAM file format bitwise FLAG (reads aligned in proper pair) @@ -19,58 +35,58 @@ * Added: --keep_original_bams flag to retain the non-extended reads BAM file when using single-end sequencing * Changed: warning on missing chromosomal identities in GATC alignment made more friendly and less confusing -[v1.4.5] +## v1.4.5 * Added --ps_debug option to explicitly display pseudocounts calculation -[v1.4.4] +## v1.4.4 * Fixed issue with some externally-generated BAM files -[v1.4.3] +## v1.4.3 * Better handling of chromosome names -[v1.4.2] +## v1.4.2 * Small improvements to SAM file handling -[v1.4.1] +## v1.4.1 * Fixed issues with samtools v1.3 (this version of samtools introduced backwards incompatibilities when using the 'sort' function. damidseq_pipeline now checks for the version number and should support all versions of samtools.) -[v1.4] +## v1.4 * New read-extension method: by default, reads are now only extended as far as the next GATC fragment. Use --extension_method=full to disable this feature and extend every read by the value of --len. * Output format is now bedgraph by default. Use --output_format=gff to restore the previous default. Changing the default to bedgraph allows users to create TDF tracks directly within the graphical IGV tools, making it easier for end users. * Minor code cleanups -[v1.3] +## v1.3 * Major bugfix: reads from the minus strand were not being extended correctly during processing. The overall impact is minor (correlation between old and new read extension methods is >0.95) but this new method is technically more accurate. * added --keep_sam (do not delete the temporary SAM file) option * added --only_sam (do not generate BAM files) option (both options are intended for debug purposes only) -[v1.2.7] +## v1.2.7 * New opition --no_file_filters to prevent any filename trimming/filtering (by default input filenames are trimmed to the first underscore) * Small filename issue fixes -[v1.2.6] +## v1.2.6 * Now uses File::Basename to handle filenames * Fixed/cleaned up a number of rare problems with filename handling -[v1.2.5] +## v1.2.5 * Added --dam and --out_name options. Use these to set the Dam-only control sample and/or a custom output name * Added more sanity checks * Minor bugfixes -[v1.2.4] +## v1.2.4 * Added explicit checks for bowtie2 and samtools executables, and for bowtie2 output -[v1.2.3] +## v1.2.3 * Fixed a serious error in RPM normalisation calculations (values were inverted) -- please re-run on your samples if you have used this method on them * Minor code cleanups -[v1.2.1] +## v1.2.1 * Added ability to process BAM files generated from paired-end sequencing * Cleaner reporting of missing assembly fragments in GATC files * Some small bugs fixed * General code clean-ups -[v1.2] +## v1.2 * Completely re-written normalisation routine based on kernel density estimation * Genomic coverage is now calculated internally rather than using bedtools (uses much less memory, is slightly faster, and drops the requirement for an external binned windows file) * Binned window files are no longer required (bins are calculated automatically using the sequence information provided in the BAM headers, and the bin size specified by the --bins command-line option) @@ -78,14 +94,14 @@ * Memory optimisation for large files (greatly reduces usage for processing mouse/human data) * Added ability to process BAM files directly * Much better file-handling all round (now takes sample names directly from filenames by default; the option to use an index.txt file remains but is essentially deprecated) -* New option: --norm_method=[kde/rpm] "kde" is the default method using kernel density estimation; "rpm" normalises solely on readcounts/million reads only (the "rpm" method is not recommended except for the very rare cases in which a Dam-fusion protein fails to methylate accessible genomic regions, making kde normalisation is inappropriate) +* New option: --norm_method=## kde/rpm "kde" is the default method using kernel density estimation; "rpm" normalises solely on readcounts/million reads only (the "rpm" method is not recommended except for the very rare cases in which a Dam-fusion protein fails to methylate accessible genomic regions, making kde normalisation is inappropriate) * Re-written --help output rountines (better formatted and more informative) * Ability to read gzipped GATC files -* Ability to save sets of defaults to enable quick switching between different genomes (use --save_defaults=[name]; use --load_defaults=[name] to load; use load_defaults=list to list current available options) +* Ability to save sets of defaults to enable quick switching between different genomes (use --save_defaults=## name; use --load_defaults=## name to load; use load_defaults=list to list current available options) * New location for config files (in ~/.config/damid_pipeline/). Existing config file will be migrated automatically * Various small bugfixes and code clean-ups ** NB: a number of default parameters have changed with this release. It is strongly advised to reset all parameters to the default value with --reset_defaults. -[v1.0] +## v1.0 * Initial release diff --git a/damidseq_pipeline b/damidseq_pipeline index 1a0f34c..c385747 100755 --- a/damidseq_pipeline +++ b/damidseq_pipeline @@ -25,8 +25,8 @@ use 5.010; $|++; -my $version = "1.5.3"; -print STDERR "\ndamidseq_pipeline v$version\nCopyright 2013-22, Owen Marshall\n\n"; +my $version = "1.6"; +print STDOUT "\ndamidseq_pipeline v$version\nCopyright 2013-24, Owen Marshall\n\n"; # Global options my %vars = ( @@ -54,6 +54,7 @@ my %vars = ( 'output_format'=>'bedgraph', 'method_subtract' => 0, 'paired' => 0, + 'paired_match' => 'R|_', 'scores_as_rpm' => 1, 'pseudocounts' => 0, 'ps_factor' => 10, @@ -63,6 +64,7 @@ my %vars = ( 'norm_steps' => 300, 'just_align' => 0, 'just_coverage' => 0, + 'catada' => 0, 'coverage' => 0, 'norm_method' => 'kde', 'save_defaults' => 0, @@ -80,6 +82,13 @@ my %vars = ( 'unique' => 0, 'kde_inline' => 1, 'chipseq' => 0, + 'ncbam' => 0, + 'n' => 0, + 'exp_prefix' => '_', + 'exp_suffix' => '', + 'rep_prefix' => '_n', + 'nogroups' => 0, + 'datadir' => '', ); my %vars_details = ( @@ -107,7 +116,8 @@ my %vars_details = ( 'output_format' => 'Output tracks in this format [gff/bedgraph]', 'method_subtract' => 'Output values are (Dam_fusion - Dam) instead of log2(Dam_fusion/Dam) (not recommended)', 'pseudocounts' => 'Add this value of psuedocounts instead (default: optimal number of pseudocounts determined algorithmically)', - 'paired' => 'Process paired-end reads', + 'paired' => 'Process paired-end reads (deprecated: PE/SE detection should happen automagically)', + 'paired_match' => 'characters to search for that, directly in front of [1|2], that marks paired-reads (regex format)', 'scores_as_rpm' => 'Calculate coverage scores (before normalisation) as reads per million (RPM)', 'ps_factor' => 'Value of c in c*(reads/bins) formula for calculating pseudocounts (default = 10)', 'ps_debug' => 'Print extra debugging info on pseudocount calculations in log file', @@ -116,6 +126,7 @@ my %vars_details = ( 'norm_steps' => 'Number of points in normalisation routine (default = 300)', 'just_align' => 'Just align the FASTQ files, generate BAM files, and exit', 'just_coverage' => 'Align, generate BAMs and calculate coverage; do generate ratio files', + 'catada' => 'Generate coverage files for CATaDa. Alias for --just_coverage', 'coverage' => 'Save individual coverage bedgraphs', 'norm_method' => "Normalisation method to use; options are:\n\rkde: use kernel density estimation of log2 GATC fragment ratio (default)\n\rrpm: use readcounts per million reads (not recommended for most use cases)\n\rrawbins: process raw counts with external normalisation command (set with --rawbins_cmd)", 'save_defaults' => "Save runtime parameters as default\n\r(provide a name to differentiate different genomes -- these can be loaded with 'load_defaults')", @@ -133,7 +144,13 @@ my %vars_details = ( 'unique' => "Only map uniquely aligning reads", 'kde_inline' => 'Use Inline::C for KDE normalisation calculations', 'chipseq' => "Process ChIP-seq data\n\r(Equivalent to --just_coverage --as_gatc=0 --remdups --unique --extension_method=full --full_data_files)", - + 'ncbam' => "Don't clobber BAM files, skip if existing", + 'n' => "Dry-run only; do not process files", + 'exp_prefix' => "Common text string immediately preceding experiment name\n\r(e.g. if filename is Dam_Exp1_n1.fq.gz, use _ as the prefix)", + 'exp_suffix' => "Common text string immediately proceding experiment name\n\r(Takes --rep_prefix if not set; or, if filename is Dam_Exp1_some-other-text_n1.fq.gz, use _ as the suffix)", + 'rep_prefix' => "Common text string denoting replicates\n\r(e.g. if filename is Dam_Exp1_n1.fq.gz, use _n as the rep_prefix)", + 'nogroups' => "Do not try to group experiments and replicates, and just process all samples as per v1.5.3 and earlier", + 'datadir' => 'Process all files in this directory', ); my @vars_unsaved = ( @@ -142,8 +159,10 @@ my @vars_unsaved = ( 'load_defaults', 'just_align', 'just_coverage', + 'catada', 'dam', - 'out_name' + 'out_name', + 'nogroups' ); # Global variables @@ -169,6 +188,8 @@ my %seg; my %full_tracks; my %counts; my %bins; +my %exps; +my %reps; my $HOME = (getpwuid($<))[7]; my $pi = 3.1415926536; @@ -194,6 +215,9 @@ read_defaults(); parameter_check(); process_cli(1); +# Files check +check_input_files(); + # CLI processing check_paths(); save_defaults(); @@ -202,54 +226,35 @@ save_defaults(); init_log_file(); # Final sanity checks -final_sanity_check(); - -# Read input files -process_input_files(); +sanity_check(); # Check executables are present in path executable_check(); - -############################# -### Pipeline workflow -### - -align_sequences(); +# Load GATC frags if required load_gatc_frags() unless (!$vars{'as_gatc'} || (($vars{'extension_method'} eq 'full' || $vars{'paired'}) && $vars{'just_align'})); -if ($vars{'extension_method'} eq 'full') { - extend_reads_full(); -} else { - extend_reads_gatc(); -} - -exit 0 if $vars{'just_align'}; +find_groups(); -calc_bins(); +# Exit if dry-run +exit 0 if ($vars{'n'}); -if ($vars{'write_raw'}) { - generate_raw_files(); - exit 0; -} - -if ($vars{'just_coverage'} || $vars{'coverage'}) { - generate_cov_files(); - exit 0 if $vars{'just_coverage'}; -} - -if ($vars{'norm_method'} eq 'kde') { - find_quants(); - find_norm_factor(); -} - -if ($vars{'norm_method'} eq 'rawbins') { - find_norm_factor_raw(); +# Now loop through samples +if ($vars{'nogroups'}) { + # Either all samples if old functionality is requested + main_pipeline(@in_files); +} else { + # Or using new automatic groupings + map { + my $e=$_; + map { + my @group = @{ $reps{$e}{$_} }; + printout ("\n*** Now working on $e | Replicate $_\n\n"); + main_pipeline(@group); + } (sort keys %{$reps{$e}}); + } (sort keys %reps); } -normalize(); -generate_ratio(); - printout("\nAll done.\n\n"); exit 0; @@ -259,6 +264,84 @@ exit 0; ### Subroutines start here ### +sub main_pipeline { + # Pipeline workflow + + my @in_files_rep = @_; + %files=(); + $damname=""; + + process_input_files(@in_files_rep); + + align_sequences(); + + if (!$vars{'paired'}) { + # Do not extend reads if using PE sequencing (as we know the exact fragment length) + # I cannot see a reason to not hardcode in this sanity check, but if there are edge + # cases please let me know + if ($vars{'extension_method'} eq 'full') { + extend_reads_full(); + } else { + extend_reads_gatc(); + } + } + + return if $vars{'just_align'}; + + calc_bins(); + + if ($vars{'write_raw'}) { + generate_raw_files(); + return; + } + + if ($vars{'just_coverage'} || $vars{'coverage'}) { + generate_cov_files(); + return if $vars{'just_coverage'}; + } + + if ($vars{'norm_method'} eq 'kde') { + find_quants(); + find_norm_factor(); + } + + if ($vars{'norm_method'} eq 'rawbins') { + find_norm_factor_raw(); + } + + normalize(); + generate_ratio(); +} + +sub find_groups { + return if $vars{'nogroups'}; + + map { + my $f = basename($_); + my $e = $f =~(/$vars{'exp_prefix'}(.*?)$vars{'exp_suffix'}/) ? $1 : 'unmatched'; + push(@{$exps{$e}},$_); + } @in_files; + + foreach my $e (keys %exps) { + map { + my $f = basename($_); + my $r = $f =~ (m/.*?$vars{'rep_prefix'}.*?(\d)/) ? $1 : 'unrep'; + push( @{$reps{$e}{$r}}, $_ ); + } @{ $exps{$e} }; + } + + printout("Found the following groups:\n"); + map { + my $e = $_ || "default"; + printout("\nExperiment $e:"); + + map { + printout("\n Rep $_:\n ", join("\n ", @{ $reps{$e}{$_} }), "\n"); + } (sort keys %{$reps{$e}}); + + } (keys %reps); +} + sub check_paths { # Check paths if ($vars{'gatc_frag_file'} && !(-e "$vars{'gatc_frag_file'}")) { @@ -325,7 +408,7 @@ sub executable_check { } } -sub final_sanity_check { +sub sanity_check { if ($vars{'norm_method'} eq 'rawbins') { unless ($vars{'rawbins_cmd'}) { print STDERR "Error: rawbins normalisation method selected, but no custom command supplied.\n\nPlease use the --rawbins_cmd to specify the command to provide normalisation factors from raw data.\n\n"; @@ -340,6 +423,10 @@ sub final_sanity_check { $vars{'just_coverage'} = 1; } + if ($vars{'catada'}) { + $vars{'just_coverage'} = 1; + } + unless ($vars{'as_gatc'}) { $vars{'extension_method'}="full"; $vars{'full_data_files'}=1; @@ -347,6 +434,12 @@ sub final_sanity_check { } $vars{'markdup'} = 1 if $vars{'remdups'}; + + # if not explicitly set, exp_suffix is set to rep_prefix + $vars{'exp_suffix'} ||= $vars{'rep_prefix'}; + + # if not explicitly set, datadir is . + $vars{'datadir'} ||= '.' } @@ -382,33 +475,36 @@ sub process_cli { } } +sub check_input_files { + # CLI files + # if no files specified, process all .gz files in directory. + unless (@in_files) { + @in_files = glob("$vars{'datadir'}*.fastq.gz"); + @in_files = glob("$vars{'datadir'}*.gz") unless scalar(@in_files); + @in_files = glob("$vars{'datadir'}*.fastq") unless scalar(@in_files); + @in_files = glob("$vars{'datadir'}*.bam") unless scalar(@in_files); + if ($vars{'bamfiles'}) { + @in_files = glob("$vars{'datadir'}*.bam"); + } + } +} + sub process_input_files { + my @in_files_rep = @_; # Check for user-specified dam control if ($vars{'dam'}) { - push @in_files, $vars{'dam'} + push @in_files_rep, $vars{'dam'} } # Get unique list of infiles - @in_files = uniq(@in_files); + @in_files_rep = uniq(@in_files_rep); # Index file my $index_file = "index.txt"; - # CLI files - # if no files specified, process all .gz files in directory. - unless (@in_files) { - @in_files = glob("*.fastq.gz"); - @in_files = glob("*.gz") unless scalar(@in_files); - @in_files = glob("*.fastq") unless scalar(@in_files); - @in_files = glob("*.bam") unless scalar(@in_files); - if ($vars{'bamfiles'}) { - @in_files = glob("*.bam"); - } - } - - unless (@in_files) { + unless (@in_files_rep) { printout("ERROR: no FASTQ or BAM files found (and no files specified on the command-line).\n(Use --help to see command-line options ...)\n\n"); exit 1; } @@ -426,7 +522,7 @@ EOT if ($vars{'bamfiles'}) { # Bamfiles - foreach my $l (@in_files) { + foreach my $l (@in_files_rep) { my ($name,$dir,$ext) = fileparse($l, qr/\.[^.]*/); next unless $ext =~ m/bam/; @@ -435,8 +531,9 @@ EOT $files{$name}[0]=$l; check_dam_name($name, $l); } - } elsif (@in_files && !(-e $index_file)) { - foreach my $l (@in_files) { + } elsif (@in_files_rep && !(-e $index_file)) { + my %skipped_files; + foreach my $l (@in_files_rep) { my ($name_long,$dir,$ext) = fileparse($l, qr/\..*/); my $name; if ($vars{'no_file_filters'}) { @@ -448,13 +545,35 @@ EOT $vars{'bamfiles'} = 1 if $ext =~ /bam/; - if ($vars{'paired'} && !$vars{'bamfiles'}) { - $vars{'extend_reads'} = 0; + if (!$vars{'bamfiles'}) { + # $vars{'extend_reads'} = 0; # find paired end fastq files my ($pmatch) = $name_long =~ m/(.*[R|_])[1|2](?=[_\d]*$)/; unless ($pmatch) { - print "Error: $name_long not paired? File ignored ...\n"; + printout(" $name_long not paired. Assuming SE sequencing ...\n"); + if ($files{$name}) { + #same name already in use + my $i = 1; + while ($files{"$name.$i"}) { + $i++; + } + $name = "$name.$i"; + } + $files{$name}[0]=$l; + next; + } else { + #printout(" Paired-end reads detected ..."); + $vars{'paired'} = 1; + $vars{'extend_reads'} = 0; + } + + my $name_nopmatch = $pmatch; + chop($name_nopmatch); + if ($vars{'ncbam'} && (-e "$name_nopmatch.bam")) { + next if $skipped_files{"$name_nopmatch"}; + $skipped_files{"$name_nopmatch"}++; + print " $name_nopmatch.bam exists: skipping ...\n"; next; } @@ -468,7 +587,7 @@ EOT } elsif ($name_long =~ m/$pmatch2/) { $files{$name}[1]=$l; } else { - die("Error: paired-end reads set but cannot find paired-end flag: $name_long\nExiting.\n\n"); + die("Error: paired-end reads detected but cannot find paired-end flag: $name_long\nExiting.\n\n"); } } else { if ($files{$name}) { @@ -479,11 +598,17 @@ EOT } $name = "$name.$i"; } + if ($vars{'ncbam'} && (-e "$name.bam")) { + next if $skipped_files{"$name"}; + $skipped_files{"$name"}++; + print " $name.bam exists: skipping ...\n"; + next; + } $files{$name}[0]=$l; } } - foreach my $name (sort(keys %files)) { + foreach my $name (sort keys %files) { unless ($vars{'just_align'} || $vars{'just_coverage'}) { check_dam_name($name, @{$files{$name}}[0]); } @@ -494,6 +619,8 @@ EOT } } elsif (-e $index_file) { + print STDERR "*** Warning: the use of an explict index file is deprecated. You can still use this option, but some new replicate matching functionality may be missing.\n\n"; + open (NORM, "<$index_file") || die "Unable to open index file for reading: $!\n"; my @norm = ; close NORM; @@ -515,7 +642,7 @@ EOT } printout("\n*** Matching adaptors ***\n"); - foreach my $l (@in_files) { + foreach my $l (@in_files_rep) { print "$l\n"; ## Change this next line's regexp to match your sequencing format (currently matches, e.g. "Index6" or "A006") @@ -587,7 +714,7 @@ sub read_defaults { if ($vars{'load_defaults'} eq 'list') { print_defaults_files(); } else { - unless (-e "$HOME/.config/damid_pipeline/defaults.$vars{'load_defaults'}") { + unless (-e "$HOME/.config/damid_pipelinef/defaults.$vars{'load_defaults'}") { print STDERR "Error: cannot find saved defaults file for '$vars{'load_defaults'}'\n\n"; print_defaults_files(); } @@ -680,7 +807,7 @@ sub align_sequences { if ($vars{'bowtie'}) { printout("\n*** Aligning files with bowtie2 ***\n"); - foreach my $fn (keys %files) { + foreach my $fn (sort keys %files) { my $samtools_pipe = "$vars{'samtools_path'}samtools view -Shb - > $fn.bam"; if ($vars{'markdup'}) { @@ -719,7 +846,7 @@ sub extend_reads_full { # Extend reads if ($vars{'extend_reads'}) { printout("*** Extending reads to $vars{'len'} bases (full extension) ***\n"); - foreach my $fn (keys %files) { + foreach my $fn (sort keys %files) { printout("Reading input file: $fn ...\n"); open (IN, "$vars{'samtools_path'}samtools view -h $fn.bam |") || die "Unable to read $fn: $!\n"; @@ -787,7 +914,7 @@ sub extend_reads_gatc { # Extend reads if ($vars{'extend_reads'}) { printout("*** Extending reads up to $vars{'len'} bases ***\n"); - foreach my $fn (keys %files) { + foreach my $fn (sort keys %files) { printout("Reading input file: $fn ...\n"); open (IN, "$vars{'samtools_path'}samtools view -h $fn.bam |") || die "Unable to read $fn: $!\n"; @@ -912,7 +1039,7 @@ sub extend_reads_gatc { sub calc_bins { printout("*** Calculating bins ***"); - foreach my $n (keys %files) { + foreach my $n (sort keys %files) { my $fn = $vars{'bamfiles'} ? $files{$n}[0] : $vars{'extend_reads'} ? "$n-ext$vars{'len'}" : $n; $fn =~ s/\.bam$//; @@ -1044,7 +1171,7 @@ sub calc_bins { exit 1; } } else { - printout(" Error: chromsome definitions between bam and GATC file do not match!\n"); + printout(" Error: chromosome definitions between bam and GATC file do not match!\n"); } } } @@ -1089,7 +1216,7 @@ sub calc_bins { $index++; # Get two adjacent GATC sites my ($mida) = @{$gatc{$chr}}[$i]; - my ($midb) = @{$gatc{$chr}}[$i+1]; + my ($midb) = @{$gatc{$chr}}[$i+1] - 1; # avoid overlaps if ($index%100 == 0) { my $pc = sprintf("%0.0f",($index*100)/$#gatc_simple); @@ -1146,7 +1273,7 @@ sub calc_bins { sub normalize { printout("\n*** Normalising ***\n"); - foreach my $n (keys %files) { + foreach my $n (sort keys %files) { next if $n eq $damname; # we don't normalise the Dam control printout("Processing sample: $n ...\n"); @@ -1190,15 +1317,15 @@ sub normalize { } sub printout { - my $s = shift; - print STDERR $s; + my $s = join("",@_); + print STDOUT $s; print LOG $s; } sub generate_ratio { printout("\n*** Generating ratios ***\n"); - foreach my $n (keys %files) { + foreach my $n (sort keys %files) { next if $n eq $damname; printout("Now working on $n ...\n"); @@ -1210,7 +1337,7 @@ sub generate_ratio { sub generate_raw_files { printout("\n*** Writing raw files ***\n"); - foreach my $n (keys %files) { + foreach my $n (sort keys %files) { next if $n eq $damname; printout("Now working on $n ...\n"); @@ -1222,7 +1349,7 @@ sub generate_raw_files { sub generate_cov_files { printout("\n*** Writing coverage files ***\n"); - foreach my $n (keys %files) { + foreach my $n (sort keys %files) { printout("Now working on $n ...\n"); my $write_gatc = $vars{'as_gatc'} ? 1 : 0; write_cov($n, $write_gatc); @@ -1354,7 +1481,7 @@ sub write_file { my $fout=$fname.( $vars{'output_format'} =~ m/^bed/ ? 'bedgraph' : 'gff' ); if ($vars{'scores_as_rpm'}) { - printout(sprintf(" ... adding %0.2fe-3 pseudocounts to each sample\n",$pseudocounts*1e3)); + printout(sprintf(" ... adding %0.2f pseudocounts to each sample\n",$pseudocounts)); } else { printout(sprintf(" ... adding %0.2f pseudocounts to each sample\n",$pseudocounts)); } @@ -1413,7 +1540,7 @@ sub find_norm_factor_raw { generate_raw_files(); - foreach my $n (keys %files) { + foreach my $n (sort keys %files) { next if $n eq $damname; my $protname = $vars{'out_name'} ? $vars{'out_name'} : $n; @@ -1430,7 +1557,7 @@ sub find_norm_factor_raw { sub find_norm_factor { printout("*** Calculating normalisation factor ***\n"); - foreach my $n (keys %files) { + foreach my $n (sort keys %files) { next if $n eq $damname; printout("Now working on $n ...\n"); @@ -1511,7 +1638,7 @@ sub qsc { sub find_quants { printout("\n*** Calculating quantiles ***\n"); - for my $prot (keys %files) { + for my $prot (sort keys %files) { my @frags; printout("Now working on $prot ...\n"); @@ -1753,6 +1880,9 @@ sub help { write(); } print STDOUT "\n"; + + print STDOUT "More details and latest version available at https://owenjm.github.io/damidseq_pipeline\n"; + exit 1; } @@ -1768,7 +1898,7 @@ sub load_gatc_frags { foreach () { my ($chr, $source, $type, $start, $end, $score, $b, $c, $name) = split('\t'); - my $mid = ($start+$end)/2; + my $mid = $start+2; # clean up an old fencepost snafu for good push (@gatc_simple, [$chr, $mid]); push @{$gatc{$chr}}, $mid; } @@ -1832,7 +1962,27 @@ sub uniq { BEGIN { eval { require Inline::C; - Inline->import('C' => q { + + my $HOME = (getpwuid($<))[7]; + unless (-d "$HOME/.config") { + mkdir("$HOME/.config"); + } + + unless (-d "$HOME/.config/damid_pipeline") { + mkdir("$HOME/.config/damid_pipeline"); + } + + unless (-d "$HOME/.config/damid_pipeline") { + mkdir("$HOME/.config/damid_pipeline/cache"); + } + + if (-d "$HOME/.config/damid_pipeline/cache") { + # set Inline::C compile directory + Inline->import(Config => DIRECTORY => "$HOME/.config/damid_pipeline/cache"); + } + + Inline->import( + 'C' => q { #include double kden_calc (double y, double h, double sq2pi, SV* terms) { int n = 0;