From 20c6ffa9ec1993e9e3cbf2267dcec29ea5faca10 Mon Sep 17 00:00:00 2001 From: Thomas Weber Date: Tue, 22 Aug 2023 13:24:31 +0200 Subject: [PATCH 1/5] 2.2.1: fixes --- .gitignore | 1 + .gitpod.Dockerfile | 38 +++ .gitpod.yml | 65 +++- .gitpod.yml.bak | 4 + config/config_metadata.yaml | 15 +- github-actions-runner/docker_procedure.md | 40 +++ workflow/Snakefile | 8 +- workflow/envs/rtools.yaml | 5 + workflow/report/workflow.rst | 33 +- workflow/rules/common.smk | 21 +- workflow/scripts/GC/GC_correction.R | 307 +++++++----------- workflow/scripts/GC/counts_scaling.R | 32 -- workflow/scripts/GC/gc.R | 38 --- .../scripts/GC/library_size_normalisation.R | 115 +++++++ .../GC/variance_stabilizing_transformation.R | 302 ++++++++--------- workflow/scripts/utils/make_log_useful.py | 67 ++-- 16 files changed, 631 insertions(+), 460 deletions(-) create mode 100644 .gitpod.Dockerfile create mode 100644 .gitpod.yml.bak create mode 100644 github-actions-runner/docker_procedure.md delete mode 100644 workflow/scripts/GC/counts_scaling.R delete mode 100644 workflow/scripts/GC/gc.R create mode 100644 workflow/scripts/GC/library_size_normalisation.R diff --git a/.gitignore b/.gitignore index dc307c6a..5e501cbe 100644 --- a/.gitignore +++ b/.gitignore @@ -214,3 +214,4 @@ workflow/data/arbigent/scTRIP_segmentation.bed .tests/data_CHR17/RPE-BM510/bam/*.bam.raw .tests/data_CHR17/RPE-BM510/bam/*.bam.sort .tests/external_data/chr17.fa.log +LOGS_DEV/ diff --git a/.gitpod.Dockerfile b/.gitpod.Dockerfile new file mode 100644 index 00000000..04dfb23b --- /dev/null +++ b/.gitpod.Dockerfile @@ -0,0 +1,38 @@ +FROM gitpod/workspace-full:2023-03-24-02-48-18 + +ENV RETRIGGER=4 + +ENV BUILDKIT_VERSION=0.11.6 +ENV BUILDKIT_FILENAME=buildkit-v${BUILDKIT_VERSION}.linux-amd64.tar.gz +ENV DAZZLE_VERSION=0.1.17 + +USER root + +# Install dazzle, buildkit and pre-commit +RUN curl -sSL https://github.com/moby/buildkit/releases/download/v${BUILDKIT_VERSION}/${BUILDKIT_FILENAME} | tar -xvz -C /usr +RUN curl -sSL https://github.com/gitpod-io/dazzle/releases/download/v${DAZZLE_VERSION}/dazzle_${DAZZLE_VERSION}_Linux_x86_64.tar.gz | tar -xvz -C /usr/local/bin +RUN curl -sSL https://github.com/mvdan/sh/releases/download/v3.5.1/shfmt_v3.5.1_linux_amd64 -o /usr/bin/shfmt \ + && chmod +x /usr/bin/shfmt +RUN install-packages shellcheck \ + && pip3 install pre-commit +RUN curl -sSL https://github.com/mikefarah/yq/releases/download/v4.22.1/yq_linux_amd64 -o /usr/bin/yq && chmod +x /usr/bin/yq + +# Apptainer installation +RUN apt-get update && \ + apt-get install -y software-properties-common && \ + add-apt-repository -y ppa:apptainer/ppa && \ + apt-get update && \ + apt-get install -y apptainer + +# Mambaforge installation +RUN wget https://github.com/conda-forge/miniforge/releases/latest/download/Mambaforge-Linux-x86_64.sh && \ + sh Mambaforge-Linux-x86_64.sh -b -p /home/gitpod/mambaforge && \ + /home/gitpod/mambaforge/bin/mamba create -n snakemake -c bioconda -c conda-forge snakemake && \ + /home/gitpod/mambaforge/bin/mamba init && \ + echo "source /home/gitpod/mambaforge/bin/activate" >> /home/gitpod/.bashrc && \ + echo "conda activate snakemake" >> /home/gitpod/.bashrc + +# Clean up to reduce image size +RUN apt-get clean && \ + rm -rf /var/lib/apt/lists/* && \ + rm Mambaforge-Linux-x86_64.sh \ No newline at end of file diff --git a/.gitpod.yml b/.gitpod.yml index 18970d9a..34ef39b6 100644 --- a/.gitpod.yml +++ b/.gitpod.yml @@ -1,4 +1,65 @@ -image: condaforge/mambaforge:latest +image: + file: .gitpod.Dockerfile + +ports: + - port: 5000 + onOpen: ignore + tasks: + - name: install pre-commit git hooks + init: pre-commit install + - name: start up buildkitd + command: | + sudo /usr/bin/buildkitd --debug --config ./buildkitd.toml --group gitpod + - name: start a local docker registry + command: | + mkdir -p /workspace/registry + docker run -p 5000:5000 --name registry --rm -v /workspace/registry:/var/lib/registry registry:2 + openMode: split-right + - name: dazzle build and test + command: | + gp ports await 5000 + REPO=localhost:5000/dazzle + echo "To build specific chunks and combine them 'time ./build-chunk.sh -c chunk1 -c chunk2:variant1.2.3 -n combo'" + echo "To build all the chunks and combinations 'time ./build-all.sh'" + echo "To build a specific combination 'time ./build-combo.sh combo'" + echo "To list image chunks 'dazzle project image-name $REPO'" + echo "To list hashes for image chunks 'dazzle project hash $REPO'" + echo "To print the combined image maniest 'dazzle project manifest $REPO'" + + openMode: tab-after + + # MOSAICATCHER + - name: Create snakemake env - command: mamba create -n snakemake -c bioconda -c conda-forge snakemake=7.18.2 -y + command: mamba create -n snakemake -c bioconda -c conda-forge snakemake -y + + # - name: Run mosaicatcher + # command: | + # snakemake --configfile .tests/config/simple_config.yaml \ + # --config ashleys_pipeline=True MultiQC=True \ + # --profile workflow/snakemake_profiles/local/conda/ \ + # --conda-frontend mamba \ + # --cores 8 --forceall + + # - name: Generate Dockerfile + # command: | + # snakemake --configfile .tests/config/simple_config.yaml \ + # --config ashleys_pipeline=True MultiQC=True \ + # --profile workflow/snakemake_profiles/local/conda/ \ + # --conda-frontend mamba \ + # --cores 8 --containerize + +vscode: + extensions: + - ms-azuretools.vscode-docker + - timonwong.shellcheck + +github: + prebuilds: + master: true + branches: true + pullRequests: true + pullRequestsFromForks: true + addCheck: true + addComment: true diff --git a/.gitpod.yml.bak b/.gitpod.yml.bak new file mode 100644 index 00000000..18970d9a --- /dev/null +++ b/.gitpod.yml.bak @@ -0,0 +1,4 @@ +image: condaforge/mambaforge:latest +tasks: + - name: Create snakemake env + command: mamba create -n snakemake -c bioconda -c conda-forge snakemake=7.18.2 -y diff --git a/config/config_metadata.yaml b/config/config_metadata.yaml index 16669821..054f84a5 100644 --- a/config/config_metadata.yaml +++ b/config/config_metadata.yaml @@ -23,7 +23,7 @@ MultiQC: type: bool required: False default: False -multistep_normalisation_analysis: +multistep_normalisation: desc: "Enable / Disable multistep normalisation" type: bool required: False @@ -38,11 +38,6 @@ ashleys_threshold: type: bool required: False default: 0.5 -plate_orientation: - desc: "Plate orientation for GC analysis (landscape/portrait)" - type: string - required: False - default: landscape window: desc: "Mosaic bin window size" type: int @@ -73,7 +68,7 @@ samples_to_process: type: list required: True default: "[]" -normalized_counts: +hgsvc_based_normalized_counts: desc: Normalize or not mosaic counts type: bool required: False @@ -116,3 +111,9 @@ scNOVA: required: False default: False lint_check: False +genecore_prefix: + desc: "" + type: str + required: False + default: "/g/korbel/shared/genecore" + lint_check: False diff --git a/github-actions-runner/docker_procedure.md b/github-actions-runner/docker_procedure.md new file mode 100644 index 00000000..c2661b4b --- /dev/null +++ b/github-actions-runner/docker_procedure.md @@ -0,0 +1,40 @@ +- Push latest changes to dev +- Open gitpod.io for repo +- Run snakemake for test dataset with all options + +``` +snakemake --configfile .tests/config/simple_config.yaml --config ashleys_pipeline=True MultiQC=True --profile workflow/snakemake_profiles/local/conda_singularity/ -c 8 --forceall +``` + +- Run same command with --containerize +- Copy and paste Dockerfile into github-actions-runner folder of repo +- Open gitpod.io with docker/apptainer/mamba + +# To solve by preinstalling apptainer and mamba into base image + +Apptainer install: + +``` +sudo apt update +sudo apt install -y software-properties-common +sudo add-apt-repository -y ppa:apptainer/ppa +sudo apt update +sudo apt install -y apptainer +``` + +Mamba install: + +wget https://github.com/conda-forge/miniforge/releases/latest/download/Mambaforge-Linux-x86_64.sh +sh Mambaforge-Linux-x86_64.sh +/home/gitpod/mambaforge/bin/mamba create -n snakemake -c bioconda -c conda-forge snakemake +/home/gitpod/mambaforge/bin/mamba init +source ~/.bashrc +conda activate snakemake + +snakemake --configfile .tests/config/simple_config.yaml --config ashleys_pipeline=True MultiQC=True --profile workflow/snakemake_profiles/local/conda/ -c 8 --forceall -n +snakemake --configfile .tests/config/simple_config.yaml --config ashleys_pipeline=True MultiQC=True --profile workflow/snakemake_profiles/local/conda_singularity/ -c 8 --forceall + + +# Solved in + +https://github.com/weber8thomas/workspace-images/tree/docker-apptainer-mamba \ No newline at end of file diff --git a/workflow/Snakefile b/workflow/Snakefile index 4faa35cd..e8e15c2b 100644 --- a/workflow/Snakefile +++ b/workflow/Snakefile @@ -2,8 +2,8 @@ from snakemake.utils import min_version min_version("7.5.0") - -configfile: "config/config.yaml" +configfile_location = "config/config.yaml" +configfile: configfile_location report: "report/workflow.rst" @@ -25,9 +25,9 @@ if config["ashleys_pipeline"] is True: github( "friendsofstrandseq/ashleys-qc-pipeline", path="workflow/Snakefile", - tag=str(config["ashleys_pipeline_version"]) + # tag=str(config["ashleys_pipeline_version"]) # branch="main", - # branch="dev", + branch="dev", ) config: config diff --git a/workflow/envs/rtools.yaml b/workflow/envs/rtools.yaml index e1580794..59c95e3d 100644 --- a/workflow/envs/rtools.yaml +++ b/workflow/envs/rtools.yaml @@ -48,3 +48,8 @@ dependencies: - r-tidyr - r-ggbeeswarm - r-pheatmap + # GC_correction + - r-ggpubr + - bioconductor-edger + # SOLVE R lib issue + - r-stringi=1.7.12 diff --git a/workflow/report/workflow.rst b/workflow/report/workflow.rst index 5404e21d..ce78a056 100644 --- a/workflow/report/workflow.rst +++ b/workflow/report/workflow.rst @@ -3,25 +3,47 @@ MosaiCatcher v2 is a `Snakemake `__ pipeline that aims to detect Structural variants from single-cell Strand-seq data. -**Versions used:** +**Versions used and general parameters:** * MosaiCatcher version used: {{ snakemake.config["version"] }} +* Ashleys version used (if enabled): {{ snakemake.config["ashleys_pipeline_version"] }} +* Samples processed in the folder: {{ snakemake.config["samples_to_process"] }} **Input/Output options:** * Folder processed: {{ snakemake.config["data_location"] }} +* Publishdir defined: {{ snakemake.config["publishdir"] }} +* Input BAM legacy (bam & selected ; mutually exclusive with ashleys_pipeline): {{ snakemake.config["input_bam_legacy"] }} -**Main options:** +**Ashleys-QC parameters (if enabled) * Ashleys-QC preprocessing pipeline enabled: {{ snakemake.config["ashleys_pipeline"] }} * Ashleys-QC preprocessing pipeline version used: {{ snakemake.config["ashleys_pipeline_version"] }} -* BAM folder legacy format (all/selected) enabled: {{ snakemake.config["input_bam_legacy"] }} +* Ashleys-QC preprocessing pipeline only (to stop after QC): {{ snakemake.config["ashleys_pipeline_only"] }} +* Ashleys-QC threshold: {{ snakemake.config["ashleys_threshold"] }} +* MultiQC enabled (triggered FastQC, samtools idxstats & flagstats): {{ snakemake.config["MultiQC"] }} + **Counts option:** * Multistep normalisation enabled: {{ snakemake.config["multistep_normalisation"] }} +* Multistep normalisation min reads / bin: {{ snakemake.config["multistep_normalisation_options"]["min_reads_bin"] }} +* Multistep normalisation nb of subsample: {{ snakemake.config["multistep_normalisation_options"]["n_subsample"] }} +* Multistep normalisation min reads / cell: {{ snakemake.config["multistep_normalisation_options"]["min_reads_cell"] }} * Read Counts normalization enabled: {{ snakemake.config["hgsvc_based_normalized_counts"] }} * Binning window size: {{ snakemake.config["window"] }} +* Blacklist regions: {{ snakemake.config["blacklist_regions"] }} + +**SV calling:** + +* Multistep normalisation for SV calling enabled: {{ snakemake.config["multistep_normalisation_for_SV_calling"] }} + +**Downstream analysis modules:** + +* Arbitrary Genotyping (ArbiGent): {{ snakemake.config["arbigent"] }} +* ArbiGent BED file: {{ snakemake.config["arbigent_bed_file"] }} +* scNOVA: {{ snakemake.config["scNOVA"] }} + **Reference genome & Chromosomes options:** @@ -29,6 +51,11 @@ MosaiCatcher v2 is a `Snakemake `__ pipeline that a * Reference genome selected: {{ snakemake.config["reference"] }} * Reference FASTA file: {{ snakemake.config["references_data"][snakemake.config["reference"]]["reference_file_location"] }} +**Other options** + +* Split QC plot into individual files: {{ snakemake.config["split_qc_plot"] }} + + MosaiCatcher git repository: https://github.com/friendsofstrandseq/mosaicatcher-pipeline *Please cite:* diff --git a/workflow/rules/common.smk b/workflow/rules/common.smk index cb0dcdd3..fbbb1e4d 100644 --- a/workflow/rules/common.smk +++ b/workflow/rules/common.smk @@ -1,5 +1,6 @@ import collections import pandas as pd +import yaml # from scripts.utils import handle_input, make_log_useful, pipeline_aesthetic_start from scripts.utils import make_log_useful, pipeline_aesthetic_start @@ -283,7 +284,8 @@ class HandleInput: complete_df_list = list() # List of folders/files to not consider (restrict to samples only) - l_to_process = [e for e in os.listdir(thisdir) if e not in exclude] + l_to_process = [e for e in os.listdir(thisdir) if e not in exclude and e.endswith(".zip") is False] + print(l_to_process) if config["samples_to_process"]: l_to_process = [e for e in l_to_process if e in config["samples_to_process"]] @@ -325,6 +327,7 @@ class HandleInput: complete_df = complete_df.sort_values(by=["Cell", "File"]).reset_index( drop=True ) + print(complete_df) return complete_df @@ -511,20 +514,22 @@ def get_mem_mb_heavy(wildcards, attempt): return mem_avail[attempt - 1] * 1000 -def onsuccess_fct(log): - make_log_useful.make_log_useful(log, "SUCCESS", config) +def onsuccess_fct(log): + config_metadata = config_definitions = yaml.safe_load(open(configfile_location.replace("config.yaml", "config_metadata.yaml"), "r")) + log_path_new = make_log_useful.make_log_useful(log, "SUCCESS", config, config_metadata) shell( - 'mail -s "[Snakemake] smk-wf-catalog/mosacaitcher-pipeline v{} - Run on {} - SUCCESS" {} < {{log}}'.format( - config["version"], config["data_location"], config["email"] + 'mail -s "[Snakemake] smk-wf-catalog/mosacaitcher-pipeline v{} - Run on {} - SUCCESS" {} < {}'.format( + config["version"], config["data_location"], config["email"], log_path_new ) ) def onerror_fct(log): - make_log_useful.make_log_useful(log, "ERROR", config) + config_metadata = config_definitions = yaml.safe_load(open(configfile_location.replace("config.yaml", "config_metadata.yaml"), "r")) + log_path_new = make_log_useful.make_log_useful(log, "ERROR", config, config_metadata) shell( - 'mail -s "[Snakemake] smk-wf-catalog/mosacaitcher-pipeline v{} - Run on {} - ERRROR" {} < {{log}}'.format( - config["version"], config["data_location"], config["email"] + 'mail -s "[Snakemake] smk-wf-catalog/mosacaitcher-pipeline v{} - Run on {} - ERRROR" {} < {}'.format( + config["version"], config["data_location"], config["email"], log_path_new ) ) diff --git a/workflow/scripts/GC/GC_correction.R b/workflow/scripts/GC/GC_correction.R index 030c7a48..50e7a107 100644 --- a/workflow/scripts/GC/GC_correction.R +++ b/workflow/scripts/GC/GC_correction.R @@ -1,60 +1,54 @@ -# fetch arguments -args = commandArgs(trailingOnly = T) - -# # checking arguments -# if (length(args) < 3) { -# message("Usage: Rscript GC_correction.R count-file.txt.gz gc-matrix.txt output.txt.gz optional(plots.pdf)") -# stop() -# } -# for (a in seq(2)) { -# if (!file.exists(args[a])) { -# message(paste(args[a], "does not exists or cannot be found.")) -# stop() -# } -# } - -#args[1] <- '/data/r-workspace/strandseq_utils/H3JNHAFX3_MNIphotoconverted_200000_fixed.norm.txt.gz' -#args[2] <- '/data/r-workspace/strandseq_utils/GC_matrix_200000.txt' -print(snakemake@params[["gc_matrix"]]) +# SET ARGUMENTS +input_path <- snakemake@input[["counts_scaled"]] +gc_path <- snakemake@params[["gc_matrix"]] +save_path <- snakemake@output[["counts_scaled_gc"]] +plot <- TRUE +min_reads <- snakemake@params[["gc_min_reads"]] # <- 5 +n_subsample <- snakemake@params[["gc_n_subsample"]] # <- 1000 + +print(gc_path) + # open files -counts <- data.table::fread(snakemake@input[["counts_vst"]], header = T) -# counts <- data.table::fread(args[1], header = T) -GC_matrix <- data.table::fread(snakemake@params[["gc_matrix"]], header = T) -# GC_matrix <- data.table::fread(args[2], header = T) -save_path <- snakemake@output[["counts_vst_gc"]] -# save_path <- args[3] +counts <- data.table::fread(input_path, header = T) +GC_matrix <- data.table::fread(gc_path, header = T) + +# reformat GC_matrix +# find column containing GC counts and rename to 'GC%' + +idx <- which(grepl("GC", colnames(GC_matrix), fixed = TRUE)) +colnames(GC_matrix)[[idx]] <- "GC%" # check GC plots -# plot <- ifelse(is.na(args[4]), FALSE, TRUE) -plot <- FALSE -# if (plot) { -# # import libraries -# library(ggplot2) -# } +if (plot) { + # import libraries + library(ggplot2) + library(ggpubr) +} + # check files -if (!all(c('cell', 'chrom', 'start', 'end', 'w', 'c') %in% colnames(counts))) { +if (!all(c("cell", "chrom", "start", "end", "w", "c") %in% colnames(counts))) { message("count file does not contain required columns: 'cell', 'chrom', 'start', 'end', 'w', 'c'") message("Usage: Rscript GC_correction.R count-file.txt.gz gc-matrix.txt output.txt.gz") stop() } -if (!all(c('chrom', 'start', 'end', 'GC%') %in% colnames(GC_matrix))) { +if (!all(c("chrom", "start", "end", "GC%") %in% colnames(GC_matrix))) { message("GC_matrix file does not contain required columns: 'chrom', 'start', 'end', 'GC%'") message("Usage: Rscript GC_correction.R count-file.txt.gz gc-matrix.txt output.txt.gz") stop() } -if (! (all(unique(counts$chrom) %in% unique(GC_matrix$chrom)) & - all(unique(counts$start) %in% unique(GC_matrix$start)) & - all(unique(counts$end) %in% unique(GC_matrix$end)))) { +if (!(all(unique(counts$chrom) %in% unique(GC_matrix$chrom)) & + all(unique(counts$start) %in% unique(GC_matrix$start)) & + all(unique(counts$end) %in% unique(GC_matrix$end)))) { message("bin features ('crhom', 'start', 'end') do not match between count file and GC matrix") message("make sure to choose files with identical bin sizes") } # green light message -message(paste("\ncount file:", args[1])) -message(paste("GC matrix file:", args[2])) -message(paste("savepath:", args[3])) +# message(paste("\ncount file:", args[1])) +# message(paste("GC matrix file:", args[2])) +# message(paste("savepath:", args[3])) message("preprocessing...\n") @@ -67,181 +61,98 @@ counts$cell <- as.factor(counts$cell) # convert strandseq count file to count matrix counts$tot_count <- counts$c + counts$w -count_matrix <- reshape2::dcast(counts, chrom+start+end ~ cell, value.var = "tot_count") - -# add GC fractions to count_matrix not to lose matching order -count_matrix <- merge(count_matrix, GC_matrix[,c('chrom', 'start', 'end', 'GC%')], by=c('chrom', 'start', 'end'), all.x = T) -count_matrix$`GC%` <- as.numeric(count_matrix$`GC%`) - -# filter unavailable GC fractions -fil <- apply(count_matrix, MARGIN = 1, FUN = (function(x) any(is.na(x)))) -count_matrix.filt <- count_matrix[!fil,] -message(paste((dim(count_matrix)[1]-dim(count_matrix.filt)[1]), 'bins out of', dim(count_matrix.filt)[1], 'with NA values')) - ###################### # GC bias correction # ###################### -# set up the matrix -count_matrix.corrected <- data.frame(count_matrix.filt) - -chrom_ids <- gsub('chr', '', unique(count_matrix.corrected$chrom)) -autosomes <- !sapply(as.numeric(chrom_ids), is.na) -chr_order <- c(order(as.numeric(chrom_ids[autosomes])), sum(autosomes) + order(chrom_ids[!autosomes])) - -count_matrix.corrected$chrom <- factor(count_matrix.corrected$chrom, levels=paste('chr', chrom_ids[chr_order], sep='')) - -# collect plots -plots_meta <- data.frame() -plots <- list() -n <- 1 - -# correction chromosome by chromosome -for (chr in unique(count_matrix.corrected$chrom)) { - - # subset one chromosome - message(paste('correction', chr)) - count_matrix.sub <- count_matrix.corrected[count_matrix.corrected$chrom == chr,] - - # correction cell by cell - for (cell_n in seq(4,(ncol(count_matrix.sub)-1))) { - - # get counts and GC percentage for the current cell - cell_id <- colnames(count_matrix.sub)[cell_n] - cell_counts_raw <- as.numeric(count_matrix.sub[,(cell_id)]) - GC.frac_raw <- as.numeric(count_matrix.sub[,"GC."]) - - # filter out count 0 for better fitting - df <- data.frame(cbind(cell_counts_raw, GC.frac_raw)) - df <- df[df$cell_counts_raw > 0 & df$GC.frac_raw >.3, ] - cell_counts <- df$cell_counts_raw - GC.frac <- df$GC.frac_raw - - # logarithm of the counts centered on the mean - log.norm.counts_raw <- log2( (cell_counts_raw+1) / mean(cell_counts_raw+1) ) - log.norm.counts <- log2( (cell_counts+1) / mean(cell_counts+1) ) - - # check bins - if (length(GC.frac) < 10 | length(log.norm.counts) < 10) { - message(paste('skipping', chr, 'number of bins < 10')) - next - } - - # fit curve before correction - fit_before <- smooth.spline(GC.frac, log.norm.counts, df = 6) - if(plot) { - #plot_title <- 'before correction' - #plot(GC.frac, log.norm.counts, main=plot_title, xlab="GC%", ylab="log2(counts/mean)") - #lines(fit_before, col="red", lwd=2) - plotdf <- data.frame(log.norm.counts, GC.frac) - plotline <- data.frame(fit_before$x, fit_before$y) - - p <- ggplot(plotdf, aes(GC.frac, log.norm.counts)) + - geom_point(size=1, alpha=.2) + - ggtitle(paste(chr, "before")) + - geom_line(data = plotline, aes(fit_before.x, fit_before.y), color="red") - plots_meta[n, 1:4] <- list("index" = n, "chrom" = chr, "cell" = cell_id, "type" = "before") - plots[[n]] <- p - n <- n+1 - } - - # correct values for raw (pre-filtering) data - pred <- predict(fit_before, GC.frac_raw) - log.norm.corrected <- log.norm.counts_raw - pred$y - - # convert log of normalized counts to counts - counts.corrected <- (2 ^ log.norm.corrected) * mean(count_matrix.sub[,cell_id]+1) -1 - - # fit curve after correction - if(plot) { - # curve fit after correction is needed only for plotting - fit_after <- smooth.spline(GC.frac_raw, log.norm.corrected, df = 6) - #plot_title <- 'after correction' - #plot(GC.frac_raw, log.norm.corrected, main=plot_title, xlab="GC%", ylab="log2(counts/mean)") - #lines(fit_after, col="red", lwd=2) - - plotdf <- data.frame(log.norm.corrected, GC.frac_raw) - plotline <- data.frame(fit_after$x, fit_after$y) - p <- ggplot(plotdf, aes(GC.frac_raw, log.norm.corrected)) + - geom_point(size=1, alpha=.2) + - ggtitle(paste(chr, "after")) + - geom_line(data = plotline, aes(fit_after.x, fit_after.y), color="red") - plots_meta[n, 1:4] <- list("index" = n, "chrom" = chr, "cell" = cell_id, "type" = "after") - plots[[n]] <- p - n <- n+1 - } - - # write data to the table - count_matrix.corrected[count_matrix.corrected$chrom == chr, cell_id] <- counts.corrected - } -} +counts <- merge(counts, GC_matrix[, c("chrom", "start", "GC%")], by = c("chrom", "start"), all.x = T) -################### -# POST PROCESSING # -################### +# filter data for subsampling +c <- counts[counts$tot_count >= min_reads] +if (dim(c)[[1]] == 0) { + stop(paste("there are no bins with more than", min_reads, "reads")) +} +c$`GC%` <- as.numeric(c$`GC%`) +c$log_count_norm <- log(c$tot_count) - log(median(c$tot_count)) +not.na <- !is.na(c$`GC%`) +s <- c[not.na] + +# subsample from quantiles +s$GC_bin <- cut(s$`GC%`, breaks = c(quantile(s$`GC%`, probs = seq(0, 1, by = 1 / 10))), labels = seq(1, 10, by = 1), include.lowest = TRUE) + +subsample <- data.frame() +for (i in seq(10)) { + sbin <- s[s$GC_bin == i] + m <- min(dim(sbin)[1], n_subsample) + sa <- sbin[sample(nrow(sbin), size = m), ] + subsample <- rbind(subsample, sa) +} -# table of corrected data -counts.corrected <- reshape2::melt(count_matrix.corrected, id.vars = c("chrom", "start", "end"), - measure.vars = colnames(count_matrix.corrected)[seq(4,(ncol(count_matrix.corrected)-1))], - value.name = "GC_tot_count", variable.name='cell') +############################# +# lowess fit and correction # +############################# +# lowess fit +z <- lowess(subsample$`GC%`, subsample$log_count_norm) + +# ################ +# # SAVING PLOTS # +# ################ + +# adjust tot count to closest predicted GC value +idxs <- sapply(as.numeric(counts$`GC%`), FUN = function(a) { + which.min(abs(z$x - a)) +}) +idxs[lapply(idxs, length) == 0] <- NA +counts$pred <- z$y[unlist(idxs)] +counts$tot_count_gc <- log(counts$tot_count / median(counts$tot_count)) - counts$pred +counts$tot_count_gc <- exp(counts$tot_count_gc) * median(counts$tot_count) -# force cell column to factor -counts$cell <- as.factor(counts$cell) -counts.corrected$cell <- as.factor(counts.corrected$cell) +if (plot) { + sidxs <- sapply(as.numeric(subsample$`GC%`), FUN = function(a) { + which.min(abs(z$x - a)) + }) + sidxs[lapply(sidxs, length) == 0] <- NA -# merge corrected data to original data -counts.full <- merge(counts, counts.corrected, by=c("chrom", "start", "end", "cell"), all.x=T) -# ratio of the corrected and raw counts -counts.full$ratio <- counts.full$GC_tot_count/counts.full$tot_count -counts.full[is.na(counts.full$ratio)]$ratio <- 1 -counts.full[is.infinite(counts.full$ratio)]$ratio <- 0 + subsample$pred <- z$y[unlist(sidxs)] + subsample$tot_count_gc <- log(subsample$tot_count / median(subsample$tot_count)) - subsample$pred + subsample$tot_count_gc <- exp(subsample$tot_count_gc) * median(subsample$tot_count) -# adjust watson, crick and tot_counts -counts.full$w <- counts.full$w * counts.full$ratio -counts.full$c <- counts.full$c * counts.full$ratio -counts.full$tot_count <- counts.full$w + counts.full$c + z$y2 <- exp(z$y) * median(subsample$tot_count) + ymin <- min(cbind(subsample$tot_count, subsample$tot_count_gc)) + ymax <- max(cbind(subsample$tot_count, subsample$tot_count_gc)) -# saving -message("saving...\n") -data.table::fwrite(counts.full[,c('chrom', 'start', 'end', 'sample', 'cell', 'w', 'c', 'class', 'tot_count')], save_path) + p1 <- ggplot(subsample, aes(`GC%`, tot_count)) + + geom_point(size = 1, alpha = .2) + + ggtitle("raw") + + ylim(ymin, ymax) + + xlab("GC_content") + + ylab("read count") + + geom_line(data = as.data.frame(z), aes(x, y2), color = "red") + p2 <- ggplot(subsample, aes(`GC%`, tot_count_gc)) + + geom_point(size = 1, alpha = .2) + + ggtitle("gc corrected") + + ylim(ymin, ymax) + + xlab("GC content") + + ylab("read count") + + corr_plot <- ggarrange(p1, p2) -################ -# SAVING PLOTS # -################ -if (plot) { - - # define layout matrix - layout_matrix <- matrix(seq(1,48), 6,8) - # last extra squares for the dummy - layout_matrix[5:6,8] <- 47 - - # collect plots in right order - n <-1 - index_order <- c() - for (cell_id in unique(plots_meta$cell)) { - # add dummy plot with cell name - plots[[length(plots) + n]] <- ggplot() + - annotate("text", size=4, x=0, y=0, label = cell_id, angle=-90) + - theme_void() - - # retrieve indexes - subdf <- plots_meta[plots_meta$cell == cell_id,] - index_order <- c(index_order, as.numeric(rownames(subdf)), length(plots)) - n <- n+1 - } - - message("saving plots, please wait...") - # generate the grid - grid <- gridExtra::marrangeGrob(grobs = plots[index_order], - ncol = 8, nrow = 6, - top="GC correction plots", - layout_matrix = layout_matrix) - # save plots - ggsave(args[4], grid, width=14, height=10) + ggsave(snakemake@output[["plot"]], corr_plot, width = 12, height = 6) } + +# adjust w, c and fill NAs +counts$w <- (counts$w * counts$tot_count / counts$tot_count_gc) +counts$c <- (counts$c * counts$tot_count / counts$tot_count_gc) +counts$w[is.na(counts$w)] <- 0 +counts$c[is.na(counts$c)] <- 0 +counts$tot_count[is.na(counts$tot_count)] <- 0 + +output <- counts[, c("chrom", "start", "end", "sample", "cell", "w", "c", "tot_count", "class")] + +data.table::fwrite(output, save_path) \ No newline at end of file diff --git a/workflow/scripts/GC/counts_scaling.R b/workflow/scripts/GC/counts_scaling.R deleted file mode 100644 index 88361319..00000000 --- a/workflow/scripts/GC/counts_scaling.R +++ /dev/null @@ -1,32 +0,0 @@ -# fetch arguments -args <- commandArgs(trailingOnly = T) - -# args[1] <- '/data/r-workspace/strandseq_utils/test_data/H3JNHAFX3_MNIphotoconverted.200000.VST.GC.txt.gz' - -# open counts -# counts <- data.table::fread(args[1]) -counts <- data.table::fread(snakemake@input[["counts_vst_gc"]]) - - -# calculate scale factor essentially checks the order of magnitude (log10) -# and rescales to the 100s -scale_factor <- (floor(log10(median(counts$w + counts$c))) - 2) * -1 - -message(paste("scale factor:", scale_factor)) - -# scaling -counts$w <- as.integer(counts$w * 10^scale_factor) -counts$c <- as.integer(counts$c * 10^scale_factor) - -if ("tot_count" %in% colnames(counts)) { - counts$tot_count <- as.integer(counts$w + counts$c) -} - -# filename -# tokens <- strsplit(args[1], "/")[[1]] -# basename <- tokens[length(tokens)] -# basename <- strsplit(args[1], ".txt.gz")[[1]] - -# save -# data.table::fwrite(counts, args[2]) -data.table::fwrite(counts, snakemake@output[["counts_vst_gc_scaled"]]) diff --git a/workflow/scripts/GC/gc.R b/workflow/scripts/GC/gc.R deleted file mode 100644 index 4c632ef9..00000000 --- a/workflow/scripts/GC/gc.R +++ /dev/null @@ -1,38 +0,0 @@ -library(ggplot2) -library(reshape2) - -# zcat out.tsv.gz | grep "^GC" > scripts/gc.table - -args = commandArgs(trailingOnly=TRUE) -# x = read.table(args[1], header=T) -x = read.table(snakemake@input[["table"]], header = T) -df = dcast(x, GCcontent ~ Sample) -refcol = (1:ncol(df))[colnames(df) == "Reference"] -if (refcol == 2){gc_mean = sum(df[,1]*df[,3])} -if (refcol == 3){gc_mean = sum(df[,1]*df[,2])} - -if (refcol == 3) { datacol = 2; } else { datacol = 3; } -df$observed = df[,datacol] / df[,refcol] -df$expected = 1.0 -df = df[df$GCcontent > 0.2 & df$GCcontent < 0.7,] -sse = sum((df$observed - df$expected)^2) -df = melt(df, id.vars=colnames(df[,1:3])) - -# png(args[2]) -png(snakemake@output[["gcdist_plot"]]) -p = ggplot(data=x, aes(x=GCcontent, y=fractionOfReads)) -p = p + geom_freqpoly(aes(color=Sample), stat="identity") -p = p + xlab("GC content") -p = p + ylab("Normalized count") -p = p + ggtitle(paste0("GC content mean = ", gc_mean)) -p -dev.off() - -# png(args[3]) -png(snakemake@output[["gcdevi_plot"]]) -p = ggplot(data=df, aes(x=GCcontent, y=value)) + geom_line(aes(color=variable)) -p = p + xlab("GC content") -p = p + ylab("Sample to reference") -p = p + ggtitle(paste0("SSE = ", sse)) -p -dev.off() \ No newline at end of file diff --git a/workflow/scripts/GC/library_size_normalisation.R b/workflow/scripts/GC/library_size_normalisation.R new file mode 100644 index 00000000..dba917e0 --- /dev/null +++ b/workflow/scripts/GC/library_size_normalisation.R @@ -0,0 +1,115 @@ +# Median of ratios normalization + +# fetch arguments +# args <- commandArgs(trailingOnly = T) + +# checking arguments +# if (length(args) != 2) { +# message("Usage: Rscript GC_correction.R count-file.txt.gz gc-matrix.txt output.txt.gz") +# stop() +# } +# if (!file.exists(args[1])) { +# message(paste(args[1], "does not exists or cannot be found.")) +# stop() +# } + +# args[1] <- '/data/projects/strandseq_segmentation/dataset/counts/H3JNHAFX3_MNIphotoconverted_200000_fixed.txt.gz' + +# open files +# counts <- data.table::fread(args[1]) +counts <- data.table::fread(snakemake@input[["counts"]], header = T) +save_path <- snakemake@output[["counts_scaled"]] +# save_path <- args[3] +# min_reads <- snakemake@params[["gc_min_reads"]] + +info_raw <- data.table::fread(snakemake@input[["info_raw"]], skip = 13, header = T, sep = "\t") +# info_raw <- data.table::fread(args[2]) + +min_reads <- min(info_raw[info_raw$pass1 == 1, ]$good) - 1 + +################# +# Preprocessing # +################# + +# force cell column to factor +counts$cell <- as.factor(counts$cell) + +# add tot counts +counts$tot_count <- counts$c + counts$w + + +# filter cells with too low counts +counts_bycell <- as.data.frame(aggregate(counts$tot_count, by = list(Category = counts$cell), FUN = sum)) +sel_cells <- counts_bycell[counts_bycell$x >= as.integer(min_reads), "Category"] +if (length(sel_cells) == 0) { + stop(paste("there are no cells with more than", min_reads, "total reads")) +} +counts <- counts[counts$cell %in% sel_cells, ] + +# convert to bin matrix +count_matrix_raw <- reshape2::dcast(counts, chrom + start + end ~ cell, value.var = "tot_count") +count_matrix <- as.data.frame(count_matrix_raw) + +################# +# Normalization # +################# + +message(paste("library size normalization for", snakemake@input[["counts"]])) +# take the log of counts +count_matrix[4:ncol(count_matrix)] <- log(count_matrix[4:ncol(count_matrix)]) + +# calculate mean of the log counts per bin +count_matrix$mean_log_count <- apply(count_matrix[4:ncol(count_matrix)], MARGIN = 1, FUN = mean, na.rm = FALSE) + +# filter out infinity +count_matrix <- count_matrix[!is.infinite(count_matrix$mean_log_count), ] +if (dim(count_matrix)[[1]] == 0) { + stop("there are no common non-zero bins available across all cells.") +} + +# log of counts over mean per bin +count_matrix[4:ncol(count_matrix)] <- count_matrix[4:ncol(count_matrix)] - count_matrix$mean_log_count + +# median per cell of the per log of counts/mean per bin is the scaling factor +scaling_factors <- apply(count_matrix[4:ncol(count_matrix)], MARGIN = 2, FUN = median) + +# raise e to scaling factor +scaling_factors <- exp(scaling_factors) + +# rescale libraries +scaled_matrix <- data.table::data.table(count_matrix_raw) + +for (i in colnames(scaled_matrix)[4:ncol(scaled_matrix)]) { + scaled_matrix[[i]] <- scaled_matrix[[i]] / scaling_factors[i] +} + +################## +# Postprocessing # +################## + +# wide to long +norm_tot_counts <- reshape2::melt(scaled_matrix, + id.vars = c("chrom", "start", "end"), + measure.vars = colnames(scaled_matrix)[4:ncol(scaled_matrix)], + value.name = "norm_tot_count", variable.name = "cell" +) + +# merge with counts +cols <- colnames(counts) +counts <- merge(counts, norm_tot_counts, by = c("chrom", "start", "end", "cell"), all.x = TRUE) + +# adjust W and C counts to normalized counts +counts$ratio <- counts$norm_tot_count / counts$tot_count +counts$ratio[is.na(counts$ratio)] <- 0 +counts$w <- counts$w * counts$ratio +counts$c <- counts$c * counts$ratio +counts$tot_count <- counts$norm_tot_count +# fill na +counts$w[is.na(counts$w)] <- 0 +counts$c[is.na(counts$c)] <- 0 +counts$tot_count[is.na(counts$tot_count)] <- 0 + + +# saving +message("saving...\n") +data.table::fwrite(counts[, ..cols], save_path) diff --git a/workflow/scripts/GC/variance_stabilizing_transformation.R b/workflow/scripts/GC/variance_stabilizing_transformation.R index eb0996a5..2b71a1d3 100644 --- a/workflow/scripts/GC/variance_stabilizing_transformation.R +++ b/workflow/scripts/GC/variance_stabilizing_transformation.R @@ -1,105 +1,36 @@ -library(edgeR) -library(ggplot2) - -# fetch arguments -args = commandArgs(trailingOnly = T) - -filter <- ifelse('filter' %in% args, TRUE, FALSE) -chosen_transform <- ifelse('anscombe' %in% args, 'anscombe', - ifelse('laubschner' %in% args, 'laubschner', 'anscombe')) - +# set arguments +input_path <- snakemake@input[["counts_scaled_gc"]] +save_path <- snakemake@output[["counts_scaled_gc_vst"]] +chosen_transform <- "anscombe" +plot <- TRUE +rescale <- TRUE + +# PRE-PROCESSING DATA # open count file -#args[1] <- '/data/strandseq_datasets/lib_test/HCLLVAFX3_iPSCs.200000.LIB.txt.gz' -# counts_raw <- data.table::fread(args[1]) -counts_raw <- data.table::fread(snakemake@input[["counts"]]) +counts_raw <- data.table::fread(input_path) # force cell column to factor counts_raw$cell <- as.factor(counts_raw$cell) # fuse bin coordinates -counts_raw$bin <- paste(counts_raw$chrom, counts_raw$start, counts_raw$end, sep='_') +counts_raw$bin <- paste(counts_raw$chrom, counts_raw$start, counts_raw$end, sep = "_") # add tot counts counts_raw$tot_count <- counts_raw$c + counts_raw$w -# sum of counts per cell -sum_counts <- aggregate(counts_raw$tot_count, by= list(counts_raw$cell), FUN = sum) -colnames(sum_counts) <- c('cell', 'sum') - - -# centromere read spikes exclusion - -exclude_centromeres <- function(counts_raw, exclusion_list) { - exclusion_list <- GenomicRanges::GRanges(seqnames = exclusion_list$chrom, ranges = IRanges::IRanges(start = exclusion_list$start, end=exclusion_list$end)) - counts_iranges <- GenomicRanges::GRanges(seqnames = counts_raw$chrom, ranges = IRanges::IRanges(start = counts_raw$start, end=counts_raw$end)) - - overlaps <- data.table::as.data.table(GenomicRanges::findOverlaps(exclusion_list, counts_iranges)) - counts <- counts_raw - counts <- counts[-overlaps$subjectHits,] - return(counts) -} - -#args[3] <- '/data/r-workspace/strandseq_utils/large_centromere_blacklist.csv' - -if (!is.na(args[3])) { - message('blacklisting...') - counts <- exclude_centromeres(counts_raw, data.table::fread(args[3])) -} else { - counts <- counts_raw -} +counts <- counts_raw # convert to bin count matrix to_matrix <- function(counts) { + # fuse bin coordinates + counts$bin <- paste(counts$chrom, counts$start, counts$end, sep = "_") mat_tot <- reshape2::dcast(counts, bin ~ cell, value.var = "tot_count") rownames(mat_tot) <- mat_tot$bin - mat_tot <- mat_tot[,2:ncol(mat_tot)] - - return(mat_tot) -} - -mat_tot <- to_matrix(counts) - -generate_dgelist <- function(mat_tot, filter) { - - # create the DGEList object - # all samples belong to group 1 - y.raw <- DGEList(as.matrix(mat_tot), group = rep(1, ncol(mat_tot))) - - # filter out bins with too low counts to be informative - if (filter) { - keep.exprs <- edgeR::filterByExpr(y.raw, group=y.raw$samples$group) - y <- y.raw[keep.exprs, keep.lib.sizes=FALSE] - message(paste('filtering out', (dim(y.raw)[1]-dim(y)[1]), 'bins out of', dim(y.raw)[1], "due to low count")) - } else { - y <- y.raw - message('no filtering') - } - - # calculate library size and composition normalization - y <- calcNormFactors(y) - - return(y) -} + mat_tot <- mat_tot[, 2:ncol(mat_tot)] -estimate_dispersion <- function(y) { - - # estimate common dispersion - # default settings for DGEList objects - message('Estimating common dispersion with edgeR...') - disp <- estimateDisp(y, design=NULL, prior.df=NULL, trend.method="locfit", tagwise=TRUE, - span=NULL, min.row.sum=5, grid.length=21, grid.range=c(-10,10), robust=FALSE, - winsor.tail.p=c(0.05,0.1), tol=1e-06) - phi <- disp$common.dispersion - message(paste("phi =",phi)) - write(paste(args[1], phi, sep="\t"), './log.txt', append=TRUE) - - return(phi) + return(mat_tot) } -y <- generate_dgelist(mat_tot, filter) -phi <- estimate_dispersion(y) - - # VARIANCE STABILIZING TRANSFORMATION # TRANSFORMS @@ -107,77 +38,152 @@ phi <- estimate_dispersion(y) # Anscombe, 1948 # Laubschner, 1961 anscombe_transform <- function(x, phi) { - a <- x + (3/8) - b <- (1/phi) - (3/4) - c <- sqrt(a/b) - y <- sinh(c) + a <- x + (3 / 8) + b <- (1 / phi) - (3 / 4) + c <- sqrt(a / b) + y <- asinh(c) return(y) } laubscher_transform <- function(x, phi) { a <- sqrt(phi) - b <- sinh( sqrt( x/phi ) ) - c <- sqrt( phi-1 ) + b <- asinh(sqrt(x / phi)) + c <- sqrt(phi - 1) d <- anscombe_transform(x, phi) - y <- a*b + c*d + y <- a * b + c * d return(y) } -transform_list <- list('anscombe'= anscombe_transform, 'laubscher'= laubscher_transform) +transform_data <- function(counts, transform, phi) { + cols <- colnames(counts) + counts$tot_count_corr <- transform(counts$tot_count, phi) + counts$f <- counts$tot_count_corr / counts$tot_count + counts$w <- counts$w * counts$f + counts$c <- counts$c * counts$f + counts$tot_count <- counts$tot_count_corr + counts <- counts[, ..cols] + + return(counts) +} +transform_list <- list("anscombe" = anscombe_transform, "laubscher" = laubscher_transform) transform <- transform_list[[chosen_transform]] + +disp_score <- function(counts, transform, phi, design = NULL) { + counts$tot_count <- transform(counts$tot_count, phi) + mat <- to_matrix(counts) + # if multiple samples are present design matrix can be used + if (is.null(design)) { + design <- matrix(1, ncol = 1, nrow = ncol(mat)) + } + res <- as.matrix(mat) %*% MASS::Null(design) + rsd <- sqrt(rowMeans(res * res)) + score <- sd(rsd) / mean(rsd) + return(score) +} + message(paste("Transforming data with", chosen_transform, "VST")) -#ans <- transform(y$counts, phi) -ans <- transform(to_matrix(counts_raw), phi) - -# convert to count table -ans <- as.data.frame(ans) -ans$bin <- rownames(ans) -d <- reshape2::melt(ans, id.vars='bin', measure.vars=colnames(ans), variable.name='cell', value.name = 'tot_count_corr') - -sum2 <- apply(ans[,1:ncol(ans)-1], MARGIN = 2, FUN = sum) -sum2 <- data.table::data.table(cell = names(sum2), sum2 = sum2) -sum_counts <- merge(sum_counts, sum2, by = c('cell'), all.x = T) -sum_counts$f <- sum_counts$sum / sum_counts$sum2 - -d <- merge(d, sum_counts[,c('cell', 'f')], by = c('cell'), all.x = T) -d$tot_count_corr <- as.numeric(d$tot_count_corr) * d$f -d <- d[,c('cell', 'bin', 'tot_count_corr')] - -bins <- stringr::str_split_fixed(d$bin, '_', 3) -colnames(bins) <- c('chrom', 'start', 'end') -d[c('chrom', 'start', 'end')] <- bins -d$start <- as.numeric(d$start) -d$end <- as.numeric(d$end) - -merged.raw <- merge(counts_raw, d[c('chrom', 'start', 'end', 'cell', 'tot_count_corr')], - by=c('chrom', 'start', 'end', 'cell'), all.x=T) - -fil <- apply(merged.raw, MARGIN = 1, FUN = (function(x) any(is.na(x)))) -#merged.raw[fil, 'tot_count_corr'] <- 0 -merged <- data.table::data.table(merged.raw) - -# adjust W and C counts to corrected tot_counts -merged$tot_count_corr <- as.numeric(merged$tot_count_corr) -merged$ratio <- merged$tot_count_corr / merged$tot_count -merged$w <- merged$w * merged$ratio -merged$c <- merged$c * merged$ratio -merged$w[is.na(merged$w)] <- 0 -merged$c[is.na(merged$c)] <- 0 -merged$tot_count <- merged$tot_count_corr - -message('saving...') -df <- data.table::data.table(merged[,c('chrom', 'start', 'end', 'sample', 'cell', 'w', 'c', 'class', 'tot_count')]) - -#s1 <- aggregate(counts_raw$tot_count, by= list(counts_raw$cell), FUN = sum) -#s2 <- aggregate(df$tot_count, by= list(df$cell), FUN = sum) -#s3 <- merge(s1, s2, by=c('Group.1'), all.x=T) -#s3$d <- s3['x.x'] - s3['x.y'] -#df - -#args[2] <- '/data/r-workspace/strandseq_utils/vst_test.txt.gz' -# data.table::fwrite(df, args[2]) -data.table::fwrite(df, snakemake@output[["counts_vst"]]) - -#scTRIPmultiplot::multiplot(args[2], chromosome = 'chr8', cell_id = 'MNIx260521x01PE20505') -#scTRIPmultiplot::multiplot(args[1], chromosome = 'chr8', cell_id = 'MNIx260521x01PE20505') -#path <- '/data/strandseq_datasets/H22WGAFX3_MNIx260521.200000.VST.txt.gz' -#scTRIPmultiplot::multiplot(path, chromosome = 'chr8', cell_id = 'MNIx260521x01PE20505') + +# estimate dispersion by residual variance + +opt <- optimize(disp_score, counts = counts, transform = transform, interval = c(0.00001, 1)) +phi <- opt$minimum +message(paste("Estimated dispersion - phi: ", phi)) + +# correction +corr_counts <- transform_data(counts, transform, phi) +corr_counts <- data.table::data.table(corr_counts[, c("chrom", "start", "end", "sample", "cell", "w", "c", "class", "tot_count")]) + +rescale_data <- function(counts_original, counts_transformed) { + rescaled <- counts_transformed + rescaled_med <- aggregate(rescaled$tot_count, list(rescaled$cell), FUN = median) + original_med <- aggregate(counts_original$tot_count, list(counts_original$cell), FUN = median) + m <- merge(x = original_med, y = rescaled_med, by = "Group.1", suffixes = c("_raw", "_norm")) + m[["f"]] <- m[["x_raw"]] / m[["x_norm"]] + + rescaled <- merge(rescaled, m[c("Group.1", "f")], by.x = "cell", by.y = "Group.1") + + rescaled$tot_count <- rescaled$tot_count * rescaled$f + rescaled$w <- rescaled$w * rescaled$f + rescaled$c <- rescaled$c * rescaled$f + return(rescaled) + +} + +if (rescale == TRUE) { + corr_counts <- rescale_data(counts_raw, corr_counts) +} + + +message("saving...") +data.table::fwrite(corr_counts, save_path) + +if (plot) { + library(ggplot2) + library(ggpubr) + + merge_bins <- function(df, bin_size = 3e6) { + df <- df[with(df, order(cell, chrom, start))] + + df$bin_group <- df$start %/% bin_size + + + w <- aggregate(df$w, by = list(df$cell, df$chrom, df$bin_group), FUN = sum) + c <- aggregate(df$c, by = list(df$cell, df$chrom, df$bin_group), FUN = sum) + s <- aggregate(df$start, by = list(df$cell, df$chrom, df$bin_group), FUN = function(x) x[[1]]) + e <- aggregate(df$end, by = list(df$cell, df$chrom, df$bin_group), FUN = function(x) x[[length(x)]]) + cl <- aggregate(df$class, by = list(df$cell, df$chrom, df$bin_group), FUN = function(x) names(sort(table(x), decreasing = TRUE))[[1]]) + + m <- merge(w, c, by = c("Group.1", "Group.2", "Group.3")) + m <- merge(m, s, by = c("Group.1", "Group.2", "Group.3")) + m <- merge(m, e, by = c("Group.1", "Group.2", "Group.3")) + m <- merge(m, cl, by = c("Group.1", "Group.2", "Group.3")) + colnames(m) <- c("cell", "chrom", "bin_group", "w", "c", "start", "end", "class") + m$sample <- unique(df$sample)[[1]] + m <- data.table::as.data.table(m) + m <- m[with(m, order(cell, chrom, start))] + + m$tot_count <- m$w + m$c + return(m) + } + + wf_plot <- function(m) { + m$wf <- m$w / m$tot_count + + p1 <- ggplot(m, aes(x = tot_count, y = wf)) + + geom_point(size = 1, alpha = .1, shape = 16) + + xlab("tot count") + + ylab("watson fraction") + + ylim(0, 1) + return(p1) + } + + p1 <- ggplot(counts_raw, aes(x = tot_count)) + + geom_histogram(bins = 256) + + ggtitle("raw") + + xlab("read count") + + ylab("bin count") + + p2 <- ggplot(corr_counts, aes(x = tot_count)) + + geom_histogram(bins = 256) + + ggtitle(paste(chosen_transform, "VST")) + + xlab("read count") + + ylab("bin count") + + m <- merge_bins(counts_raw) + p3 <- wf_plot(m) + ggtitle('raw') + + n <- merge_bins(corr_counts) + p4 <- wf_plot(n) + ggtitle(paste(chosen_transform, "VST")) + + + + m <- merge_bins(counts_raw) + p3 <- wf_plot(m) + ggtitle("raw") + + n <- merge_bins(corr_counts) + p4 <- wf_plot(n) + ggtitle(paste(chosen_transform, "VST")) + + + corr_plot <- ggarrange(p1, p2, p3, p4) + + ggsave(snakemake@output[["plot"]], corr_plot, width = 12, height = 6) +} \ No newline at end of file diff --git a/workflow/scripts/utils/make_log_useful.py b/workflow/scripts/utils/make_log_useful.py index ef62fc84..bbe56250 100644 --- a/workflow/scripts/utils/make_log_useful.py +++ b/workflow/scripts/utils/make_log_useful.py @@ -1,8 +1,10 @@ import sys, os -def make_log_useful(log_path, status, config): - +def make_log_useful(log_path, status, config, config_definitions): + logs_processed_dir = "/".join(log_path.split("/")[:-1]) + "/processed_logs_for_mail/" + os.makedirs(logs_processed_dir, exist_ok=True) + log_path_new = "/".join(log_path.split("/")[:-1]) + "/processed_logs_for_mail/" + log_path.split("/")[-1] error_buffer = [] record = 0 with open(log_path, "r") as logfile: @@ -25,12 +27,8 @@ def make_log_useful(log_path, status, config): else: continue - with open(log_path, "w") as logfile: - _ = logfile.write("\n".join(error_buffer)) - _ = logfile.write("\n\n") - my_env = dict(os.environ) - with open(log_path, "a") as logfile: + with open(log_path_new, "w") as logfile: _ = logfile.write("=======[{}]=======\n".format(status)) _ = logfile.write("\n===[{}]===\n".format("Infrastructure information")) _ = logfile.write("Host: {}\n".format(my_env.get("HOST", "N/A"))) @@ -41,20 +39,49 @@ def make_log_useful(log_path, status, config): _ = logfile.write("Screen: {}\n".format(my_env.get("STY", "N/A"))) _ = logfile.write("Conda ENV: {}\n".format(my_env.get("CONDA_DEFAULT_ENV", "N/A"))) - _ = logfile.write("\n===[{}]===\n".format("Workflow information")) - _ = logfile.write("smk-wf-catalog/mosacaitcher-pipeline v{version}\n".format(version=str(config["version"]))) - _ = logfile.write("Folder to processed : {}\n".format(str(config["data_location"]))) - _ = logfile.write("Multistep normalisation module : {}\n".format(str(config["multistep_normalisation"]))) - _ = logfile.write("Read Counts HGSVC-based normalization : {}\n".format(str(config["hgsvc_based_normalized_counts"]))) - _ = logfile.write("Binning window size : {}\n".format(str(config["window"]))) - _ = logfile.write("Ashleys-QC preprocessing pipeline : {}\n".format(str(config["ashleys_pipeline"]))) - _ = logfile.write("BAM folder old format (all/selected) : {}\n".format(str(config["input_bam_legacy"]))) - _ = logfile.write("List of chromosomes processed : {}\n".format(str(config["chromosomes"]))) - _ = logfile.write("Reference genome selected : {}\n".format(config["reference"])) - _ = logfile.write("\n") + # Define the categories + categories = { + "General": ["email", "data_location", "reference", "samples_to_process"], + "Ashleys-QC parameters": [ + "input_bam_legacy", + "ashleys_pipeline", + "ashleys_pipeline_only", + "ashleys_threshold", + "hand_selection", + "MultiQC", + ], + "Counts processing": ["blacklist_regions", "window"], + "Normalization": [ + "multistep_normalisation", + "multistep_normalisation_for_SV_calling", + "hgsvc_based_normalized_counts", + ], + "Advanced Settings": ["ashleys_threshold", "window", "chromosomes", "chromosomes_to_exclude"], + "Genecore": ["genecore", "genecore_date_folder", "genecore_prefix"], + "Downstream Modules": ["arbigent", "arbigent_bed_file", "scNOVA"], + } + + _ = logfile.write("\n=========================\n===[CONFIG PARAMETERS]===\n=========================\n") + # Log the configuration + for category, keys in categories.items(): + _ = logfile.write("\n===[{}]===\n".format(category)) + for key in keys: + if key in config: + _ = logfile.write("{}: {}\n".format(key, config[key])) + + if error_buffer: + with open(log_path_new, "a") as logfile: + _ = logfile.write("\nErrors recorded (can be related to OOM & NFS issues)") + _ = logfile.write("\n".join(error_buffer)) + _ = logfile.write("\n\n") - return + return log_path_new # if __name__ == "__main__": -# make_log_useful(sys.argv[1], sys.argv[2], sys.argv[3]) +# config = yaml.safe_load(open("config/config.yaml", "r")) +# config_definitions = yaml.safe_load(open("config/config_metadata.yaml"), "r") +# print(config) +# log_path = sys.argv[1] +# status = "SUCCESS" +# make_log_useful(log_path, status, config, config_definitions) From 7c1b7f23185a09182415dace9c3d4983e522d114 Mon Sep 17 00:00:00 2001 From: Thomas Weber Date: Tue, 22 Aug 2023 15:28:52 +0200 Subject: [PATCH 2/5] pre-commit --- .pre-commit-config.yaml | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) create mode 100644 .pre-commit-config.yaml diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml new file mode 100644 index 00000000..065f372f --- /dev/null +++ b/.pre-commit-config.yaml @@ -0,0 +1,21 @@ +repos: + - hooks: + - id: trailing-whitespace + - id: check-added-large-files + - id: check-yaml + # - args: + # - --branch + # - master + # id: no-commit-to-branch + repo: https://github.com/pre-commit/pre-commit-hooks + rev: v3.4.0 + - hooks: + - id: snakefmt + repo: https://github.com/snakemake/snakefmt + rev: 0.4.0 + - hooks: + - id: commitizen + stages: + - commit-msg + repo: https://github.com/commitizen-tools/commitizen + rev: v2.17.12 From 643c72438c6dc7ab944b9181249e750f90a72c20 Mon Sep 17 00:00:00 2001 From: Thomas Weber Date: Tue, 22 Aug 2023 15:29:17 +0200 Subject: [PATCH 3/5] 2.2.1: fixes --- config/config.yaml | 4 +- .../Dockerfile-2.2.1.dockerfile | 227 ++++++++++++++++++ github-actions-runner/docker_procedure.md | 13 +- workflow/Snakefile | 4 +- workflow/rules/common.smk | 4 +- 5 files changed, 244 insertions(+), 8 deletions(-) create mode 100644 github-actions-runner/Dockerfile-2.2.1.dockerfile diff --git a/config/config.yaml b/config/config.yaml index 9571162c..78dc3d51 100644 --- a/config/config.yaml +++ b/config/config.yaml @@ -3,10 +3,10 @@ # -------------------------------------------------------- # MosaiCatcher version -version: 2.2.0 +version: 2.2.1 # Ashleys-QC pipeline version -ashleys_pipeline_version: 2.2.0 +ashleys_pipeline_version: 2.2.1 # Email for notifications about the pipeline's status email: "" diff --git a/github-actions-runner/Dockerfile-2.2.1.dockerfile b/github-actions-runner/Dockerfile-2.2.1.dockerfile new file mode 100644 index 00000000..d5f5feb1 --- /dev/null +++ b/github-actions-runner/Dockerfile-2.2.1.dockerfile @@ -0,0 +1,227 @@ +FROM condaforge/mambaforge:latest +LABEL io.github.snakemake.containerized="true" +LABEL io.github.snakemake.conda_env_hash="77eaa388d65d5205b87324fb0adb89561bc0e532a328995990a1d580aeb894ae" + +# Step 1: Retrieve conda environments + +# Conda environment: +# source: https://github.com/snakemake/snakemake-wrappers/raw/v1.7.0/bio/bwa/index/environment.yaml +# prefix: /conda-envs/5681728a49bd83ceed09ba194330c858 +# channels: +# - bioconda +# - conda-forge +# - defaults +# dependencies: +# - bwa ==0.7.17 +RUN mkdir -p /conda-envs/5681728a49bd83ceed09ba194330c858 +ADD https://github.com/snakemake/snakemake-wrappers/raw/v1.7.0/bio/bwa/index/environment.yaml /conda-envs/5681728a49bd83ceed09ba194330c858/environment.yaml + +# Conda environment: +# source: https://github.com/snakemake/snakemake-wrappers/raw/v1.7.0/bio/fastqc/environment.yaml +# prefix: /conda-envs/08d4368302a4bdf7eda6b536495efe7d +# channels: +# - bioconda +# - conda-forge +# - defaults +# dependencies: +# - fastqc ==0.11.9 +RUN mkdir -p /conda-envs/08d4368302a4bdf7eda6b536495efe7d +ADD https://github.com/snakemake/snakemake-wrappers/raw/v1.7.0/bio/fastqc/environment.yaml /conda-envs/08d4368302a4bdf7eda6b536495efe7d/environment.yaml + +# Conda environment: +# source: https://raw.githubusercontent.com/friendsofstrandseq/ashleys-qc-pipeline/dev/workflow/envs/ashleys_base.yaml +# prefix: /conda-envs/87c04f5d115eff742eca84455513deba +# name: ashleys_base +# channels: +# - conda-forge +# - bioconda +# dependencies: +# - samtools +# - tabix +# - bwa +# - sambamba +# - mosaicatcher +# # - alfred +# - ashleys-qc +# - pandas +# # PUBLISHDIR +# - rsync +# # MULTIQC +# - multiqc +# # Fix sklearn update +# - scikit-learn=1.2.2 +RUN mkdir -p /conda-envs/87c04f5d115eff742eca84455513deba +ADD https://raw.githubusercontent.com/friendsofstrandseq/ashleys-qc-pipeline/dev/workflow/envs/ashleys_base.yaml /conda-envs/87c04f5d115eff742eca84455513deba/environment.yaml + +# Conda environment: +# source: https://raw.githubusercontent.com/friendsofstrandseq/ashleys-qc-pipeline/dev/workflow/envs/ashleys_rtools.yaml +# prefix: /conda-envs/9b847fc31baae8e01dfb7ce438a56b71 +# name: rtools +# channels: +# - conda-forge +# - bioconda +# - r +# - anaconda +# dependencies: +# # - bioconductor-biocparallel +# # - bioconductor-bsgenome +# # - bioconductor-bsgenome.hsapiens.ucsc.hg19 +# # - bioconductor-bsgenome.hsapiens.ucsc.hg38 +# # - bioconductor-fastseg +# # - bioconductor-genomicalignments +# - bioconductor-genomicranges +# # - bioconductor-rsamtools +# # - bioconductor-s4vectors +# - r-assertthat +# - r-base +# # - r-biocmanager +# - r-cowplot +# - r-data.table +# # - r-devtools +# # - r-doparallel +# # - r-foreach +# - r-ggplot2 +# # - r-gtools +# - r-reshape2 +# # - r-zoo +# # - r-dplyr +# # - r-mc2d +# # - r-pheatmap +# # - bioconductor-complexheatmap +# # - r-gplots +# - r-scales +# - r-rcolorbrewer +# # - r-stringr +# - r-cairo +# - fonts-anaconda +# # NEW +# - bioconductor-edger +# - r-r.utils +# # PLATE PLOT +# - r-dplyr +# - r-platetools +# - r-viridis +# # GC_correction +# - r-tidyr +# - r-ggpubr +# # SOLVE R lib issue +# - r-stringi=1.7.12 +RUN mkdir -p /conda-envs/9b847fc31baae8e01dfb7ce438a56b71 +ADD https://raw.githubusercontent.com/friendsofstrandseq/ashleys-qc-pipeline/dev/workflow/envs/ashleys_rtools.yaml /conda-envs/9b847fc31baae8e01dfb7ce438a56b71/environment.yaml + +# Conda environment: +# source: workflow/envs/mc_base.yaml +# prefix: /conda-envs/c80307395eddf442c2fb6870f40d822b +# name: mc-base +# channels: +# - conda-forge +# - bioconda +# dependencies: +# - pandas +# - intervaltree +# - scipy +# - pysam +# - tqdm +# - perl +# - pypdf2 +# - parmap +# # NEW +# - pyyaml +# - seaborn +# - matplotlib +# # SOLVE se-pe detection +# - samtools +# # ArbiGent Hufsah deps +# - pytables +# - xopen +RUN mkdir -p /conda-envs/c80307395eddf442c2fb6870f40d822b +COPY workflow/envs/mc_base.yaml /conda-envs/c80307395eddf442c2fb6870f40d822b/environment.yaml + +# Conda environment: +# source: workflow/envs/mc_bioinfo_tools.yaml +# prefix: /conda-envs/f251d84cdc9f25d0e14b48e780261d66 +# name: mc-bioinfo-tools +# channels: +# - conda-forge +# - bioconda +# dependencies: +# - bcftools +# - freebayes +# - mosaicatcher +# - samtools +# - tabix +# - whatshap +RUN mkdir -p /conda-envs/f251d84cdc9f25d0e14b48e780261d66 +COPY workflow/envs/mc_bioinfo_tools.yaml /conda-envs/f251d84cdc9f25d0e14b48e780261d66/environment.yaml + +# Conda environment: +# source: workflow/envs/rtools.yaml +# prefix: /conda-envs/598c87b6c764d05e0c66953cc67f2931 +# name: rtools +# channels: +# - bioconda +# - conda-forge +# - r +# - anaconda +# dependencies: +# # # NEW +# - strandphaser +# # ############### +# - bioconductor-biocparallel +# - bioconductor-bsgenome +# - bioconductor-bsgenome.hsapiens.ucsc.hg38 +# - bioconductor-complexheatmap +# # - bioconductor-fastseg +# - bioconductor-genomicalignments +# - bioconductor-genomicranges +# - bioconductor-rsamtools +# # - bioconductor-s4vectors +# - fonts-anaconda +# - r-assertthat +# - r-base +# - r-biocmanager +# - r-cairo +# - r-cowplot +# - r-data.table +# - r-devtools +# - r-doparallel +# - r-dplyr +# - r-foreach +# - r-ggplot2 +# - r-gplots +# - r-gtools +# - r-mc2d +# - r-rcolorbrewer +# - r-reshape2 +# - r-scales +# - r-stringr +# # SV_CALLS_DEV +# # - r-zoo +# - r-r.utils +# - r-ggnewscale +# # HEATMAP +# - r-tidyr +# # ARBIGENT +# - r-reshape +# - r-optparse +# - r-tidyr +# - r-ggbeeswarm +# - r-pheatmap +# # GC_correction +# - r-ggpubr +# - bioconductor-edger +# # SOLVE R lib issue +# - r-stringi=1.7.12 +RUN mkdir -p /conda-envs/598c87b6c764d05e0c66953cc67f2931 +COPY workflow/envs/rtools.yaml /conda-envs/598c87b6c764d05e0c66953cc67f2931/environment.yaml + +# Step 2: Generate conda environments + +RUN mamba env create --prefix /conda-envs/5681728a49bd83ceed09ba194330c858 --file /conda-envs/5681728a49bd83ceed09ba194330c858/environment.yaml && \ + mamba env create --prefix /conda-envs/08d4368302a4bdf7eda6b536495efe7d --file /conda-envs/08d4368302a4bdf7eda6b536495efe7d/environment.yaml && \ + mamba env create --prefix /conda-envs/87c04f5d115eff742eca84455513deba --file /conda-envs/87c04f5d115eff742eca84455513deba/environment.yaml && \ + mamba env create --prefix /conda-envs/9b847fc31baae8e01dfb7ce438a56b71 --file /conda-envs/9b847fc31baae8e01dfb7ce438a56b71/environment.yaml && \ + mamba env create --prefix /conda-envs/c80307395eddf442c2fb6870f40d822b --file /conda-envs/c80307395eddf442c2fb6870f40d822b/environment.yaml && \ + mamba env create --prefix /conda-envs/f251d84cdc9f25d0e14b48e780261d66 --file /conda-envs/f251d84cdc9f25d0e14b48e780261d66/environment.yaml && \ + mamba env create --prefix /conda-envs/598c87b6c764d05e0c66953cc67f2931 --file /conda-envs/598c87b6c764d05e0c66953cc67f2931/environment.yaml && \ + mamba clean --all -y \ No newline at end of file diff --git a/github-actions-runner/docker_procedure.md b/github-actions-runner/docker_procedure.md index c2661b4b..f3ea5c61 100644 --- a/github-actions-runner/docker_procedure.md +++ b/github-actions-runner/docker_procedure.md @@ -34,7 +34,16 @@ conda activate snakemake snakemake --configfile .tests/config/simple_config.yaml --config ashleys_pipeline=True MultiQC=True --profile workflow/snakemake_profiles/local/conda/ -c 8 --forceall -n snakemake --configfile .tests/config/simple_config.yaml --config ashleys_pipeline=True MultiQC=True --profile workflow/snakemake_profiles/local/conda_singularity/ -c 8 --forceall +# Solved in -# Solved in +https://github.com/weber8thomas/workspace-images/tree/docker-apptainer-mamba -https://github.com/weber8thomas/workspace-images/tree/docker-apptainer-mamba \ No newline at end of file +# Dockerfile creation + +snakemake --configfile .tests/config/simple_config.yaml --config ashleys_pipeline=True MultiQC=True --profile workflow/snakemake_profiles/local/conda/ -c 8 --containerize > Dockerfile + +# Docker commands + +docker login -u weber8thomas +docker build --platform=linux/amd64 -t weber8thomas/mosaicatcher-pipeline:VERSION . +docker push -t weber8thomas/mosaicatcher-pipeline:VERSION diff --git a/workflow/Snakefile b/workflow/Snakefile index e8e15c2b..980f9ca0 100644 --- a/workflow/Snakefile +++ b/workflow/Snakefile @@ -25,9 +25,9 @@ if config["ashleys_pipeline"] is True: github( "friendsofstrandseq/ashleys-qc-pipeline", path="workflow/Snakefile", - # tag=str(config["ashleys_pipeline_version"]) + tag=str(config["ashleys_pipeline_version"]) # branch="main", - branch="dev", + # branch="dev", ) config: config diff --git a/workflow/rules/common.smk b/workflow/rules/common.smk index fbbb1e4d..999140ba 100644 --- a/workflow/rules/common.smk +++ b/workflow/rules/common.smk @@ -285,7 +285,7 @@ class HandleInput: # List of folders/files to not consider (restrict to samples only) l_to_process = [e for e in os.listdir(thisdir) if e not in exclude and e.endswith(".zip") is False] - print(l_to_process) + # print(l_to_process) if config["samples_to_process"]: l_to_process = [e for e in l_to_process if e in config["samples_to_process"]] @@ -327,7 +327,7 @@ class HandleInput: complete_df = complete_df.sort_values(by=["Cell", "File"]).reset_index( drop=True ) - print(complete_df) + # print(complete_df) return complete_df From 9971816fadb237bcef87f707a8e80a918638807a Mon Sep 17 00:00:00 2001 From: Thomas Weber Date: Tue, 22 Aug 2023 15:37:48 +0200 Subject: [PATCH 4/5] 2.2.1: fixes From 05c67537c11f6e146ce345dddf7e3c82bcae9a72 Mon Sep 17 00:00:00 2001 From: Thomas Weber Date: Tue, 22 Aug 2023 15:41:46 +0200 Subject: [PATCH 5/5] 2.2.1: fixes --- .gitpod.yml | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/.gitpod.yml b/.gitpod.yml index 34ef39b6..76eafb53 100644 --- a/.gitpod.yml +++ b/.gitpod.yml @@ -31,6 +31,13 @@ tasks: # MOSAICATCHER + # MOSAICATCHER + + - name: Change cache location + command: | + mkdir -p ~/my_mamba_cache + export CONDA_PKGS_DIRS=~/my_mamba_cache + - name: Create snakemake env command: mamba create -n snakemake -c bioconda -c conda-forge snakemake -y