From 20c6ffa9ec1993e9e3cbf2267dcec29ea5faca10 Mon Sep 17 00:00:00 2001
From: Thomas Weber <thomas.weber@embl.de>
Date: Tue, 22 Aug 2023 13:24:31 +0200
Subject: [PATCH 1/5] 2.2.1: fixes

---
 .gitignore                                    |   1 +
 .gitpod.Dockerfile                            |  38 +++
 .gitpod.yml                                   |  65 +++-
 .gitpod.yml.bak                               |   4 +
 config/config_metadata.yaml                   |  15 +-
 github-actions-runner/docker_procedure.md     |  40 +++
 workflow/Snakefile                            |   8 +-
 workflow/envs/rtools.yaml                     |   5 +
 workflow/report/workflow.rst                  |  33 +-
 workflow/rules/common.smk                     |  21 +-
 workflow/scripts/GC/GC_correction.R           | 307 +++++++-----------
 workflow/scripts/GC/counts_scaling.R          |  32 --
 workflow/scripts/GC/gc.R                      |  38 ---
 .../scripts/GC/library_size_normalisation.R   | 115 +++++++
 .../GC/variance_stabilizing_transformation.R  | 302 ++++++++---------
 workflow/scripts/utils/make_log_useful.py     |  67 ++--
 16 files changed, 631 insertions(+), 460 deletions(-)
 create mode 100644 .gitpod.Dockerfile
 create mode 100644 .gitpod.yml.bak
 create mode 100644 github-actions-runner/docker_procedure.md
 delete mode 100644 workflow/scripts/GC/counts_scaling.R
 delete mode 100644 workflow/scripts/GC/gc.R
 create mode 100644 workflow/scripts/GC/library_size_normalisation.R

diff --git a/.gitignore b/.gitignore
index dc307c6a..5e501cbe 100644
--- a/.gitignore
+++ b/.gitignore
@@ -214,3 +214,4 @@ workflow/data/arbigent/scTRIP_segmentation.bed
 .tests/data_CHR17/RPE-BM510/bam/*.bam.raw
 .tests/data_CHR17/RPE-BM510/bam/*.bam.sort
 .tests/external_data/chr17.fa.log
+LOGS_DEV/
diff --git a/.gitpod.Dockerfile b/.gitpod.Dockerfile
new file mode 100644
index 00000000..04dfb23b
--- /dev/null
+++ b/.gitpod.Dockerfile
@@ -0,0 +1,38 @@
+FROM gitpod/workspace-full:2023-03-24-02-48-18
+
+ENV RETRIGGER=4
+
+ENV BUILDKIT_VERSION=0.11.6
+ENV BUILDKIT_FILENAME=buildkit-v${BUILDKIT_VERSION}.linux-amd64.tar.gz
+ENV DAZZLE_VERSION=0.1.17
+
+USER root
+
+# Install dazzle, buildkit and pre-commit
+RUN curl -sSL https://github.com/moby/buildkit/releases/download/v${BUILDKIT_VERSION}/${BUILDKIT_FILENAME} | tar -xvz -C /usr
+RUN curl -sSL https://github.com/gitpod-io/dazzle/releases/download/v${DAZZLE_VERSION}/dazzle_${DAZZLE_VERSION}_Linux_x86_64.tar.gz | tar -xvz -C /usr/local/bin
+RUN curl -sSL https://github.com/mvdan/sh/releases/download/v3.5.1/shfmt_v3.5.1_linux_amd64 -o /usr/bin/shfmt \
+    && chmod +x /usr/bin/shfmt
+RUN install-packages shellcheck \
+    && pip3 install pre-commit
+RUN curl -sSL https://github.com/mikefarah/yq/releases/download/v4.22.1/yq_linux_amd64 -o /usr/bin/yq && chmod +x /usr/bin/yq
+
+# Apptainer installation
+RUN apt-get update && \
+    apt-get install -y software-properties-common && \
+    add-apt-repository -y ppa:apptainer/ppa && \
+    apt-get update && \
+    apt-get install -y apptainer
+
+# Mambaforge installation
+RUN wget https://github.com/conda-forge/miniforge/releases/latest/download/Mambaforge-Linux-x86_64.sh && \
+    sh Mambaforge-Linux-x86_64.sh -b -p /home/gitpod/mambaforge && \
+    /home/gitpod/mambaforge/bin/mamba create -n snakemake -c bioconda -c conda-forge snakemake && \
+    /home/gitpod/mambaforge/bin/mamba init && \
+    echo "source /home/gitpod/mambaforge/bin/activate" >> /home/gitpod/.bashrc && \
+    echo "conda activate snakemake" >> /home/gitpod/.bashrc
+
+# Clean up to reduce image size
+RUN apt-get clean && \
+    rm -rf /var/lib/apt/lists/* && \
+    rm Mambaforge-Linux-x86_64.sh
\ No newline at end of file
diff --git a/.gitpod.yml b/.gitpod.yml
index 18970d9a..34ef39b6 100644
--- a/.gitpod.yml
+++ b/.gitpod.yml
@@ -1,4 +1,65 @@
-image: condaforge/mambaforge:latest
+image:
+  file: .gitpod.Dockerfile
+
+ports:
+  - port: 5000
+    onOpen: ignore
+
 tasks:
+  - name: install pre-commit git hooks
+    init: pre-commit install
+  - name: start up buildkitd
+    command: |
+      sudo /usr/bin/buildkitd --debug  --config ./buildkitd.toml --group gitpod
+  - name: start a local docker registry
+    command: |
+      mkdir -p /workspace/registry
+      docker run -p 5000:5000 --name registry --rm -v /workspace/registry:/var/lib/registry registry:2
+    openMode: split-right
+  - name: dazzle build and test
+    command: |
+      gp ports await 5000
+      REPO=localhost:5000/dazzle
+      echo "To build specific chunks and combine them 'time ./build-chunk.sh  -c chunk1 -c chunk2:variant1.2.3 -n combo'"
+      echo "To build all the chunks and combinations 'time ./build-all.sh'"
+      echo "To build a specific combination 'time ./build-combo.sh combo'"
+      echo "To list image chunks 'dazzle project image-name $REPO'"
+      echo "To list hashes for image chunks 'dazzle project hash $REPO'"
+      echo "To print the combined image maniest 'dazzle project manifest $REPO'"
+
+    openMode: tab-after
+
+    # MOSAICATCHER
+
   - name: Create snakemake env
-    command: mamba create -n snakemake -c bioconda -c conda-forge snakemake=7.18.2 -y
+    command: mamba create -n snakemake -c bioconda -c conda-forge snakemake -y
+
+  # - name: Run mosaicatcher
+  #   command: |
+  #     snakemake --configfile .tests/config/simple_config.yaml \
+  #       --config ashleys_pipeline=True MultiQC=True \
+  #       --profile workflow/snakemake_profiles/local/conda/ \
+  #       --conda-frontend mamba \
+  #       --cores 8 --forceall
+
+  # - name: Generate Dockerfile
+  #   command: |
+  #     snakemake --configfile .tests/config/simple_config.yaml \
+  #       --config ashleys_pipeline=True MultiQC=True \
+  #       --profile workflow/snakemake_profiles/local/conda/ \
+  #       --conda-frontend mamba \
+  #       --cores 8 --containerize
+
+vscode:
+  extensions:
+    - ms-azuretools.vscode-docker
+    - timonwong.shellcheck
+
+github:
+  prebuilds:
+    master: true
+    branches: true
+    pullRequests: true
+    pullRequestsFromForks: true
+    addCheck: true
+    addComment: true
diff --git a/.gitpod.yml.bak b/.gitpod.yml.bak
new file mode 100644
index 00000000..18970d9a
--- /dev/null
+++ b/.gitpod.yml.bak
@@ -0,0 +1,4 @@
+image: condaforge/mambaforge:latest
+tasks:
+  - name: Create snakemake env
+    command: mamba create -n snakemake -c bioconda -c conda-forge snakemake=7.18.2 -y
diff --git a/config/config_metadata.yaml b/config/config_metadata.yaml
index 16669821..054f84a5 100644
--- a/config/config_metadata.yaml
+++ b/config/config_metadata.yaml
@@ -23,7 +23,7 @@ MultiQC:
   type: bool
   required: False
   default: False
-multistep_normalisation_analysis:
+multistep_normalisation:
   desc: "Enable / Disable multistep normalisation"
   type: bool
   required: False
@@ -38,11 +38,6 @@ ashleys_threshold:
   type: bool
   required: False
   default: 0.5
-plate_orientation:
-  desc: "Plate orientation for GC analysis (landscape/portrait)"
-  type: string
-  required: False
-  default: landscape
 window:
   desc: "Mosaic bin window size"
   type: int
@@ -73,7 +68,7 @@ samples_to_process:
   type: list
   required: True
   default: "[]"
-normalized_counts:
+hgsvc_based_normalized_counts:
   desc: Normalize or not mosaic counts
   type: bool
   required: False
@@ -116,3 +111,9 @@ scNOVA:
   required: False
   default: False
   lint_check: False
+genecore_prefix:
+  desc: ""
+  type: str
+  required: False
+  default: "/g/korbel/shared/genecore"
+  lint_check: False
diff --git a/github-actions-runner/docker_procedure.md b/github-actions-runner/docker_procedure.md
new file mode 100644
index 00000000..c2661b4b
--- /dev/null
+++ b/github-actions-runner/docker_procedure.md
@@ -0,0 +1,40 @@
+- Push latest changes to dev
+- Open gitpod.io for repo
+- Run snakemake for test dataset with all options
+
+```
+snakemake --configfile .tests/config/simple_config.yaml --config ashleys_pipeline=True MultiQC=True --profile workflow/snakemake_profiles/local/conda_singularity/ -c 8 --forceall
+```
+
+- Run same command with --containerize
+- Copy and paste Dockerfile into github-actions-runner folder of repo
+- Open gitpod.io with docker/apptainer/mamba
+
+# To solve by preinstalling apptainer and mamba into base image
+
+Apptainer install:
+
+```
+sudo apt update
+sudo apt install -y software-properties-common
+sudo add-apt-repository -y ppa:apptainer/ppa
+sudo apt update
+sudo apt install -y apptainer
+```
+
+Mamba install:
+
+wget https://github.com/conda-forge/miniforge/releases/latest/download/Mambaforge-Linux-x86_64.sh
+sh Mambaforge-Linux-x86_64.sh
+/home/gitpod/mambaforge/bin/mamba create -n snakemake -c bioconda -c conda-forge snakemake
+/home/gitpod/mambaforge/bin/mamba init
+source ~/.bashrc
+conda activate snakemake
+
+snakemake --configfile .tests/config/simple_config.yaml --config ashleys_pipeline=True MultiQC=True --profile workflow/snakemake_profiles/local/conda/ -c 8 --forceall -n
+snakemake --configfile .tests/config/simple_config.yaml --config ashleys_pipeline=True MultiQC=True --profile workflow/snakemake_profiles/local/conda_singularity/ -c 8 --forceall
+
+
+# Solved in 
+
+https://github.com/weber8thomas/workspace-images/tree/docker-apptainer-mamba
\ No newline at end of file
diff --git a/workflow/Snakefile b/workflow/Snakefile
index 4faa35cd..e8e15c2b 100644
--- a/workflow/Snakefile
+++ b/workflow/Snakefile
@@ -2,8 +2,8 @@ from snakemake.utils import min_version
 
 min_version("7.5.0")
 
-
-configfile: "config/config.yaml"
+configfile_location = "config/config.yaml"
+configfile: configfile_location
 
 
 report: "report/workflow.rst"
@@ -25,9 +25,9 @@ if config["ashleys_pipeline"] is True:
             github(
                 "friendsofstrandseq/ashleys-qc-pipeline",
                 path="workflow/Snakefile",
-                tag=str(config["ashleys_pipeline_version"])
+                # tag=str(config["ashleys_pipeline_version"])
                 # branch="main",
-                # branch="dev",
+                branch="dev",
             )
         config:
             config
diff --git a/workflow/envs/rtools.yaml b/workflow/envs/rtools.yaml
index e1580794..59c95e3d 100644
--- a/workflow/envs/rtools.yaml
+++ b/workflow/envs/rtools.yaml
@@ -48,3 +48,8 @@ dependencies:
   - r-tidyr
   - r-ggbeeswarm
   - r-pheatmap
+  # GC_correction
+  - r-ggpubr
+  - bioconductor-edger
+  # SOLVE R lib issue
+  - r-stringi=1.7.12
diff --git a/workflow/report/workflow.rst b/workflow/report/workflow.rst
index 5404e21d..ce78a056 100644
--- a/workflow/report/workflow.rst
+++ b/workflow/report/workflow.rst
@@ -3,25 +3,47 @@
     
 MosaiCatcher v2 is a `Snakemake <https://snakemake.github.io>`__ pipeline that aims to detect Structural variants from single-cell Strand-seq data.
 
-**Versions used:** 
+**Versions used and general parameters:** 
 
 * MosaiCatcher version used: {{ snakemake.config["version"] }}
+* Ashleys version used (if enabled): {{ snakemake.config["ashleys_pipeline_version"] }}
+* Samples processed in the folder: {{ snakemake.config["samples_to_process"] }}
 
 **Input/Output options:**
 
 * Folder processed: {{ snakemake.config["data_location"] }}
+* Publishdir defined: {{ snakemake.config["publishdir"] }}
+* Input BAM legacy (bam & selected ; mutually exclusive with ashleys_pipeline): {{ snakemake.config["input_bam_legacy"] }}
 
-**Main options:**
+**Ashleys-QC parameters (if enabled)
 
 * Ashleys-QC preprocessing pipeline enabled: {{ snakemake.config["ashleys_pipeline"] }}
 * Ashleys-QC preprocessing pipeline version used: {{ snakemake.config["ashleys_pipeline_version"] }}
-* BAM folder legacy format (all/selected) enabled: {{ snakemake.config["input_bam_legacy"] }}
+* Ashleys-QC preprocessing pipeline only (to stop after QC): {{ snakemake.config["ashleys_pipeline_only"] }}
+* Ashleys-QC threshold: {{ snakemake.config["ashleys_threshold"] }}
+* MultiQC enabled (triggered FastQC, samtools idxstats & flagstats): {{ snakemake.config["MultiQC"] }}
+
 
 **Counts option:**
 
 * Multistep normalisation enabled: {{ snakemake.config["multistep_normalisation"] }}
+* Multistep normalisation min reads / bin: {{ snakemake.config["multistep_normalisation_options"]["min_reads_bin"] }}
+* Multistep normalisation nb of subsample: {{ snakemake.config["multistep_normalisation_options"]["n_subsample"] }}
+* Multistep normalisation min reads / cell: {{ snakemake.config["multistep_normalisation_options"]["min_reads_cell"] }}
 * Read Counts normalization enabled: {{ snakemake.config["hgsvc_based_normalized_counts"] }}
 * Binning window size: {{ snakemake.config["window"] }}
+* Blacklist regions: {{ snakemake.config["blacklist_regions"] }}
+
+**SV calling:**
+
+* Multistep normalisation for SV calling enabled: {{ snakemake.config["multistep_normalisation_for_SV_calling"] }}
+
+**Downstream analysis modules:**
+
+* Arbitrary Genotyping (ArbiGent): {{ snakemake.config["arbigent"] }}
+* ArbiGent BED file: {{ snakemake.config["arbigent_bed_file"] }}
+* scNOVA: {{ snakemake.config["scNOVA"] }}
+
 
 **Reference genome & Chromosomes options:**
 
@@ -29,6 +51,11 @@ MosaiCatcher v2 is a `Snakemake <https://snakemake.github.io>`__ pipeline that a
 * Reference genome selected: {{ snakemake.config["reference"] }}
 * Reference FASTA file: {{ snakemake.config["references_data"][snakemake.config["reference"]]["reference_file_location"] }}
 
+**Other options**
+
+* Split QC plot into individual files: {{ snakemake.config["split_qc_plot"] }}
+
+
 MosaiCatcher git repository: https://github.com/friendsofstrandseq/mosaicatcher-pipeline
 
 *Please cite:*
diff --git a/workflow/rules/common.smk b/workflow/rules/common.smk
index cb0dcdd3..fbbb1e4d 100644
--- a/workflow/rules/common.smk
+++ b/workflow/rules/common.smk
@@ -1,5 +1,6 @@
 import collections
 import pandas as pd
+import yaml
 
 # from scripts.utils import handle_input, make_log_useful, pipeline_aesthetic_start
 from scripts.utils import make_log_useful, pipeline_aesthetic_start
@@ -283,7 +284,8 @@ class HandleInput:
         complete_df_list = list()
         # List of folders/files to not consider (restrict to samples only)
 
-        l_to_process = [e for e in os.listdir(thisdir) if e not in exclude]
+        l_to_process = [e for e in os.listdir(thisdir) if e not in exclude and e.endswith(".zip") is False]
+        print(l_to_process)
         if config["samples_to_process"]:
             l_to_process = [e for e in l_to_process if e in config["samples_to_process"]]
 
@@ -325,6 +327,7 @@ class HandleInput:
         complete_df = complete_df.sort_values(by=["Cell", "File"]).reset_index(
             drop=True
         )
+        print(complete_df)
         return complete_df
 
 
@@ -511,20 +514,22 @@ def get_mem_mb_heavy(wildcards, attempt):
     return mem_avail[attempt - 1] * 1000
 
 
-def onsuccess_fct(log):
-    make_log_useful.make_log_useful(log, "SUCCESS", config)
+def onsuccess_fct(log): 
+    config_metadata = config_definitions = yaml.safe_load(open(configfile_location.replace("config.yaml", "config_metadata.yaml"), "r"))
+    log_path_new = make_log_useful.make_log_useful(log, "SUCCESS", config, config_metadata)
     shell(
-        'mail -s "[Snakemake] smk-wf-catalog/mosacaitcher-pipeline v{} - Run on {} - SUCCESS" {} < {{log}}'.format(
-            config["version"], config["data_location"], config["email"]
+        'mail -s "[Snakemake] smk-wf-catalog/mosacaitcher-pipeline v{} - Run on {} - SUCCESS" {} < {}'.format(
+            config["version"], config["data_location"], config["email"], log_path_new
         )
     )
 
 
 def onerror_fct(log):
-    make_log_useful.make_log_useful(log, "ERROR", config)
+    config_metadata = config_definitions = yaml.safe_load(open(configfile_location.replace("config.yaml", "config_metadata.yaml"), "r"))
+    log_path_new = make_log_useful.make_log_useful(log, "ERROR", config, config_metadata)
     shell(
-        'mail -s "[Snakemake] smk-wf-catalog/mosacaitcher-pipeline v{} - Run on {} - ERRROR" {} < {{log}}'.format(
-            config["version"], config["data_location"], config["email"]
+        'mail -s "[Snakemake] smk-wf-catalog/mosacaitcher-pipeline v{} - Run on {} - ERRROR" {} < {}'.format(
+            config["version"], config["data_location"], config["email"], log_path_new
         )
     )
 
diff --git a/workflow/scripts/GC/GC_correction.R b/workflow/scripts/GC/GC_correction.R
index 030c7a48..50e7a107 100644
--- a/workflow/scripts/GC/GC_correction.R
+++ b/workflow/scripts/GC/GC_correction.R
@@ -1,60 +1,54 @@
-# fetch arguments
-args = commandArgs(trailingOnly = T)
-
-# # checking arguments
-# if (length(args) < 3) {
-#   message("Usage: Rscript GC_correction.R count-file.txt.gz gc-matrix.txt output.txt.gz optional(plots.pdf)")
-#   stop()
-# }
-# for (a in seq(2)) {
-#   if (!file.exists(args[a])) {
-#     message(paste(args[a], "does not exists or cannot be found."))
-#     stop()
-#   }
-# }
-
-#args[1] <- '/data/r-workspace/strandseq_utils/H3JNHAFX3_MNIphotoconverted_200000_fixed.norm.txt.gz'
-#args[2] <- '/data/r-workspace/strandseq_utils/GC_matrix_200000.txt'
-print(snakemake@params[["gc_matrix"]])
+# SET ARGUMENTS
+input_path <- snakemake@input[["counts_scaled"]]
+gc_path <- snakemake@params[["gc_matrix"]]
+save_path <- snakemake@output[["counts_scaled_gc"]]
+plot <- TRUE
+min_reads <- snakemake@params[["gc_min_reads"]] # <- 5
+n_subsample <- snakemake@params[["gc_n_subsample"]] # <- 1000
+
+print(gc_path)
+
 # open files
-counts <- data.table::fread(snakemake@input[["counts_vst"]], header = T)
-# counts <- data.table::fread(args[1], header = T)
-GC_matrix <- data.table::fread(snakemake@params[["gc_matrix"]], header = T)
-# GC_matrix <- data.table::fread(args[2], header = T)
-save_path <- snakemake@output[["counts_vst_gc"]]
-# save_path <- args[3]
+counts <- data.table::fread(input_path, header = T)
+GC_matrix <- data.table::fread(gc_path, header = T)
+
+# reformat GC_matrix
+# find column containing GC counts and rename to 'GC%'
+
+idx <- which(grepl("GC", colnames(GC_matrix), fixed = TRUE))
+colnames(GC_matrix)[[idx]] <- "GC%"
 
 # check GC plots
-# plot <- ifelse(is.na(args[4]), FALSE, TRUE)
-plot <- FALSE
-# if (plot) {
-#   # import libraries
-#   library(ggplot2)
-# }
+if (plot) {
+  # import libraries
+  library(ggplot2)
+  library(ggpubr)
+}
+
 
 # check files
-if (!all(c('cell', 'chrom', 'start', 'end', 'w', 'c') %in% colnames(counts))) {
+if (!all(c("cell", "chrom", "start", "end", "w", "c") %in% colnames(counts))) {
   message("count file does not contain required columns: 'cell', 'chrom', 'start', 'end', 'w', 'c'")
   message("Usage: Rscript GC_correction.R count-file.txt.gz gc-matrix.txt output.txt.gz")
   stop()
 }
-if (!all(c('chrom', 'start', 'end', 'GC%') %in% colnames(GC_matrix))) {
+if (!all(c("chrom", "start", "end", "GC%") %in% colnames(GC_matrix))) {
   message("GC_matrix file does not contain required columns: 'chrom', 'start', 'end', 'GC%'")
   message("Usage: Rscript GC_correction.R count-file.txt.gz gc-matrix.txt output.txt.gz")
   stop()
 }
-if (! (all(unique(counts$chrom) %in% unique(GC_matrix$chrom)) &
-       all(unique(counts$start) %in% unique(GC_matrix$start)) &
-       all(unique(counts$end) %in% unique(GC_matrix$end)))) {
+if (!(all(unique(counts$chrom) %in% unique(GC_matrix$chrom)) &
+  all(unique(counts$start) %in% unique(GC_matrix$start)) &
+  all(unique(counts$end) %in% unique(GC_matrix$end)))) {
   message("bin features ('crhom', 'start', 'end') do not match between count file and GC matrix")
   message("make sure to choose files with identical bin sizes")
 }
 
 
 # green light message
-message(paste("\ncount file:", args[1]))
-message(paste("GC matrix file:", args[2]))
-message(paste("savepath:", args[3]))
+# message(paste("\ncount file:", args[1]))
+# message(paste("GC matrix file:", args[2]))
+# message(paste("savepath:", args[3]))
 message("preprocessing...\n")
 
 
@@ -67,181 +61,98 @@ counts$cell <- as.factor(counts$cell)
 
 # convert strandseq count file to count matrix
 counts$tot_count <- counts$c + counts$w
-count_matrix <- reshape2::dcast(counts, chrom+start+end ~ cell, value.var = "tot_count")
-
-# add GC fractions to count_matrix not to lose matching order
-count_matrix <- merge(count_matrix, GC_matrix[,c('chrom', 'start', 'end', 'GC%')], by=c('chrom', 'start', 'end'), all.x = T)
-count_matrix$`GC%` <- as.numeric(count_matrix$`GC%`)
-
-# filter unavailable GC fractions
-fil <- apply(count_matrix, MARGIN = 1, FUN = (function(x) any(is.na(x))))
-count_matrix.filt <- count_matrix[!fil,]
-message(paste((dim(count_matrix)[1]-dim(count_matrix.filt)[1]), 'bins out of', dim(count_matrix.filt)[1], 'with NA values'))
-
 
 ######################
 # GC bias correction #
 ######################
 
-# set up the matrix
-count_matrix.corrected <- data.frame(count_matrix.filt)
-
-chrom_ids <- gsub('chr', '', unique(count_matrix.corrected$chrom))
-autosomes <- !sapply(as.numeric(chrom_ids), is.na)
-chr_order <- c(order(as.numeric(chrom_ids[autosomes])), sum(autosomes) + order(chrom_ids[!autosomes]))
-
-count_matrix.corrected$chrom <- factor(count_matrix.corrected$chrom, levels=paste('chr', chrom_ids[chr_order], sep=''))
-
-# collect plots
-plots_meta <- data.frame()
-plots <- list()
-n <- 1
-
-# correction chromosome by chromosome
-for (chr in unique(count_matrix.corrected$chrom)) {
-
-  # subset one chromosome
-  message(paste('correction', chr))
-  count_matrix.sub <- count_matrix.corrected[count_matrix.corrected$chrom == chr,]
-  
-  # correction cell by cell
-  for (cell_n in seq(4,(ncol(count_matrix.sub)-1))) {
-    
-    # get counts and GC percentage for the current cell
-    cell_id <- colnames(count_matrix.sub)[cell_n]
-    cell_counts_raw <- as.numeric(count_matrix.sub[,(cell_id)])
-    GC.frac_raw <- as.numeric(count_matrix.sub[,"GC."])
-    
-    # filter out count 0 for better fitting
-    df <- data.frame(cbind(cell_counts_raw, GC.frac_raw))
-    df <- df[df$cell_counts_raw > 0 & df$GC.frac_raw >.3, ]
-    cell_counts <- df$cell_counts_raw
-    GC.frac <- df$GC.frac_raw
-    
-    # logarithm of the counts centered on the mean
-    log.norm.counts_raw <- log2( (cell_counts_raw+1) / mean(cell_counts_raw+1) )
-    log.norm.counts <- log2( (cell_counts+1) / mean(cell_counts+1) )
-    
-    # check bins
-    if (length(GC.frac) < 10 | length(log.norm.counts) < 10) {
-      message(paste('skipping', chr, 'number of bins < 10'))
-      next
-    } 
-    
-    # fit curve before correction
-    fit_before <- smooth.spline(GC.frac, log.norm.counts, df = 6)
-    if(plot) {
-      #plot_title <- 'before correction'
-      #plot(GC.frac, log.norm.counts, main=plot_title, xlab="GC%", ylab="log2(counts/mean)")
-      #lines(fit_before, col="red", lwd=2)
-      plotdf <- data.frame(log.norm.counts, GC.frac)
-      plotline <- data.frame(fit_before$x, fit_before$y)
-      
-      p <- ggplot(plotdf, aes(GC.frac, log.norm.counts)) +
-        geom_point(size=1, alpha=.2) +
-        ggtitle(paste(chr, "before")) +
-        geom_line(data = plotline, aes(fit_before.x, fit_before.y), color="red")
-      plots_meta[n, 1:4] <- list("index" = n, "chrom" = chr, "cell" = cell_id, "type" = "before")
-      plots[[n]] <- p
-      n <- n+1
-    }
-    
-    # correct values for raw (pre-filtering) data
-    pred <- predict(fit_before, GC.frac_raw)
-    log.norm.corrected <- log.norm.counts_raw - pred$y
-    
-    # convert log of normalized counts to counts
-    counts.corrected <- (2 ^ log.norm.corrected) * mean(count_matrix.sub[,cell_id]+1) -1
-    
-    # fit curve after correction
-    if(plot) {
-      # curve fit after correction is needed only for plotting
-      fit_after <- smooth.spline(GC.frac_raw, log.norm.corrected, df = 6)
-      #plot_title <- 'after correction'
-      #plot(GC.frac_raw, log.norm.corrected, main=plot_title, xlab="GC%", ylab="log2(counts/mean)")
-      #lines(fit_after, col="red", lwd=2)
-      
-      plotdf <- data.frame(log.norm.corrected, GC.frac_raw)
-      plotline <- data.frame(fit_after$x, fit_after$y)
-      p <- ggplot(plotdf, aes(GC.frac_raw, log.norm.corrected)) +
-        geom_point(size=1, alpha=.2) +
-        ggtitle(paste(chr, "after")) +
-        geom_line(data = plotline, aes(fit_after.x, fit_after.y), color="red")
-      plots_meta[n, 1:4] <- list("index" = n, "chrom" = chr, "cell" = cell_id, "type" = "after")
-      plots[[n]] <- p
-      n <- n+1
-    }
-    
-    # write data to the table
-    count_matrix.corrected[count_matrix.corrected$chrom == chr, cell_id] <- counts.corrected
-  }
-}
 
+counts <- merge(counts, GC_matrix[, c("chrom", "start", "GC%")], by = c("chrom", "start"), all.x = T)
 
-###################
-# POST PROCESSING #
-###################
+# filter data for subsampling
+c <- counts[counts$tot_count >= min_reads]
+if (dim(c)[[1]] == 0) {
+  stop(paste("there are no bins with more than", min_reads, "reads"))
+}
+c$`GC%` <- as.numeric(c$`GC%`)
+c$log_count_norm <- log(c$tot_count) - log(median(c$tot_count))
+not.na <- !is.na(c$`GC%`)
+s <- c[not.na]
+
+# subsample from quantiles
+s$GC_bin <- cut(s$`GC%`, breaks = c(quantile(s$`GC%`, probs = seq(0, 1, by = 1 / 10))), labels = seq(1, 10, by = 1), include.lowest = TRUE)
+
+subsample <- data.frame()
+for (i in seq(10)) {
+  sbin <- s[s$GC_bin == i]
+  m <- min(dim(sbin)[1], n_subsample)
+  sa <- sbin[sample(nrow(sbin), size = m), ]
+  subsample <- rbind(subsample, sa)
+}
 
-# table of corrected data
-counts.corrected <- reshape2::melt(count_matrix.corrected, id.vars = c("chrom", "start", "end"),
-                                              measure.vars = colnames(count_matrix.corrected)[seq(4,(ncol(count_matrix.corrected)-1))], 
-                                              value.name = "GC_tot_count", variable.name='cell')
+#############################
+# lowess fit and correction #
+#############################
+# lowess fit
+z <- lowess(subsample$`GC%`, subsample$log_count_norm)
+
+# ################
+# # SAVING PLOTS #
+# ################
+
+# adjust tot count to closest predicted GC value
+idxs <- sapply(as.numeric(counts$`GC%`), FUN = function(a) {
+  which.min(abs(z$x - a))
+})
+idxs[lapply(idxs, length) == 0] <- NA
+counts$pred <- z$y[unlist(idxs)]
+counts$tot_count_gc <- log(counts$tot_count / median(counts$tot_count)) - counts$pred
+counts$tot_count_gc <- exp(counts$tot_count_gc) * median(counts$tot_count)
 
-# force cell column to factor
-counts$cell <- as.factor(counts$cell)
-counts.corrected$cell <- as.factor(counts.corrected$cell)
+if (plot) {
+  sidxs <- sapply(as.numeric(subsample$`GC%`), FUN = function(a) {
+    which.min(abs(z$x - a))
+  })
+  sidxs[lapply(sidxs, length) == 0] <- NA
 
-# merge corrected data to original data
-counts.full <- merge(counts, counts.corrected, by=c("chrom", "start", "end", "cell"), all.x=T)
 
-# ratio of the corrected and raw counts
-counts.full$ratio <- counts.full$GC_tot_count/counts.full$tot_count
-counts.full[is.na(counts.full$ratio)]$ratio <- 1
-counts.full[is.infinite(counts.full$ratio)]$ratio <- 0
+  subsample$pred <- z$y[unlist(sidxs)]
+  subsample$tot_count_gc <- log(subsample$tot_count / median(subsample$tot_count)) - subsample$pred
+  subsample$tot_count_gc <- exp(subsample$tot_count_gc) * median(subsample$tot_count)
 
-# adjust watson, crick and tot_counts
-counts.full$w <- counts.full$w * counts.full$ratio
-counts.full$c <- counts.full$c * counts.full$ratio
-counts.full$tot_count <- counts.full$w + counts.full$c
+  z$y2 <- exp(z$y) * median(subsample$tot_count)
+  ymin <- min(cbind(subsample$tot_count, subsample$tot_count_gc))
+  ymax <- max(cbind(subsample$tot_count, subsample$tot_count_gc))
 
-# saving
-message("saving...\n")
-data.table::fwrite(counts.full[,c('chrom', 'start', 'end', 'sample', 'cell', 'w', 'c', 'class', 'tot_count')], save_path)
+  p1 <- ggplot(subsample, aes(`GC%`, tot_count)) +
+    geom_point(size = 1, alpha = .2) +
+    ggtitle("raw") +
+    ylim(ymin, ymax) +
+    xlab("GC_content") +
+    ylab("read count") +
+    geom_line(data = as.data.frame(z), aes(x, y2), color = "red")
 
+  p2 <- ggplot(subsample, aes(`GC%`, tot_count_gc)) +
+    geom_point(size = 1, alpha = .2) +
+    ggtitle("gc corrected") +
+    ylim(ymin, ymax) +
+    xlab("GC content") +
+    ylab("read count")
+
+  corr_plot <- ggarrange(p1, p2)
 
-################
-# SAVING PLOTS #
-################
 
-if (plot) {
-  
-  # define layout matrix
-  layout_matrix <- matrix(seq(1,48), 6,8)
-  # last extra squares for the dummy
-  layout_matrix[5:6,8] <- 47
-  
-  # collect plots in right order
-  n <-1
-  index_order <- c()
-  for (cell_id in unique(plots_meta$cell)) {
-    # add dummy plot with cell name
-    plots[[length(plots) + n]] <- ggplot() + 
-      annotate("text", size=4, x=0, y=0, label = cell_id, angle=-90) + 
-      theme_void()
-    
-    # retrieve indexes
-    subdf <- plots_meta[plots_meta$cell == cell_id,]
-    index_order <- c(index_order, as.numeric(rownames(subdf)), length(plots))
-    n <- n+1
-  }
-  
-  message("saving plots, please wait...")
-  # generate the grid
-  grid <- gridExtra::marrangeGrob(grobs = plots[index_order], 
-                                  ncol = 8, nrow = 6, 
-                                  top="GC correction plots", 
-                                  layout_matrix = layout_matrix)
-  
   # save plots
-  ggsave(args[4], grid, width=14, height=10)
+  ggsave(snakemake@output[["plot"]], corr_plot, width = 12, height = 6)
 }
+
+# adjust w, c and fill NAs
+counts$w <- (counts$w * counts$tot_count / counts$tot_count_gc)
+counts$c <- (counts$c * counts$tot_count / counts$tot_count_gc)
+counts$w[is.na(counts$w)] <- 0
+counts$c[is.na(counts$c)] <- 0
+counts$tot_count[is.na(counts$tot_count)] <- 0
+
+output <- counts[, c("chrom", "start", "end", "sample", "cell", "w", "c", "tot_count", "class")]
+
+data.table::fwrite(output, save_path)
\ No newline at end of file
diff --git a/workflow/scripts/GC/counts_scaling.R b/workflow/scripts/GC/counts_scaling.R
deleted file mode 100644
index 88361319..00000000
--- a/workflow/scripts/GC/counts_scaling.R
+++ /dev/null
@@ -1,32 +0,0 @@
-# fetch arguments
-args <- commandArgs(trailingOnly = T)
-
-# args[1] <- '/data/r-workspace/strandseq_utils/test_data/H3JNHAFX3_MNIphotoconverted.200000.VST.GC.txt.gz'
-
-# open counts
-# counts <- data.table::fread(args[1])
-counts <- data.table::fread(snakemake@input[["counts_vst_gc"]])
-
-
-# calculate scale factor essentially checks the order of magnitude (log10)
-# and rescales to the 100s
-scale_factor <- (floor(log10(median(counts$w + counts$c))) - 2) * -1
-
-message(paste("scale factor:", scale_factor))
-
-# scaling
-counts$w <- as.integer(counts$w * 10^scale_factor)
-counts$c <- as.integer(counts$c * 10^scale_factor)
-
-if ("tot_count" %in% colnames(counts)) {
-    counts$tot_count <- as.integer(counts$w + counts$c)
-}
-
-# filename
-# tokens <- strsplit(args[1], "/")[[1]]
-# basename <- tokens[length(tokens)]
-# basename <- strsplit(args[1], ".txt.gz")[[1]]
-
-# save
-# data.table::fwrite(counts, args[2])
-data.table::fwrite(counts, snakemake@output[["counts_vst_gc_scaled"]])
diff --git a/workflow/scripts/GC/gc.R b/workflow/scripts/GC/gc.R
deleted file mode 100644
index 4c632ef9..00000000
--- a/workflow/scripts/GC/gc.R
+++ /dev/null
@@ -1,38 +0,0 @@
-library(ggplot2)
-library(reshape2)
-
-# zcat out.tsv.gz  | grep "^GC" > scripts/gc.table
-
-args = commandArgs(trailingOnly=TRUE)
-# x = read.table(args[1], header=T)
-x = read.table(snakemake@input[["table"]], header = T)
-df = dcast(x, GCcontent ~ Sample)
-refcol = (1:ncol(df))[colnames(df) == "Reference"]
-if (refcol == 2){gc_mean = sum(df[,1]*df[,3])}
-if (refcol == 3){gc_mean = sum(df[,1]*df[,2])}
-
-if (refcol == 3) { datacol = 2; } else { datacol = 3; }
-df$observed = df[,datacol] / df[,refcol]
-df$expected = 1.0
-df = df[df$GCcontent > 0.2 & df$GCcontent < 0.7,]
-sse = sum((df$observed - df$expected)^2)
-df = melt(df, id.vars=colnames(df[,1:3]))
-
-# png(args[2])
-png(snakemake@output[["gcdist_plot"]])
-p = ggplot(data=x, aes(x=GCcontent, y=fractionOfReads))
-p = p + geom_freqpoly(aes(color=Sample), stat="identity")
-p = p + xlab("GC content")
-p = p + ylab("Normalized count")
-p = p + ggtitle(paste0("GC content mean = ", gc_mean))
-p
-dev.off()
-
-# png(args[3])
-png(snakemake@output[["gcdevi_plot"]])
-p = ggplot(data=df, aes(x=GCcontent, y=value)) + geom_line(aes(color=variable))
-p = p + xlab("GC content")
-p = p + ylab("Sample to reference")
-p = p + ggtitle(paste0("SSE = ", sse))
-p
-dev.off()
\ No newline at end of file
diff --git a/workflow/scripts/GC/library_size_normalisation.R b/workflow/scripts/GC/library_size_normalisation.R
new file mode 100644
index 00000000..dba917e0
--- /dev/null
+++ b/workflow/scripts/GC/library_size_normalisation.R
@@ -0,0 +1,115 @@
+# Median of ratios normalization
+
+# fetch arguments
+# args <- commandArgs(trailingOnly = T)
+
+# checking arguments
+# if (length(args) != 2) {
+#     message("Usage: Rscript GC_correction.R count-file.txt.gz gc-matrix.txt output.txt.gz")
+#     stop()
+# }
+# if (!file.exists(args[1])) {
+#     message(paste(args[1], "does not exists or cannot be found."))
+#     stop()
+# }
+
+# args[1] <- '/data/projects/strandseq_segmentation/dataset/counts/H3JNHAFX3_MNIphotoconverted_200000_fixed.txt.gz'
+
+# open files
+# counts <- data.table::fread(args[1])
+counts <- data.table::fread(snakemake@input[["counts"]], header = T)
+save_path <- snakemake@output[["counts_scaled"]]
+# save_path <- args[3]
+# min_reads <- snakemake@params[["gc_min_reads"]]
+
+info_raw <- data.table::fread(snakemake@input[["info_raw"]], skip = 13, header = T, sep = "\t")
+# info_raw <- data.table::fread(args[2])
+
+min_reads <- min(info_raw[info_raw$pass1 == 1, ]$good) - 1
+ 
+#################
+# Preprocessing #
+#################
+
+# force cell column to factor
+counts$cell <- as.factor(counts$cell)
+
+# add tot counts
+counts$tot_count <- counts$c + counts$w
+
+
+# filter cells with too low counts
+counts_bycell <- as.data.frame(aggregate(counts$tot_count, by = list(Category = counts$cell), FUN = sum))
+sel_cells <- counts_bycell[counts_bycell$x >= as.integer(min_reads), "Category"]
+if (length(sel_cells) == 0) {
+    stop(paste("there are no cells with more than", min_reads, "total reads"))
+}
+counts <- counts[counts$cell %in% sel_cells, ]
+
+# convert to bin matrix
+count_matrix_raw <- reshape2::dcast(counts, chrom + start + end ~ cell, value.var = "tot_count")
+count_matrix <- as.data.frame(count_matrix_raw)
+
+#################
+# Normalization #
+#################
+
+message(paste("library size normalization for", snakemake@input[["counts"]]))
+# take the log of counts
+count_matrix[4:ncol(count_matrix)] <- log(count_matrix[4:ncol(count_matrix)])
+
+# calculate mean of the log counts per bin
+count_matrix$mean_log_count <- apply(count_matrix[4:ncol(count_matrix)], MARGIN = 1, FUN = mean, na.rm = FALSE)
+
+# filter out infinity
+count_matrix <- count_matrix[!is.infinite(count_matrix$mean_log_count), ]
+if (dim(count_matrix)[[1]] == 0) {
+    stop("there are no common non-zero bins available across all cells.")
+}
+
+# log of counts over mean per bin
+count_matrix[4:ncol(count_matrix)] <- count_matrix[4:ncol(count_matrix)] - count_matrix$mean_log_count
+
+# median per cell of the per log of counts/mean per bin is the scaling factor
+scaling_factors <- apply(count_matrix[4:ncol(count_matrix)], MARGIN = 2, FUN = median)
+
+# raise e to scaling factor
+scaling_factors <- exp(scaling_factors)
+
+# rescale libraries
+scaled_matrix <- data.table::data.table(count_matrix_raw)
+
+for (i in colnames(scaled_matrix)[4:ncol(scaled_matrix)]) {
+    scaled_matrix[[i]] <- scaled_matrix[[i]] / scaling_factors[i]
+}
+
+##################
+# Postprocessing #
+##################
+
+# wide to long
+norm_tot_counts <- reshape2::melt(scaled_matrix,
+    id.vars = c("chrom", "start", "end"),
+    measure.vars = colnames(scaled_matrix)[4:ncol(scaled_matrix)],
+    value.name = "norm_tot_count", variable.name = "cell"
+)
+
+# merge with counts
+cols <- colnames(counts)
+counts <- merge(counts, norm_tot_counts, by = c("chrom", "start", "end", "cell"), all.x = TRUE)
+
+# adjust W and C counts to normalized counts
+counts$ratio <- counts$norm_tot_count / counts$tot_count
+counts$ratio[is.na(counts$ratio)] <- 0
+counts$w <- counts$w * counts$ratio
+counts$c <- counts$c * counts$ratio
+counts$tot_count <- counts$norm_tot_count
+# fill na
+counts$w[is.na(counts$w)] <- 0
+counts$c[is.na(counts$c)] <- 0
+counts$tot_count[is.na(counts$tot_count)] <- 0
+
+
+# saving
+message("saving...\n")
+data.table::fwrite(counts[, ..cols], save_path)
diff --git a/workflow/scripts/GC/variance_stabilizing_transformation.R b/workflow/scripts/GC/variance_stabilizing_transformation.R
index eb0996a5..2b71a1d3 100644
--- a/workflow/scripts/GC/variance_stabilizing_transformation.R
+++ b/workflow/scripts/GC/variance_stabilizing_transformation.R
@@ -1,105 +1,36 @@
-library(edgeR)
-library(ggplot2)
-
-# fetch arguments
-args = commandArgs(trailingOnly = T)
-
-filter <- ifelse('filter' %in% args, TRUE, FALSE)
-chosen_transform <- ifelse('anscombe' %in% args, 'anscombe', 
-                           ifelse('laubschner' %in% args, 'laubschner', 'anscombe'))
-
+# set arguments
+input_path <- snakemake@input[["counts_scaled_gc"]]
+save_path <- snakemake@output[["counts_scaled_gc_vst"]]
+chosen_transform <- "anscombe"
+plot <- TRUE
+rescale <- TRUE
+
+# PRE-PROCESSING DATA
 # open count file
-#args[1] <- '/data/strandseq_datasets/lib_test/HCLLVAFX3_iPSCs.200000.LIB.txt.gz'
-# counts_raw <- data.table::fread(args[1])
-counts_raw <- data.table::fread(snakemake@input[["counts"]])
+counts_raw <- data.table::fread(input_path)
 
 # force cell column to factor
 counts_raw$cell <- as.factor(counts_raw$cell)
 
 # fuse bin coordinates
-counts_raw$bin <- paste(counts_raw$chrom, counts_raw$start, counts_raw$end, sep='_')
+counts_raw$bin <- paste(counts_raw$chrom, counts_raw$start, counts_raw$end, sep = "_")
 
 # add tot counts
 counts_raw$tot_count <- counts_raw$c + counts_raw$w
 
-# sum of counts per cell
-sum_counts <- aggregate(counts_raw$tot_count, by= list(counts_raw$cell), FUN = sum)
-colnames(sum_counts) <- c('cell', 'sum')
-
-
-# centromere read spikes exclusion
-
-exclude_centromeres <- function(counts_raw, exclusion_list) {
-  exclusion_list <- GenomicRanges::GRanges(seqnames = exclusion_list$chrom, ranges = IRanges::IRanges(start = exclusion_list$start, end=exclusion_list$end))
-  counts_iranges <- GenomicRanges::GRanges(seqnames = counts_raw$chrom, ranges = IRanges::IRanges(start = counts_raw$start, end=counts_raw$end))
-
-  overlaps <- data.table::as.data.table(GenomicRanges::findOverlaps(exclusion_list, counts_iranges))
-  counts <- counts_raw
-  counts <- counts[-overlaps$subjectHits,]
-  return(counts)
-}
-
-#args[3] <- '/data/r-workspace/strandseq_utils/large_centromere_blacklist.csv'
-
-if (!is.na(args[3])) {
-  message('blacklisting...')
-  counts <- exclude_centromeres(counts_raw, data.table::fread(args[3]))
-} else {
-  counts <- counts_raw
-}
+counts <- counts_raw
 
 # convert to bin count matrix
 to_matrix <- function(counts) {
+  # fuse bin coordinates
+  counts$bin <- paste(counts$chrom, counts$start, counts$end, sep = "_")
   mat_tot <- reshape2::dcast(counts, bin ~ cell, value.var = "tot_count")
   rownames(mat_tot) <- mat_tot$bin
-  mat_tot <- mat_tot[,2:ncol(mat_tot)]
-  
-  return(mat_tot)
-}
-
-mat_tot <- to_matrix(counts)
-
-generate_dgelist <- function(mat_tot, filter) {
-  
-  # create the DGEList object
-  # all samples belong to group 1
-  y.raw <- DGEList(as.matrix(mat_tot), group = rep(1, ncol(mat_tot)))
-  
-  # filter out bins with too low counts to be informative
-  if (filter) {
-    keep.exprs <- edgeR::filterByExpr(y.raw, group=y.raw$samples$group)
-    y <- y.raw[keep.exprs, keep.lib.sizes=FALSE]
-    message(paste('filtering out', (dim(y.raw)[1]-dim(y)[1]), 'bins out of', dim(y.raw)[1], "due to low count"))
-  } else {
-    y <- y.raw
-    message('no filtering')
-  }
-  
-  # calculate library size and composition normalization
-  y <- calcNormFactors(y)
-  
-  return(y)
-}
+  mat_tot <- mat_tot[, 2:ncol(mat_tot)]
 
-estimate_dispersion <- function(y) {
-  
-  # estimate common dispersion
-  # default settings for DGEList objects
-  message('Estimating common dispersion with edgeR...')
-  disp <- estimateDisp(y, design=NULL, prior.df=NULL, trend.method="locfit", tagwise=TRUE,
-                       span=NULL, min.row.sum=5, grid.length=21, grid.range=c(-10,10), robust=FALSE, 
-                       winsor.tail.p=c(0.05,0.1), tol=1e-06)
-  phi <- disp$common.dispersion
-  message(paste("phi =",phi))
-  write(paste(args[1], phi, sep="\t"), './log.txt', append=TRUE)
-  
-  return(phi)
+  return(mat_tot)
 }
 
-y <- generate_dgelist(mat_tot, filter)
-phi <- estimate_dispersion(y)
-
-
 # VARIANCE STABILIZING TRANSFORMATION
 
 # TRANSFORMS
@@ -107,77 +38,152 @@ phi <- estimate_dispersion(y)
 # Anscombe, 1948
 # Laubschner, 1961
 anscombe_transform <- function(x, phi) {
-  a <- x + (3/8)
-  b <- (1/phi) - (3/4)
-  c <- sqrt(a/b)
-  y <- sinh(c)
+  a <- x + (3 / 8)
+  b <- (1 / phi) - (3 / 4)
+  c <- sqrt(a / b)
+  y <- asinh(c)
   return(y)
 }
 laubscher_transform <- function(x, phi) {
   a <- sqrt(phi)
-  b <- sinh( sqrt( x/phi ) )
-  c <- sqrt( phi-1 )
+  b <- asinh(sqrt(x / phi))
+  c <- sqrt(phi - 1)
   d <- anscombe_transform(x, phi)
-  y <- a*b + c*d
+  y <- a * b + c * d
   return(y)
 }
-transform_list <- list('anscombe'= anscombe_transform, 'laubscher'= laubscher_transform)
+transform_data <- function(counts, transform, phi) {
+  cols <- colnames(counts)
+  counts$tot_count_corr <- transform(counts$tot_count, phi)
+  counts$f <- counts$tot_count_corr / counts$tot_count
+  counts$w <- counts$w * counts$f
+  counts$c <- counts$c * counts$f
+  counts$tot_count <- counts$tot_count_corr
+  counts <- counts[, ..cols]
+
+  return(counts)
+}
+transform_list <- list("anscombe" = anscombe_transform, "laubscher" = laubscher_transform)
 transform <- transform_list[[chosen_transform]]
 
+
+disp_score <- function(counts, transform, phi, design = NULL) {
+  counts$tot_count <- transform(counts$tot_count, phi)
+  mat <- to_matrix(counts)
+  # if multiple samples are present design matrix can be used
+  if (is.null(design)) {
+    design <- matrix(1, ncol = 1, nrow = ncol(mat))
+  }
+  res <- as.matrix(mat) %*% MASS::Null(design)
+  rsd <- sqrt(rowMeans(res * res))
+  score <- sd(rsd) / mean(rsd)
+  return(score)
+}
+
 message(paste("Transforming data with", chosen_transform, "VST"))
-#ans <- transform(y$counts, phi)
-ans <- transform(to_matrix(counts_raw), phi)
-
-# convert to count table
-ans <- as.data.frame(ans)
-ans$bin <- rownames(ans)
-d <- reshape2::melt(ans, id.vars='bin', measure.vars=colnames(ans), variable.name='cell', value.name = 'tot_count_corr')
-
-sum2 <- apply(ans[,1:ncol(ans)-1], MARGIN = 2, FUN = sum)
-sum2 <- data.table::data.table(cell = names(sum2), sum2 = sum2)
-sum_counts <- merge(sum_counts, sum2, by = c('cell'), all.x = T)
-sum_counts$f <- sum_counts$sum / sum_counts$sum2
-
-d <- merge(d, sum_counts[,c('cell', 'f')], by = c('cell'), all.x = T)
-d$tot_count_corr <- as.numeric(d$tot_count_corr) * d$f
-d <- d[,c('cell', 'bin', 'tot_count_corr')]
-
-bins <- stringr::str_split_fixed(d$bin, '_', 3)
-colnames(bins) <- c('chrom', 'start', 'end')
-d[c('chrom', 'start', 'end')] <- bins
-d$start <- as.numeric(d$start)
-d$end <- as.numeric(d$end)
-
-merged.raw <- merge(counts_raw, d[c('chrom', 'start', 'end', 'cell', 'tot_count_corr')], 
-                    by=c('chrom', 'start', 'end', 'cell'), all.x=T)
-
-fil <- apply(merged.raw, MARGIN = 1, FUN = (function(x) any(is.na(x))))
-#merged.raw[fil, 'tot_count_corr'] <- 0
-merged <- data.table::data.table(merged.raw)
-
-# adjust W and C counts to corrected tot_counts
-merged$tot_count_corr <- as.numeric(merged$tot_count_corr)
-merged$ratio <- merged$tot_count_corr / merged$tot_count
-merged$w <- merged$w * merged$ratio
-merged$c <- merged$c * merged$ratio
-merged$w[is.na(merged$w)] <- 0
-merged$c[is.na(merged$c)] <- 0
-merged$tot_count <- merged$tot_count_corr
-
-message('saving...')
-df <- data.table::data.table(merged[,c('chrom', 'start', 'end', 'sample', 'cell', 'w', 'c', 'class', 'tot_count')])
-
-#s1 <- aggregate(counts_raw$tot_count, by= list(counts_raw$cell), FUN = sum)
-#s2 <- aggregate(df$tot_count, by= list(df$cell), FUN = sum)
-#s3 <- merge(s1, s2, by=c('Group.1'), all.x=T)
-#s3$d <- s3['x.x'] - s3['x.y']
-#df
-
-#args[2] <- '/data/r-workspace/strandseq_utils/vst_test.txt.gz'
-# data.table::fwrite(df, args[2])
-data.table::fwrite(df, snakemake@output[["counts_vst"]])
-
-#scTRIPmultiplot::multiplot(args[2], chromosome = 'chr8', cell_id = 'MNIx260521x01PE20505')
-#scTRIPmultiplot::multiplot(args[1], chromosome = 'chr8', cell_id = 'MNIx260521x01PE20505')
-#path <- '/data/strandseq_datasets/H22WGAFX3_MNIx260521.200000.VST.txt.gz'
-#scTRIPmultiplot::multiplot(path, chromosome = 'chr8', cell_id = 'MNIx260521x01PE20505')
+
+# estimate dispersion by residual variance
+
+opt <- optimize(disp_score, counts = counts, transform = transform, interval = c(0.00001, 1))
+phi <- opt$minimum
+message(paste("Estimated dispersion - phi: ", phi))
+
+# correction
+corr_counts <- transform_data(counts, transform, phi)
+corr_counts <- data.table::data.table(corr_counts[, c("chrom", "start", "end", "sample", "cell", "w", "c", "class", "tot_count")])
+
+rescale_data <- function(counts_original, counts_transformed) {
+  rescaled <- counts_transformed
+  rescaled_med <- aggregate(rescaled$tot_count, list(rescaled$cell), FUN = median)
+  original_med <- aggregate(counts_original$tot_count, list(counts_original$cell), FUN = median)
+  m <- merge(x = original_med, y = rescaled_med, by = "Group.1", suffixes = c("_raw", "_norm"))
+  m[["f"]] <- m[["x_raw"]] / m[["x_norm"]]
+
+  rescaled <- merge(rescaled, m[c("Group.1", "f")], by.x = "cell", by.y = "Group.1")
+
+  rescaled$tot_count <- rescaled$tot_count * rescaled$f
+  rescaled$w <- rescaled$w * rescaled$f
+  rescaled$c <- rescaled$c * rescaled$f
+  return(rescaled)
+
+}
+
+if (rescale == TRUE) {
+  corr_counts <- rescale_data(counts_raw, corr_counts)
+} 
+
+
+message("saving...")
+data.table::fwrite(corr_counts, save_path)
+
+if (plot) {
+  library(ggplot2)
+  library(ggpubr)
+
+  merge_bins <- function(df, bin_size = 3e6) {
+    df <- df[with(df, order(cell, chrom, start))]
+
+    df$bin_group <- df$start %/% bin_size
+
+
+    w <- aggregate(df$w, by = list(df$cell, df$chrom, df$bin_group), FUN = sum)
+    c <- aggregate(df$c, by = list(df$cell, df$chrom, df$bin_group), FUN = sum)
+    s <- aggregate(df$start, by = list(df$cell, df$chrom, df$bin_group), FUN = function(x) x[[1]])
+    e <- aggregate(df$end, by = list(df$cell, df$chrom, df$bin_group), FUN = function(x) x[[length(x)]])
+    cl <- aggregate(df$class, by = list(df$cell, df$chrom, df$bin_group), FUN = function(x) names(sort(table(x), decreasing = TRUE))[[1]])
+
+    m <- merge(w, c, by = c("Group.1", "Group.2", "Group.3"))
+    m <- merge(m, s, by = c("Group.1", "Group.2", "Group.3"))
+    m <- merge(m, e, by = c("Group.1", "Group.2", "Group.3"))
+    m <- merge(m, cl, by = c("Group.1", "Group.2", "Group.3"))
+    colnames(m) <- c("cell", "chrom", "bin_group", "w", "c", "start", "end", "class")
+    m$sample <- unique(df$sample)[[1]]
+    m <- data.table::as.data.table(m)
+    m <- m[with(m, order(cell, chrom, start))]
+
+    m$tot_count <- m$w + m$c
+    return(m)
+  }
+
+  wf_plot <- function(m) {
+    m$wf <- m$w / m$tot_count
+
+    p1 <- ggplot(m, aes(x = tot_count, y = wf)) +
+      geom_point(size = 1, alpha = .1, shape = 16) +
+      xlab("tot count") +
+      ylab("watson fraction") +
+      ylim(0, 1)
+    return(p1)
+  }
+
+  p1 <- ggplot(counts_raw, aes(x = tot_count)) +
+    geom_histogram(bins = 256) +
+    ggtitle("raw") +
+    xlab("read count") +
+    ylab("bin count")
+
+  p2 <- ggplot(corr_counts, aes(x = tot_count)) +
+    geom_histogram(bins = 256) +
+    ggtitle(paste(chosen_transform, "VST")) +
+    xlab("read count") +
+    ylab("bin count")
+  
+  m <- merge_bins(counts_raw)
+  p3 <- wf_plot(m) + ggtitle('raw')
+  
+  n <- merge_bins(corr_counts)
+  p4 <- wf_plot(n) + ggtitle(paste(chosen_transform, "VST"))
+
+
+
+  m <- merge_bins(counts_raw)
+  p3 <- wf_plot(m) + ggtitle("raw")
+
+  n <- merge_bins(corr_counts)
+  p4 <- wf_plot(n) + ggtitle(paste(chosen_transform, "VST"))
+
+
+  corr_plot <- ggarrange(p1, p2, p3, p4)
+
+  ggsave(snakemake@output[["plot"]], corr_plot, width = 12, height = 6)
+}
\ No newline at end of file
diff --git a/workflow/scripts/utils/make_log_useful.py b/workflow/scripts/utils/make_log_useful.py
index ef62fc84..bbe56250 100644
--- a/workflow/scripts/utils/make_log_useful.py
+++ b/workflow/scripts/utils/make_log_useful.py
@@ -1,8 +1,10 @@
 import sys, os
 
 
-def make_log_useful(log_path, status, config):
-
+def make_log_useful(log_path, status, config, config_definitions):
+    logs_processed_dir = "/".join(log_path.split("/")[:-1]) + "/processed_logs_for_mail/"
+    os.makedirs(logs_processed_dir, exist_ok=True)
+    log_path_new = "/".join(log_path.split("/")[:-1]) + "/processed_logs_for_mail/" + log_path.split("/")[-1]
     error_buffer = []
     record = 0
     with open(log_path, "r") as logfile:
@@ -25,12 +27,8 @@ def make_log_useful(log_path, status, config):
             else:
                 continue
 
-    with open(log_path, "w") as logfile:
-        _ = logfile.write("\n".join(error_buffer))
-        _ = logfile.write("\n\n")
-
     my_env = dict(os.environ)
-    with open(log_path, "a") as logfile:
+    with open(log_path_new, "w") as logfile:
         _ = logfile.write("=======[{}]=======\n".format(status))
         _ = logfile.write("\n===[{}]===\n".format("Infrastructure information"))
         _ = logfile.write("Host: {}\n".format(my_env.get("HOST", "N/A")))
@@ -41,20 +39,49 @@ def make_log_useful(log_path, status, config):
         _ = logfile.write("Screen: {}\n".format(my_env.get("STY", "N/A")))
         _ = logfile.write("Conda ENV: {}\n".format(my_env.get("CONDA_DEFAULT_ENV", "N/A")))
 
-        _ = logfile.write("\n===[{}]===\n".format("Workflow information"))
-        _ = logfile.write("smk-wf-catalog/mosacaitcher-pipeline v{version}\n".format(version=str(config["version"])))
-        _ = logfile.write("Folder to processed : {}\n".format(str(config["data_location"])))
-        _ = logfile.write("Multistep normalisation module : {}\n".format(str(config["multistep_normalisation"])))
-        _ = logfile.write("Read Counts HGSVC-based normalization : {}\n".format(str(config["hgsvc_based_normalized_counts"])))
-        _ = logfile.write("Binning window size : {}\n".format(str(config["window"])))
-        _ = logfile.write("Ashleys-QC preprocessing pipeline : {}\n".format(str(config["ashleys_pipeline"])))
-        _ = logfile.write("BAM folder old format (all/selected) : {}\n".format(str(config["input_bam_legacy"])))
-        _ = logfile.write("List of chromosomes processed : {}\n".format(str(config["chromosomes"])))
-        _ = logfile.write("Reference genome selected : {}\n".format(config["reference"]))
-        _ = logfile.write("\n")
+        # Define the categories
+        categories = {
+            "General": ["email", "data_location", "reference", "samples_to_process"],
+            "Ashleys-QC parameters": [
+                "input_bam_legacy",
+                "ashleys_pipeline",
+                "ashleys_pipeline_only",
+                "ashleys_threshold",
+                "hand_selection",
+                "MultiQC",
+            ],
+            "Counts processing": ["blacklist_regions", "window"],
+            "Normalization": [
+                "multistep_normalisation",
+                "multistep_normalisation_for_SV_calling",
+                "hgsvc_based_normalized_counts",
+            ],
+            "Advanced Settings": ["ashleys_threshold", "window", "chromosomes", "chromosomes_to_exclude"],
+            "Genecore": ["genecore", "genecore_date_folder", "genecore_prefix"],
+            "Downstream Modules": ["arbigent", "arbigent_bed_file", "scNOVA"],
+        }
+
+        _ = logfile.write("\n=========================\n===[CONFIG PARAMETERS]===\n=========================\n")
+        # Log the configuration
+        for category, keys in categories.items():
+            _ = logfile.write("\n===[{}]===\n".format(category))
+            for key in keys:
+                if key in config:
+                    _ = logfile.write("{}: {}\n".format(key, config[key]))
+
+    if error_buffer:
+        with open(log_path_new, "a") as logfile:
+            _ = logfile.write("\nErrors recorded (can be related to OOM & NFS issues)")
+            _ = logfile.write("\n".join(error_buffer))
+            _ = logfile.write("\n\n")
 
-    return
+    return log_path_new
 
 
 # if __name__ == "__main__":
-#     make_log_useful(sys.argv[1], sys.argv[2], sys.argv[3])
+#     config = yaml.safe_load(open("config/config.yaml", "r"))
+#     config_definitions = yaml.safe_load(open("config/config_metadata.yaml"), "r")
+#     print(config)
+#     log_path = sys.argv[1]
+#     status = "SUCCESS"
+#     make_log_useful(log_path, status, config, config_definitions)

From 7c1b7f23185a09182415dace9c3d4983e522d114 Mon Sep 17 00:00:00 2001
From: Thomas Weber <thomas.weber@embl.de>
Date: Tue, 22 Aug 2023 15:28:52 +0200
Subject: [PATCH 2/5] pre-commit

---
 .pre-commit-config.yaml | 21 +++++++++++++++++++++
 1 file changed, 21 insertions(+)
 create mode 100644 .pre-commit-config.yaml

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
new file mode 100644
index 00000000..065f372f
--- /dev/null
+++ b/.pre-commit-config.yaml
@@ -0,0 +1,21 @@
+repos:
+  - hooks:
+      - id: trailing-whitespace
+      - id: check-added-large-files
+      - id: check-yaml
+        #  - args:
+        #    - --branch
+        #    - master
+        #    id: no-commit-to-branch
+    repo: https://github.com/pre-commit/pre-commit-hooks
+    rev: v3.4.0
+  - hooks:
+      - id: snakefmt
+    repo: https://github.com/snakemake/snakefmt
+    rev: 0.4.0
+  - hooks:
+      - id: commitizen
+        stages:
+          - commit-msg
+    repo: https://github.com/commitizen-tools/commitizen
+    rev: v2.17.12

From 643c72438c6dc7ab944b9181249e750f90a72c20 Mon Sep 17 00:00:00 2001
From: Thomas Weber <thomas.weber@embl.de>
Date: Tue, 22 Aug 2023 15:29:17 +0200
Subject: [PATCH 3/5] 2.2.1: fixes

---
 config/config.yaml                            |   4 +-
 .../Dockerfile-2.2.1.dockerfile               | 227 ++++++++++++++++++
 github-actions-runner/docker_procedure.md     |  13 +-
 workflow/Snakefile                            |   4 +-
 workflow/rules/common.smk                     |   4 +-
 5 files changed, 244 insertions(+), 8 deletions(-)
 create mode 100644 github-actions-runner/Dockerfile-2.2.1.dockerfile

diff --git a/config/config.yaml b/config/config.yaml
index 9571162c..78dc3d51 100644
--- a/config/config.yaml
+++ b/config/config.yaml
@@ -3,10 +3,10 @@
 # --------------------------------------------------------
 
 # MosaiCatcher version
-version: 2.2.0
+version: 2.2.1
 
 # Ashleys-QC pipeline version
-ashleys_pipeline_version: 2.2.0
+ashleys_pipeline_version: 2.2.1
 
 # Email for notifications about the pipeline's status
 email: ""
diff --git a/github-actions-runner/Dockerfile-2.2.1.dockerfile b/github-actions-runner/Dockerfile-2.2.1.dockerfile
new file mode 100644
index 00000000..d5f5feb1
--- /dev/null
+++ b/github-actions-runner/Dockerfile-2.2.1.dockerfile
@@ -0,0 +1,227 @@
+FROM condaforge/mambaforge:latest
+LABEL io.github.snakemake.containerized="true"
+LABEL io.github.snakemake.conda_env_hash="77eaa388d65d5205b87324fb0adb89561bc0e532a328995990a1d580aeb894ae"
+
+# Step 1: Retrieve conda environments
+
+# Conda environment:
+#   source: https://github.com/snakemake/snakemake-wrappers/raw/v1.7.0/bio/bwa/index/environment.yaml
+#   prefix: /conda-envs/5681728a49bd83ceed09ba194330c858
+#   channels:
+#     - bioconda
+#     - conda-forge
+#     - defaults
+#   dependencies:
+#     - bwa ==0.7.17
+RUN mkdir -p /conda-envs/5681728a49bd83ceed09ba194330c858
+ADD https://github.com/snakemake/snakemake-wrappers/raw/v1.7.0/bio/bwa/index/environment.yaml /conda-envs/5681728a49bd83ceed09ba194330c858/environment.yaml
+
+# Conda environment:
+#   source: https://github.com/snakemake/snakemake-wrappers/raw/v1.7.0/bio/fastqc/environment.yaml
+#   prefix: /conda-envs/08d4368302a4bdf7eda6b536495efe7d
+#   channels:
+#     - bioconda
+#     - conda-forge
+#     - defaults
+#   dependencies:
+#     - fastqc ==0.11.9
+RUN mkdir -p /conda-envs/08d4368302a4bdf7eda6b536495efe7d
+ADD https://github.com/snakemake/snakemake-wrappers/raw/v1.7.0/bio/fastqc/environment.yaml /conda-envs/08d4368302a4bdf7eda6b536495efe7d/environment.yaml
+
+# Conda environment:
+#   source: https://raw.githubusercontent.com/friendsofstrandseq/ashleys-qc-pipeline/dev/workflow/envs/ashleys_base.yaml
+#   prefix: /conda-envs/87c04f5d115eff742eca84455513deba
+#   name: ashleys_base
+#   channels:
+#     - conda-forge
+#     - bioconda
+#   dependencies:
+#     - samtools
+#     - tabix
+#     - bwa
+#     - sambamba
+#     - mosaicatcher
+#     # - alfred
+#     - ashleys-qc
+#     - pandas
+#     # PUBLISHDIR
+#     - rsync
+#     # MULTIQC
+#     - multiqc
+#     # Fix sklearn update
+#     - scikit-learn=1.2.2
+RUN mkdir -p /conda-envs/87c04f5d115eff742eca84455513deba
+ADD https://raw.githubusercontent.com/friendsofstrandseq/ashleys-qc-pipeline/dev/workflow/envs/ashleys_base.yaml /conda-envs/87c04f5d115eff742eca84455513deba/environment.yaml
+
+# Conda environment:
+#   source: https://raw.githubusercontent.com/friendsofstrandseq/ashleys-qc-pipeline/dev/workflow/envs/ashleys_rtools.yaml
+#   prefix: /conda-envs/9b847fc31baae8e01dfb7ce438a56b71
+#   name: rtools
+#   channels:
+#     - conda-forge
+#     - bioconda
+#     - r
+#     - anaconda
+#   dependencies:
+#     # - bioconductor-biocparallel
+#     # - bioconductor-bsgenome
+#     # - bioconductor-bsgenome.hsapiens.ucsc.hg19
+#     # - bioconductor-bsgenome.hsapiens.ucsc.hg38
+#     # - bioconductor-fastseg
+#     # - bioconductor-genomicalignments
+#     - bioconductor-genomicranges
+#     # - bioconductor-rsamtools
+#     # - bioconductor-s4vectors
+#     - r-assertthat
+#     - r-base
+#     # - r-biocmanager
+#     - r-cowplot
+#     - r-data.table
+#     # - r-devtools
+#     # - r-doparallel
+#     # - r-foreach
+#     - r-ggplot2
+#     # - r-gtools
+#     - r-reshape2
+#     # - r-zoo
+#     # - r-dplyr
+#     # - r-mc2d
+#     # - r-pheatmap
+#     # - bioconductor-complexheatmap
+#     # - r-gplots
+#     - r-scales
+#     - r-rcolorbrewer
+#     # - r-stringr
+#     - r-cairo
+#     - fonts-anaconda
+#     # NEW
+#     - bioconductor-edger
+#     - r-r.utils
+#     # PLATE PLOT
+#     - r-dplyr
+#     - r-platetools
+#     - r-viridis
+#     # GC_correction
+#     - r-tidyr
+#     - r-ggpubr
+#     # SOLVE R lib issue
+#     - r-stringi=1.7.12
+RUN mkdir -p /conda-envs/9b847fc31baae8e01dfb7ce438a56b71
+ADD https://raw.githubusercontent.com/friendsofstrandseq/ashleys-qc-pipeline/dev/workflow/envs/ashleys_rtools.yaml /conda-envs/9b847fc31baae8e01dfb7ce438a56b71/environment.yaml
+
+# Conda environment:
+#   source: workflow/envs/mc_base.yaml
+#   prefix: /conda-envs/c80307395eddf442c2fb6870f40d822b
+#   name: mc-base
+#   channels:
+#     - conda-forge
+#     - bioconda
+#   dependencies:
+#     - pandas
+#     - intervaltree
+#     - scipy
+#     - pysam
+#     - tqdm
+#     - perl
+#     - pypdf2
+#     - parmap
+#     # NEW
+#     - pyyaml
+#     - seaborn
+#     - matplotlib
+#     # SOLVE se-pe detection
+#     - samtools
+#     # ArbiGent Hufsah deps
+#     - pytables
+#     - xopen
+RUN mkdir -p /conda-envs/c80307395eddf442c2fb6870f40d822b
+COPY workflow/envs/mc_base.yaml /conda-envs/c80307395eddf442c2fb6870f40d822b/environment.yaml
+
+# Conda environment:
+#   source: workflow/envs/mc_bioinfo_tools.yaml
+#   prefix: /conda-envs/f251d84cdc9f25d0e14b48e780261d66
+#   name: mc-bioinfo-tools
+#   channels:
+#     - conda-forge
+#     - bioconda
+#   dependencies:
+#     - bcftools
+#     - freebayes
+#     - mosaicatcher
+#     - samtools
+#     - tabix
+#     - whatshap
+RUN mkdir -p /conda-envs/f251d84cdc9f25d0e14b48e780261d66
+COPY workflow/envs/mc_bioinfo_tools.yaml /conda-envs/f251d84cdc9f25d0e14b48e780261d66/environment.yaml
+
+# Conda environment:
+#   source: workflow/envs/rtools.yaml
+#   prefix: /conda-envs/598c87b6c764d05e0c66953cc67f2931
+#   name: rtools
+#   channels:
+#     - bioconda
+#     - conda-forge
+#     - r
+#     - anaconda
+#   dependencies:
+#     # # NEW
+#     - strandphaser
+#     # ###############
+#     - bioconductor-biocparallel
+#     - bioconductor-bsgenome
+#     - bioconductor-bsgenome.hsapiens.ucsc.hg38
+#     - bioconductor-complexheatmap
+#     # - bioconductor-fastseg
+#     - bioconductor-genomicalignments
+#     - bioconductor-genomicranges
+#     - bioconductor-rsamtools
+#     # - bioconductor-s4vectors
+#     - fonts-anaconda
+#     - r-assertthat
+#     - r-base
+#     - r-biocmanager
+#     - r-cairo
+#     - r-cowplot
+#     - r-data.table
+#     - r-devtools
+#     - r-doparallel
+#     - r-dplyr
+#     - r-foreach
+#     - r-ggplot2
+#     - r-gplots
+#     - r-gtools
+#     - r-mc2d
+#     - r-rcolorbrewer
+#     - r-reshape2
+#     - r-scales
+#     - r-stringr
+#     # SV_CALLS_DEV
+#     # - r-zoo
+#     - r-r.utils
+#     - r-ggnewscale
+#     # HEATMAP
+#     - r-tidyr
+#     # ARBIGENT
+#     - r-reshape
+#     - r-optparse
+#     - r-tidyr
+#     - r-ggbeeswarm
+#     - r-pheatmap
+#     # GC_correction
+#     - r-ggpubr
+#     - bioconductor-edger
+#     # SOLVE R lib issue
+#     - r-stringi=1.7.12
+RUN mkdir -p /conda-envs/598c87b6c764d05e0c66953cc67f2931
+COPY workflow/envs/rtools.yaml /conda-envs/598c87b6c764d05e0c66953cc67f2931/environment.yaml
+
+# Step 2: Generate conda environments
+
+RUN mamba env create --prefix /conda-envs/5681728a49bd83ceed09ba194330c858 --file /conda-envs/5681728a49bd83ceed09ba194330c858/environment.yaml && \
+    mamba env create --prefix /conda-envs/08d4368302a4bdf7eda6b536495efe7d --file /conda-envs/08d4368302a4bdf7eda6b536495efe7d/environment.yaml && \
+    mamba env create --prefix /conda-envs/87c04f5d115eff742eca84455513deba --file /conda-envs/87c04f5d115eff742eca84455513deba/environment.yaml && \
+    mamba env create --prefix /conda-envs/9b847fc31baae8e01dfb7ce438a56b71 --file /conda-envs/9b847fc31baae8e01dfb7ce438a56b71/environment.yaml && \
+    mamba env create --prefix /conda-envs/c80307395eddf442c2fb6870f40d822b --file /conda-envs/c80307395eddf442c2fb6870f40d822b/environment.yaml && \
+    mamba env create --prefix /conda-envs/f251d84cdc9f25d0e14b48e780261d66 --file /conda-envs/f251d84cdc9f25d0e14b48e780261d66/environment.yaml && \
+    mamba env create --prefix /conda-envs/598c87b6c764d05e0c66953cc67f2931 --file /conda-envs/598c87b6c764d05e0c66953cc67f2931/environment.yaml && \
+    mamba clean --all -y
\ No newline at end of file
diff --git a/github-actions-runner/docker_procedure.md b/github-actions-runner/docker_procedure.md
index c2661b4b..f3ea5c61 100644
--- a/github-actions-runner/docker_procedure.md
+++ b/github-actions-runner/docker_procedure.md
@@ -34,7 +34,16 @@ conda activate snakemake
 snakemake --configfile .tests/config/simple_config.yaml --config ashleys_pipeline=True MultiQC=True --profile workflow/snakemake_profiles/local/conda/ -c 8 --forceall -n
 snakemake --configfile .tests/config/simple_config.yaml --config ashleys_pipeline=True MultiQC=True --profile workflow/snakemake_profiles/local/conda_singularity/ -c 8 --forceall
 
+# Solved in
 
-# Solved in 
+https://github.com/weber8thomas/workspace-images/tree/docker-apptainer-mamba
 
-https://github.com/weber8thomas/workspace-images/tree/docker-apptainer-mamba
\ No newline at end of file
+# Dockerfile creation
+
+snakemake --configfile .tests/config/simple_config.yaml --config ashleys_pipeline=True MultiQC=True --profile workflow/snakemake_profiles/local/conda/ -c 8 --containerize > Dockerfile
+
+# Docker commands
+
+docker login -u weber8thomas
+docker build --platform=linux/amd64 -t weber8thomas/mosaicatcher-pipeline:VERSION .
+docker push -t weber8thomas/mosaicatcher-pipeline:VERSION
diff --git a/workflow/Snakefile b/workflow/Snakefile
index e8e15c2b..980f9ca0 100644
--- a/workflow/Snakefile
+++ b/workflow/Snakefile
@@ -25,9 +25,9 @@ if config["ashleys_pipeline"] is True:
             github(
                 "friendsofstrandseq/ashleys-qc-pipeline",
                 path="workflow/Snakefile",
-                # tag=str(config["ashleys_pipeline_version"])
+                tag=str(config["ashleys_pipeline_version"])
                 # branch="main",
-                branch="dev",
+                # branch="dev",
             )
         config:
             config
diff --git a/workflow/rules/common.smk b/workflow/rules/common.smk
index fbbb1e4d..999140ba 100644
--- a/workflow/rules/common.smk
+++ b/workflow/rules/common.smk
@@ -285,7 +285,7 @@ class HandleInput:
         # List of folders/files to not consider (restrict to samples only)
 
         l_to_process = [e for e in os.listdir(thisdir) if e not in exclude and e.endswith(".zip") is False]
-        print(l_to_process)
+        # print(l_to_process)
         if config["samples_to_process"]:
             l_to_process = [e for e in l_to_process if e in config["samples_to_process"]]
 
@@ -327,7 +327,7 @@ class HandleInput:
         complete_df = complete_df.sort_values(by=["Cell", "File"]).reset_index(
             drop=True
         )
-        print(complete_df)
+        # print(complete_df)
         return complete_df
 
 
From 9971816fadb237bcef87f707a8e80a918638807a Mon Sep 17 00:00:00 2001
From: Thomas Weber <thomas.weber@embl.de>
Date: Tue, 22 Aug 2023 15:37:48 +0200
Subject: [PATCH 4/5] 2.2.1: fixes


From 05c67537c11f6e146ce345dddf7e3c82bcae9a72 Mon Sep 17 00:00:00 2001
From: Thomas Weber <thomas.weber@embl.de>
Date: Tue, 22 Aug 2023 15:41:46 +0200
Subject: [PATCH 5/5] 2.2.1: fixes

---
 .gitpod.yml | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/.gitpod.yml b/.gitpod.yml
index 34ef39b6..76eafb53 100644
--- a/.gitpod.yml
+++ b/.gitpod.yml
@@ -31,6 +31,13 @@ tasks:
 
     # MOSAICATCHER
 
+    # MOSAICATCHER
+
+  - name: Change cache location
+    command: |
+      mkdir -p ~/my_mamba_cache
+      export CONDA_PKGS_DIRS=~/my_mamba_cache
+
   - name: Create snakemake env
     command: mamba create -n snakemake -c bioconda -c conda-forge snakemake -y