Merge pull request #38 from friendsofstrandseq/dev

Version 2.2.0
friendsofstrandseq · Aug 9, 2023 · caba655 · caba655
2 parents fd8f3f3 + 83de295
commit caba655
Show file tree

Hide file tree

Showing 11 changed files with 385 additions and 207 deletions.
diff --git a/afac/plot_plate.R.ipynb b/afac/plot_plate.R.ipynb
@@ -0,0 +1,11 @@
+{
+ "cells": [],
+ "metadata": {
+  "language_info": {
+   "name": "python"
+  },
+  "orig_nbformat": 4
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/afac/plot_plate.dev.R b/afac/plot_plate.dev.R
@@ -0,0 +1,66 @@
+library(platetools)
+library(ggplot2)
+library(viridis)
+library(dplyr)
+
+## collect ASHLEYS prediction and count files
+ashleys_data <- read.table(file = "/scratch/tweber/DATA/TMP/labels384.tsv", sep = "\t", header = TRUE)
+plate_type <- nrow((ashleys_data))
+# ashleys_data = read.table(file = snakemake@input[["labels"]] , sep = '\t', header = TRUE)
+ashleys_data <- dplyr::arrange(ashleys_data, cell)
+colnames(ashleys_data)[1] <- "ashleys_id"
+
+# plate_type <- snakemake@params[["plate_type"]] # assuming that the plate type (96 or 384) is passed in as a parameter
+
+Well_position <- character()
+
+if (plate_type == 96) {
+    for (i in 1:12)
+    {
+        for (j in 1:8)
+        {
+            tmp <- paste0(LETTERS[j], i)
+            Well_position <- c(Well_position, tmp)
+        }
+    }
+} else if (plate_type == 384) {
+    for (i in 1:24)
+    {
+        for (j in 1:16)
+        {
+            tmp <- paste0(LETTERS[j], i)
+            Well_position <- c(Well_position, tmp)
+        }
+    }
+}
+
+pdf("TEST_ashleys_plate_predictions.pdf")
+# pdf(snakemake@output[["predictions"]])
+
+ashleys_data$Well_position <- Well_position
+
+raw_map(
+    data = ashleys_data$prediction,
+    well = ashleys_data$Well_position,
+    plate = plate_type
+) +
+    scale_fill_distiller(type = "div", palette = "RdYlGn", direction = 1) +
+    ggtitle(paste0("Sample: TEST | ASHLEYS binary predictions (cutoff=0.5)"))
+# ggtitle(paste0("Sample: ", snakemake@wildcards[["sample"]], " | ASHLEYS binary predictions (cutoff=", snakemake@config[["ashleys_threshold"]], ")"))
+
+dev.off()
+
+pdf("TEST_ashleys_plate_probabilities.pdf")
+# pdf(snakemake@output[["probabilities"]])
+ashleys_data$Well_position <- Well_position
+
+raw_map(
+    data = ashleys_data$probability,
+    well = ashleys_data$Well_position,
+    plate = plate_type
+) +
+    scale_fill_distiller(type = "div", palette = "RdYlGn", direction = 1) +
+    ggtitle(paste0("Sample: ", "TEST", " | ASHLEYS probabilities"))
+# ggtitle(paste0("Sample: ", snakemake@wildcards[["sample"]], " | ASHLEYS probabilities"))
+
+dev.off()
diff --git a/afac/watchdog_ashleys.py b/afac/watchdog_ashleys.py
@@ -4,6 +4,9 @@
 from watchdog.events import FileSystemEventHandler
 from datetime import datetime
 import logging
+import json
+import pandas as pd
+import threading
 
 
 os.makedirs("watchdog/logs", exist_ok=True)
@@ -22,6 +25,17 @@
 # Set the path you want to watch
 path_to_watch = sys.argv[1]
 
+data_location = "/scratch/tweber/DATA/MC_DATA/STOCKS"
+publishdir_location = "/g/korbel/WORKFLOW_RESULTS"
+genecore_prefix = path_to_watch
+profile_slurm = ["--profile", "workflow/snakemake_profiles/HPC/slurm_EMBL/"]
+profile_dry_run = ["--profile", "workflow/snakemake_profiles/local/conda/"]
+dry_run_options = ["-c", "1", "-n", "-q"]
+snakemake_binary = "/g/korbel2/weber/miniconda3/envs/snakemake_latest/bin/snakemake"
+
+# plates_processing_status = pd.read_csv("watchdog/processing_status.json", sep="\t")
+# print(plates_processing_status)
+
 
 # Define the event handler
 class MyHandler(FileSystemEventHandler):
@@ -30,6 +44,21 @@ def on_created(self, event):
             logging.info(f"Directory {event.src_path} has been created!")
             self.process_new_directory(event.src_path)
 
+    def check_unprocessed_folder(self):
+        unwanted = ["._.DS_Store", ".DS_Store", "config"]
+        list_runs_processed = sorted([e for e in os.listdir(data_location) if e not in unwanted])
+        total_list_runs = sorted([e for e in os.listdir(path_to_watch) if e not in unwanted])
+        unprocessed_plates = set(total_list_runs).difference(list_runs_processed)
+        for plate in unprocessed_plates:
+            # if plate not in plates_processing_status["plate"].values.tolist():
+            # plates_processing_status_plate_dict = collections.defaultdict(dict)
+            nb_txt_gz_files = len(glob.glob(f"{path_to_watch}/{plate}/*.txt.gz"))
+            if nb_txt_gz_files == 576:
+                print(f"PROCESSING {path_to_watch}/{plate}")
+                self.process_new_directory(f"{path_to_watch}/{plate}")
+            else:
+                print(f"Not possible to process {path_to_watch}/{plate}, containing {nb_txt_gz_files} txt.gz files")
+
     def process_new_directory(self, directory_path):
         """Process the new directory, check for .txt.gz files and execute snakemake command if conditions are met."""
 
@@ -86,13 +115,7 @@ def execute_command(self, directory_path, prefix):
 
         # Change directory and run the snakemake command
         date_folder = directory_path.split("/")[-1]
-        data_location = "/scratch/tweber/DATA/MC_DATA/STOCKS"
-        publishdir_location = "/g/korbel/WORKFLOW_RESULTS"
-        genecore_prefix = path_to_watch
-        profile_slurm = ["--profile", "workflow/snakemake_profiles/HPC/slurm_EMBL/"]
-        profile_dry_run = ["--profile", "workflow/snakemake_profiles/local/conda/"]
-        dry_run_options = ["-c", "1", "-n", "-q"]
-        snakemake_binary = "/g/korbel2/weber/miniconda3/envs/snakemake_latest/bin/snakemake"
+
         cmd = [
             f"{snakemake_binary}",
             "--config",
@@ -158,22 +181,37 @@ def run_second_command(self, cmd, profile_slurm, data_location, date_folder):
         subprocess.run(["chmod", "-R", "777", f"{data_location}/{date_folder}"])
 
 
-# Create the event handler
-event_handler = MyHandler()
+def main():
+    # Create the event handler
+    event_handler = MyHandler()
+
+    # Create an observer
+    observer = Observer()
 
-# Create an observer
-observer = Observer()
+    # Assign the observer to the path and the event handler
+    observer.schedule(event_handler, path_to_watch, recursive=False)
 
-# Assign the observer to the path and the event handler
-observer.schedule(event_handler, path_to_watch, recursive=False)
+    # Start the observer
+    observer.start()
+
+    # Start the periodical directory scanning in a separate thread
+    def periodic_scan():
+        while True:
+            event_handler.check_unprocessed_folder()
+            time.sleep(3600)  # Scan the directory every hour
+
+    scan_thread = threading.Thread(target=periodic_scan)
+    scan_thread.start()
+
+    try:
+        while True:
+            logging.info("Waiting for new plate ...")
+            time.sleep(3600)
+    except KeyboardInterrupt:
+        observer.stop()
 
-# Start the observer
-observer.start()
+    observer.join()
 
-try:
-    while True:
-        time.sleep(3)
-except KeyboardInterrupt:
-    observer.stop()
 
-observer.join()
+if __name__ == "__main__":
+    main()
diff --git a/config/config.yaml b/config/config.yaml
@@ -1,59 +1,39 @@
-version: 2.1.3
-# Option to display all potential options - listed in config_metadata.yaml
-list_commands: False
-## Data location - MUST BE AN ABSOULTE PATH (due to snakemake-symlink issues) - PLEASE MODIFY IT
-# input_bam_location: ".tests/data_CHR17"
-data_location: ".tests/data_CHR17"
-# Reference genome used by BWA to map FASTQ files
-# reference: sandbox.zenodo.org/record/1074721/files/GCA_000001405.15_GRCh38_no_alt_analysis_set.fna
-# Enable / Disable download of external files (1000G SNV & Fasta ref genome)
-dl_external_files: False
-# Enable / Disable multistep normalisation analysis
-multistep_normalisation: False
-# Ashleys-qc binary classification threshold
-ashleys_threshold: 0.5
-# Enable / Disable FastQC analysis
-MultiQC: False
-# To be informed of pipeline status
+# --------------------------------------------------------
+# Ashleys-QC pipeline Configuration
+# --------------------------------------------------------
+version: 2.2.0
+
+# Email for notifications about the pipeline's status
 email: ""
-# Others
-abs_path: "/"
-############################################################################
-#                          ADVANCED PARAMETERS
-############################################################################
 
-reference: "hg38"
+# List of samples to process if multiple are specified
+samples_to_process: []
 
-references_data:
-  "hg38":
-    reference_fasta: "workflow/data/ref_genomes/hg38.fa"
-  "hg19":
-    reference_fasta: "workflow/data/ref_genomes/hg19.fa"
-  "T2T":
-    reference_fasta: "workflow/data/ref_genomes/T2T.fa"
-  "mm10":
-    reference_fasta: "workflow/data/ref_genomes/mm10.fa"
-
-# Boolean parameters
-## Is the pipeline called used as a submodule in mosaicatcher-pipeline?
-mosaicatcher_pipeline: False
-## Enable/Disable hand selection through Jupyter Notebook
-hand_selection: False
+# --------------------------------------------------------
+# Data location & I/O
+# --------------------------------------------------------
 
-# Window size used by mosaic binning algorithm
-window: 200000
+# Absolute path to the data location (modify as needed)
+data_location: ".tests/data_CHR17"
 
-plottype_counts:
-  - "raw"
-  - "normalised"
+# Directory to publish important data (e.g., stats, plots, counts). Leave empty if not required.
+publishdir: ""
+
+# --------------------------------------------------------
+# Reference Data Configuration
+# --------------------------------------------------------
 
-alfred_plots:
-  - "dist"
-  - "devi"
+# Reference genome used by BWA to map FASTQ files
+reference: "hg38"
 
-plate_orientation: landscape
+# Reference genome files' location
+references_data:
+  "hg38": { reference_fasta: "workflow/data/ref_genomes/hg38.fa" }
+  "T2T": { reference_fasta: "workflow/data/ref_genomes/T2T.fa" }
+  "hg19": { reference_fasta: "workflow/data/ref_genomes/hg19.fa" }
+  "mm10": { reference_fasta: "workflow/data/ref_genomes/mm10.fa" }
 
-# Chromosomes list to process
+# List of chromosomes to process
 chromosomes:
   - chr1
   - chr2
@@ -80,26 +60,64 @@ chromosomes:
   - chrX
   - chrY
 
+# Specify any chromosomes to exclude from processing
 chromosomes_to_exclude: []
 
-# GENECORE
+# --------------------------------------------------------
+# Quality Control Configuration
+# --------------------------------------------------------
+
+# Threshold for Ashleys-qc binary classification
+ashleys_threshold: 0.5
+
+# Enable or disable FastQC analysis
+MultiQC: False
+
+# --------------------------------------------------------
+# Counts Configuration
+# --------------------------------------------------------
+
+# Enable or disable multistep normalization analysis
+multistep_normalisation: False
+
+# Advanced parameters for multi-step normalisation
+multistep_normalisation_options:
+  min_reads_bin: 5
+  n_subsample: 1000
+  min_reads_cell: 100000
+
+# Window size used by the mosaic binning algorithm
+window: 200000
+
+# Enable or disable hand selection through the Jupyter Notebook
+hand_selection: False
+
+# --------------------------------------------------------
+# GENECORE Configuration
+# --------------------------------------------------------
+
 genecore: False
-samples_to_process: []
 genecore_date_folder: ""
-# genecore_prefix: "/g/korbel/shared/genecore"
 genecore_prefix: "/g/korbel/STOCKS/Data/Assay/sequencing/2023"
 genecore_regex_element: "PE20"
 
-##### DEV only
+# --------------------------------------------------------
+# Internal Parameters
+# --------------------------------------------------------
+
+# Is the pipeline used as a submodule in mosaicatcher-pipeline?
+mosaicatcher_pipeline: False
 
 # Overwrite ASHLEYS PREDICTIONS for GitHub & smoke dataset purpose
 use_light_data: False
 
-# If specified, will copy important data (stats, plots, counts file) to a second place
-publishdir: ""
+# For snakemake linting
+abs_path: "/"
 
-# Multi-step normalisation advanced parameters
-multistep_normalisation_options:
-  min_reads_bin: 5
-  n_subsample: 1000
-  min_reads_cell: 100000
+# Type of plots for counts
+plottype_counts:
+  - "raw"
+  - "normalised"
+
+# Option to display all potential commands (as listed in config_metadata.yaml)
+list_commands: False