Skip to content

Commit

Permalink
Merge pull request #38 from friendsofstrandseq/dev
Browse files Browse the repository at this point in the history
Version 2.2.0
  • Loading branch information
weber8thomas authored Aug 9, 2023
2 parents fd8f3f3 + 83de295 commit caba655
Show file tree
Hide file tree
Showing 11 changed files with 385 additions and 207 deletions.
11 changes: 11 additions & 0 deletions afac/plot_plate.R.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
{
"cells": [],
"metadata": {
"language_info": {
"name": "python"
},
"orig_nbformat": 4
},
"nbformat": 4,
"nbformat_minor": 2
}
66 changes: 66 additions & 0 deletions afac/plot_plate.dev.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
library(platetools)
library(ggplot2)
library(viridis)
library(dplyr)

## collect ASHLEYS prediction and count files
ashleys_data <- read.table(file = "/scratch/tweber/DATA/TMP/labels384.tsv", sep = "\t", header = TRUE)
plate_type <- nrow((ashleys_data))
# ashleys_data = read.table(file = snakemake@input[["labels"]] , sep = '\t', header = TRUE)
ashleys_data <- dplyr::arrange(ashleys_data, cell)
colnames(ashleys_data)[1] <- "ashleys_id"

# plate_type <- snakemake@params[["plate_type"]] # assuming that the plate type (96 or 384) is passed in as a parameter

Well_position <- character()

if (plate_type == 96) {
for (i in 1:12)
{
for (j in 1:8)
{
tmp <- paste0(LETTERS[j], i)
Well_position <- c(Well_position, tmp)
}
}
} else if (plate_type == 384) {
for (i in 1:24)
{
for (j in 1:16)
{
tmp <- paste0(LETTERS[j], i)
Well_position <- c(Well_position, tmp)
}
}
}

pdf("TEST_ashleys_plate_predictions.pdf")
# pdf(snakemake@output[["predictions"]])

ashleys_data$Well_position <- Well_position

raw_map(
data = ashleys_data$prediction,
well = ashleys_data$Well_position,
plate = plate_type
) +
scale_fill_distiller(type = "div", palette = "RdYlGn", direction = 1) +
ggtitle(paste0("Sample: TEST | ASHLEYS binary predictions (cutoff=0.5)"))
# ggtitle(paste0("Sample: ", snakemake@wildcards[["sample"]], " | ASHLEYS binary predictions (cutoff=", snakemake@config[["ashleys_threshold"]], ")"))

dev.off()

pdf("TEST_ashleys_plate_probabilities.pdf")
# pdf(snakemake@output[["probabilities"]])
ashleys_data$Well_position <- Well_position

raw_map(
data = ashleys_data$probability,
well = ashleys_data$Well_position,
plate = plate_type
) +
scale_fill_distiller(type = "div", palette = "RdYlGn", direction = 1) +
ggtitle(paste0("Sample: ", "TEST", " | ASHLEYS probabilities"))
# ggtitle(paste0("Sample: ", snakemake@wildcards[["sample"]], " | ASHLEYS probabilities"))

dev.off()
80 changes: 59 additions & 21 deletions afac/watchdog_ashleys.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,9 @@
from watchdog.events import FileSystemEventHandler
from datetime import datetime
import logging
import json
import pandas as pd
import threading


os.makedirs("watchdog/logs", exist_ok=True)
Expand All @@ -22,6 +25,17 @@
# Set the path you want to watch
path_to_watch = sys.argv[1]

data_location = "/scratch/tweber/DATA/MC_DATA/STOCKS"
publishdir_location = "/g/korbel/WORKFLOW_RESULTS"
genecore_prefix = path_to_watch
profile_slurm = ["--profile", "workflow/snakemake_profiles/HPC/slurm_EMBL/"]
profile_dry_run = ["--profile", "workflow/snakemake_profiles/local/conda/"]
dry_run_options = ["-c", "1", "-n", "-q"]
snakemake_binary = "/g/korbel2/weber/miniconda3/envs/snakemake_latest/bin/snakemake"

# plates_processing_status = pd.read_csv("watchdog/processing_status.json", sep="\t")
# print(plates_processing_status)


# Define the event handler
class MyHandler(FileSystemEventHandler):
Expand All @@ -30,6 +44,21 @@ def on_created(self, event):
logging.info(f"Directory {event.src_path} has been created!")
self.process_new_directory(event.src_path)

def check_unprocessed_folder(self):
unwanted = ["._.DS_Store", ".DS_Store", "config"]
list_runs_processed = sorted([e for e in os.listdir(data_location) if e not in unwanted])
total_list_runs = sorted([e for e in os.listdir(path_to_watch) if e not in unwanted])
unprocessed_plates = set(total_list_runs).difference(list_runs_processed)
for plate in unprocessed_plates:
# if plate not in plates_processing_status["plate"].values.tolist():
# plates_processing_status_plate_dict = collections.defaultdict(dict)
nb_txt_gz_files = len(glob.glob(f"{path_to_watch}/{plate}/*.txt.gz"))
if nb_txt_gz_files == 576:
print(f"PROCESSING {path_to_watch}/{plate}")
self.process_new_directory(f"{path_to_watch}/{plate}")
else:
print(f"Not possible to process {path_to_watch}/{plate}, containing {nb_txt_gz_files} txt.gz files")

def process_new_directory(self, directory_path):
"""Process the new directory, check for .txt.gz files and execute snakemake command if conditions are met."""

Expand Down Expand Up @@ -86,13 +115,7 @@ def execute_command(self, directory_path, prefix):

# Change directory and run the snakemake command
date_folder = directory_path.split("/")[-1]
data_location = "/scratch/tweber/DATA/MC_DATA/STOCKS"
publishdir_location = "/g/korbel/WORKFLOW_RESULTS"
genecore_prefix = path_to_watch
profile_slurm = ["--profile", "workflow/snakemake_profiles/HPC/slurm_EMBL/"]
profile_dry_run = ["--profile", "workflow/snakemake_profiles/local/conda/"]
dry_run_options = ["-c", "1", "-n", "-q"]
snakemake_binary = "/g/korbel2/weber/miniconda3/envs/snakemake_latest/bin/snakemake"

cmd = [
f"{snakemake_binary}",
"--config",
Expand Down Expand Up @@ -158,22 +181,37 @@ def run_second_command(self, cmd, profile_slurm, data_location, date_folder):
subprocess.run(["chmod", "-R", "777", f"{data_location}/{date_folder}"])


# Create the event handler
event_handler = MyHandler()
def main():
# Create the event handler
event_handler = MyHandler()

# Create an observer
observer = Observer()

# Create an observer
observer = Observer()
# Assign the observer to the path and the event handler
observer.schedule(event_handler, path_to_watch, recursive=False)

# Assign the observer to the path and the event handler
observer.schedule(event_handler, path_to_watch, recursive=False)
# Start the observer
observer.start()

# Start the periodical directory scanning in a separate thread
def periodic_scan():
while True:
event_handler.check_unprocessed_folder()
time.sleep(3600) # Scan the directory every hour

scan_thread = threading.Thread(target=periodic_scan)
scan_thread.start()

try:
while True:
logging.info("Waiting for new plate ...")
time.sleep(3600)
except KeyboardInterrupt:
observer.stop()

# Start the observer
observer.start()
observer.join()

try:
while True:
time.sleep(3)
except KeyboardInterrupt:
observer.stop()

observer.join()
if __name__ == "__main__":
main()
136 changes: 77 additions & 59 deletions config/config.yaml
Original file line number Diff line number Diff line change
@@ -1,59 +1,39 @@
version: 2.1.3
# Option to display all potential options - listed in config_metadata.yaml
list_commands: False
## Data location - MUST BE AN ABSOULTE PATH (due to snakemake-symlink issues) - PLEASE MODIFY IT
# input_bam_location: ".tests/data_CHR17"
data_location: ".tests/data_CHR17"
# Reference genome used by BWA to map FASTQ files
# reference: sandbox.zenodo.org/record/1074721/files/GCA_000001405.15_GRCh38_no_alt_analysis_set.fna
# Enable / Disable download of external files (1000G SNV & Fasta ref genome)
dl_external_files: False
# Enable / Disable multistep normalisation analysis
multistep_normalisation: False
# Ashleys-qc binary classification threshold
ashleys_threshold: 0.5
# Enable / Disable FastQC analysis
MultiQC: False
# To be informed of pipeline status
# --------------------------------------------------------
# Ashleys-QC pipeline Configuration
# --------------------------------------------------------
version: 2.2.0

# Email for notifications about the pipeline's status
email: ""
# Others
abs_path: "/"
############################################################################
# ADVANCED PARAMETERS
############################################################################

reference: "hg38"
# List of samples to process if multiple are specified
samples_to_process: []

references_data:
"hg38":
reference_fasta: "workflow/data/ref_genomes/hg38.fa"
"hg19":
reference_fasta: "workflow/data/ref_genomes/hg19.fa"
"T2T":
reference_fasta: "workflow/data/ref_genomes/T2T.fa"
"mm10":
reference_fasta: "workflow/data/ref_genomes/mm10.fa"

# Boolean parameters
## Is the pipeline called used as a submodule in mosaicatcher-pipeline?
mosaicatcher_pipeline: False
## Enable/Disable hand selection through Jupyter Notebook
hand_selection: False
# --------------------------------------------------------
# Data location & I/O
# --------------------------------------------------------

# Window size used by mosaic binning algorithm
window: 200000
# Absolute path to the data location (modify as needed)
data_location: ".tests/data_CHR17"

plottype_counts:
- "raw"
- "normalised"
# Directory to publish important data (e.g., stats, plots, counts). Leave empty if not required.
publishdir: ""

# --------------------------------------------------------
# Reference Data Configuration
# --------------------------------------------------------

alfred_plots:
- "dist"
- "devi"
# Reference genome used by BWA to map FASTQ files
reference: "hg38"

plate_orientation: landscape
# Reference genome files' location
references_data:
"hg38": { reference_fasta: "workflow/data/ref_genomes/hg38.fa" }
"T2T": { reference_fasta: "workflow/data/ref_genomes/T2T.fa" }
"hg19": { reference_fasta: "workflow/data/ref_genomes/hg19.fa" }
"mm10": { reference_fasta: "workflow/data/ref_genomes/mm10.fa" }

# Chromosomes list to process
# List of chromosomes to process
chromosomes:
- chr1
- chr2
Expand All @@ -80,26 +60,64 @@ chromosomes:
- chrX
- chrY

# Specify any chromosomes to exclude from processing
chromosomes_to_exclude: []

# GENECORE
# --------------------------------------------------------
# Quality Control Configuration
# --------------------------------------------------------

# Threshold for Ashleys-qc binary classification
ashleys_threshold: 0.5

# Enable or disable FastQC analysis
MultiQC: False

# --------------------------------------------------------
# Counts Configuration
# --------------------------------------------------------

# Enable or disable multistep normalization analysis
multistep_normalisation: False

# Advanced parameters for multi-step normalisation
multistep_normalisation_options:
min_reads_bin: 5
n_subsample: 1000
min_reads_cell: 100000

# Window size used by the mosaic binning algorithm
window: 200000

# Enable or disable hand selection through the Jupyter Notebook
hand_selection: False

# --------------------------------------------------------
# GENECORE Configuration
# --------------------------------------------------------

genecore: False
samples_to_process: []
genecore_date_folder: ""
# genecore_prefix: "/g/korbel/shared/genecore"
genecore_prefix: "/g/korbel/STOCKS/Data/Assay/sequencing/2023"
genecore_regex_element: "PE20"

##### DEV only
# --------------------------------------------------------
# Internal Parameters
# --------------------------------------------------------

# Is the pipeline used as a submodule in mosaicatcher-pipeline?
mosaicatcher_pipeline: False

# Overwrite ASHLEYS PREDICTIONS for GitHub & smoke dataset purpose
use_light_data: False

# If specified, will copy important data (stats, plots, counts file) to a second place
publishdir: ""
# For snakemake linting
abs_path: "/"

# Multi-step normalisation advanced parameters
multistep_normalisation_options:
min_reads_bin: 5
n_subsample: 1000
min_reads_cell: 100000
# Type of plots for counts
plottype_counts:
- "raw"
- "normalised"

# Option to display all potential commands (as listed in config_metadata.yaml)
list_commands: False
Loading

0 comments on commit caba655

Please sign in to comment.