Skip to content

Commit

Permalink
changes for version 1.3.3
Browse files Browse the repository at this point in the history
  • Loading branch information
ibn-salem committed Apr 29, 2021
1 parent 882cacc commit 1361e67
Show file tree
Hide file tree
Showing 5 changed files with 41 additions and 41 deletions.
24 changes: 9 additions & 15 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -49,22 +49,16 @@ Install python modules (we strongly recommend installation via conda):
conda install -c bioconda pysam=0.15.2 star=2.6.1b star-fusion=1.5.0 bowtie2=2.3.4.3 bx-python=0.8.2 crossmap=0.2.7
```


- R (>= 3.5.1)
- R (>= 3.6.0)
- R packages:
- optparse
- tidyverse
- randomForest
- Biostrings
- GenomicRanges
- BSgenome
- bindrcpp
- optparse (1.6.4)
- tidyverse (1.3.0)
- randomForest (4.6-14)

Install packages within R by

```
install.packages(c("optparse", "tidyverse", "randomForest", "Biostrings","BiocManager","BSgenome","optparse"))
BiocManager::install("GenomicRanges") #bioconductor package
install.packages(c("optparse", "tidyverse", "randomForest"))
```

## Usage
Expand All @@ -86,10 +80,10 @@ processing.py \

Before executing the example command

- [ ] rename `build_env.sh.smaple` into `build_env.sh` and configure content.
- [ ] rename `config.py.smaple` into `config.py` and configure content.
- [ ] rename `blacklist.txt.sample` into `blacklist.txt`.
- rename `build_env.sh.smaple` into `build_env.sh` and configure content.
- rename `config.py.smaple` into `config.py` and configure content.
- rename `blacklist.txt.sample` into `blacklist.txt`.

```
python processing.py -i test_case/SRR1659960_05pc_* -o test_easyfuse_1.3.1/
```
```
4 changes: 2 additions & 2 deletions config.py.sample
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ import os
# 3) Which reference data shall be used (ref_trans_version & ref_genome_build)
# 4) To whom shall slurm mails be sent to (receiver)

__version__ = "1.3.2"
version = "1.3.4"

pipeline_name = "EasyFuse"

Expand Down Expand Up @@ -162,4 +162,4 @@ other_files = {
"soapfuse_cfg": "/path/to/soapfuse_config/config_h<release>.txt",
"soapfuse_cfg_mm10": "/path/to/soapfuse_config/config_m<release>.txt",
"easyfuse_model": os.path.join(module_dir, "data", "model", "Fusion_modeling_FFPE_deploy_v01.model_full_data.EasyFuse_model.rds")
}
}
9 changes: 6 additions & 3 deletions misc/queueing.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,8 @@ def get_jobs_by_name(name, system="slurm"):
return get_jobs_by_name_slurm(name)
elif system == "pbs":
return get_jobs_by_name_pbs(name)
else:
return []

def get_jobs_by_name_pbs(name):
jobs = []
Expand Down Expand Up @@ -70,12 +72,13 @@ def submit(job_name, cmd, cores, mem_usage, output_results_folder, dependencies,
elif sched == "pbs":
_submit_pbs(job_name, cmd, cores, mem_usage, output_results_folder, dependencies, module_file)
else:
_submit_nonqueue(cmd, module_file)
_submit_nonqueue(job_name, cmd, module_file)

def _submit_nonqueue(cmd, module_file=""):
def _submit_nonqueue(job_name, cmd, module_file=""):
# if module_file:
# cmd = " && ".join(["source " + module_file, " ".join(cmd)]).split(" ")
# print(cmd)
print("Running {}".format(job_name))
print("CMD: {}".format(cmd))
p = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=False)
(stdoutdata, stderrdata) = p.communicate()
print(stdoutdata)
Expand Down
43 changes: 24 additions & 19 deletions processing.py
Original file line number Diff line number Diff line change
Expand Up @@ -77,7 +77,7 @@ def run(self, tool_num_cutoff):
# urla - note: would be happy to get the dependencies with a stacked LC, but is atm to complicated for me ^^
dependency = []
for sample in sample_list:
dependency.extend(Queueing.get_jobs_by_name("Fetchdata-{}".format(sample)))
dependency.extend(Queueing.get_jobs_by_name("Fetchdata-{}".format(sample), cfg.queueing_system))
modelling_string = ""
if cfg.other_files["easyfuse_model"]:
modelling_string = " --model_predictions"
Expand Down Expand Up @@ -109,6 +109,7 @@ def execute_pipeline(self, fq1, fq2, sample_id, ref_genome, ref_trans, tool_num_
# kallisto_index_path = indices["kallisto"]
# pizzly_cache_path = "{}.pizzlyCache.txt".format(genes_gtf_path)
starfusion_index_path = indices["starfusion"]
fusioncatcher_index_path = indices["fusioncatcher"]
infusion_cfg_path = other_files["infusion_cfg"]
# starchip_param_path = other_files["starchip_param"]

Expand All @@ -133,6 +134,9 @@ def execute_pipeline(self, fq1, fq2, sample_id, ref_genome, ref_trans, tool_num_
infusion_path = os.path.join(fusion_path, "infusion")
soapfuse_path = os.path.join(fusion_path, "soapfuse")
fetchdata_path = os.path.join(self.working_dir, "Sample_{}".format(sample_id), "fetchdata")
fastqc_1 = os.path.join(qc_path, sample_id + "_R1_fastqc", "fastqc_data.txt")
fastqc_2 = os.path.join(qc_path, sample_id + "_R2_fastqc", "fastqc_data.txt")


for folder in [
output_results_path,
Expand Down Expand Up @@ -163,9 +167,9 @@ def execute_pipeline(self, fq1, fq2, sample_id, ref_genome, ref_trans, tool_num_
# Define cmd strings for each program
# urla: mapsplice requires gunzip'd read files and process substitutions don't seem to work in slurm scripts...
# process substitution do somehow not work from this script - c/p the command line to the terminal, however, works w/o issues?!
cmd_fastqc = "{} --nogroup --extract -t 6 -o {} {} {}".format(cmds["fastqc"], qc_path, fq1, fq2)
cmd_qc_parser = "{} -i {}/*/fastqc_data.txt -o {}".format(os.path.join(module_dir, "misc", "qc_parser.py"), qc_path, qc_table_path)
cmd_skewer = "{} -q {} -i {} {} -o {}".format(os.path.join(module_dir, "tool_wrapper", "skewer_wrapper.py"), qc_table_path, fq1, fq2, skewer_path)
cmd_fastqc = "{0} --nogroup --extract -t 6 -o {1} {2} {3}".format(cmds["fastqc"], qc_path, fq1, fq2)
cmd_qc_parser = "{0} -i {1} {2} -o {3}".format(os.path.join(module_dir, "misc", "qc_parser.py"), fastqc_1, fastqc_2, qc_table_path)
cmd_skewer = "{0} -q {1} -i {2} {3} -o {4}".format(os.path.join(module_dir, "tool_wrapper", "skewer_wrapper.py"), qc_table_path, fq1, fq2, skewer_path)

fq0 = ""
if "QC" in tools:
Expand All @@ -192,12 +196,12 @@ def execute_pipeline(self, fq1, fq2, sample_id, ref_genome, ref_trans, tool_num_
cmd_star = "{0} --genomeDir {1} --outFileNamePrefix waiting_for_output_string --runThreadN waiting_for_cpu_number --runMode alignReads --readFilesIn {2} {3} --readFilesCommand zcat --chimSegmentMin 10 --chimJunctionOverhangMin 10 --alignSJDBoverhangMin 10 --alignMatesGapMax {4} --alignIntronMax {4} --chimSegmentReadGapMax 3 --alignSJstitchMismatchNmax 5 -1 5 5 --seedSearchStartLmax 20 --winAnchorMultimapNmax 50 --outSAMtype BAM SortedByCoordinate --chimOutType Junctions SeparateSAMold --chimOutJunctionFormat 1".format(cmds["star"], star_index_path, fq1, fq2, cfg.max_dist_proper_pair)
# (3) Mapslice
# urla: the "keep" parameter requires gunzip >= 1.6
cmd_extr_fastq1 = "gunzip {0} --keep".format(fq1)
cmd_extr_fastq2 = "gunzip {0} --keep".format(fq2)
cmd_extr_fastq1 = "gunzip --keep {0}".format(fq1)
cmd_extr_fastq2 = "gunzip --keep {0}".format(fq2)
# Added python interpreter to circumvent external hardcoded shell script
cmd_mapsplice = "python {0} --chromosome-dir {1} -x {2} -1 {3} -2 {4} --threads waiting_for_cpu_number --output {5} --qual-scale phred33 --bam --seglen 20 --min-map-len 40 --gene-gtf {6} --fusion".format(cmds["mapsplice"], genome_chrs_path, bowtie_index_path, fq1[:-3], fq2[:-3], mapsplice_path, genes_gtf_path)
# (4) Fusiocatcher
cmd_fusioncatcher = "{0} --input {1} --output {2} -p waiting_for_cpu_number".format(cmds["fusioncatcher"], ",".join([fq1, fq2]), fusioncatcher_path)
cmd_fusioncatcher = "{0} --input {1} --data {2} --output {3} -p waiting_for_cpu_number".format(cmds["fusioncatcher"], ",".join([fq1, fq2]), fusioncatcher_index_path, fusioncatcher_path)
# star-fusion and star-chip can be run upon a previous star run (this MUST NOT be the star_filter run, but the star_expression run)
# (5)
cmd_starfusion = "{0} --chimeric_junction {1} --genome_lib_dir {2} --CPU waiting_for_cpu_number --output_dir {3}".format(cmds["starfusion"], "{}_Chimeric.out.junction".format(os.path.join(star_path, sample_id)), starfusion_index_path, starfusion_path)
Expand Down Expand Up @@ -317,36 +321,37 @@ def execute_pipeline(self, fq1, fq2, sample_id, ref_genome, ref_trans, tool_num_
exe_cmds[i] = exe_cmds[i].replace("waiting_for_output_string", exe_path[i]).replace("waiting_for_cpu_number", str(cpu))
cmd = " && ".join([exe_cmds[i], cmd_samples + tool])
# Managing slurm dependencies
que_sys = cfg.queueing_system
if tool == "Pizzly":
dependency = Queueing.get_jobs_by_name("Kallisto-{0}".format(sample_id))
dependency = Queueing.get_jobs_by_name("Kallisto-{0}".format(sample_id), que_sys)
elif tool == "Starfusion" or tool == "Starchip":
dependency = Queueing.get_jobs_by_name("Star-{0}".format(sample_id))
dependency = Queueing.get_jobs_by_name("Star-{0}".format(sample_id), que_sys)
elif tool == "Fetchdata":
dependency = Queueing.get_jobs_by_name(sample_id)
dependency = Queueing.get_jobs_by_name(sample_id, que_sys)
elif tool == "Assembly":
dependency = Queueing.get_jobs_by_name("Fetchdata-{0}".format(sample_id))
dependency = Queueing.get_jobs_by_name("Fetchdata-{0}".format(sample_id), que_sys)
elif tool == "ReadFilter":
dependency = Queueing.get_jobs_by_name("QC-{0}".format(sample_id))
# else:
dependency.extend(Queueing.get_jobs_by_name("Readfilter-{0}".format(sample_id)))
dependency.extend(Queueing.get_jobs_by_name("QC-{0}".format(sample_id)))
dependency = Queueing.get_jobs_by_name("QC-{0}".format(sample_id), que_sys)
dependency.extend(Queueing.get_jobs_by_name("Readfilter-{0}".format(sample_id), que_sys))
dependency.extend(Queueing.get_jobs_by_name("QC-{0}".format(sample_id), que_sys))
self.logger.debug("Submitting slurm job: CMD - {0}; PATH - {1}; DEPS - {2}".format(cmd, exe_path[i], dependency))
self.submit_job(uid, cmd, cpu, mem, exe_path[i], dependency, "")
else:
self.logger.info("Skipping {0} as it is not selected for execution (Selected are: {1})".format(tool, tools))

def submit_job(self, uid, cmd, cores, mem_usage, output_results_folder, dependencies, mail):
"""Submit job to slurm scheduling"""
already_running = Queueing.get_jobs_by_name(uid)
que_sys = cfg.queueing_system
already_running = Queueing.get_jobs_by_name(uid, que_sys)
if not already_running:
# urla: for compatibility reasons (and to be independent of shell commands), concatenated commands are splitted again,
# dependencies within the splitted groups updated and everything submitted sequentially to the queueing system
module_file = os.path.join(cfg.module_dir, "build_env.sh")
que_sys = cfg.queueing_system

for i, cmd_split in enumerate(cmd.split(" && ")):
if not que_sys in ["slurm", "pbs"]:
cmd_split = cmd_split.split(" ")
dependencies.extend(Queueing.get_jobs_by_name("{0}_CMD{1}".format(uid, i - 1)))
dependencies.extend(Queueing.get_jobs_by_name("{0}_CMD{1}".format(uid, i - 1), que_sys))
Queueing.submit("{0}_CMD{1}".format(uid, i), cmd_split, cores, mem_usage, output_results_folder, dependencies, cfg.partition, cfg.user, cfg.time_limit, mail, module_file, que_sys)
time.sleep(0.5)
else:
Expand All @@ -365,7 +370,7 @@ def main():

# if version is request, print it and exit
if args.version:
print(cfg.version)
print(cfg.__version__)
sys.exit(0)

script_call = "python {} -i {} -o {}".format(os.path.realpath(__file__), " ".join([os.path.abspath(x) for x in args.input_paths]), os.path.abspath(args.output_folder))
Expand Down
2 changes: 0 additions & 2 deletions summarize_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,8 +11,6 @@
import time
import argparse

import pandas as pd
import seaborn as sns
from join_data import DataJoining
from misc.samples import SamplesDB
import misc.io_methods as IOMethods
Expand Down

0 comments on commit 1361e67

Please sign in to comment.