diff --git a/config.py.sample b/config.py.sample new file mode 100644 index 0000000..224b1fc --- /dev/null +++ b/config.py.sample @@ -0,0 +1,165 @@ +#!/usr/bin/env python + +import getpass +import os + +# Required user input: +# 1) Which fusion prediction tools should be executed (tools) +# 2) Which post-processing steps should be executed (fd_tools) +# 3) Which reference data shall be used (ref_trans_version & ref_genome_build) +# 4) To whom shall slurm mails be sent to (receiver) + +__version__ = "1.3.1" + +pipeline_name = "EasyFuse" + +#tools=Readfilter,Mapsplice,Fusioncatcher,Star,Starfusion,Infusion,Fetchdata,Summary +tools = ("QC", + "Readfilter", + "Fusioncatcher", + "Star", + "Starfusion", + "Infusion", + "Mapsplice", + "Soapfuse", + "Fetchdata", + "Summary") + +fusiontools = ("Fusioncatcher", + "Starfusion", + "Infusion", + "Mapsplice", + "Soapfuse") + +#fd_tools=Fusiongrep,Contextseq,Starindex,Staralign,Bamindex,Requantify +fd_tools = ("Fusiongrep", + "Contextseq", + "Starindex", + "ReadFilter2", + "ReadFilter2b", + "StaralignBest", + "BamindexBest", + "RequantifyBest") + +sender = "sender@mail.com" +receiver = "receiver@mail.com" +min_read_len_perc = 0.75 +max_dist_proper_pair = 200000 +cis_near_distance = 1000000 +model_pred_threshold = 0.75 +tsl_filter = "4,5,NA" +requant_mode = ["best"] +context_seq_len = 400 +ref_genome_build = "hg38" +ref_trans_version = "ensembl" +queueing_system = "slurm" +time_limit = "30-00:00:0" +partition = "Compute" +user = getpass.getuser() +module_dir = os.path.dirname(os.path.realpath(__file__)) +#logfile=/data/urla_progs/TronData/ngs_pipelines/easyfuse/fusion.log +#fusion_db=/data/urla_progs/TronData/ngs_pipelines/easyfuse/fusion.db + +# Define ressource usage (cpu (number of threads), mem (ram in Gb)): +resources = { + "qc": { + "cpu": 6, + "mem": 10 + }, + "readfilter": { + "cpu": 6, + "mem": 50 + }, + "star": { + "cpu": 6, + "mem": 40 + }, + "kallisto": { + "cpu": 6, + "mem": 10 + }, + "mapsplice": { + "cpu": 6, + "mem": 30 + }, + "fusioncatcher": { + "cpu": 6, + "mem": 30 + }, + "starfusion": { + "cpu": 6, + "mem": 30 + }, + "starchip": { + "cpu": 12, + "mem": 30 + }, + "infusion": { + "cpu": 6, + "mem": 30 + }, + "soapfuse": { + "cpu": 6, + "mem": 20 + }, + "classification": { + "cpu": 1, + "mem": 16 + }, + "fetchdata": { + "cpu": 12, + "mem": 50 + }, + "summary": { + "cpu": 1, + "mem": 16 + } +} + +# execution command for individual programs (what you write here should be identical to what is typed in the console) +commands = { + # for qc + "fastqc": "/path/to/fastqc_bin", + "skewer": "/path/to/skewer_bin", + # for processing + "mapsplice": "/path/to/mapsplice.py", + "fusioncatcher": "/path/to/fusioncatcher_bin", + "starfusion": "/path/to/starfusion_bin", + "infusion": "/path/to/infusion_bin", + "soapfuse": "/path/to/SOAPfuse-RUN.pl", + # for processing and fetch data + "star": "/path/to/star_bin", + "samtools": "/path/to/samtools_bin" + # for liftover + #"crossmap": "test" +} + +# full path to reference files +references = { + # wget http://ftp.ensembl.org/pub/release-86/fasta/homo_sapiens/dna/Homo_sapiens.GRCh38.dna.primary_assembly.fa.gz + "genome_fasta": "/path/to/Homo_sapiens.GRCh38.dna.primary_assembly.fa", + "genome_fastadir": "/projects/data/human/ensembl/GRCh38.86/fasta", + "genome_sizes": "/projects/data/human/ensembl/GRCh38.86/STAR_idx/chrNameLength.txt", + # wget http://ftp.ensembl.org/pub/release-86/fasta/homo_sapiens/cdna/Homo_sapiens.GRCh38.cdna.all.fa.gz + "genes_fasta": "/projects/data/human/ensembl/GRCh38.86/Homo_sapiens.GRCh38.cdna.all.fa", + # wget http://ftp.ensembl.org/pub/release-86/gtf/homo_sapiens/Homo_sapiens.GRCh38.86.chr.gtf.gz + "genes_gtf": "/path/to/Homo_sapiens.GRCh38.86.gtf", + "genes_adb": "/projects/data/human/ensembl/GRCh38.86/Homo_sapiens.GRCh38.86.gff3.db", + "genes_tsl": "/projects/data/human/ensembl/GRCh38.86/Homo_sapiens.GRCh38.86.gtf.tsl" +} + +# full path to program indices +indices = { + "star": "/projects/data/human/ensembl/GRCh38.86/STAR_idx/", + "bowtie": "/projects/data/human/ensembl/GRCh38.86/bowtie_index/hg38", + "starfusion": "/projects/data/human/ensembl/GRCh38.86/starfusion_index/", + "fusioncatcher": "/projects/data/human/ensembl/GRCh38.86/fusioncatcher_index/" +} + +# full path to program specific config files (these are just general files which need no user modification) +other_files = { + "infusion_cfg": "/projects/data/human/ensembl/GRCh38.86/infusion_index/infusion.cfg", + "soapfuse_cfg": "/code/SOAPfuse/1.27/config/config_h86.txt", + "soapfuse_cfg_mm10": "/code/SOAPfuse/1.27/config/config_m95.txt", + "easyfuse_model": os.path.join(module_dir, "data", "model", "Fusion_modeling_IVAC_BNT_v16.model.requant_and_boundary_org.randomForest.rds") +} diff --git a/fetchdata.py b/fetchdata.py index 547a36a..e81c696 100755 --- a/fetchdata.py +++ b/fetchdata.py @@ -10,32 +10,29 @@ @version: 20190118 """ -from __future__ import print_function import sys import os import os.path import math import argparse -import misc.queue as Queueing -from misc.config import Config +import misc.queueing as Queueing from misc.samples import SamplesDB from misc.logger import Logger import misc.io_methods as IOMethods +import config as cfg # pylint: disable=line-too-long # yes they are partially, but I do not consider this to be relevant here class Fetching(object): """Run, monitor and schedule fastq processing for fusion gene prediction""" - def __init__(self, scratch_path, fetchdata_path, sample_id, config): + def __init__(self, scratch_path, fetchdata_path, sample_id): """Parameter initiation and work folder creation.""" - self.cfg = config self.scratch_path = scratch_path self.fetchdata_path = fetchdata_path self.sample_id = sample_id #self.tools = Samples(os.path.join(scratch_path, os.path.pardir, os.path.pardir, "samples.csv")).get_tool_list_from_state(self.sample_id) - self.samples = SamplesDB(os.path.join(scratch_path, os.path.pardir, os.path.pardir, "samples.db")) + self.samples = SamplesDB(os.path.join(scratch_path, os.path.pardir, "samples.db")) self.logger = Logger(os.path.join(self.fetchdata_path, "fetchdata.log")) - self.easyfuse_path = os.path.dirname(os.path.realpath(__file__)) def get_pseudo_genome_adjustments_for_star(self, num_len_file): # wrong pylint error due to long name => pylint: disable=C0103 """Return the genome size of an associated fasta file calculated by urla_GetFusionSequence_latest.R""" @@ -54,12 +51,20 @@ def get_pseudo_genome_adjustments_for_star(self, num_len_file): # wrong pylint e def get_input_read_count_from_star(star_out_bam): """Parses a star output log file to get input read counts from the fastq origin""" log_file = "{}Log.final.out".format(star_out_bam.rstrip("Aligned.out.bam")) + if not os.path.exists(log_file): + return -1 with open(log_file, "r") as star_log: for line in star_log: if line.split("|")[0].strip() == "Number of input reads": return int(line.split("|")[1].strip()) return -1 + @staticmethod + def get_input_read_count_from_fastq(fastq): + """Parses input FASTQ to get read count""" + result = subprocess.getoutput("zcat {} | wc -l".format(fastq)) + return int(result) / 4 + def run(self, fusion_support, fq1, fq2): """Identification of fastq files and initiation of processing""" # print sample id @@ -68,21 +73,23 @@ def run(self, fusion_support, fq1, fq2): self.logger.info("Fetching in sample {}".format(self.sample_id)) if not fq1 or not fq2: self.logger.debug("Either ReadFile 1 or 2 or both are missing, trying to get original files from samples.csv") + self.logger.debug(self.sample_id) + self.logger.debug(self.samples.db_path) (fq1, fq2) = self.samples.get_fastq_files(self.sample_id) - self.execute_pipeline((fq1, fq2), fusion_support) + self.execute_pipeline(fq1, fq2, fusion_support) # urla: there are a lot of local variables declarated in the following method. # Although this could be reduced quite strongly, readability would be strongly reduced as well # pylint:disable=R0914 - def execute_pipeline(self, (fq1, fq2), fusion_support): + def execute_pipeline(self, fq1, fq2, fusion_support): """Create sample specific subfolder structuge and run tools on fastq files""" # Genome/Gene references to use - ref_trans = self.cfg.get('general', 'ref_trans_version') - ref_genome = self.cfg.get('general', 'ref_genome_build') - genome_fasta_path = self.cfg.get('references', ref_trans + '_genome_fasta_' + ref_genome) - genes_adb_path = self.cfg.get('references', ref_trans + '_genes_adb_' + ref_genome) - genes_tsl_path = self.cfg.get('references', ref_trans + '_genes_tsl_' + ref_genome) + ref_trans = cfg.ref_trans_version + ref_genome = cfg.ref_genome_build + genome_fasta_path = cfg.references["genome_fasta"] + genes_adb_path = cfg.references["genes_adb"] + genes_tsl_path = cfg.references["genes_tsl"] fetchdata_current_path = os.path.join(self.fetchdata_path, "fd_{}_tool".format(fusion_support)) detected_fusions_path = os.path.join(fetchdata_current_path, "fetched_fusions") @@ -98,53 +105,65 @@ def execute_pipeline(self, (fq1, fq2), fusion_support): for folder in [ fetchdata_current_path, - detected_fusions_path, context_seq_path, - star_genome_path, star_align_path, + detected_fusions_path, + context_seq_path, + star_genome_path, + star_align_path, classification_path ]: IOMethods.create_folder(folder, self.logger) # processing steps to perform - tools = self.cfg.get('general', 'fd_tools').split(",") + tools = cfg.fd_tools + fusion_tools = cfg.tools + module_dir = cfg.module_dir + cmds = cfg.commands # In case of a liftover, some reference and path must be changed accordingly cmd_contextseq_org = "" if "Liftover" in tools: tools.insert(2, "ContextSeqBak") # for read grepping, we need the original reference on which the first mapping was performed - cmd_contextseq_org = "python {0} --detected_fusions {1}.bak --annotation_db {2} --out_csv {3}.bak --genome_fasta {4} --tsl_info {5} --cis_near_dist {6} --context_seq_len {7} --tsl_filter_level {8}".format(os.path.join(self.easyfuse_path, "fusionannotation.py"), detected_fusions_file, genes_adb_path, context_seq_file, genome_fasta_path, genes_tsl_path, self.cfg.get('general', 'cis_near_distance'), self.cfg.get('general', 'context_seq_len'), self.cfg.get('general', 'tsl_filter')) + cmd_contextseq_org = "python {0} --detected_fusions {1}.bak --annotation_db {2} --out_csv {3}.bak --genome_fasta {4} --tsl_info {5} --cis_near_dist {6} --context_seq_len {7} --tsl_filter_level {8}".format(os.path.join(module_dir, "fusionannotation.py"), detected_fusions_file, genes_adb_path, context_seq_file, genome_fasta_path, genes_tsl_path, cfg.cis_near_distance, cfg.context_seq_len, cfg.tsl_filter) # now, references need to be updated according to the target liftover - crossmap_chain = self.cfg.get('liftover', 'crossmap_chain') + crossmap_chain = cfg.liftover["crossmap_chain"] ref_genome_dest = os.path.basename(crossmap_chain).replace(".", "_").split("_")[2].lower() self.logger.debug("Creating a copy of the detected fusions file due to selection of liftover. Old ({0}) data will be kept in \"{1}.bak\"".format(ref_genome, detected_fusions_file)) - genome_fasta_path = self.cfg.get('references', ref_trans + '_genome_fasta_' + ref_genome_dest) - genes_adb_path = self.cfg.get('references', ref_trans + '_genes_adb_' + ref_genome_dest) + genome_fasta_path = cfg.references["genome_fasta_hg37"] + genes_adb_path = cfg.references["genes_adb_hg37"] # urla - note: tmp hack to get original star input reads for normalization with open(os.path.join(classification_path, "Star_org_input_reads.txt"), "w") as infile: - infile.write(str(self.get_input_read_count_from_star(os.path.join(filtered_reads_path, "{}_Aligned.out.bam".format(self.sample_id))))) + read_count = self.get_input_read_count_from_star(os.path.join(filtered_reads_path, "{}_Aligned.out.bam".format(self.sample_id))) + if read_count == -1: + read_count = self.get_input_read_count_from_fastq(fq1) + infile.write(str(read_count)) # Define cmd strings for each program - cmd_fusiondata = "{0} -i {1} -o {2} -s {3} -t {4} -f {5} -l {6}".format(os.path.join(self.easyfuse_path, "fusiontoolparser.py"), self.scratch_path, detected_fusions_path, self.sample_id, fusion_support, self.cfg.get('general', 'fusiontools'), self.logger.get_path()) - cmd_liftover = "{0} -i {1} -c {2} -l {3}".format(os.path.join(self.easyfuse_path, "misc", "liftover.py"), detected_fusions_file, self.cfg.get_path(), self.logger.get_path()) - cmd_contextseq = "{0} --detected_fusions {1} --annotation_db {2} --out_csv {3} --genome_fasta {4} --tsl_info {5} --cis_near_dist {6} --context_seq_len {7} --tsl_filter_level {8}".format(os.path.join(self.easyfuse_path, "fusionannotation.py"), detected_fusions_file, genes_adb_path, context_seq_file, genome_fasta_path, genes_tsl_path, self.cfg.get('general', 'cis_near_distance'), self.cfg.get('general', 'context_seq_len'), self.cfg.get('general', 'tsl_filter')) - cpu, _ = self.cfg.get("resources", "fetchdata").split(",") - cmd_starindex = "{0} --runMode genomeGenerate --runThreadN {1} --limitGenomeGenerateRAM 48000000000 --genomeChrBinNbits waiting_for_bin_size_input --genomeSAindexNbases waiting_for_sa_idx_input --genomeDir {2} --genomeFastaFiles {3}".format(self.cfg.get('commands', 'star_cmd'), cpu, star_genome_path, "{0}{1}".format(context_seq_file, ".fasta")) - cmd_staralign_fltr = "{0} --genomeDir {1} --readFilesCommand zcat --readFilesIn {2} {3} --outSAMtype BAM SortedByCoordinate --outFilterMultimapNmax -1 --outSAMattributes Standard --outSAMunmapped None --outFilterMismatchNoverLmax 0.02 --runThreadN {4} --outFileNamePrefix {5}fltr_ --limitBAMsortRAM 48000000000".format(self.cfg.get('commands', 'star_cmd'), star_genome_path, fq1, fq2, cpu, star_align_file) - cmd_bamindex_fltr = "{0} index {1}fltr_Aligned.sortedByCoord.out.bam".format(self.cfg.get('commands', 'samtools_cmd'), star_align_file) - cmd_requantify_fltr = "{0} -i {1}fltr_Aligned.sortedByCoord.out.bam -o {2}_fltr.tdt -d 10".format(os.path.join(self.easyfuse_path, "requantify.py"), star_align_file, classification_file) + cmd_fusiondata = "{0} -i {1} -o {2} -s {3} -t {4} -f {5} -l {6}".format(os.path.join(module_dir, "fusiontoolparser.py"), self.scratch_path, detected_fusions_path, self.sample_id, fusion_support, ",".join(cfg.fusiontools), self.logger.get_path()) + cmd_liftover = "{0} -i {1} -l {2}".format(os.path.join(module_dir, "misc", "liftover.py"), detected_fusions_file, self.logger.get_path()) + cmd_contextseq = "{0} --detected_fusions {1} --annotation_db {2} --out_csv {3} --genome_fasta {4} --tsl_info {5} --cis_near_dist {6} --context_seq_len {7} --tsl_filter_level {8}".format(os.path.join(module_dir, "fusionannotation.py"), detected_fusions_file, genes_adb_path, context_seq_file, genome_fasta_path, genes_tsl_path, cfg.cis_near_distance, cfg.context_seq_len, cfg.tsl_filter) + cpu = cfg.resources["fetchdata"]["cpu"] +# mem = cfg.resources["fetchdata"]["mem"] + cmd_starindex = "{0} --runMode genomeGenerate --runThreadN {1} --limitGenomeGenerateRAM 48000000000 --genomeChrBinNbits waiting_for_bin_size_input --genomeSAindexNbases waiting_for_sa_idx_input --genomeDir {2} --genomeFastaFiles {3}".format(cmds["star"], cpu, star_genome_path, "{0}{1}".format(context_seq_file, ".fasta")) + cmd_staralign_fltr = "{0} --genomeDir {1} --readFilesCommand zcat --readFilesIn {2} {3} --outSAMtype BAM SortedByCoordinate --outFilterMultimapNmax -1 --outSAMattributes Standard --outSAMunmapped None --outFilterMismatchNoverLmax 0.02 --runThreadN {4} --outFileNamePrefix {5}fltr_ --limitBAMsortRAM 48000000000".format(cmds["star"], star_genome_path, fq1, fq2, cpu, star_align_file) + cmd_bamindex_fltr = "{0} index {1}fltr_Aligned.sortedByCoord.out.bam".format(cmds["samtools"], star_align_file) + cmd_requantify_fltr = "{0} -i {1}fltr_Aligned.sortedByCoord.out.bam -o {2}_fltr.tdt -d 10".format(os.path.join(module_dir, "requantify.py"), star_align_file, classification_file) (fq1, fq2) = self.samples.get_fastq_files(self.sample_id) - cmd_staralign_org = "{0} --genomeDir {1} --readFilesCommand zcat --readFilesIn {2} {3} --outSAMtype BAM SortedByCoordinate --outFilterMultimapNmax -1 --outSAMattributes Standard --outSAMunmapped None --outFilterMismatchNoverLmax 0.02 --runThreadN {4} --outFileNamePrefix {5}org_ --limitBAMsortRAM 48000000000".format(self.cfg.get('commands', 'star_cmd'), star_genome_path, fq1, fq2, cpu, star_align_file) - cmd_bamindex_org = "{0} index {1}org_Aligned.sortedByCoord.out.bam".format(self.cfg.get('commands', 'samtools_cmd'), star_align_file) - cmd_requantify_org = "{0} -i {1}org_Aligned.sortedByCoord.out.bam -o {2}_org.tdt -d 10".format(os.path.join(self.easyfuse_path, "requantify.py"), star_align_file, classification_file) + cmd_staralign_org = "{0} --genomeDir {1} --readFilesCommand zcat --readFilesIn {2} {3} --outSAMtype BAM SortedByCoordinate --outFilterMultimapNmax -1 --outSAMattributes Standard --outSAMunmapped None --outFilterMismatchNoverLmax 0.02 --runThreadN {4} --outFileNamePrefix {5}org_ --limitBAMsortRAM 48000000000".format(cmds["star"], star_genome_path, fq1, fq2, cpu, star_align_file) + cmd_bamindex_org = "{0} index {1}org_Aligned.sortedByCoord.out.bam".format(cmds["samtools"], star_align_file) + cmd_requantify_org = "{0} -i {1}org_Aligned.sortedByCoord.out.bam -o {2}_org.tdt -d 10".format(os.path.join(module_dir, "requantify.py"), star_align_file, classification_file) # for testing, based on debug. should be removed if merged to original - cmd_read_filter2 = "{0} --input {1}_Aligned.out.bam --input2 {2}.debug --output {1}_Aligned.out.filtered2.bam".format(os.path.join(self.easyfuse_path, "getRequantReads.py"), os.path.join(filtered_reads_path, self.sample_id), context_seq_file) + cmd_read_filter2 = "{0} --input {1}_Aligned.out.bam --input2 {2}.debug --output {1}_Aligned.out.filtered2.bam".format(os.path.join(module_dir, "getRequantReads.py"), os.path.join(filtered_reads_path, self.sample_id), context_seq_file) # re-define fastq's if filtering is on (default) - fq0 = os.path.join(filtered_reads_path, os.path.basename(fq1).replace("R1", "R0").replace(".fastq.gz", "_filtered2_singles.fastq.gz")) - fq1 = os.path.join(filtered_reads_path, os.path.basename(fq1).replace(".fastq.gz", "_filtered2.fastq.gz")) - fq2 = os.path.join(filtered_reads_path, os.path.basename(fq2).replace(".fastq.gz", "_filtered2.fastq.gz")) - cmd_bam_to_fastq = "{0} fastq -0 {1} -1 {2} -2 {3} --threads {5} {4}_Aligned.out.filtered2.bam".format(self.cfg.get('commands', 'samtools_cmd'), fq0, fq1, fq2, os.path.join(filtered_reads_path, self.sample_id), cpu) - cmd_staralign_best = "{0} --genomeDir {1} --readFilesCommand zcat --readFilesIn {2} {3} --outSAMtype BAM SortedByCoordinate --outFilterMultimapNmax -1 --outSAMattributes Standard --outSAMunmapped None --outFilterMismatchNoverLmax 0.02 --runThreadN {4} --outFileNamePrefix {5}best_ --limitBAMsortRAM 48000000000".format(self.cfg.get('commands', 'star_cmd'), star_genome_path, fq1, fq2, cpu, star_align_file) - cmd_bamindex_best = "{0} index {1}best_Aligned.sortedByCoord.out.bam".format(self.cfg.get('commands', 'samtools_cmd'), star_align_file) - cmd_requantify_best = "{0} -i {1}best_Aligned.sortedByCoord.out.bam -o {2}_best.tdt -d 10".format(os.path.join(self.easyfuse_path, "requantify.py"), star_align_file, classification_file) + fq0 = "" + if "Readfilter" in fusion_tools: + fq0 = os.path.join(filtered_reads_path, os.path.basename(fq1).replace("R1", "R0").replace(".fastq.gz", "_filtered2_singles.fastq.gz")) + fq1 = os.path.join(filtered_reads_path, os.path.basename(fq1).replace(".fastq.gz", "_filtered2.fastq.gz")) + fq2 = os.path.join(filtered_reads_path, os.path.basename(fq2).replace(".fastq.gz", "_filtered2.fastq.gz")) + cmd_bam_to_fastq = "{0} fastq -0 {1} -1 {2} -2 {3} --threads {5} {4}_Aligned.out.filtered2.bam".format(cmds["samtools"], fq0, fq1, fq2, os.path.join(filtered_reads_path, self.sample_id), cpu) + # allow soft-clipping? Specificity? --alignEndsType EndToEnd + cmd_staralign_best = "{0} --genomeDir {1} --readFilesCommand zcat --readFilesIn {2} {3} --outSAMtype BAM SortedByCoordinate --outFilterMultimapNmax -1 --outSAMattributes Standard --outSAMunmapped None --outFilterMismatchNoverLmax 0.02 --runThreadN {4} --outFileNamePrefix {5}best_ --limitBAMsortRAM 48000000000".format(cmds["star"], star_genome_path, fq1, fq2, cpu, star_align_file) + cmd_bamindex_best = "{0} index {1}best_Aligned.sortedByCoord.out.bam".format(cmds["samtools"], star_align_file) + cmd_requantify_best = "{0} -i {1}best_Aligned.sortedByCoord.out.bam -o {2}_best.tdt -d 10".format(os.path.join(module_dir, "requantify.py"), star_align_file, classification_file) @@ -205,7 +224,7 @@ def execute_pipeline(self, (fq1, fq2), fusion_support): ] # create and submit slurm job if the tool is requested and hasn't been run before - module_file = self.cfg.get('commands', 'module_file') + module_file = os.path.join(module_dir, "build_env.sh") for i, tool in enumerate(exe_tools, 0): if tool in tools: if not exe_dependencies[i] or os.path.exists(exe_dependencies[i]): @@ -229,7 +248,6 @@ def main(): parser.add_argument('-i', '--input', dest='input', help='Data input directory.', required=True) parser.add_argument('-o', '--output', dest='output', help='Data output directory.', required=True) parser.add_argument('-s', '--sample', dest='sample', help='Specify the sample to process.', required=True) - parser.add_argument('-c', '--config', dest='config', help='Specify config file.', default="", required=True) parser.add_argument('--fq1', dest='fq1', help='Input read1 file for re-quantification', default="", required=False) parser.add_argument('--fq2', dest='fq2', help='Input read2 file for re-quantification', default="", required=False) parser.add_argument('--fusion_support', dest='fusion_support', help='Number of fusion tools which must predict the fusion event', default=1) @@ -242,7 +260,7 @@ def main(): # 4: get expression? # ... for i in range(1, int(args.fusion_support) + 1): - proc = Fetching(args.input, args.output, args.sample, Config(args.config)) + proc = Fetching(args.input, args.output, args.sample) proc.run(i, args.fq1, args.fq2) if __name__ == '__main__': diff --git a/fusionannotation.py b/fusionannotation.py index c27f9af..bad7aa9 100755 --- a/fusionannotation.py +++ b/fusionannotation.py @@ -7,19 +7,20 @@ 3) Reconstruct possible fusion transcripts as well as the respective wildtype background 4) Translate transcripts 5) Select regions of interest for neopeptide searches - 6) Collect information on the kind of fusion which might be relevant for downstream selection processes - 6) Filter likely artifacts based on the quality of the annotated transcripts + 6) Collect information on the kind of fusion which might be relevant for downstream + selection processes + 7) Filter likely artifacts based on the quality of the annotated transcripts @author: BNT (URLA) @version: 20190704 """ -from __future__ import print_function, division -import argparse + +from argparse import ArgumentParser import gffutils import xxhash -from Bio import SeqIO -from Bio.Seq import Seq -from Bio.Alphabet import generic_dna -from Bio.SeqRecord import SeqRecord +from Bio import SeqIO # pysam is not available for windows (where I run pylint) => pylint: disable=E0401 +from Bio.Seq import Seq # pylint: disable=E0401 +from Bio.Alphabet import generic_dna # pylint: disable=E0401 +from Bio.SeqRecord import SeqRecord # pylint: disable=E0401 # pylint: disable=line-too-long # yes they are partially, but I do not consider this to be relevant here @@ -33,7 +34,8 @@ def __init__(self, db_file, bp_file, tsl_file): self.cds_seq_dict = {} self.trans_flags_dict = {} - def load_featuredb(self, db_in_file): + @staticmethod + def load_featuredb(db_in_file): """ Perform a simple check to confirm that the database input is a database """ # urla - note: I would like to have a better check implemented before trying to load the db but currently don't know how this should look like if not db_in_file.endswith("db"): @@ -41,7 +43,8 @@ def load_featuredb(self, db_in_file): # return the db return gffutils.FeatureDB(db_in_file) - def breakpoints_to_dict(self, bp_in_file): + @staticmethod + def breakpoints_to_dict(bp_in_file): """ Read fgid and breakpoints from Detected_Fusions.csv and store the info in a dict """ bp_dict = {} with open(bp_in_file, "r") as bpin: @@ -53,7 +56,8 @@ def breakpoints_to_dict(self, bp_in_file): bp_dict[fgid] = (bp1, bp2) return bp_dict - def load_tsl_data(self, tsl_in_file): + @staticmethod + def load_tsl_data(tsl_in_file): """ Load data created by gtf2tsl.py into a dict """ tsl_dict = {} with open(tsl_in_file, "r") as tslin: @@ -76,7 +80,8 @@ def get_tsl(self, trans_id): return "6" return self.tsl_dict[trans_id] - def define_type(self, cis_near_distance, bp1_chr, bp1_pos, bp1_strand, bp2_chr, bp2_pos, bp2_strand): + @staticmethod + def define_type(cis_near_distance, bp1_chr, bp1_pos, bp1_strand, bp2_chr, bp2_pos, bp2_strand): """ Define the fusion type based on the location and orientation of the fusion partners """ # Fusion type: # - "trans" (bp of fusion partners on different chromosomes, but with same strand) @@ -109,7 +114,7 @@ def get_bp_overlapping_features(self, bp_chr, bp_pos): """ Get two lists with breakpoint overlapping exons and cds from the database """ bp_exons = [] bp_cds = [] - for efeature in self.db.region(region="{}:{}-{}".format(bp_chr, bp_pos - 1, bp_pos + 1), completely_within=False, featuretype = ("exon", "CDS")): + for efeature in self.db.region(region="{}:{}-{}".format(bp_chr, bp_pos - 1, bp_pos + 1), completely_within=False, featuretype=("exon", "CDS")): if efeature.featuretype == "exon": bp_exons.append(efeature) elif efeature.featuretype == "CDS": @@ -121,7 +126,8 @@ def get_bp_overlapping_features(self, bp_chr, bp_pos): bp_cds = [bp_cds[cds_transcripts.index(x)] if x in cds_transcripts else "" for x in exon_transcripts] return bp_exons, bp_cds - def get_boundary(self, bp_pos, efeature): + @staticmethod + def get_boundary(bp_pos, efeature): """ Check whether the breakpoint position is on an exon or CDS boundary """ if efeature == "": return "NA" @@ -133,7 +139,8 @@ def get_boundary(self, bp_pos, efeature): return "within" return "outside" - def get_boundary2(self, boundary1, boundary2, strand1, strand2): + @staticmethod + def get_boundary2(boundary1, boundary2, strand1, strand2): """ Check whether boundaries of both fusion partners are in the correct orientation """ # urla - note: "both" is returned to be compatible with the existing pipeline although the name is not very meaningful here if strand1 == "+": @@ -150,7 +157,7 @@ def get_boundary2(self, boundary1, boundary2, strand1, strand2): else: if boundary1 == "left_boundary" and boundary2 == "right_boundary": return "both" - + # the "and not" checks in the following ifs should be superficial, because the only way the check # could be unintendently true is if one of the boundaries is on the wrong site of the exon # If this can be excluded to 100%, the previous if/else should also be simplified strongly! @@ -168,6 +175,7 @@ def get_parents(self, efeature): gene_name = "" gene_biotype = "" description = "" + count_parents = 0 for count_parents, parent in enumerate(self.db.parents(efeature.id), 1): if parent.id.startswith("transcript:"): trans_id = parent.id @@ -177,20 +185,19 @@ def get_parents(self, efeature): gene_name = parent.attributes["Name"][0] gene_biotype = parent.attributes["biotype"][0] try: - description = parent.attributes["description"][0].replace(";"," ") + description = parent.attributes["description"][0].replace(";", " ") except KeyError: description = "NA" if count_parents > 2: - print("Warning: {0} has more than two parents. Please check that gene_id {1} and trans_id {2} are correct for the exon at position {3}-{4}.".format( - efeature.id, gene_id, trans_id, efeature.start, efeature.stop)) - return trans_id, trans_biotype, gene_id, gene_name, gene_biotype, description + print("Warning: {0} has more than two parents. Please check that gene_id {1} and trans_id {2} are correct for the exon at position {3}-{4}.".format(efeature.id, gene_id, trans_id, efeature.start, efeature.stop)) + return trans_id, trans_biotype, gene_name, gene_biotype, description def get_wt_codings(self, trans_id): """ Get matched exons and cds region per transcript id """ exon_pos_list = [] cds_pos_list = [] cds_frame_list = [] - for feature in self.db.children(trans_id, featuretype = ("exon", "CDS"), order_by = "start", reverse = False): + for feature in self.db.children(trans_id, featuretype=("exon", "CDS"), order_by="start", reverse=False): if feature.featuretype == "exon": exon_pos_list.append((feature.start, feature.stop)) elif feature.featuretype == "CDS": @@ -215,7 +222,8 @@ def get_wt_codings(self, trans_id): cds_frame_list2.append("NA") return exon_pos_list, cds_pos_list2, cds_frame_list2 - def check_exon_overlap(self, wt1_exon_pos_list, wt2_exon_pos_list, fusion_type): + @staticmethod + def check_exon_overlap(wt1_exon_pos_list, wt2_exon_pos_list, fusion_type): """ Returns true if exons overlap and false otherwise """ # exon position lists are always sorted and we therefore don't need any information on the strand # gene on different chromosomes @@ -226,65 +234,49 @@ def check_exon_overlap(self, wt1_exon_pos_list, wt2_exon_pos_list, fusion_type): return False return True - def get_frame(self, bp_pos, cds_pos_list, cds_frame_list, strand): + @staticmethod + def get_frame(bp_pos, cds_pos_list, cds_frame_list, strand): """ Get/Calculate the frame at the start of the cds and at the breakpoint """ # urla - note: the frame annotation is imho a little confusing in ensembl and defined as follows: # 0: The next full codon (i.e. 3bp codon) starts 0 bases from the current position => this is the first (0) base of a codon # 1: The next full codon (i.e. 3bp codon) starts 1 base from the current position => this is the third (2) base of a codon # 2: The next full codon (i.e. 3bp codon) starts 2 bases from the current position => this is the second (1) base of a codon - pos_frame_idx = [0, 2, 1, 0, 2, 1] -# neg_frame_idx = [2, 0, 1, 2, 0, 1] - + frame_idx = [0, 2, 1, 0, 2, 1] # if the frame at bp is non-determinable, it is set to -1 frame_at_start = -1 frame_at_bp = -1 # zip pos and frame while excluding NAs - pos_frame_list = [(x,int(y)) for x,y in zip(cds_pos_list, cds_frame_list) if not x == "NA"] + pos_frame_list = [(x, int(y)) for x, y in zip(cds_pos_list, cds_frame_list) if not x == "NA"] # find breakpoint cds and get the frame at the breakpoint for cds_pos, cds_frame in pos_frame_list: - -# if cds_pos[0] <= bp_pos <= cds_pos[1]: -# tmp_frame_idx = pos_frame_idx[pos_frame_idx.index(cds_frame):pos_frame_idx.index(cds_frame)+3] -# frame_at_bp = tmp_frame_idx[(bp_pos - (cds_pos[0] - 1)) % 3] -# break - - tmp_frame_idx = pos_frame_idx[pos_frame_idx.index(cds_frame):pos_frame_idx.index(cds_frame)+3] -# if (strand == "+" and bp_pos == cds_pos[0]) or (strand == "-" and bp_pos == cds_pos[1]): -# frame_at_bp = cds_frame -# elif cds_pos[0] < bp_pos <= cds_pos[1]: -# frame_at_bp = tmp_frame_idx[(bp_pos - (cds_pos[0] - 1)) % 3] - + tmp_frame_idx = frame_idx[frame_idx.index(cds_frame):frame_idx.index(cds_frame)+3] + # on pos strand if strand == "+": -# tmp_frame_idx = pos_frame_idx[pos_frame_idx.index(cds_frame):pos_frame_idx.index(cds_frame)+3] -# frame_at_bp = tmp_frame_idx[(bp_pos - (cds_pos[0] - 1)) % 3] if bp_pos == cds_pos[0]: frame_at_bp = cds_frame # break elif cds_pos[0] < bp_pos <= cds_pos[1]: frame_at_bp = tmp_frame_idx[(bp_pos - (cds_pos[0] - 1)) % 3] # break -# frame_at_bp = pos_frame_idx[((bp_pos + 1) - cds_pos[0]) % 3 + cds_frame] + # on neg strand else: -# tmp_frame_idx = neg_frame_idx[pos_frame_idx.index(cds_frame):pos_frame_idx.index(cds_frame)+3] - if bp_pos == cds_pos[1]: frame_at_bp = cds_frame # break elif cds_pos[0] <= bp_pos < cds_pos[1]: frame_at_bp = tmp_frame_idx[(cds_pos[1] - (bp_pos - 1)) % 3] # break -# frame_at_bp = neg_frame_idx[(cds_pos[1] - (bp_pos + 1)) % 3 + cds_frame] # get the starting frame of the cds if not len(pos_frame_list) == 0: if strand == "+": frame_at_start = pos_frame_list[0][1] else: -# frame_at_start = frame_idx[(pos_frame_list[-1][0][1] - pos_frame_list[-1][0][0]) % 3 + pos_frame_list[-1][1]] frame_at_start = pos_frame_list[-1][1] # return frame at the beginning of the first cds and at the breakpoint position, both corrected according the the reading strand return frame_at_start, frame_at_bp - def get_frame2(self, frame1, frame2): + @staticmethod + def get_frame2(frame1, frame2): """ Combine frame info from bp1 and 2 """ if frame1 == -1: return "no_frame" @@ -294,7 +286,8 @@ def get_frame2(self, frame1, frame2): return "in_frame" return "out_frame" - def get_fusion_feature_list(self, bp1_pos, bp2_pos, bp1_strand, bp2_strand, bp1_feature_pos_list, bp2_feature_pos_list): + @staticmethod + def get_fusion_feature_list(bp1_pos, bp2_pos, bp1_strand, bp2_strand, bp1_feature_pos_list, bp2_feature_pos_list): """ Based on the breakpoints, the strand and the complete feature positions of the involved genes, return only those feature positions which will remain in the fusion. """ bp1_feature_fus_list = [] bp2_feature_fus_list = [] @@ -343,7 +336,8 @@ def fill_seq_lookup_dict(self, chrom, cds_list): if not cds_coord in self.cds_seq_dict[chrom][0]: self.cds_seq_dict[chrom][0].append(cds_coord) - def get_stranded_seq(self, sequence, strand): + @staticmethod + def get_stranded_seq(sequence, strand): """ Return the reverse complement of a sequence if the strand is negative and the unchanged sequence otherwise """ if strand == "-": return sequence.reverse_complement() @@ -353,20 +347,20 @@ def run(self, context_seqs_file, cis_near_distance, genome_fasta, context_seq_le """ Run the annotation pipeline """ # for performance reasons (mainly seq grepping), results cannot be written on the fly and everything will therefore be stored in a list of lists header = [ - "FGID", "Fusion_Gene", "Breakpoint1", "Breakpoint2", "FTID", "context_sequence_id", "context_sequence_100_id", "type", - "exon_nr", "exon_starts", "exon_ends", "exon_boundary1", "exon_boundary2", "exon_boundary", "bp1_frame", "bp2_frame", - "frame", "context_sequence", "context_sequence_bp", "neo_peptide_sequence", "neo_peptide_sequence_bp", - - "context_sequence_wt1", "context_sequence_wt2", "context_sequence_wt1_bp", "context_sequence_wt2_bp", "context_sequence_100", "bp1_chr", - "bp1_pos", "bp1_strand", "bp2_chr", "bp2_pos", "bp2_strand", "cds_boundary1", "cds_boundary2", "wt1_exon_pos", "wt2_exon_pos", "ft1_exon_pos", - "ft2_exon_pos", "wt1_cds_pos", "wt2_cds_pos", "ft1_cds_pos", "ft2_cds_pos", "wt1_exon_seqs", "wt2_exon_seqs", "ft1_exon_seqs", "ft2_exon_seqs", - "wt1_cds_seqs", "wt2_cds_seqs", "ft1_cds_seqs", "ft2_cds_seqs", "wt1_exon_transcripts", "wt2_exon_transcripts", "ft1_exon_transcripts", - "ft2_exon_transcripts", "wt1_cds_transcripts", "wt2_cds_transcripts", "ft1_cds_transcripts", "ft2_cds_transcripts", "wt1_peptide", - "wt2_peptide", "fusion_transcript", "fusion_peptide", "wt1_is_good_transcript", "wt2_is_good_transcript", "wt1_trans_biotype", - "wt2_trans_biotype", "wt1_gene_biotype", "wt2_gene_biotype", "wt1_description", "wt2_description", "wt1_frame_at_start", "wt2_frame_at_start", - "wt1_TSL", "wt2_TSL", "wt1_exon_no", "wt2_exon_no", "ft1_exon_no", "ft2_exon_no", "wt1_cds_no", "wt2_cds_no", "ft1_cds_no", "ft2_cds_no", - "wt1_start_stop", "wt2_start_stop", "annotation_bias" - ] + "FGID", "Fusion_Gene", "Breakpoint1", "Breakpoint2", "FTID", "context_sequence_id", "context_sequence_100_id", "type", + "exon_nr", "exon_starts", "exon_ends", "exon_boundary1", "exon_boundary2", "exon_boundary", "bp1_frame", "bp2_frame", + "frame", "context_sequence", "context_sequence_bp", "neo_peptide_sequence", "neo_peptide_sequence_bp", + + "context_sequence_wt1", "context_sequence_wt2", "context_sequence_wt1_bp", "context_sequence_wt2_bp", "context_sequence_100", "bp1_chr", + "bp1_pos", "bp1_strand", "bp2_chr", "bp2_pos", "bp2_strand", "cds_boundary1", "cds_boundary2", "wt1_exon_pos", "wt2_exon_pos", "ft1_exon_pos", + "ft2_exon_pos", "wt1_cds_pos", "wt2_cds_pos", "ft1_cds_pos", "ft2_cds_pos", "wt1_exon_seqs", "wt2_exon_seqs", "ft1_exon_seqs", "ft2_exon_seqs", + "wt1_cds_seqs", "wt2_cds_seqs", "ft1_cds_seqs", "ft2_cds_seqs", "wt1_exon_transcripts", "wt2_exon_transcripts", "ft1_exon_transcripts", + "ft2_exon_transcripts", "wt1_cds_transcripts", "wt2_cds_transcripts", "ft1_cds_transcripts", "ft2_cds_transcripts", "wt1_peptide", + "wt2_peptide", "fusion_transcript", "fusion_peptide", "wt1_is_good_transcript", "wt2_is_good_transcript", "wt1_trans_biotype", + "wt2_trans_biotype", "wt1_gene_biotype", "wt2_gene_biotype", "wt1_description", "wt2_description", "wt1_frame_at_start", "wt2_frame_at_start", + "wt1_TSL", "wt2_TSL", "wt1_exon_no", "wt2_exon_no", "ft1_exon_no", "ft2_exon_no", "wt1_cds_no", "wt2_cds_no", "ft1_cds_no", "ft2_cds_no", + "wt1_start_stop", "wt2_start_stop", "annotation_bias" + ] header_idx = range(len(header)) header_dict = dict(zip(header, header_idx)) @@ -397,9 +391,9 @@ def run(self, context_seqs_file, cis_near_distance, genome_fasta, context_seq_le # check whether breakpoint1 is located on a CDS boundary cds_boundary1 = self.get_boundary(bp1_pos, bp1_cfeature) # get id and additional infos from exon parents transcript and gene - wt1_trans_id, wt1_trans_biotype, wt1_gene_id, wt1_gene_name, wt1_gene_biotype, wt1_description = self.get_parents(bp1_efeature) + wt1_trans_id, wt1_trans_biotype, wt1_gene_name, wt1_gene_biotype, wt1_description = self.get_parents(bp1_efeature) # create a set to collect "suspicious findings" for transcripts - if not wt1_trans_id in self.trans_flags_dict: + if wt1_trans_id not in self.trans_flags_dict: self.trans_flags_dict[wt1_trans_id] = set() # get the complete set of exons, corresponding cds (cds at the same index are NA if there is none in the exon) and start frames of the cds wt1_exon_pos_list, wt1_cds_pos_list, wt1_cds_frame_list = self.get_wt_codings(wt1_trans_id) @@ -422,8 +416,8 @@ def run(self, context_seqs_file, cis_near_distance, genome_fasta, context_seq_le # combined exon boundary information exon_boundary = self.get_boundary2(exon_boundary1, exon_boundary2, bp1_strand, bp2_strand) cds_boundary2 = self.get_boundary(bp2_pos, bp2_cfeature) - wt2_trans_id, wt2_trans_biotype, wt2_gene_id, wt2_gene_name, wt2_gene_biotype, wt2_description = self.get_parents(bp2_efeature) - if not wt2_trans_id in self.trans_flags_dict: + wt2_trans_id, wt2_trans_biotype, wt2_gene_name, wt2_gene_biotype, wt2_description = self.get_parents(bp2_efeature) + if wt2_trans_id not in self.trans_flags_dict: self.trans_flags_dict[wt2_trans_id] = set() # generate ftid from gene names, breakpoints and transcript ids ftid = "_".join([wt1_gene_name, bp1, wt1_trans_id.split(":")[1], wt2_gene_name, bp2, wt2_trans_id.split(":")[1]]) @@ -461,25 +455,23 @@ def run(self, context_seqs_file, cis_near_distance, genome_fasta, context_seq_le exon_nr = len(ft1_cds_pos_list) + len(ft2_cds_pos_list) else: exon_nr = len(ft1_cds_pos_list) + len(ft2_exon_pos_list) -# exon_starts = "{}*{}".format("|".join([str(x) for x,y in bp1_cds_pos_list]), "|".join([str(x) for x,y in bp2_cds_pos_list])) # actually cds starts -# exon_ends = "{}*{}".format("|".join([str(y) for x,y in bp1_cds_pos_list]), "|".join([str(y) for x,y in bp2_cds_pos_list])) # actually cds ends wt1_is_good_transcript = self.trans_flags_dict[wt1_trans_id] wt2_is_good_transcript = self.trans_flags_dict[wt2_trans_id] results_lists.append([ - fgid_ef, fusion_gene_ef, bp1, bp2, ftid, "context_sequence_id", "context_sequence_100_id", fusion_type, - str(exon_nr), "exon_starts", "exon_ends", exon_boundary1, exon_boundary2, exon_boundary, str(wt1_frame_at_bp), str(wt2_frame_at_bp), - frame, "context_sequence", "context_sequence_bp", "neo_peptide_sequence", "neo_peptide_sequence_bp", - - "context_sequence_wt1", "context_sequence_wt2", "context_sequence_wt1_bp", "context_sequence_wt2_bp", "context_sequence_100", bp1_chr, - bp1_pos, bp1_strand, bp2_chr, bp2_pos, bp2_strand, cds_boundary1, cds_boundary2, wt1_exon_pos_list, wt2_exon_pos_list, ft1_exon_pos_list, - ft2_exon_pos_list, wt1_cds_pos_list, wt2_cds_pos_list, ft1_cds_pos_list, ft2_cds_pos_list, [], [], [], [], - [], [], [], [], "wt1_exon_transcripts", "wt2_exon_transcripts", "ft1_exon_transcripts", - "ft2_exon_transcripts", "wt1_cds_transcripts", "wt2_cds_transcripts", "ft1_cds_transcripts", "ft2_cds_transcripts", "wt1_peptide", - "wt2_peptide", "fusion_transcript", "fusion_peptide", wt1_is_good_transcript, wt2_is_good_transcript, wt1_trans_biotype, - wt2_trans_biotype, wt1_gene_biotype, wt2_gene_biotype, wt1_description, wt2_description, wt1_frame_at_start, wt2_frame_at_start, - wt1_tsl, wt2_tsl, len(wt1_exon_pos_list), len(wt2_exon_pos_list), len(ft1_exon_pos_list), len(ft2_exon_pos_list), len(wt1_cds_pos_list), len(wt2_cds_pos_list), len(ft1_cds_pos_list), len(ft2_cds_pos_list), - "{}:{}:{}".format(bp1_chr, wt1_exon_pos_list[0][0], wt1_exon_pos_list[-1][1]), "{}:{}:{}".format(bp2_chr, wt2_exon_pos_list[0][0], wt2_exon_pos_list[-1][1]), "annotation_bias" - ]) + fgid_ef, fusion_gene_ef, bp1, bp2, ftid, "context_sequence_id", "context_sequence_100_id", fusion_type, + str(exon_nr), "exon_starts", "exon_ends", exon_boundary1, exon_boundary2, exon_boundary, str(wt1_frame_at_bp), str(wt2_frame_at_bp), + frame, "context_sequence", "context_sequence_bp", "neo_peptide_sequence", "neo_peptide_sequence_bp", + + "context_sequence_wt1", "context_sequence_wt2", "context_sequence_wt1_bp", "context_sequence_wt2_bp", "context_sequence_100", bp1_chr, + bp1_pos, bp1_strand, bp2_chr, bp2_pos, bp2_strand, cds_boundary1, cds_boundary2, wt1_exon_pos_list, wt2_exon_pos_list, ft1_exon_pos_list, + ft2_exon_pos_list, wt1_cds_pos_list, wt2_cds_pos_list, ft1_cds_pos_list, ft2_cds_pos_list, [], [], [], [], + [], [], [], [], "wt1_exon_transcripts", "wt2_exon_transcripts", "ft1_exon_transcripts", + "ft2_exon_transcripts", "wt1_cds_transcripts", "wt2_cds_transcripts", "ft1_cds_transcripts", "ft2_cds_transcripts", "wt1_peptide", + "wt2_peptide", "fusion_transcript", "fusion_peptide", wt1_is_good_transcript, wt2_is_good_transcript, wt1_trans_biotype, + wt2_trans_biotype, wt1_gene_biotype, wt2_gene_biotype, wt1_description, wt2_description, wt1_frame_at_start, wt2_frame_at_start, + wt1_tsl, wt2_tsl, len(wt1_exon_pos_list), len(wt2_exon_pos_list), len(ft1_exon_pos_list), len(ft2_exon_pos_list), len(wt1_cds_pos_list), len(wt2_cds_pos_list), len(ft1_cds_pos_list), len(ft2_cds_pos_list), + "{}:{}:{}".format(bp1_chr, wt1_exon_pos_list[0][0], wt1_exon_pos_list[-1][1]), "{}:{}:{}".format(bp2_chr, wt2_exon_pos_list[0][0], wt2_exon_pos_list[-1][1]), "annotation_bias" + ]) # grep chromosomes and extract cds sequences from the @cds_seq_dict # urla note: the max(..) check in the slicing is probably superfluous as CDS annotations should not start at chromosome position 0 @@ -523,7 +515,7 @@ def run(self, context_seqs_file, cis_near_distance, genome_fasta, context_seq_le result_list[header_dict["context_sequence"]] = self.get_stranded_seq(result_list[header_dict["ft1_exon_transcripts"]], result_list[header_dict["bp1_strand"]]) result_list[header_dict["context_sequence_wt1"]] = self.get_stranded_seq(result_list[header_dict["wt1_exon_transcripts"]], result_list[header_dict["bp1_strand"]]) result_list[header_dict["fusion_transcript"]] = self.get_stranded_seq(result_list[header_dict["ft1_cds_transcripts"]], result_list[header_dict["bp1_strand"]]) - result_list[header_dict["wt1_peptide"]] = self.get_stranded_seq(result_list[header_dict["wt1_cds_transcripts"]], result_list[header_dict["bp1_strand"]])[translation_shift1:].translate(table = trans_table) + result_list[header_dict["wt1_peptide"]] = self.get_stranded_seq(result_list[header_dict["wt1_cds_transcripts"]], result_list[header_dict["bp1_strand"]])[translation_shift1:].translate(table=trans_table) # do the same for fusion partner 2 if not last_chr == result_list[header_dict["bp2_chr"]]: @@ -549,7 +541,7 @@ def run(self, context_seqs_file, cis_near_distance, genome_fasta, context_seq_le result_list[header_dict["fusion_transcript"]] += self.get_stranded_seq(result_list[header_dict["ft2_cds_transcripts"]], result_list[header_dict["bp2_strand"]]) else: result_list[header_dict["fusion_transcript"]] += self.get_stranded_seq(result_list[header_dict["ft2_exon_transcripts"]], result_list[header_dict["bp2_strand"]]) - result_list[header_dict["wt2_peptide"]] = self.get_stranded_seq(result_list[header_dict["wt2_cds_transcripts"]], result_list[header_dict["bp2_strand"]])[translation_shift2:].translate(table = trans_table) + result_list[header_dict["wt2_peptide"]] = self.get_stranded_seq(result_list[header_dict["wt2_cds_transcripts"]], result_list[header_dict["bp2_strand"]])[translation_shift2:].translate(table=trans_table) # bp pos in the fusion bp_in_fusion_nt = len(result_list[header_dict["ft1_cds_transcripts"]]) # the fusion starts in the fusion transcript with the end of the first part @@ -559,7 +551,7 @@ def run(self, context_seqs_file, cis_near_distance, genome_fasta, context_seq_le bp_in_fusion_aa = ((bp_in_fusion_nt - translation_shift1)* 10 // 3) / 10 # the respective bp pos in the aa seq, whereas x.0 / x.3 / x.6 indicate that the breakpoint is at the beginning / after first base / after second base of the underlying codon # translate and trim the fusion sequence around the breakpoint - result_list[header_dict["fusion_peptide"]] = result_list[header_dict["fusion_transcript"]][translation_shift1:].translate(table = 1, to_stop=True) + result_list[header_dict["fusion_peptide"]] = result_list[header_dict["fusion_transcript"]][translation_shift1:].translate(table=1, to_stop=True) result_list[header_dict["context_sequence_100"]] = result_list[header_dict["context_sequence"]][max(0, bp_in_fusion_nt_ex - 100):(bp_in_fusion_nt_ex + 100)] result_list[header_dict["context_sequence"]] = result_list[header_dict["context_sequence"]][max(0, bp_in_fusion_nt_ex - context_seq_len):(bp_in_fusion_nt_ex + context_seq_len)] result_list[header_dict["context_sequence_wt1"]] = result_list[header_dict["context_sequence_wt1"]][max(0, bp_in_wt1_nt_ex - context_seq_len):(bp_in_wt1_nt_ex + context_seq_len)] @@ -590,6 +582,7 @@ def run(self, context_seqs_file, cis_near_distance, genome_fasta, context_seq_le # urla: needs revision and simplification, but is working... neo_pep_nuc_length = len(result_list[header_dict["neo_peptide_sequence"]]) * 3 neo_pep_until_bp_nuc = int(round(result_list[header_dict["neo_peptide_sequence_bp"]] * 3, 0)) + exon_pos = 1 if result_list[header_dict["bp1_strand"]] == "+": summed_len = 0 for exon_pos, exon_length in enumerate([y-x for x, y in result_list[header_dict["ft1_exon_pos"]]][::-1], 1): @@ -620,8 +613,8 @@ def run(self, context_seqs_file, cis_near_distance, genome_fasta, context_seq_le break lookup_exons2 = result_list[header_dict["ft2_exon_pos"]][-exon_pos:] - result_list[header_dict["exon_starts"]] = "{}*{}".format("|".join([str(x) for x,y in lookup_exons1]), "|".join([str(x) for x,y in lookup_exons2])) - result_list[header_dict["exon_ends"]] = "{}*{}".format("|".join([str(y) for x,y in lookup_exons1]), "|".join([str(y) for x,y in lookup_exons2])) + result_list[header_dict["exon_starts"]] = "{}*{}".format("|".join([str(x) for x, y in lookup_exons1]), "|".join([str(x) for x, y in lookup_exons2])) + result_list[header_dict["exon_ends"]] = "{}*{}".format("|".join([str(y) for x, y in lookup_exons1]), "|".join([str(y) for x, y in lookup_exons2])) # experimental if not len(result_list[header_dict["wt1_cds_transcripts"]]) % 3 == 0: @@ -634,7 +627,7 @@ def run(self, context_seqs_file, cis_near_distance, genome_fasta, context_seq_le # write everything to a couple of files # 1) context.csv.debug and context.csv : for easy debugging, we will write these two files. - # The first contains all informations, the second is filtered and + # The first contains all informations, the second is filtered and # contains only downstream required columns # 2) context_seqs.csv.fasta, context_seqs.csv_transcripts.fasta and context_seqs.csv_peptide.fasta (the last two only for debugging): # These files contain the fasta sequences of the ft/wt1/wt2 context sequences, @@ -665,10 +658,10 @@ def run(self, context_seqs_file, cis_near_distance, genome_fasta, context_seq_le # context data for star genome generation count_context_seqs += 3 summed_context_seqs_len += sum(map(len, [ - result_line[header_dict["context_sequence"]], - result_line[header_dict["context_sequence_wt1"]], - result_line[header_dict["context_sequence_wt2"]] - ])) + result_line[header_dict["context_sequence"]], + result_line[header_dict["context_sequence_wt1"]], + result_line[header_dict["context_sequence_wt2"]] + ])) # sequences for requantification SeqIO.write(SeqRecord(result_line[header_dict["context_sequence"]], id="{}_{}_{}_ft".format(result_line[header_dict["FTID"]], result_line[header_dict["context_sequence_id"]], result_line[header_dict["context_sequence_bp"]]), name="", description=""), cfasta, "fasta") SeqIO.write(SeqRecord(result_line[header_dict["context_sequence_wt1"]], id="{}_{}_{}_wt1".format(result_line[header_dict["FTID"]], result_line[header_dict["context_sequence_id"]], result_line[header_dict["context_sequence_wt1_bp"]]), name="", description=""), cfasta, "fasta") @@ -680,7 +673,7 @@ def run(self, context_seqs_file, cis_near_distance, genome_fasta, context_seq_le def main(): """Parse command line arguments and start script""" - parser = argparse.ArgumentParser(description="Generate mapping stats for fusion detection") + parser = ArgumentParser(description="Generate mapping stats for fusion detection") parser.add_argument('--detected_fusions', dest='detected_fusions', help='detected_fusions.csv file from easyfuse', required=True) parser.add_argument('--annotation_db', dest='annotation_db', help='gff3 derived annotation database from gffutils', required=True) parser.add_argument('--out_csv', dest='out_csv', help='context_seqs.csv output for easyfuse', required=True) diff --git a/fusionreadfilter.py b/fusionreadfilter.py index 2d3cf39..8ae5ef4 100755 --- a/fusionreadfilter.py +++ b/fusionreadfilter.py @@ -20,7 +20,6 @@ @version: 20181126 """ -from __future__ import print_function import sys import time from argparse import ArgumentParser diff --git a/fusiontoolparser.py b/fusiontoolparser.py index f517767..f290d2e 100755 --- a/fusiontoolparser.py +++ b/fusiontoolparser.py @@ -10,7 +10,6 @@ @version: 20181126 """ -from __future__ import print_function import sys import os import re @@ -33,13 +32,13 @@ def __init__(self, scratch_path, fusion_output_path, sample_id, tool_num_cutoff, self.tool_num_cutoff = int(tool_num_cutoff) # urla: if we want to be more generic and allow different annotations, identification of the chr names # (eg "chr1" vs "1" and "chrM" vs "MT") should be performed in advance - self.chr_list = [ + self.chr_list = ( "1", "2", "3", "4", "5", "6", "7", "8", "9", "10", "11", "12", "13", "14", "15", "16", "17", "18", "19", "20", "21", "22", "X", "Y", "MT" - ] + ) self.tools = fusiontool_list.split(",") self.logger = Logger(sample_log) diff --git a/getRequantReads.py b/getRequantReads.py index 896006b..7c5a76f 100755 --- a/getRequantReads.py +++ b/getRequantReads.py @@ -20,7 +20,6 @@ @version: 20181126 """ -from __future__ import print_function import sys import time from argparse import ArgumentParser diff --git a/join_data.py b/join_data.py index 7aabbca..b37c4e7 100755 --- a/join_data.py +++ b/join_data.py @@ -9,11 +9,11 @@ @version: 20190118 """ -from __future__ import print_function, division import os import sys import pandas as pd -import misc.queue as Queueing +import misc.queueing as Queueing +import config as cfg # pylint: disable=line-too-long # yes they are partially, but I do not consider this to be relevant here @@ -28,7 +28,6 @@ def __init__(self, input_dir, id1, id2, output, model_predictions): self.model_predictions = model_predictions pd.options.mode.chained_assignment = None self.input_read_count = 0 - self.easyfuse_path = os.path.dirname(os.path.realpath(__file__)) self.blacklist = [] def append_tool_cnts_to_context_file(self, context_data, detected_fusions, fusion_tool_list): @@ -101,11 +100,11 @@ def create_joined_table(self, sample_id, fusion_tools, requant_mode): return context_data.fillna(0), redundant_header - def run(self, config): + def run(self): """run the data concatenation""" - fusion_tools = config.get("general", "fusiontools").split(",") - requant_mode = config.get("general", "requant_mode").split(",") - self.load_blacklist(config.get("easyfuse_helper", "blacklist")) + fusion_tools = cfg.fusiontools + requant_mode = cfg.requant_mode + self.load_blacklist(os.path.join(cfg.module_dir, "blacklist.txt")) # urla - note: with icam_run set to True, two results from technical replicates are expected as input print("Looking for required files...") # check whether output already exists @@ -122,10 +121,10 @@ def run(self, config): for table in ["1", "2"]: summary_file = "{}_fusRank_{}.csv".format(self.output, table) if self.model_predictions and self.check_files(summary_file, True): - model_path = config.get("otherFiles", "easyfuse_model") - model_threshold = config.get("general", "model_pred_threshold") + model_path = cfg.other_files["easyfuse_model"] + model_threshold = cfg.model_pred_threshold # append prediction scores based on pre-calculated model - cmd_model = "{0} --fusion_summary {1} --model_file {2} --prediction_threshold {3} --output {4}".format(os.path.join(self.easyfuse_path, "R", "R_model_prediction.R"), summary_file, model_path, model_threshold, "{}.pModelPred.csv".format(summary_file[:-4])) + cmd_model = "{0} --fusion_summary {1} --model_file {2} --prediction_threshold {3} --output {4}".format(os.path.join(cfg.module_dir, "R", "R_model_prediction.R"), summary_file, model_path, model_threshold, "{}.pModelPred.csv".format(summary_file[:-4])) Queueing.submit("", cmd_model.split(" "), "", "", "", "", "", "", "", "", "", "none") # re-read the table with prediction for filter counting # urla - note: there is probably a more elegant way using getattr/setattr but I'm not at all familiar with its pros/cons @@ -211,9 +210,9 @@ def count_records(self, pd_data_frame, is_merged, name): ftid_dict[ftid] = ftid_dict[ftid][0] # create sets of fusion gene names according to different fiters - fusion_gene_set_list[1] = set([ftid_dict[x] for x in df_rep[df_rep["type"] != "cis_near"]["FTID"]]) # exclude cis near events as they are likely read through - fusion_gene_set_list[2] = set([ftid_dict[x] for x in df_rep[df_rep["exon_boundary"] == "both"]["FTID"]]) # allow only fusions where the breakpoints are on exon boundaries in BOTH fusion partners - fusion_gene_set_list[3] = set([ftid_dict[x] for x in df_rep[df_rep["frame"] != "no_frame"]["FTID"]]) # exclude no frame fusions + fusion_gene_set_list[1] = set([ftid_dict[x] for x in df_rep[df_rep["type"].astype(str) != "cis_near"]["FTID"]]) # exclude cis near events as they are likely read through + fusion_gene_set_list[2] = set([ftid_dict[x] for x in df_rep[df_rep["exon_boundary"].astype(str) == "both"]["FTID"]]) # allow only fusions where the breakpoints are on exon boundaries in BOTH fusion partners + fusion_gene_set_list[3] = set([ftid_dict[x] for x in df_rep[df_rep["frame"].astype(str) != "no_frame"]["FTID"]]) # exclude no frame fusions # urla - note: I'm not 100% why str.isalpha is working everywhere I tested, but not on our dev servers... This way, it works, although I thought the pandas str method would this already... fusion_gene_set_list[4] = set([ftid_dict[x] for x in df_rep[df_rep["neo_peptide_sequence"].astype(str).str.isalpha()]["FTID"]]) # the neo peptide sequence must be an alphabetic sequence of at least length 1 if is_merged: diff --git a/misc/config.py b/misc/config.py deleted file mode 100755 index 2d9d38c..0000000 --- a/misc/config.py +++ /dev/null @@ -1,112 +0,0 @@ -#/usr/bin/env python - -""" -Reading/accessing "config.ini" file - -@author: Tron (PASO), BNT (URLA) -@version: 20190118 -""" - -from __future__ import print_function -import sys -import os.path -from distutils.spawn import find_executable - -class Config(object): - """Class to access all required parameter information about a run""" - def __init__(self, infile): - """inits the object and reads a file with state info""" - self._file = infile - self._categories, self._d, self._key_lists = self._read_config_file() - - def _read_config_file(self): - """read key-value pairs from the state file self._file""" - record_dict = {} - key_lists = {} - categories = [] - category = "NONE" - with open(self._file) as cfg: - for line in cfg: - line = line.strip("\n") - line = line.strip() - if not (line.startswith("#") or line == ""): - if line.startswith("[") and line.endswith("]"): - category = line.strip("[").strip("]").strip() - categories.append(category) - record_dict[category] = {} - key_lists[category] = [] - else: - words = line.split("=") - words = [x.strip() for x in words] - if len(words) < 2: - continue - temp_d = record_dict.get(category, {}) - temp_d[words[0]] = words[1] - record_dict[category] = temp_d - temp_l = key_lists.get(category, []) - temp_l.append(words[0]) - key_lists[category] = temp_l - return categories, record_dict, key_lists - - def get(self, category, key): - """get an dictionary entry named "key" from self._d""" - return self._d[category][key] - - def get_path(self): - """return the fullpath string of this config file""" - return self._file - - def get_categories(self): - """return a list of catgories""" - return self._categories - - def get_keys(self, category): - """return a list of keys belonging to a defined category""" - return self._key_lists[category] - - def run_self_test(self): - """Test if set commands work and provided file/dir path exist""" - print("Starting self test of the config file...") - count_errors = 0 - for category in self.get_categories(): - # ['general', 'resources', 'commands', 'references', 'indices', 'otherFiles', 'easyfuse_helper', 'liftover'] - if category == "general": - print("Checking general settings...") - if len(self.get(category, "tools")) == 0: - print("Error 99: No tools selected for running!") - count_errors += 1 - elif "Fetchdata" in self.get(category, "tools") and len(self.get(category, "fd_tools")) == 0: - print("Error 99: No tools selected for fetchdata run!") - count_errors += 1 - elif category == "resources": - for ressource in self.get_keys(category): - try: - mem, cpu = self.get(category, ressource).split(",") - int(mem) - int(cpu) - except ValueError: - print("Error 99: Incorrect ressource definition for {}. Please provide comma separated integers for CPU and MEM limits!".format(ressource)) - count_errors += 1 - elif category == "commands": - print("Checking provided commands...") - for command in self.get_keys(category): - try: - print("Found \"{0}\" at \"{1}\".".format(command, os.path.realpath(find_executable(self.get(category, command))))) - except: - print("Error 99: The command assigned to \"{}\" could not be found!".format(command)) - count_errors += 1 - else: - print("Checking provided path in \"{}\"".format(category)) - for path in self.get_keys(category): - if "bowtie" in path: - if not os.path.isfile("{}.rev.2.ebwt".format(self.get(category, path))): - print("Error 99: Bowtie indices not found. Please make sure that {} is the correct prefix for the index!".format(self.get(category, path))) - count_errors += 1 - elif not os.path.isfile(self.get(category, path)) and not os.path.isdir(self.get(category, path)): - print("Error 99: File or directory \"{}\" does not exist. Please check definition of {}!".format(self.get(category, path), path)) - count_errors += 1 - if count_errors > 0: - print("Error 99: Identified {} problems with the config file => program execution aborted!".format(count_errors)) - sys.exit(99) - else: - print("All tests finished successfully!") diff --git a/misc/io_methods.py b/misc/io_methods.py index 7802678..2614352 100755 --- a/misc/io_methods.py +++ b/misc/io_methods.py @@ -10,7 +10,6 @@ @version: 20181126 """ -from __future__ import print_function import os import sys import stat diff --git a/misc/liftover.py b/misc/liftover.py index f1e89eb..499c077 100755 --- a/misc/liftover.py +++ b/misc/liftover.py @@ -4,27 +4,28 @@ Changes the coordinates from one genome annotation to another (liftover) with crossmap Coordinates are changed in DetectedFusions.csv; all intermediate files are preserved. -@author: BNT (URLA) +@author: TRON (PASO), BNT (URLA) @version: 20190131 """ -from __future__ import print_function from argparse import ArgumentParser import sys import os.path import pandas as pd -from shutil import copy2 +from shutil import copyfile import queue as Queueing -from config import Config from logger import Logger +sys.path.append(os.path.join(os.path.dirname(__file__), "..")) + +import config as cfg + class FusionLiftover(object): """Select alignments belonging to putative fusions from an s/bam file""" - def __init__(self, in_fus_detect, config, logger): + def __init__(self, in_fus_detect, logger): self.in_fus_detect = in_fus_detect - self.cfg = config self.logger = Logger(logger) - copy2(in_fus_detect, "{}.bak".format(in_fus_detect)) + copyfile(in_fus_detect, "{}.bak".format(in_fus_detect)) self.chr_list = [ "1", "2", "3", "4", "5", "6", "7", "8", "9", "10", @@ -35,12 +36,12 @@ def __init__(self, in_fus_detect, config, logger): def liftcoords(self): """Parse ensembl transcriptome fasta file and modify header""" - crossmap_chain = self.cfg.get('liftover', 'crossmap_chain') + crossmap_chain = cfg.liftover["crossmap_chain"] # check whether annotations and references are set, existing and matching ref_genome_org = os.path.basename(crossmap_chain).replace(".", "_").split("_")[0] ref_genome_dest = os.path.basename(crossmap_chain).replace(".", "_").split("_")[2] - if not ref_genome_org.lower() == self.cfg.get('general', 'ref_genome_build').lower(): + if not ref_genome_org.lower() == cfg.ref_genome_build.lower(): self.logger.error("Error 99: Mismatch between set genome version and chain file. Please verify that you have the correct chain file was supplied!") print("Error 99: Mismatch between set genome version and chain file. Please verify that you have the correct chain file was supplied!") sys.exit(99) @@ -61,8 +62,8 @@ def liftcoords(self): fusout.write("{0}\t{1}\t{2}\t{3}\t2\t{4}\n".format(chrom, pos, (int(pos) + 1), strand, in_fus_detect_pddf.loc[i, "FGID"])) # run crossmap to perform liftover - cmd_crossmap = "{0} bed {1} {2} {3}".format(self.cfg.get('commands', 'crossmap_cmd'), crossmap_chain, tmp_org_bed, tmp_dest_bed) - module_file = self.cfg.get('commands', 'module_file') + cmd_crossmap = "{0} bed {1} {2} {3}".format(cfg.commands["crossmap_cmd"], crossmap_chain, tmp_org_bed, tmp_dest_bed) + module_file = os.path.join(cfg.module_dir, "build_env.sh") Queueing.submit("", cmd_crossmap.split(" "), "", "", "", "", "", "", "", "", module_file, "none") # check whether some coords were unmapped and print which those are (i.e. which fusion will be lost) if os.stat(tmp_dest_bed_unmap).st_size == 0: @@ -117,11 +118,10 @@ def main(): """Parse command line arguments and start script""" parser = ArgumentParser(description="Generate mapping stats for fusion detection") parser.add_argument('-i', '--input', dest='input', help='Detected fusions file', required=True) - parser.add_argument('-c', '--config', dest='config', help='Main config file', required=True) parser.add_argument('-l', '--logger', dest='logger', help='Logging of processing steps', default="") args = parser.parse_args() - flo = FusionLiftover(args.input, Config(args.config), args.logger) + flo = FusionLiftover(args.input, args.logger) flo.liftcoords() if __name__ == '__main__': diff --git a/misc/qc_parser.py b/misc/qc_parser.py index 426c091..12186f3 100755 --- a/misc/qc_parser.py +++ b/misc/qc_parser.py @@ -7,7 +7,7 @@ class Parser(object): def __init__(self): pass - def parse_total_sequences(self,infile): + def parse_total_sequences(self, infile): total_sequences = 0 counter = 0 with open(infile) as inf: @@ -18,30 +18,7 @@ def parse_total_sequences(self,infile): return total_sequences return total_sequences - def parse_overrepresented(self,infile,outfile): - seq_map = {} - with open(infile) as inf: - check = False - for line in inf: - if check and line.startswith(">>END_MODULE"): - check = False - if check and not line.startswith(">>END_MODULE"): -# print line - elements = line.rstrip().split("\t") - sequence = elements[0] - occurences = elements[1] - perc = elements[2] - if sequence != "#Sequence": - seq_map[sequence] = "" - if line.startswith(">>Overrepresented sequences"): - check = True - outf = open(outfile,'a') - for key in seq_map: - outf.write(key + "\n") - outf.close() - return seq_map - - def parse_quality(self,infile,outfile): + def parse_quality(self, infile, outfile): seq_map = {} read_length = 0 with open(infile) as inf: @@ -62,14 +39,21 @@ def parse_quality(self,infile,outfile): percentile_90 = float(elements[6]) if read_length < int(base): read_length = int(base) - seq_map[int(base)] = [mean,median,lower_quartile,upper_quartile,percentile_10,percentile_90] + seq_map[int(base)] = [ + mean, + median, + lower_quartile, + upper_quartile, + percentile_10, + percentile_90 + ] if line.startswith(">>Per base sequence quality"): check = True counter = 0 stop = False badass_bases = [] - for key in sorted(seq_map.keys(),reverse=True): + for key in sorted(seq_map.keys(), reverse=True): if seq_map[key][2] > 20: stop = True if not stop: @@ -78,51 +62,28 @@ def parse_quality(self,infile,outfile): if seq_map[key][2] < 20: badass_bases.append(key) - remaining = read_length-counter + remaining = read_length - counter actual = 0 -# print "Suggested Trim cutoff: " + str(counter) + " bp" -# print "Remaining Read length: " + str(remaining) + " bp" - if (read_length-counter) < (0.75*read_length): + if (read_length - counter) < (0.75*read_length): actual = int(round(0.25*read_length)) -# print "Actual Trim length: " + str(actual) else: actual = counter -# print "Actual Trim length: " + str(actual) -# print "Badass bases: " + str(badass_bases) total_sequences = self.parse_total_sequences(infile) - outf = open(outfile,'a') - - outf.write(infile + "," + str(read_length) + "," + str(counter) + "," + str(remaining) + "," + str(actual) + "," + str(len(badass_bases)) + "," + str(total_sequences) + "\n") -# print infile + "," + str(counter) + "," + str(remaining) + "," + str(actual) + "," + str(len(badass_bases)) - outf.close() + with open(outfile, 'a') as outf: + outf.write("{},{},{},{},{},{},{}\n".format(infile, read_length, counter, remaining, actual, len(badass_bases), total_sequences)) return seq_map - - def write_fasta(self,outfile,seq_map): - outf = open(outfile,"w") - for i,key in enumerate(seq_map,start=1): - header = ">seq" + str(i) - sequence = key - outf.write(header+"\n") - outf.write(sequence+"\n") def main(): - parser = ArgumentParser(description='handles garbage in racoon-style') - parser.add_argument('-i','--input',dest='input',nargs='+',help='Specify input file(s) (fastqc_data.txt in fastqc folder(s))') - parser.add_argument('-o','--overrepresented',dest='overrepresented',help='Specify output overrepresented sequences file') - parser.add_argument('-q','--qc-table',dest='qc_table',help='Specify output QC table') + parser = ArgumentParser(description='Parses FastQC quality values and outputs suggested trimming ratios') + parser.add_argument('-i', '--input', dest='input', nargs='+', help='Specify input file(s) (fastqc_data.txt in fastqc folder(s))') + parser.add_argument('-o', '--qc-table', dest='qc_table', help='Specify output QC table') args = parser.parse_args() -# infile = sys.argv[1] -# outfile = sys.argv[2] - open(args.qc_table,"w").write("filename,read_length,suggested_trim_length,remaining_read_length,actual_trim_length,badass_bases,total_sequences\n") + + open(args.qc_table, "w").write("filename,read_length,suggested_trim_length,remaining_read_length,actual_trim_length,badass_bases,total_sequences\n") p = Parser() for file in args.input: - seq_map = p.parse_overrepresented(file,args.overrepresented) -# p.write_fasta(outfile,seq_map) -# total_sequences = p.parse_total_sequences(file) -# print total_sequences - qmap = p.parse_quality(file,args.qc_table) -# print qmap + qmap = p.parse_quality(file, args.qc_table) if __name__ == '__main__': diff --git a/misc/queue.py b/misc/queueing.py similarity index 95% rename from misc/queue.py rename to misc/queueing.py index 52e1388..9d35838 100755 --- a/misc/queue.py +++ b/misc/queueing.py @@ -10,7 +10,6 @@ @version: 20180118 """ -from __future__ import print_function import os import sys import subprocess @@ -149,18 +148,17 @@ def _submit_slurm(job_name, cmd, cores, mem_usage, output_results_folder, depend "#SBATCH --account {}\n".format(userid), "#SBATCH --kill-on-invalid-dep=yes\n", "#SBATCH --cpus-per-task={}\n".format(cores), - "#SBATCH --mem={}\n".format(int(mem_usage)*1000), + "#SBATCH --mem={}G\n".format(mem_usage), "#SBATCH --time={}\n".format(timelimit), depend, "#SBATCH -D {}\n".format(output_results_folder), "#SBATCH --error={}\n".format(error_file), "#SBATCH --output={}\n".format(output_file), -# "#SBATCH --requeue\n", mail_type, mail_user, "set -eo pipefail -o nounset\n", - ". {}\n".format(module_file), - "srun {}\n".format(cmd) + ". {}\n".format(module_file) if module_file else "\n", + "srun echo \"$(date) Slurm job started.\" > {0}.runtime && {1} && echo \"$(date) Slurm job finished.\" >> {0}.runtime\n".format(job_name, cmd) ]) # and run it try: @@ -190,7 +188,7 @@ def main(): mem_usage = args.memory output_results_folder = args.output dependencies = "" - submit(job_name, cmd, cores, mem_usage, output_results_folder, dependencies, args.partitions, args.userid, args.timelimit) + submit(job_name, cmd, cores, mem_usage, output_results_folder, dependencies, args.partitions, args.userid, args.timelimit, "", "") if __name__ == '__main__': main() diff --git a/misc/version.py b/misc/version.py index fb32fcf..290e754 100755 --- a/misc/version.py +++ b/misc/version.py @@ -3,7 +3,7 @@ """ Just the version -@author: BNT (URLA) +@author: TRON (PASO), BNT (URLA) @version: 20181126 """ -__version__ = "easyfuse v1.0" +__version__ = "easyfuse v1.3.1" diff --git a/misc/versioncontrol.py b/misc/versioncontrol.py index 6e0f546..232cc8a 100755 --- a/misc/versioncontrol.py +++ b/misc/versioncontrol.py @@ -28,10 +28,12 @@ def load_py_dict(self, with_pip_not_conda): output = "" try: if with_pip_not_conda: - output = subprocess.Popen(["pip", "freeze"], stdout = subprocess.PIPE, stderr = subprocess.PIPE) + output = subprocess.Popen(["pip", "freeze"], stdout = subprocess.PIPE, stderr = subprocess.PIPE, encoding = "utf-8") else: - output = subprocess.Popen(["conda", "list"], stdout = subprocess.PIPE, stderr = subprocess.PIPE) + output = subprocess.Popen(["conda", "list"], stdout = subprocess.PIPE, stderr = subprocess.PIPE, encoding = "utf-8") (stdoutdata, stderrdata) = output.communicate() +# print(stderrdata) +# print(stdoutdata) if with_pip_not_conda and stderrdata and not stdoutdata: print(stderrdata) @@ -47,9 +49,13 @@ def load_py_dict(self, with_pip_not_conda): for py_line in stdoutdata.split("\n"): if py_line and not py_line.startswith("#"): if with_pip_not_conda: - self.py_dict[py_line.split("==")[0]] = py_line.split("==")[1] + keyval = py_line.split("==") + if len(keyval) == 2: + self.py_dict[keyval[0]] = keyval[1] else: - self.py_dict[py_line.split()[0]] = py_line.split()[1] + keyval = py_line.split() + if len(keyval) == 2: + self.py_dict[keyval[0]] = keyval[1] def get_and_print_tool_versions(self): """For each tool in the dep list, try to identify the installed version and compare against the tested""" @@ -82,7 +88,7 @@ def get_version_with_version_string(self, cmd, grp_str): """Return version string from program call on shell""" output = "" try: - output = subprocess.Popen(cmd.split(" "), stdout = subprocess.PIPE, stderr = subprocess.PIPE) + output = subprocess.Popen(cmd.split(" "), stdout = subprocess.PIPE, stderr = subprocess.PIPE, encoding = "utf-8") (stdoutdata, stderrdata) = output.communicate() if stdoutdata and not stderrdata: output = stdoutdata diff --git a/processing.py b/processing.py index 7d5f109..810bfc3 100755 --- a/processing.py +++ b/processing.py @@ -10,63 +10,56 @@ @author: Tron (PASO), BNT (URLA) @version: 20190118 """ -from __future__ import print_function + import os import os.path import sys import time import argparse -from shutil import copy2 -import misc.queue as Queueing -from misc.config import Config +from shutil import copy +import misc.queueing as Queueing +#from misc.config import Config from misc.samples import SamplesDB from misc.logger import Logger import misc.io_methods as IOMethods from misc.versioncontrol import VersCont -import misc.version as eav +#import misc.version as eav +import config as cfg # pylint: disable=line-too-long # yes they are partially, but I do not consider this to be relevant here class Processing(object): """Run, monitor and schedule fastq processing for fusion gene prediction""" - def __init__(self, cmd, config, input_path, working_dir, partitions, userid, overwrite): + def __init__(self, cmd, input_path, working_dir): """Parameter initiation and work folder creation. Start of progress logging.""" self.working_dir = working_dir.rstrip("/") self.logger = Logger(os.path.join(self.working_dir, "easyfuse_processing.log")) IOMethods.create_folder(self.working_dir, self.logger) - copy2(config, working_dir) - self.cfg = Config(os.path.join(self.working_dir, IOMethods.path_leaf(config))) + copy(os.path.join(cfg.module_dir, "config.py"), working_dir) self.logger.info("Starting easyfuse: CMD - {}".format(cmd)) self.input_path = input_path self.samples = SamplesDB(os.path.join(self.working_dir, "samples.db")) - self.partitions = partitions - self.userid = userid - self.overwrite = overwrite - self.easyfuse_path = os.path.dirname(os.path.realpath(__file__)) +# self.overwrite = overwrite # The run method simply greps and organises fastq input files. # Fastq pairs (single end input is currently not supported) are then send to "execute_pipeline" def run(self, tool_num_cutoff): """General parameter setting, identification of fastq files and initiation of processing""" + self.logger.info("Pipeline Version: {}".format(cfg.version)) # Checking dependencies - VersCont(self.cfg.get('easyfuse_helper', 'dependencies')).get_and_print_tool_versions() - self.cfg.run_self_test() - is_stranded = False # urla: I leave it in although it is currently not required - however, future predictions may improve from stranded libs - is_rna_seq = True # urla: can this ever become false?! + #VersCont(os.path.join(cfg.module_dir, "dependency_versions.txt")).get_and_print_tool_versions() + #self.cfg.run_self_test() # urla: organism is currently not used for anything, however, this might change; is mouse processing relevant at some point? - ref_genome = self.cfg.get('general', 'ref_genome_build') - ref_trans = self.cfg.get('general', 'ref_trans_version') -# if ref_genome in ['hg19', 'hg38']: -# organism = "human" -# else: -# organism = "mouse" - self.logger.info("Stranded: {0}, RNA-Seq: {1}, Reference Genome: {2}, Reference Transcriptome: {3}".format(is_stranded, is_rna_seq, ref_genome, ref_trans)) - if self.overwrite: - self.logger.info("#############################################################################") - self.logger.info("") - self.logger.info("Overwrite flag is set => all previously existing results may be overwritten!") - self.logger.info("") - self.logger.info("#############################################################################") + ref_genome = cfg.ref_genome_build + ref_trans = cfg.ref_trans_version + + self.logger.info("Reference Genome: {0}, Reference Transcriptome: {1}".format(ref_genome, ref_trans)) + # if self.overwrite: + # self.logger.info("#############################################################################") + # self.logger.info("") + # self.logger.info("Overwrite flag is set => all previously existing results may be overwritten!") + # self.logger.info("") + # self.logger.info("#############################################################################") sample_list = [] @@ -81,51 +74,57 @@ def run(self, tool_num_cutoff): self.execute_pipeline(left[i], right[i], sample_id[i], ref_genome, ref_trans, tool_num_cutoff) # summarize all data if selected - if "Summary" in self.cfg.get('general', 'tools'): + if "Summary" in cfg.tools: #dependency = [Queueing.get_jobs_by_name("Fetchdata-{}".format(sample)) for sample in sample_list] # urla - note: would be happy to get the dependencies with a stacked LC, but is atm to complicated for me ^^ dependency = [] for sample in sample_list: dependency.extend(Queueing.get_jobs_by_name("Fetchdata-{}".format(sample))) modelling_string = "" - if self.cfg.get("otherFiles", "easyfuse_model"): + if cfg.other_files["easyfuse_model"]: modelling_string = " --model_predictions" - cmd_summarize = "python {0} --input {1} --config {2}{3}".format(os.path.join(self.easyfuse_path, "summarize_data.py"), self.working_dir, self.cfg.get_path(), modelling_string) + cmd_summarize = "python {0} --input {1}{2}".format(os.path.join(cfg.module_dir, "summarize_data.py"), self.working_dir, modelling_string) self.logger.debug("Submitting slurm job: CMD - {0}; PATH - {1}; DEPS - {2}".format(cmd_summarize, self.working_dir, dependency)) - cpu, mem = self.cfg.get("resources", "summary").split(",") - self.submit_job("-".join(["Summary", str(int(round(time.time())))]), cmd_summarize, cpu, mem, self.working_dir, dependency, self.cfg.get('general', 'receiver')) + cpu = cfg.resources["summary"]["cpu"] + mem = cfg.resources["summary"]["mem"] + self.submit_job("-".join([cfg.pipeline_name, "Summary", str(int(round(time.time())))]), cmd_summarize, cpu, mem, self.working_dir, dependency, cfg.receiver) # Per sample, define input parameters and execution commands, create a folder tree and submit runs to slurm def execute_pipeline(self, fq1, fq2, sample_id, ref_genome, ref_trans, tool_num_cutoff): """Create sample specific subfolder structure and run tools on fastq files""" self.samples.add_sample(sample_id, "NA", fq1, fq2) - # Genome/Gene references to use - genome_sizes_path = self.cfg.get("references", "{0}_genome_sizes_{1}".format(ref_trans, ref_genome)) - genome_chrs_path = self.cfg.get("references", "{0}_genome_fastadir_{1}".format(ref_trans, ref_genome)) - genes_fasta_path = self.cfg.get("references", "{0}_genes_fasta_{1}".format(ref_trans, ref_genome)) - genes_gtf_path = self.cfg.get("references", "{0}_genes_gtf_{1}".format(ref_trans, ref_genome)) + refs = cfg.references + + # Genome/Gene references to use + genome_sizes_path = refs["genome_sizes"] + genome_chrs_path = refs["genome_fastadir"] + genes_fasta_path = refs["genes_fasta"] + genes_gtf_path = refs["genes_gtf"] # Path' to specific indices - bowtie_index_path = self.cfg.get("indices", "{0}_bowtie_{1}".format(ref_trans, ref_genome)) - star_index_path = self.cfg.get("indices", "{0}_star_{1}_sjdb{2}".format(ref_trans, ref_genome, 49)) - kallisto_index_path = self.cfg.get("indices", "{0}_kallisto_{1}".format(ref_trans, ref_genome)) - pizzly_cache_path = "{}.pizzlyCache.txt".format(genes_gtf_path) - starfusion_index_path = self.cfg.get("indices", "{0}_starfusion_{1}".format(ref_trans, ref_genome)) - infusion_cfg_path = self.cfg.get("otherFiles", "{0}_infusion_cfg_{1}".format(ref_trans, ref_genome)) - starchip_param_path = self.cfg.get("otherFiles", "{0}_starchip_param_{1}".format(ref_trans, ref_genome)) + indices = cfg.indices + other_files = cfg.other_files + + bowtie_index_path = indices["bowtie"] + star_index_path = indices["star"] +# kallisto_index_path = indices["kallisto"] +# pizzly_cache_path = "{}.pizzlyCache.txt".format(genes_gtf_path) + starfusion_index_path = indices["starfusion"] + infusion_cfg_path = other_files["infusion_cfg"] +# starchip_param_path = other_files["starchip_param"] # Output results folder creation - currently included: # 1) Gene/Isoform expression: kallisto, star # 2) Fusion prediction: mapsplice, pizzly, fusioncatcher, star-fusion, starchip, infusion - output_results_path = os.path.join(self.working_dir, "Sample_{}".format(sample_id), "scratch") + output_results_path = os.path.join(self.working_dir, "Sample_{}".format(sample_id)) qc_path = os.path.join(output_results_path, "qc") skewer_path = os.path.join(qc_path, "skewer") qc_table_path = os.path.join(qc_path, "qc_table.txt") overrepresented_path = os.path.join(qc_path, "overrepresented.fa") filtered_reads_path = os.path.join(output_results_path, "filtered_reads") expression_path = os.path.join(output_results_path, "expression") - kallisto_path = os.path.join(expression_path, "kallisto") +# kallisto_path = os.path.join(expression_path, "kallisto") star_path = os.path.join(expression_path, "star") fusion_path = os.path.join(output_results_path, "fusion") mapsplice_path = os.path.join(fusion_path, "mapsplice") @@ -138,9 +137,21 @@ def execute_pipeline(self, fq1, fq2, sample_id, ref_genome, ref_trans, tool_num_ fetchdata_path = os.path.join(self.working_dir, "Sample_{}".format(sample_id), "fetchdata") for folder in [ - output_results_path, qc_path, skewer_path, filtered_reads_path, - expression_path, kallisto_path, star_path, - fusion_path, mapsplice_path, pizzly_path, fusioncatcher_path, starfusion_path, starchip_path, infusion_path, soapfuse_path, + output_results_path, + qc_path, + skewer_path, + filtered_reads_path, + expression_path, +# kallisto_path, + star_path, + fusion_path, + mapsplice_path, +# pizzly_path, + fusioncatcher_path, + starfusion_path, +# starchip_path, + infusion_path, + soapfuse_path, fetchdata_path ]: IOMethods.create_folder(folder, self.logger) @@ -148,18 +159,15 @@ def execute_pipeline(self, fq1, fq2, sample_id, ref_genome, ref_trans, tool_num_ # get a list of tools from the samples.db file that have been run previously on this sample state_tools = self.samples.get_tool_list_from_state(sample_id) # get a list of tools from the config file which shall be run on this sample - tools = [] - if "," in self.cfg.get('general', 'tools'): - tools.extend(self.cfg.get('general', 'tools').split(",")) - else: - tools.extend([self.cfg.get('general', 'tools')]) - + tools = cfg.tools + cmds = cfg.commands + module_dir = cfg.module_dir # Define cmd strings for each program # urla: mapsplice requires gunzip'd read files and process substitutions don't seem to work in slurm scripts... # process substitution do somehow not work from this script - c/p the command line to the terminal, however, works w/o issues?! - cmd_fastqc = "{} --nogroup --extract -t 6 -o {} {} {}".format(self.cfg.get('commands', 'fastqc_cmd'), qc_path, fq1, fq2) - cmd_qc_parser = "{} -i {}/*/fastqc_data.txt -q {} -o {}".format(self.cfg.get('commands', 'qc_parser_cmd'), qc_path, qc_table_path, overrepresented_path) - cmd_skewer = "{} -q {} -m {} -i {} {} -o {} -b {}".format(self.cfg.get('commands', 'skewer_wrapper_cmd'), qc_table_path, 0.75, fq1, fq2, skewer_path, self.cfg.get('commands', 'skewer_cmd')) + cmd_fastqc = "{} --nogroup --extract -t 6 -o {} {} {}".format(cmds["fastqc"], qc_path, fq1, fq2) + cmd_qc_parser = "{} -i {}/*/fastqc_data.txt -o {}".format(os.path.join(module_dir, "misc", "qc_parser.py"), qc_path, qc_table_path) + cmd_skewer = "{} -q {} -i {} {} -o {}".format(os.path.join(module_dir, "tool_wrapper", "skewer_wrapper.py"), qc_table_path, fq1, fq2, skewer_path) fq0 = "" if "QC" in tools: @@ -170,8 +178,8 @@ def execute_pipeline(self, fq1, fq2, sample_id, ref_genome, ref_trans, tool_num_ qc_table_path = "None" # (0) Readfilter - cmd_star_filter = "{0} --genomeDir {1} --outFileNamePrefix {2}_ --readFilesCommand zcat --readFilesIn {3} {4} --outFilterMultimapNmax 100 --outSAMmultNmax 1 --chimSegmentMin 10 --chimJunctionOverhangMin 10 --alignSJDBoverhangMin 10 --alignMatesGapMax {5} --alignIntronMax {5} --chimSegmentReadGapMax 3 --alignSJstitchMismatchNmax 5 -1 5 5 --seedSearchStartLmax 20 --winAnchorMultimapNmax 50 --outSAMtype BAM Unsorted --chimOutType Junctions WithinBAM --outSAMunmapped Within KeepPairs --runThreadN waiting_for_cpu_number".format(self.cfg.get('commands', 'star_cmd'), star_index_path, os.path.join(filtered_reads_path, sample_id), fq1, fq2, self.cfg.get('general', 'max_dist_proper_pair')) - cmd_read_filter = "{0} --input {1}_Aligned.out.bam --output {1}_Aligned.out.filtered.bam".format(os.path.join(self.easyfuse_path, "fusionreadfilter.py"), os.path.join(filtered_reads_path, sample_id)) + cmd_star_filter = "{0} --genomeDir {1} --outFileNamePrefix {2}_ --readFilesCommand zcat --readFilesIn {3} {4} --outFilterMultimapNmax 100 --outSAMmultNmax 1 --chimSegmentMin 10 --chimJunctionOverhangMin 10 --alignSJDBoverhangMin 10 --alignMatesGapMax {5} --alignIntronMax {5} --chimSegmentReadGapMax 3 --alignSJstitchMismatchNmax 5 -1 5 5 --seedSearchStartLmax 20 --winAnchorMultimapNmax 50 --outSAMtype BAM Unsorted --chimOutType Junctions WithinBAM --outSAMunmapped Within KeepPairs --runThreadN waiting_for_cpu_number".format(cmds["star"], star_index_path, os.path.join(filtered_reads_path, sample_id), fq1, fq2, cfg.max_dist_proper_pair) + cmd_read_filter = "{0} --input {1}_Aligned.out.bam --output {1}_Aligned.out.filtered.bam".format(os.path.join(module_dir, "fusionreadfilter.py"), os.path.join(filtered_reads_path, sample_id)) # re-define fastq's if filtering is on (default) fq0 = "" if "Readfilter" in tools: @@ -179,84 +187,84 @@ def execute_pipeline(self, fq1, fq2, sample_id, ref_genome, ref_trans, tool_num_ fq1 = os.path.join(filtered_reads_path, os.path.basename(fq1).replace(".fastq.gz", "_filtered.fastq.gz")) fq2 = os.path.join(filtered_reads_path, os.path.basename(fq2).replace(".fastq.gz", "_filtered.fastq.gz")) - cmd_bam_to_fastq = "{0} fastq -0 {1} -1 {2} -2 {3} --threads waiting_for_cpu_number {4}_Aligned.out.filtered.bam".format(self.cfg.get('commands', 'samtools_cmd'), fq0, fq1, fq2, os.path.join(filtered_reads_path, sample_id)) + cmd_bam_to_fastq = "{0} fastq -0 {1} -1 {2} -2 {3} --threads waiting_for_cpu_number {4}_Aligned.out.filtered.bam".format(cmds["samtools"], fq0, fq1, fq2, os.path.join(filtered_reads_path, sample_id)) # (1) Kallisto expression quantification (required for pizzly) - cmd_kallisto = "{0} quant --threads waiting_for_cpu_number --genomebam --gtf {1} --chromosomes {2} --index {3} --fusion --output-dir waiting_for_output_string {4} {5}".format(self.cfg.get('commands', 'kallisto_cmd'), genes_gtf_path, genome_sizes_path, kallisto_index_path, fq1, fq2) +# cmd_kallisto = "{0} quant --threads waiting_for_cpu_number --genomebam --gtf {1} --chromosomes {2} --index {3} --fusion --output-dir waiting_for_output_string {4} {5}".format(cmds["kallisto"], genes_gtf_path, genome_sizes_path, kallisto_index_path, fq1, fq2) # (2) Star expression quantification (required for starfusion and starchip) - cmd_star = "{0} --genomeDir {1} --outFileNamePrefix waiting_for_output_string --runThreadN waiting_for_cpu_number --runMode alignReads --readFilesIn {2} {3} --readFilesCommand zcat --chimSegmentMin 10 --chimJunctionOverhangMin 10 --alignSJDBoverhangMin 10 --alignMatesGapMax {4} --alignIntronMax {4} --chimSegmentReadGapMax 3 --alignSJstitchMismatchNmax 5 -1 5 5 --seedSearchStartLmax 20 --winAnchorMultimapNmax 50 --outSAMtype BAM SortedByCoordinate --chimOutType Junctions SeparateSAMold --chimOutJunctionFormat 1".format(self.cfg.get('commands', 'star_cmd'), star_index_path, fq1, fq2, self.cfg.get('general', 'max_dist_proper_pair')) + cmd_star = "{0} --genomeDir {1} --outFileNamePrefix waiting_for_output_string --runThreadN waiting_for_cpu_number --runMode alignReads --readFilesIn {2} {3} --readFilesCommand zcat --chimSegmentMin 10 --chimJunctionOverhangMin 10 --alignSJDBoverhangMin 10 --alignMatesGapMax {4} --alignIntronMax {4} --chimSegmentReadGapMax 3 --alignSJstitchMismatchNmax 5 -1 5 5 --seedSearchStartLmax 20 --winAnchorMultimapNmax 50 --outSAMtype BAM SortedByCoordinate --chimOutType Junctions SeparateSAMold --chimOutJunctionFormat 1".format(cmds["star"], star_index_path, fq1, fq2, cfg.max_dist_proper_pair) # (3) Mapslice # urla: the "keep" parameter requires gunzip >= 1.6 cmd_extr_fastq1 = "gunzip {0} --keep".format(fq1) cmd_extr_fastq2 = "gunzip {0} --keep".format(fq2) # Added python interpreter to circumvent external hardcoded shell script - cmd_mapsplice = "python {0} --chromosome-dir {1} -x {2} -1 {3} -2 {4} --threads waiting_for_cpu_number --output {5} --qual-scale phred33 --bam --seglen 20 --min-map-len 40 --gene-gtf {6} --fusion".format(self.cfg.get('commands', 'mapsplice_cmd'), genome_chrs_path, bowtie_index_path, fq1[:-3], fq2[:-3], mapsplice_path, genes_gtf_path) + cmd_mapsplice = "python {0} --chromosome-dir {1} -x {2} -1 {3} -2 {4} --threads waiting_for_cpu_number --output {5} --qual-scale phred33 --bam --seglen 20 --min-map-len 40 --gene-gtf {6} --fusion".format(cmds["mapsplice"], genome_chrs_path, bowtie_index_path, fq1[:-3], fq2[:-3], mapsplice_path, genes_gtf_path) # (4) Fusiocatcher - cmd_fusioncatcher = "{0} --input {1} --output {2} -p waiting_for_cpu_number".format(self.cfg.get('commands', 'fusioncatcher_cmd'), ",".join([fq1, fq2]), fusioncatcher_path) + cmd_fusioncatcher = "{0} --input {1} --output {2} -p waiting_for_cpu_number".format(cmds["fusioncatcher"], ",".join([fq1, fq2]), fusioncatcher_path) # star-fusion and star-chip can be run upon a previous star run (this MUST NOT be the star_filter run, but the star_expression run) # (5) - cmd_starfusion = "{0} --chimeric_junction {1} --genome_lib_dir {2} --CPU waiting_for_cpu_number --output_dir {3}".format(self.cfg.get('commands', 'starfusion_cmd'), "{}_Chimeric.out.junction".format(os.path.join(star_path, sample_id)), starfusion_index_path, starfusion_path) + cmd_starfusion = "{0} --chimeric_junction {1} --genome_lib_dir {2} --CPU waiting_for_cpu_number --output_dir {3}".format(cmds["starfusion"], "{}_Chimeric.out.junction".format(os.path.join(star_path, sample_id)), starfusion_index_path, starfusion_path) # (7) - cmd_starchip = "{0} {1} {2} {3}".format(self.cfg.get("commands", "starchip_cmd"), os.path.join(starchip_path, "starchip"), "{}_Chimeric.out.junction".format(os.path.join(star_path, sample_id)), starchip_param_path) +# cmd_starchip = "{0} {1} {2} {3}".format(cmds["starchip"], os.path.join(starchip_path, "starchip"), "{}_Chimeric.out.junction".format(os.path.join(star_path, sample_id)), starchip_param_path) # (6) Pizzly - cmd_pizzly = "{0} -k 29 --gtf {1} --cache {2} --fasta {3} --output {4} {5}".format(self.cfg.get('commands', 'pizzly_cmd'), genes_gtf_path, pizzly_cache_path, genes_fasta_path, os.path.join(pizzly_path, "kallizzy"), os.path.join(kallisto_path, "fusion.txt")) - cmd_pizzly2 = "{0} {1} {2}".format(self.cfg.get('commands', 'pizzly_cmd2'), "{}.json".format(os.path.join(pizzly_path, "kallizzy")), "{}.json.txt".format(os.path.join(pizzly_path, "kallizzy"))) +# cmd_pizzly = "{0} -k 29 --gtf {1} --cache {2} --fasta {3} --output {4} {5}".format(cmds["pizzly"], genes_gtf_path, pizzly_cache_path, genes_fasta_path, os.path.join(pizzly_path, "kallizzy"), os.path.join(kallisto_path, "fusion.txt")) +# cmd_pizzly2 = "{0} {1} {2}".format(cmds["pizzly_cmd2"], "{}.json".format(os.path.join(pizzly_path, "kallizzy")), "{}.json.txt".format(os.path.join(pizzly_path, "kallizzy"))) # (8) Infusion - cmd_infusion = "{0} -1 {1} -2 {2} --skip-finished --min-unique-alignment-rate 0 --min-unique-split-reads 0 --allow-non-coding --out-dir {3} {4}".format(self.cfg.get("commands", "infusion_cmd"), fq1, fq2, infusion_path, infusion_cfg_path) + cmd_infusion = "{0} -1 {1} -2 {2} --skip-finished --min-unique-alignment-rate 0 --min-unique-split-reads 0 --allow-non-coding --out-dir {3} {4}".format(cmds["infusion"], fq1, fq2, infusion_path, infusion_cfg_path) # (x) Soapfuse - cmd_soapfuse = "{0} -c {1} -q {2} -i {3} -o {4} -g {5}".format(self.cfg.get('commands', 'soapfuse_wrapper_cmd'), self.cfg.get_path(), qc_table_path, " ".join([fq1, fq2]), soapfuse_path, ref_genome) + cmd_soapfuse = "{0} -q {1} -i {2} -o {3}".format(os.path.join(module_dir, "tool_wrapper", "soapfuse_wrapper.py"), qc_table_path, " ".join([fq1, fq2]), soapfuse_path) # (9) Data collection - cmd_fetchdata = "{0} -i {1} -o {2} -s {3} -c {4} --fq1 {5} --fq2 {6} --fusion_support {7}".format(os.path.join(self.easyfuse_path, "fetchdata.py"), output_results_path, fetchdata_path, sample_id, self.cfg.get_path(), fq1, fq2, tool_num_cutoff) + cmd_fetchdata = "{0} -i {1} -o {2} -s {3} --fq1 {4} --fq2 {5} --fusion_support {6}".format(os.path.join(module_dir, "fetchdata.py"), output_results_path, fetchdata_path, sample_id, fq1, fq2, tool_num_cutoff) # (10) De novo assembly of fusion transcripts # urla: This is currently still under active development and has not been tested thoroughly - cmd_denovoassembly = "{0} -i waiting_for_gene_list_input -c {1} -b {2}_Aligned.out.bam -g {3} -t {4} -o waiting_for_assembly_out_dir".format(os.path.join(self.easyfuse_path, "denovoassembly.py"), self.cfg.get_path(), os.path.join(filtered_reads_path, sample_id), ref_genome, ref_trans) +# cmd_denovoassembly = "{0} -i waiting_for_gene_list_input -b {1}_Aligned.out.bam -g {2} -t {3} -o waiting_for_assembly_out_dir".format(os.path.join(module_dir, "denovoassembly.py"), os.path.join(filtered_reads_path, sample_id), ref_genome, ref_trans) # (X) Sample monitoring - cmd_samples = "{0} --db_path={1} --sample_id={2} --action=append_state --tool=".format(os.path.join(self.easyfuse_path, "misc", "samples.py"), self.samples.db_path, sample_id) + cmd_samples = "{0} --db_path={1} --sample_id={2} --action=append_state --tool=".format(os.path.join(module_dir, "misc", "samples.py"), self.samples.db_path, sample_id) # set final lists of executable tools and path exe_tools = [ "QC", #0 "Readfilter", #1 - "Kallisto", #2 +# "Kallisto", #2 "Star", #3 "Mapsplice", #4 "Fusioncatcher", #5 "Starfusion", #6 - "Pizzly", #7 - "Starchip", #8 +# "Pizzly", #7 +# "Starchip", #8 "Infusion", #9 "Soapfuse", #10 - "Fetchdata", #11 - "Assembly" #12 + "Fetchdata" #11 +# "Assembly" #12 ] exe_cmds = [ " && ".join([cmd_fastqc, cmd_qc_parser, cmd_skewer]), #0 " && ".join([cmd_star_filter, cmd_read_filter, cmd_bam_to_fastq]), #1 - cmd_kallisto, #2 +# cmd_kallisto, #2 cmd_star, #3 " && ".join([cmd_extr_fastq1, cmd_extr_fastq2, cmd_mapsplice]), #4 cmd_fusioncatcher, #5 cmd_starfusion, #6 - " && ".join([cmd_pizzly, cmd_pizzly2]), #7 - cmd_starchip, #8 +# " && ".join([cmd_pizzly, cmd_pizzly2]), #7 +# cmd_starchip, #8 cmd_infusion, #9 cmd_soapfuse, #10 - cmd_fetchdata, #11 - cmd_denovoassembly #12 + cmd_fetchdata #11 +# cmd_denovoassembly #12 ] exe_path = [ qc_path, #0 filtered_reads_path, #1 - kallisto_path, #2 +# kallisto_path, #2 star_path, #3 mapsplice_path, #4 fusioncatcher_path, #5 starfusion_path, #6 - pizzly_path, #7 - starchip_path, #8 +# pizzly_path, #7 +# starchip_path, #8 infusion_path, #9 soapfuse_path, #10 - fetchdata_path, #11 - "" #12 + fetchdata_path #11 +# "" #12 ] # create and submit slurm job if the tool is requested and hasn't been run before @@ -269,11 +277,11 @@ def execute_pipeline(self, fq1, fq2, sample_id, ref_genome, ref_trans, tool_num_ if tool in state_tools: # urla: the primary idea behind this flag is to allow multiple fetchdata executions during processing # nevertheless, re-processing of the same data with a newer version of a tool will also be straightforward (but overwriting previous results, of course) - if self.overwrite: - self.logger.info("Executing {0} although it looks like a previous run finished successfully. Results in {1} may be overwritten".format(tool, exe_path[i])) - else: - self.logger.info("Skipping {0} as it looks like a previous run finished successfully. Results should be in {1}".format(tool, exe_path[i])) - continue +# if self.overwrite: +# self.logger.info("Executing {0} although it looks like a previous run finished successfully. Results in {1} may be overwritten".format(tool, exe_path[i])) +# else: + self.logger.info("Skipping {0} as it looks like a previous run finished successfully. Results should be in {1}".format(tool, exe_path[i])) + continue else: if tool == "Readfilter" and "Readfilter" not in tools: self.logger.error( @@ -302,8 +310,9 @@ def execute_pipeline(self, fq1, fq2, sample_id, ref_genome, ref_trans, tool_num_ # prepare slurm jobs: get ressources, create uid, set output path and check dependencies self.logger.debug("Submitting {} run to slurm".format(tool)) - cpu, mem = self.cfg.get("resources", tool.lower()).split(",") - uid = "-".join([tool, sample_id]) + cpu = cfg.resources[tool.lower()]["cpu"] + mem = cfg.resources[tool.lower()]["mem"] + uid = "-".join([cfg.pipeline_name, tool, sample_id]) if tool == "Star": exe_cmds[i] = exe_cmds[i].replace("waiting_for_output_string", "{}_".format(os.path.join(exe_path[i], sample_id))).replace("waiting_for_cpu_number", str(cpu)) else: @@ -334,13 +343,13 @@ def submit_job(self, uid, cmd, cores, mem_usage, output_results_folder, dependen if not already_running: # urla: for compatibility reasons (and to be independent of shell commands), concatenated commands are splitted again, # dependencies within the splitted groups updated and everything submitted sequentially to the queueing system - module_file = self.cfg.get('commands', 'module_file') - que_sys = self.cfg.get('general', 'queueing_system') + module_file = os.path.join(cfg.module_dir, "build_env.sh") + que_sys = cfg.queueing_system for i, cmd_split in enumerate(cmd.split(" && ")): if not que_sys in ["slurm", "pbs"]: cmd_split = cmd_split.split(" ") dependencies.extend(Queueing.get_jobs_by_name("{0}_CMD{1}".format(uid, i - 1))) - Queueing.submit("{0}_CMD{1}".format(uid, i), cmd_split, cores, mem_usage, output_results_folder, dependencies, self.partitions, self.userid, self.cfg.get('general', 'time_limit'), mail, module_file, que_sys) + Queueing.submit("{0}_CMD{1}".format(uid, i), cmd_split, cores, mem_usage, output_results_folder, dependencies, cfg.partition, cfg.user, cfg.time_limit, mail, module_file, que_sys) time.sleep(0.5) else: self.logger.error("A job with this application/sample combination is currently running. Skipping {} in order to avoid unintended data loss.".format(uid)) @@ -351,33 +360,27 @@ def main(): # required arguments parser.add_argument('-i', '--input', dest='input', nargs='+', help='Specify full path of the fastq folder to process.', required=True) parser.add_argument('-o', '--output-folder', dest='output_folder', help='Specify full path of the folder to save the results into.', required=True) - parser.add_argument('-c', '--config', dest='config', help='Specify full path of config file.', required=True) # optional arguments - parser.add_argument('-u', '--userid', dest='userid', help='User identifier used for scheduling and timing') - parser.add_argument('-p', '--partitions', dest='partitions', help='Comma-separated list of partitions to use for queueing.', default='allNodes') parser.add_argument('--tool_support', dest='tool_support', help='The number of tools which are required to support a fusion event.', default="1") parser.add_argument('--version', dest='version', help='Get version info') # hidden (not visible in the help display) arguments - parser.add_argument('--overwrite', dest='overwrite', help=argparse.SUPPRESS, default=False, action='store_true') +# parser.add_argument('--overwrite', dest='overwrite', help=argparse.SUPPRESS, default=False, action='store_true') args = parser.parse_args() # if version is request, print it and exit if args.version: - print(eav.__version__) + print(cfg.__version__) +# print(eav.__version__) sys.exit(0) script_call = "python {} {}".format(os.path.realpath(__file__), " ".join(sys.argv[1:])) # create a local copy of the config file in the working dir folder in order to be able to re-run the script - proc = Processing(script_call, args.config, args.input, args.output_folder, args.partitions, args.userid, args.overwrite) - - - outf = open(os.path.join(args.output_folder, "process.sh"), "w") - outf.write("#!/bin/sh\n\n") - outf.write(script_call) - outf.close() - + proc = Processing(script_call, args.input, args.output_folder) proc.run(args.tool_support) + with open(os.path.join(args.output_folder, "process.sh"), "w") as outf: + outf.write("#!/bin/sh\n\n{}".format(script_call)) + if __name__ == '__main__': main() diff --git a/ref_data/gtf2tsl.py b/ref_data/gtf2tsl.py index e0cac49..8f0d66e 100755 --- a/ref_data/gtf2tsl.py +++ b/ref_data/gtf2tsl.py @@ -44,8 +44,8 @@ def get_data(self): print("{}_{}_{}_{}".format(transcript_id, trans_biotype, gene_biotype, tsl)) # check again, that what we have is what we expect - if not transcript_id[0:4] == "ENST": - print("This is not valid enseml transcript id: {}. Is this a valid ensembl gtf line: {}?".format(transcript_id, line)) + if not (transcript_id[0:3] == "ENS" or transcript_id[0:4] == "MGP_"): + print("This does not look like a valid ensembl id: {}. Is this a valid ensembl gtf line: {}?".format(transcript_id, line)) sys.exit(99) if not transcript_id in tsl_dict: tsl_dict[transcript_id] = [trans_biotype, gene_biotype, tsl] diff --git a/summarize_data.py b/summarize_data.py index d2b3e80..c625226 100755 --- a/summarize_data.py +++ b/summarize_data.py @@ -5,7 +5,6 @@ @version: 20190118 """ -from __future__ import print_function import os import os.path from datetime import datetime @@ -18,22 +17,21 @@ from matplotlib.backends.backend_pdf import PdfPages import pandas as pd import seaborn as sns -from misc.config import Config from join_data import DataJoining from misc.samples import SamplesDB import misc.io_methods as IOMethods +import config as cfg -class IcamSummary(object): +class FusionSummary(object): """Collect stats of the run and write them to file""" - def __init__(self, input_path, config_path): + def __init__(self, input_path): self.input_path = input_path self.samples = SamplesDB(os.path.join(input_path, "samples.db")) - self.cfg = Config(config_path) # self.figure_list = [] def run(self, model_predictions): """Execute individual methods""" - fusion_tools = self.cfg.get("general", "fusiontools").split(",") + fusion_tools = cfg.fusiontools fusion_data_summary_path = os.path.join(self.input_path, "FusionSummary") IOMethods.create_folder(fusion_data_summary_path) @@ -85,24 +83,26 @@ def run(self, model_predictions): with open(data_out_path_fltr, "w") as fltr_out: fltr_out.write("SampleID\tSampleDate\tFusionToolCnt\t" + "\t".join(colnames_filtering) + "\n") for sample in sid_list: + # try: if "Fetchdata" in self.samples.get_tool_list_from_state(sample): count_processed += 1 print("Processing sample {0} (dataset {1}/{2})".format(sample, count_processed, len(sid_list))) start_time = time.time() - fusion_data_summary = DataJoining(self.input_path, sample, "", os.path.join(fusion_data_summary_path, sample), model_predictions).run(self.cfg) - + fusion_data_summary = DataJoining(self.input_path, sample, "", os.path.join(fusion_data_summary_path, sample), model_predictions).run() fusion_frequency_all = self.add_to_fus_dict(fusion_data_summary[1], fusion_frequency_all) filtering_data_1[sample] = fusion_data_summary[0] _, fig1 = self.plot_boxswarm({sample:fusion_data_summary[0]}, colnames_filtering, ["Sample", "Filter", "Data"], "Filter counts of distinct fusion genes for sample {}".format(sample)) pdf.savefig(fig1) - + fltr_out.write(sample + "\t" + str(sample_date_dict[sample]) + "\t" + str(sample_toolCnt_dict[sample]) + "\t" + "\t".join(map(str, fusion_data_summary[0])) + "\n") time_taken = time.time() - start_time average_time = (average_time * (count_processed-1) + time_taken) / count_processed estimated_end = average_time * (count_valid_sample - count_processed) - print("done. Processing time: {0:.2f}s; Average processing time: {1:.2f}s; Estimated end of processing in {2:.2f}s)".format(time_taken, average_time, estimated_end)) + print("done. Processing time: {0:.2f}s; Average processing time: {1:.2f}s; Estimated end of processing in {2:.2f}s)\n".format(time_taken, average_time, estimated_end)) plt.close("all") + #except: + # print("Sample {} could not be processed! Please look into detail".format(sample)) pddfall, fig = self.plot_barchart(fusion_frequency_all, "Fusion gene recurrency in single samples", 1) pdf.savefig(fig) @@ -113,7 +113,7 @@ def run(self, model_predictions): fltr_out.write("Rep1's MinMaxMed\tNA\tNA\t" + "\t".join(map(str, mmm1)) + "\n") # close all open plots - plt.close('all') + plt.close("all") def plot_barchart(self, fusion_frequency_dict, chart_title, label_cutoff): """ Plot the detected fusion gene frequencies in a bar blot """ @@ -192,10 +192,9 @@ def main(): parser = argparse.ArgumentParser(description='Post processing of an easyfuse run - currently, collecting runtimes only :)') parser.add_argument('-i', '--input', dest='input', help='Specify the easyfuse root dir of the run you want to process.', required=True) - parser.add_argument('-c', '--config', dest='config', help='Specify config file.', required=True) parser.add_argument('--model_predictions', default=False, action='store_true', help='Score predictions based on pre-train model') args = parser.parse_args() - stats = IcamSummary(args.input, args.config) + stats = FusionSummary(args.input) stats.run(args.model_predictions) if __name__ == '__main__': diff --git a/tool_wrapper/skewer_wrapper.py b/tool_wrapper/skewer_wrapper.py index a069ee8..d0171ff 100755 --- a/tool_wrapper/skewer_wrapper.py +++ b/tool_wrapper/skewer_wrapper.py @@ -6,20 +6,19 @@ from shutil import copyfile from argparse import ArgumentParser -sys.path.append(os.path.join(os.path.dirname( __file__ ), '..')) +sys.path.append(os.path.join(os.path.dirname(__file__), "..")) + +import config as cfg -from misc.config import Config def main(): - parser = ArgumentParser(description='Wrapper around skewer.') - parser.add_argument('-q', '--qc-table', dest='qc_table', help='Specify input QC table', required=True) - parser.add_argument('-i', '--input', dest='input', nargs='+', help='Specify input FASTQ files', required=True) - parser.add_argument('-o', '--output', dest='output', help='Specify output folder', default='.') - parser.add_argument('-b', '--skewer-bin', dest='skewer_bin', help='Specify path to skewer binary', default="skewer") - parser.add_argument('-m', '--min-read-length-perc', dest='min_read_length_perc', type=float, help='Specify minimum read length percentage', default=0.75) + parser = ArgumentParser(description="Wrapper around skewer.") + parser.add_argument("-q", "--qc-table", dest="qc_table", help="Specify input QC table", required=True) + parser.add_argument("-i", "--input", dest="input", nargs="+", help="Specify input FASTQ files", required=True) + parser.add_argument("-o", "--output", dest="output", help="Specify output folder", default=".") args = parser.parse_args() - min_rl_perc = args.min_read_length_perc + min_rl_perc = cfg.min_read_len_perc remaining_len = 1000 read_len = 0 with open(args.qc_table) as inf: @@ -32,7 +31,7 @@ def main(): remaining_len = remaining if remaining_len != read_len and remaining_len >= (min_rl_perc * read_len): - cmd = "{} -l {} -q 28 -m pe -t 6 -k 5 -z -o {} {}".format(args.skewer_bin, remaining_len, os.path.join(args.output, "out_file"), " ".join(args.input)) + cmd = "{} -l {} -q 28 -m pe -t 6 -k 5 -z -o {} {}".format(cfg.commands["skewer"], remaining_len, os.path.join(args.output, "out_file"), " ".join(args.input)) proc = subprocess.Popen(cmd, shell=True, stderr=subprocess.PIPE, stdout=subprocess.PIPE) (stdoutdata, stderrdata) = proc.communicate() @@ -44,7 +43,7 @@ def main(): elif remaining_len < min_rl_perc * read_len: sys.stderr.write("Trim length too long. Discarding fastqs ({}).".format(args.input)) - open(os.path.join(args.output, "to_be_discarded"), 'a').close() + open(os.path.join(args.output, "to_be_discarded"), "a").close() sys.exit(1) elif remaining_len == read_len: if len(args.input) == 2: @@ -70,5 +69,5 @@ def main(): # os.symlink(args.input[0], out) -if __name__ == '__main__': +if __name__ == "__main__": main() diff --git a/tool_wrapper/soapfuse_wrapper.py b/tool_wrapper/soapfuse_wrapper.py index c718931..b414fcb 100755 --- a/tool_wrapper/soapfuse_wrapper.py +++ b/tool_wrapper/soapfuse_wrapper.py @@ -7,27 +7,20 @@ from argparse import ArgumentParser -sys.path.append(os.path.join(os.path.dirname( __file__ ), '..')) +sys.path.append(os.path.join(os.path.dirname(__file__), "..")) -from misc.config import Config +import config as cfg def main(): - parser = ArgumentParser(description='Wrapper around SOAPfuse.') - parser.add_argument('-q','--qc-table', dest='qc_table', help='Specify input QC table') - parser.add_argument('-i','--input', dest='input', nargs='+', help='Specify input FASTQ files', required=True) - parser.add_argument('-o','--output', dest='output', help='Specify output folder', default='.') - parser.add_argument('-g','--genome', dest='genome', help='Specify reference genome', default='hg38') - parser.add_argument('-c','--config', dest='config', help='Specify config file') + parser = ArgumentParser(description="Wrapper around SOAPfuse.") + parser.add_argument("-q","--qc-table", dest="qc_table", help="Specify input QC table") + parser.add_argument("-i","--input", dest="input", nargs="+", help="Specify input FASTQ files", required=True) + parser.add_argument("-o","--output", dest="output", help="Specify output folder", default=".") args = parser.parse_args() - if args.config: - cfg = Config(args.config) - else: - cfg = Config(os.path.join(os.path.dirname(os.path.realpath(__file__)), 'config.ini')) - - soapfuse_cfg = cfg.get('otherFiles', 'ensembl_soapfuse_cfg_{}'.format(args.genome)) + soapfuse_cfg = cfg.other_files["soapfuse_cfg"] - cols = args.input[0].rsplit("/",3) + cols = args.input[0].rsplit("/", 3) # print(cols) remaining_len = 1000 read_len = 0 @@ -60,7 +53,7 @@ def main(): os.symlink(file, os.path.join(cols[0], cols[1], cols[2], "S_{}.fastq.gz".format((i+1)))) except: print("Symlink already exists!") - cmd = "{} -c {} -fd {} -l {} -o {}".format(cfg.get('commands', 'soapfuse_cmd'), soapfuse_cfg, cols[0], cfg_file, args.output) + cmd = "{} -c {} -fd {} -l {} -o {}".format(cfg.commands["soapfuse"], soapfuse_cfg, cols[0], cfg_file, args.output) # print cmd proc = subprocess.Popen(cmd, shell=True, stderr=subprocess.PIPE, stdout=subprocess.PIPE) (stdoutdata, stderrdata) = proc.communicate() @@ -76,5 +69,5 @@ def main(): -if __name__ == '__main__': +if __name__ == "__main__": main()