manticore

#!/usr/bin/env python3

# version
VERSION = '''

MANTICORE
v 1.0.1
--------------------------------------------------------
Matteo Schiavinato, Alexandrina Bodrug
BOKU - University of Natural Resources and Life Sciences
Department of Biotechnology
Institute of Computational Biology
Vienna (AT)
--------------------------------------------------------
git@github.com:MatteoSchiavinato/manticore.git

'''

# modules
import pandas as pd
import dask
import dask.dataframe as dd
import pysam
import argparse as ap
import time
from time import asctime as at
import sys
import os
from shutil import copyfile
import os.path
from operator import itemgetter
from itertools import combinations
import subprocess
from math import floor, ceil
from Bio import SeqIO
import resource

start_time = time.time()


### help ###
if len(sys.argv) == 1:
	sys.argv.append("-h")

if sys.argv[1] == "-h":
	sys.exit('''{0}


### This is a quick help. For complete help use [--help], or browse the program manual ###


Usage:
manticore \\
--species-name STRING \\
--reads-type PE \\
--reads PATH1,PATH2 PATH3,PATH4 \\
        (parentA)   (parentB)
--names STRING STRING \\
        (A)    (B)
--reference PATH \\
--output-dir PATH \\
[ ... other options ... ]


[INPUT OPTIONS]
  --species-name	Label for output files and plot titles			[required]
  --reads		FASTQ read files, see --help for how to structure	[required]
  --reads-type		Type of the input reads (PE|SE) 			[PE]
  --names		Names of read files, see --help for how to structure	[required]
  --reference		Hybrid genome in FASTA format				[required]
  --output-dir		Output directory					[required]
  --region-beds		Regions to use in analysis, see --help for details	[-]
  --region-names	Names to use for each file in --region-beds		[-]

[COVERAGE ANALYSIS]
  --window-size		Size of the intermixing windows (in bp)			[500000]
  --n-breaks		Number of window breaks for Jaccard distance		[10]
  --min-feat-length	Min. window positions contained in a --region-bed	[1000]
  --min-frac-pos	Min. fraction of positions covered			[0.1]
  --min-cov-pos		Min. number of window positions with coverage		[1000]
  --min-cov		Min. coverage to retain position			[1]

[TABLES & PLOTTING]
  --exclude		Sequence names to exclude from the analysis		[-]
  --max-jacc-uniq       Max. Jaccard distance for uniquely assigned window      [0.5]
  --max-plot-cov	Upper limit for coverage plot				[100]

[MISCELLANEOUS]
  --max-mem		Gigabytes of RAM allowed				[4G]
  --threads		Number of parallel threads				[1]
  --cleanup		Delete heavy intermediate files at the end		[off]
  --filter-reference    Don't process sequences shorter than --window-size      [off]
  --isize-read-num      Read pairs to use in insert size distribution           [10000]
  --isize-dist-width    Width of the allowed insert size range                  [150]
  --hisat2-path		Path to hisat2 executable				[hisat2]
  --hisat2-map-pars	Mapping parameters passed to hisat2			[--score-min L,0.0,-0.6
			(consult hisat2 help)					--no-softclip
										--no-spliced-alignment]
  --samtools-filters    Arguments to pass to samtools view                      [-F 0x4 -F 0x0100]
  --rscript-path	Path to Rscript executable				[Rscript]
  --version		Print version of Manticore and exit			[off]
  --restart		Restart whole pipeline from the beginning		[off]

'''.format(VERSION))

if sys.argv[1] in ["-h", "--help", "-help", "getopt", "usage"]:
	sys.exit('''

  {0}


Usage:
manticore \\
--species-name STRING \\
--reads PATH1,PATH2 PATH3,PATH4 \\
        (parentA)   (parentB)
--names STRING STRING \\
        (A)    (B)
--reference PATH \\
--output-dir PATH \\
[ ... other options ... ]


[INPUT OPTIONS]

  --species-name	Name of the hybrid species studied, to be used for plotting
			(No whitespaces)
			[required]

  --reads		SPACE-separated list of read files (separate files of read pairs by comma)
			Example: --reads G1.read_1.fq,G1.read_2.fq G2.read_1.fq,G2.read_2.fq ...
			[required]

  --reads-type	Specify either 'PE' for paired-end reads or 'SE' for single-end reads.
  				Note that this affects the [--reads] argument too, since in case of
				single-end reads you will have to specify only one file per parent
				[PE]

  --names		SPACE-separated list of names corresponding to read files
			Example: --names Subgenome_A Subgenome_B Subgenome_C ...
			[required]

  --reference		FASTA file where to assess subgenomic intermixing
			[required]

  --output-dir		All files will be generated within this directory
			[required]

  --region-beds		SPACE-separated list of BED files with regions that have to be considered
			(A separate coverage analysis will be generated for each one)
			[-]

  --region-names	SPACE-separated list of names to associate to BED files which should
			define the type of data they contain
			(Example: \'CDS\', \'nonrep\', ...)
			(These names are only used as labels)
			[-]


[COVERAGE ANALYSIS]

  --window-size		Size of the windows on which to study intermixing
			(Hint: the larger, the more intermixing is observed, but the more data is lost)
			[500000]

  --n-breaks		Number of window breaks to compute Jaccard distance from.
			[--window-size] divided by [--n-breaks] has to return an integer
			(Hint: the smaller, the more likely intermixing is observed)
			[10]

  --min-feat-length	Minimum length of annotated features within windows of size --window-size.
			This applies SEPARATELY to all annotations provided with --region-beds.
			[1000]

  --min-frac-pos	Minimum fraction of positions that have to be covered within the --min-feat-length
			of a window.
			[0.1]

  --min-cov-pos		Minimum number of positions that have to be covered within a window.
			[1000]

  --min-cov		Minimum position coverage to consider a position in the analysis
			[1]


[TABLES & PLOTTING]

  --exclude		Sequence names to exclude from the analysis. The result tables will be generated for
			all sequences, and then they will be filtered removing lines (i.e. windows) corresponding
			to these sequences (Hint: remove sequenecs that are not clearly assigned to a subgenome).
			[off]

  --max-jacc-uniq	Genome intervals are assigned to one parent specifically (i.e. low intermixing regions)
			based on the Jaccard distance produced in the interval. Intervals producing a
			Jaccard distange > [--max-jacc-uniq] are not considered low-intermixing.
			(Hint: This value ranges from 0 to 1)
			[0.5]

  --max-plot-cov	The final plots will be limited to this maximum coverage
			[100]


[MISCELLANEOUS]

  --max-mem		Maximum memory that can be used by the program (use only \'#G\')
			(N threads will get \'--maxmem / N\' memory each)
			[4G]

  --threads		Number of parallel threads
			[4]

  --cleanup		Heavy intermediate files are deleted when the program has finished
			[off]

  --filter-reference	The reference file is filtered, keeping only sequences longer than the
			specified --window-size
			[off]

  --isize-read-num	Number of read pairs from which to estimate the insert size distribution
			[10000]

  --isize-dist-width	Width of the allowed insert size range when mapping
			(-I <x-N> -X <x+N> where x=avg.TLEN from mapping of --isize-read-num reads)
			[150]

  --hisat2-path		Path to the \"HISAT2\" executable (only specify if not present in the `$PATH`)
			[hisat2]

  --hisat2-map-pars	Mapping parameters to be passed to HISAT2
			[-k 5 --score-min L,0.0,-0.6 --mp 6,2 --rdg 5,3 --rfg 5,3 --no-softclip --no-spliced-alignment]

  --samtools-filters	Arguments to pass to samtools view
			[-F 0x0100 -F 0x4]

  --rscript-path	Path to the \"Rscript\" executable (only specify if not present in the `$PATH`)
			[Rscript]

  --version		Print version of the program and exit
			[off]

  --restart		Restart whole pipeline from the beginning, ignoring *.done files
			(each directory contains a *.done file that signals to skip the step)
			[off]


'''.format(VERSION))


p = ap.ArgumentParser()
# mandatory
p.add_argument("--species-name", type=str, required=True)
p.add_argument("--reads", nargs="*", type=str, required=True)
p.add_argument("--names", nargs="*", type=str, required=True)
p.add_argument("--reference", type=str, required=True)
p.add_argument("--output-dir", type=str, required=True)
# optional
p.add_argument("--reads-type", choices=["SE", "PE"], default="PE", type=str)
p.add_argument("--version", action="store_true", default=False)
p.add_argument("--restart", action="store_true", default=False)
p.add_argument("--filter-reference", action="store_true", default=False)
p.add_argument("--max-mem", type=str, default="4G")
p.add_argument("--threads", type=int, default=4)
p.add_argument("--cleanup", action="store_true", default=False)
# mapping
p.add_argument("--hisat2-map-pars", default="-k 5 --score-min L,0.0,-0.6 --mp 6,2 --rdg 5,3 --rfg 5,3 --no-softclip --no-spliced-alignment")
p.add_argument("--isize-read-num", default=10000, type=int)
p.add_argument("--isize-dist-width", default=150, type=int)
p.add_argument("--samtools-filters", default="-F 0x0100 -F 0x4", type=str)
# analysis
p.add_argument("--region-beds", nargs="*", type=str)
p.add_argument("--region-names", nargs="*", type=str)
p.add_argument("--window-size", default=500000, type=int)
p.add_argument("--n-breaks", default=10, type=int)
p.add_argument("--min-feat-length", default=1000, type=int)
p.add_argument("--min-frac-pos", default=0.1, type=float)
p.add_argument("--min-cov-pos", default=1000, type=int)
p.add_argument("--min-cov", type=int, default=1)
p.add_argument("--exclude", nargs="*", default=[])
p.add_argument("--max-jacc-uniq", type=float, default=0.5)
p.add_argument("--max-plot-cov", type=int, default=100)
# paths
p.add_argument("--hisat2-path", type=str, default="hisat2")
p.add_argument("--rscript-path", type=str, default="Rscript")
args = p.parse_args()


### functions ###

def memory_limit(user_limit):
	soft, hard = resource.getrlimit(resource.RLIMIT_AS)
	hard_limit = min(float(hard), float(user_limit))
	resource.setrlimit(resource.RLIMIT_AS, (get_memory() * 1024 / 2, hard_limit))


def get_memory():
	with open('/proc/meminfo', 'r') as mem:
		free_memory = 0
		for i in mem:
			sline = i.split()
			if str(sline[0]) in ('MemFree:', 'Buffers:', 'Cached:'):
				free_memory += int(sline[1])
	return free_memory


def run_cmd(cmd, stdout_file, stderr_file):

	STDOUT_FILE = open(stdout_file, "w")
	STDERR_FILE = open(stderr_file, "w")
	code = subprocess.call(cmd, stdout=STDOUT_FILE, stderr=STDERR_FILE)
	STDOUT_FILE.close()
	STDERR_FILE.close()

	return code


def run_pipeline(cmd, stdout_file, stderr_file):

	STDOUT_FILE = open(stdout_file, "w")
	STDERR_FILE = open(stderr_file, "w")
	code = subprocess.call(cmd, stdout=STDOUT_FILE, stderr=STDERR_FILE, shell=True)
	STDOUT_FILE.close()
	STDERR_FILE.close()

	return code


def link_file(source, destination):

	cmd = [ "ln",
		"-s",
		source,
		destination ]

	code = subprocess.call(cmd)
	return code


def unlink_file(file_path):

	cmd = [ "unlink",
		file_path ]

	code = subprocess.call(cmd)
	return code


def filter_reference(outdir, reference, window_size):

	INPUT = open(reference, "r")
	OUTPUT = open("{0}/genome.fa".format(outdir), "w")
	ERROR = open("{0}/genome.fa.stderr".format(outdir), "w")

	retained = 0
	discarded = 0

	for record in SeqIO.parse(INPUT, "fasta"):
		if len(record.seq) >= int(window_size):
			OUTPUT.write(">" + str(record.id) + "\n" + str(record.seq) + "\n")
			retained += len(record.seq)
		else:
			discarded += len(record.seq)

	total = retained + discarded
	retained_frac = float(retained) / float(total) * 100
	discarded_frac = float(discarded) / float(total) * 100

	ERROR.write("Retained: {0} bp ({1}%)\nDiscarded: {2} bp ({3}%)\n".format(	retained,
											retained_frac,
											discarded,
											discarded_frac ) )

	return True


def build_hisat2_indexes(outdir, threads, hisat2_path, cwd):

	# build indexes for hisat2 for the genome that is linked in the folder
	# if exist
	cmd = [	"{0}-build".format(hisat2_path),
		"-p {0}".format(threads),
		"{0}/genome.fa".format(outdir),
		"{0}/genome.fa".format(outdir) ]

	code = run_cmd(	cmd,
			"{0}/build_hisat2_indexes.stdout".format(outdir),
			"{0}/build_hisat2_indexes.stderr".format(outdir) )

	if code == 0:
		return True
	else:
		return False


def generate_reads_dictionary(reads_arg, names_arg):

	Reads = {}
	k=0
	while (k <= len(reads_arg)-1):
		entry = reads_arg[k]
		name = names_arg[k]
		x = entry.split(",")
		Reads[name] = [ read_file for read_file in x ]
		k += 1

	return Reads


def generate_beds_dictionary(beds_arg, names_arg, cwd):

	Beds = {}
	k=0
	while (k <= len(beds_arg)-1):
		entry = beds_arg[k]
		if entry[0:1] != "/":
			entry = str(cwd) + "/" + entry

		name = names_arg[k]
		Beds[name] = entry
		k += 1

	return Beds


def link_read_files(Reads, outdir):

	for name in Reads:
		if len(Reads[name]) == 1:
			code = link_file(Reads[name][0], "{0}/{1}.fastq".format(outdir, name))
		elif len(Reads[name]) == 2:
			code = link_file(Reads[name][0], "{0}/{1}.1.fastq".format(outdir, name))
			code = link_file(Reads[name][1], "{0}/{1}.2.fastq".format(outdir, name))
		else:
			sys.exit("ERROR: more than two files specified for {0}:\n{1}\n\n".format(name,
												Reads[name]))
	return True


def estimate_insert_size(hisat2_index, Reads, mapping_pars, hisat2_path, R_path, \
			isize_reads, dist_width, outdir, threads, script_dir):

	Insert_sizes = { name:0 for name in Reads.keys() }

	open("{0}/isize_ranges.table".format(outdir), "w").close()
	open("{0}/peak_insert_size.stdout".format(outdir), "w").close()
	open("{0}/peak_insert_size.stderr".format(outdir), "w").close()

	if os.path.exists("{0}.1.ht2l".format(hisat2_index)) == True:
		index_type = "--large-index"
	else:
		index_type = ""

	for name in Reads.keys():

		# map reads
		read_1 = Reads[name][0]
		read_2 = Reads[name][1]

		cmd = [	"{0}".format(hisat2_path),
			index_type,
			"-p {0}".format(threads),
			"--upto {0}".format(isize_reads),
			mapping_pars,
			"-I", str(0),
			"-X", str(2000),
			"-x {0}".format(hisat2_index),
			"-1 {0}".format(read_1),
			"-2 {0}".format(read_2),
			"-S {0}/{1}.isize_est.sam".format(outdir, name) ]

		cmd = " ".join(cmd)

		code = run_pipeline(	cmd,
				"{0}/peak_insert_size.stdout".format(outdir),
				"{0}/peak_insert_size.stderr".format(outdir) )

		if code != 0:
			break

		# extract average TLEN
		INPUT = open("{0}/{1}.isize_est.sam".format(outdir, name), "r")
		Values = [ line.rstrip("\n\r\b").split("\t") for line in INPUT if line[0:1] != "#" ]
		Values = [ lst for lst in Values if len(lst) >= 9 ]
		Values = [ int(lst[8]) for lst in Values ]
		INPUT.close()

		Values = [ float(x) for x in Values if ((float(x) > 0) and (float(x) <= 1000)) ]
		OUTPUT = open("{0}/{1}.isize_est.sam.tlen".format(outdir, name), "w")
		for x in Values:
			OUTPUT.write(str(x) + "\n")
		OUTPUT.close()

		cmd = [ "{0}".format(R_path),
			"{0}/scripts/plot-insert-size.Rscript".format(script_dir),
			"{0}/{1}.isize_est.sam.tlen".format(outdir, name) ]

		code = run_cmd(	cmd,
				"{0}/{1}.isize_est.sam.tlen.stdout".format(outdir, name),
				"{0}/{1}.isize_est.sam.tlen.stderr".format(outdir, name) )

		if code != 0:
			break

		isize_avg = float(sum(Values)) / float(len(Values))
		isize_low = str(max(floor(float(isize_avg) - float(dist_width)), 0))
		isize_high = str(ceil(float(isize_avg) + float(dist_width)))
		OUTPUT = open("{0}/isize_ranges.table".format(outdir), "a")
		OUTPUT.write("\t".join([name, isize_low, isize_high]) + "\n")
		OUTPUT.close()

	if code == 0:
		return True
	else:
		return False


def check_if_reads_were_mapped(outdir, name):

	if os.path.exists("{0}/{1}.sam".format(outdir, name)) and \
	os.path.exists("{0}/{1}.sam.stderr".format(outdir, name)):

		INPUT = open("{0}/{1}.sam.stderr".format(outdir, name), "r")
		try:
			tail = [ line.rstrip("\n\b\r") for line in INPUT ][-1]
		except IndexError:
			tail = ""
		INPUT.close()

		if ("overall alignment rate" in tail):
			return True
		else:
			return False

	else:
		return False


def map_reads( 	hisat2_index, Reads, reads_type, mapping_pars, \
		hisat2_path, isize_file, outdir, threads):

	if os.path.exists("{0}.1.ht2l".format(hisat2_index)) == True:
		index_type = "--large-index"
	else:
		index_type = ""

	if reads_type == "PE":

		# read insert sizes
		INPUT = open(isize_file, "r")
		Lsts = [ line.rstrip("\n\b\r").split("\t") for line in INPUT ]
		INPUT.close()
		Insert_sizes = { x[0]:(str(x[1]), str(x[2])) for x in Lsts }

		for name in Reads.keys():

			status = check_if_reads_were_mapped(outdir, name)
			if status == False:

				cmd = [	"{0}".format(hisat2_path),
					index_type,
					"-p {0}".format(threads),
					mapping_pars,
					"--minins {0} --maxins {1}".format(	Insert_sizes[name][0],
										Insert_sizes[name][1] ),
					"-x {0}".format(hisat2_index),
					"-1 {0}".format(Reads[name][0]),
					"-2 {0}".format(Reads[name][1]),
					"-S {0}/{1}.sam".format(outdir, name) ]

				cmd = " ".join(cmd)

				code = run_pipeline(	cmd,
					"{0}/{1}.sam.stdout".format(outdir, name),
					"{0}/{1}.sam.stderr".format(outdir, name) )

				if code != 0:
					break

	elif reads_type == "SE":

		for name in Reads.keys():

			cmd = [	"{0}".format(hisat2_path),
				index_type,
				"-p {0}".format(threads),
				mapping_pars,
				"-x {0}".format(hisat2_index),
				"-U {0}".format(Reads[name][0]),
				"-S {0}/{1}.sam".format(outdir, name) ]

			cmd = " ".join(cmd)

			code = run_pipeline(	cmd,
				"{0}/{1}.sam.stdout".format(outdir, name),
				"{0}/{1}.sam.stderr".format(outdir, name) )

			if code != 0:
				break

	if code == 0:
		return True
	else:
		return False


def filter_reads(samtools_filters, Reads, outdir, threads, thread_mem):

	for name in Reads.keys():

		code = 1

		if os.path.exists("{0}/{1}.filtered.ok".format(outdir, name)) == False:

			# filter
			cmd = "-h -b {0} -@ {1} -o {2}/{3}.f.bam {2}/{3}.sam".format(	samtools_filters,
											threads,
											outdir,
											name )
			errfile = "{0}/{1}.f.bam.stderr".format(outdir, name)
			cmd = cmd.split(" ")
			# run filtering
			tmp = pysam.view(*cmd, catch_stdout=False, save_stderr=errfile)
			open("{0}/{1}.filtered.ok".format(outdir, name), "w").close()


		if os.path.exists("{0}/{1}.sorted.ok".format(outdir, name)) == False:

			# sort
			cmd = "-@ {0} -m {1} -O bam -T {2}/{3} -o {2}/{3}.fs.bam {2}/{3}.f.bam".format(
												threads,
												thread_mem,
												outdir,
												name )
			errfile = "{0}/{1}.fs.bam.stderr".format(outdir, name)
			cmd = cmd.split(" ")
			# run sort
			tmp = pysam.sort(*cmd, catch_stdout=False, save_stderr=errfile)
			open("{0}/{1}.sorted.ok".format(outdir, name), "w").close()

		code = 0

	if code == 0:
		return True
	else:
		return False


def extract_coverage(Beds, Reads, outdir, mapdir):

	code = 0
	for name in Reads.keys():
		if len(Beds) > 0:
			for region in Beds.keys():
				outfile = "{0}/{1}.{2}.depth".format(outdir, name, region)
				errfile = "{0}/{1}.{2}.depth.stderr".format(outdir, name, region)
				OUTPUT = open(outfile, "w")
				ERROR = open(errfile, "w")
				code = 0
				cmd = [	"-b",
					"{0}".format(Beds[region]),
					"{0}/{1}.fs.bam".format(mapdir, name) ]

				try:
					pysam.depth(*cmd, save_stdout=outfile, save_stderr=errfile)
				except:
					code = 1

				OUTPUT.close()
				ERROR.close()

				if code != 0:
					break
		else:
			cmd = [ "{0}/{1}.fs.bam".format(mapdir, name) ]
			region = "whole"
			outfile = "{0}/{1}.{2}.depth".format(outdir, name, region)
			errfile = "{0}/{1}.{2}.depth.stderr".format(outdir, name, region)
			OUTPUT = open(outfile, "w")
			ERROR = open(errfile, "w")
			code = 0

			try:
				pysam.depth(*cmd, save_stdout=outfile, save_stderr=errfile)
			except:
				code = 1

			OUTPUT.close()
			ERROR.close()
			if code != 0:
				break
		if code != 0:
			break
	if code == 0:
		return True
	else:
		return False


def filter_coverage_file(cov_dir, Names, Beds, min_cov):

	code = 0
	if int(min_cov) > 1:

		for name in Names:
			for region in Beds.keys():
				code = 0
				try:
					raw_cov_file = "{0}/{1}.{2}.depth".format(cov_dir, name, region)
					dest_cov_file = "{0}/{1}.{2}.{3}x.depth".format(cov_dir, name, region, min_cov)

					INPUT = open(raw_cov_file, "r")
					OUTPUT = open(dest_cov_file, "w")

					for line in INPUT:
						lst = line.rstrip("\b\r\n").split("\t")
						if int(lst[2]) >= int(min_cov):
							OUTPUT.write(line)
					INPUT.close()
					OUTPUT.close()
				except:
					code = 1
					break
			if code != 0:
				break
	else:
		for name in Names:
			for region in Beds.keys():
				code = 0
				try:
					raw_cov_file = "{0}/{1}.{2}.depth".format(cov_dir, name, region)
					dest_cov_file = "{0}/{1}.{2}.{3}x.depth".format(cov_dir, name, region, min_cov)
					os.rename(raw_cov_file, dest_cov_file)
				except:
					code = 1
					break
			if code != 0:
				break
	if code == 0:
		return True
	else:
		return False


def get_fasta_lengths(fasta):

	INPUT = open(fasta, "r")
	Lengths = [ (str(record.id), len(str(record.seq))) for record in SeqIO.parse(INPUT, "fasta") ]
	INPUT.close()
	Lengths = sorted(Lengths, key=itemgetter(0))

	return Lengths


def get_genome_file(reference, outdir):

	Lengths = get_fasta_lengths(reference)
	OUTPUT = open("{0}/genome.fa.lengths".format(outdir), "w")
	for x in Lengths:
		line = str(x[0]) + "\t" + str(x[1]) + "\n"
		OUTPUT.write(line)
	OUTPUT.close()

	return True


def make_executable(path):
    mode = os.stat(path).st_mode
    mode |= (mode & 0o444) >> 2
    os.chmod(path, mode)


def run_coverage_analysis(	script_dir, species_name, cov_dir, outdir_absolute, Names, bed_files,
				bed_names, genome_file, window_size, threads, min_frac_pos, min_cov_pos,
				min_feat_length, min_cov, n_breaks ):

	cmd = [	"{0}".format(sys.executable),
		"{0}/src/analyze-windows.py".format(script_dir),
		"--species-name", str(species_name),
		"--cov-dir", str(cov_dir),
		"--output-dir", str(outdir_absolute),
		"--cov-names", " ".join(Names),
		"--beds", " ".join(bed_files),
		"--beds-names", " ".join(bed_names),
		"--scaf-lengths", str(genome_file),
		"--window-size", str(window_size),
		"--threads", str(threads),
		"--min-frac-pos", str(min_frac_pos),
		"--min-cov-pos", str(min_cov_pos),
		"--min-length", str(min_feat_length),
		"--min-coverage", str(min_cov),
		"--n-breaks", str(n_breaks) ]

	# join and split again to get the bed files in separate entries
	cmd = " ".join(cmd).split(" ")

	code = run_cmd(	cmd,
			"{0}/{1}.windows.table".format(outdir_absolute, species_name),
			"{0}/{1}.windows.table.stderr".format(outdir_absolute, species_name) )

	if code == 0:
		return True
	else:
		return False


def filter_tables(species_name, tables_dir, Regions, Names, Excluded_sequences):

	# read the output tables produced in run_coverage_analysis()
	# ...
	# remove lines corresponding to scaffolds in Excluded sequences
	# ...
	# return filtered tables

	try:
		infile = "{0}/{1}.combined.results.txt".format(tables_dir, species_name)
		outfile = "{0}/RES.{1}.combined.results.txt".format(tables_dir, species_name)
		x = pd.read_csv(infile, sep="\t")
		x = x[~x["Sequence"].isin(Excluded_sequences)]
		x.to_csv(outfile, sep="\t", index=False)

		for region in Regions:
			for name in Names:
				infile = "{0}/{1}.{2}.{3}.txt".format(tables_dir, species_name, name, region)
				outfile = "{0}/RES.{1}.{2}.{3}.txt".format(tables_dir, species_name, name, region)
				x = pd.read_csv(infile, sep="\t")
				x = x[~x["Sequence"].isin(Excluded_sequences)]
				x.to_csv(outfile, sep="\t", index=False)
		return True

	except:
		return False


def generate_metrics_file(species_name, output_dir, tables_dir, Regions, Names, max_jacc_uniq, output_file, window_size):

	status = False
	Metrics = { region:{} for region in Regions }

	for region in Regions:

		# union, intersection, uncovered
		infile = "{0}/RES.{1}.combined.results.txt".format(tables_dir, species_name)
		x = pd.read_table(infile)
		union = x[x['Feature']==region].loc[(x["Jaccard"] >= 0.0) , ["Sequence", "W_start", "W_end", "Union"]].drop_duplicates()['Union'].sum()
		intersection = x[x['Feature']==region].loc[(x["Jaccard"] >= 0.0) , ["Sequence", "W_start", "W_end", "Intersection"]].drop_duplicates()['Intersection'].sum()
		uncovered = x[x['Feature']==region].loc[(x["Jaccard"] >= 0.0) , ["Sequence", "W_start", "W_end", "Uncovered"]].drop_duplicates()['Uncovered'].sum()

		# NA positions
		# positions where nothing could be inferred from the jaccard calculation (-0.1)
		# this means uncovered or too poorly covered
		na_positions = int(x[x['Feature']==region].loc[(x["Jaccard"] < 0.0) , ["Sequence", "W_start", "W_end", "Uncovered"]].shape[0]) * window_size

		total_length = union + uncovered
		Metrics[region]["Total"] = total_length
		Metrics[region]["Union"] = union
		Metrics[region]["Intersection"] = intersection
		Metrics[region]["Uncovered"] = uncovered
		Metrics[region]["NA_positions"] = na_positions

		# subgenomes
		for name in Names:
			Metrics[region][name] = {}
			infile = "{0}/RES.{1}.{2}.{3}.txt".format(tables_dir, species_name, name, region)
			y = pd.read_table(infile)
			intersection_sub = y.loc[(y["Jaccard"] >= 0.0) , ["Sequence", "W_start", "W_end", "Intersection"]].drop_duplicates()['Intersection'].sum()

			# the unique value is computed only for those windows that have a Jaccard index sufficiently low (specified via command line)
			mask = (y["Jaccard"] >= 0.0) & (y["Jaccard"] <= max_jacc_uniq)
			unique_sub = y.loc[mask , ["Sequence", "W_start", "W_end", "Unique"]].drop_duplicates()['Unique'].sum()

			# total, unique, mean cov, mean frac pos, 0-50, 50-100
			Metrics[region][name]['Total'] = y.shape[0] * window_size
			Metrics[region][name]["Unique_(J<={0})".format(max_jacc_uniq)] = unique_sub

	OUTPUT = open(output_file, "w")
	for region in Metrics.keys():
		for metric in ["Total", "Union", "Intersection", "Uncovered", "NA_positions"]:
			line = "\t".join([region, "-", metric, str(Metrics[region][metric])]) + "\n"
			OUTPUT.write(line)

		for name in Names:
			for metric in [	"Total", "Unique_(J<={0})".format(max_jacc_uniq)]:
				line = "\t".join([region, name, metric, str(Metrics[region][name][metric])])+"\n"
				OUTPUT.write(line)

	OUTPUT.close()
	status = True

	return status


def get_genome_size(genome_file):

	INPUT = open(genome_file, "r")
	genome_size = sum([ int(line.rstrip("\b\r\n").split("\t")[1]) for line in INPUT ])
	INPUT.close()
	return int(genome_size)


def get_relative_cov_frac(matrix, window_size, genome_file, tables_dir, region):

	try:
		# get genome length
		genome_size = get_genome_size("{0}.lengths".format(hisat2_index))

		# read lines excluding header
		df = pd.read_csv(matrix, sep="\t")
		df = df[df["Feature"]==region]

		# compute values
		# tot_win_len can be larger than genome size because of the windows at the end of a scaffold
		# who could exceed a scaffold length
		tot_windows = df.loc[:,["Sequence", "W_start"]].drop_duplicates().shape[0]
		analysed_size = min(genome_size, int(tot_windows) * int(window_size))

		# this value represents those lines with a valid jaccard index
		# that is, those where J is >= 0
		non_NA_windows = df[df["Jaccard"]>=0].loc[:,["Sequence", "W_start"]].drop_duplicates().shape[0]
		non_NA_analysed_size = min(genome_size, int(non_NA_windows) * int(window_size))

		# this value represents the total positions represented in the windows with a valid Jaccard
		real_cov_positions = df[df["Jaccard"]>=0].loc[:,["Sequence", "W_start", "Real_length"]].drop_duplicates()["Real_length"].sum()
		informative_positions = min(genome_size, real_cov_positions)

		# compute relative covered fractions
		rel_analysed_size = float(analysed_size) / float(genome_size) * 100
		rel_non_NA_analysed_size = float(non_NA_analysed_size) / float(genome_size) * 100
		rel_informative_positions = float(informative_positions) / float(genome_size) * 100

		# write to file
		relative_lengths_file = "{0}/RES.{1}.relative_lengths.table".format(tables_dir, region)
		OUTPUT = open(relative_lengths_file, "w")
		OUTPUT.write("\n".join([
					"\t".join(("genome_size", str(genome_size))),
					"\t".join(("rel_genome_size", str(float(100)))),
					"\t".join(("analysed_size", str(analysed_size))),
					"\t".join(("rel_analysed_size", str(rel_analysed_size))),
					"\t".join(("non_NA_analysed_size", str(non_NA_analysed_size))),
					"\t".join(("rel_non_NA_analysed_size", str(rel_non_NA_analysed_size))),
					"\t".join(("informative_positions", str(informative_positions))),
					"\t".join(("rel_informative_positions", str(rel_informative_positions)))	]) + "\n")
		OUTPUT.close()

		# return
		return (True, relative_lengths_file)

	except:
		relative_lengths_file = "{0}/RES.{1}.relative_lengths.table".format(tables_dir, region)
		return (False, relative_lengths_file)


def run_coverage_plot(outdir, matrix, R_path, region, name, species_name, maxcov):

	R_lines = [
	'suppressWarnings(library(ggplot2))',
	'',
	'x <- read.table("{0}", header=T)'.format(matrix),
	'x$Frac_pos <- as.numeric(as.character(x$Frac_pos))',
	'x$Mean_cov <- as.numeric(as.character(x$Mean_cov))',
	'x <- x[x$Jaccard >= 0 , ]',
	'',
	'P1 <- ggplot() + ',
	'theme(plot.title=element_text(family="sans", face="bold", colour="grey20", size=13),',
	'axis.title.x=element_text(family="Helvetica", face="plain", colour="grey20", size=10),',
	'axis.title.y=element_text(family="Helvetica", face="plain", colour="grey20", size=10),',
	'axis.text.x=element_text(family="sans", face="plain", colour="grey20", size=11),',
	'axis.text.y=element_text(family="sans", face="italic", colour="grey20", size=11),',
	'legend.text=element_text(family="sans", face="italic", colour="black", size=9),',
	'legend.title=element_text(family="sans", face="plain", colour="black", size=9),',
	'panel.background=element_rect(fill="grey90", colour="grey90"),',
	'aspect.ratio=0.35) + ',
	'ggtitle("Coverage, {0}, {1}") +'.format(name, region),
	'xlab("") + ',
	'ylab("Mean coverage [x]") +',
	'geom_violin(data=x, mapping=aes(x=1, y=Mean_cov)) + ',
	'geom_jitter(data=x, mapping=aes(x=3, y=Mean_cov, col=Frac_pos)) + ',
	'scale_x_discrete(breaks=c(1,3), labels=c("","")) +',
	'scale_y_continuous(limits=c(0, {0})) + '.format(maxcov),
	'scale_color_gradient2(breaks=seq(0,100,20), limits=c(0,100), low="green3", high="darkorchid4", midpoint=50, name="% covered") +',
	'coord_flip()',
	'',
	'basename <- paste("{0}", "cov", "{1}", "{2}", sep=".")'.format(species_name, name, region),
	'',
	'svg(paste("{0}", "svg", paste(basename, "svg", sep="."), sep="/"))'.format(outdir),
	'plot(P1)',
	'dev.off()',
	'',
	'png(paste("{0}", "png", paste(basename, "png", sep="."), sep="/"), height=400, width=1000, pointsize=11, units="px", type="cairo", res=200)'.format(outdir),
	'suppressWarnings(plot(P1))',
	'dev.off()' ]

	Rplot = "{0}.{1}.{2}.cov.Rplot".format(species_name, name, region)
	OUTPUT = open("{0}/scripts/{1}".format(outdir, Rplot), "w")
	for line in R_lines:
		OUTPUT.write(line + "\n")
	OUTPUT.close()

	cmd = [	R_path,
		"{0}/scripts/{1}".format(outdir, Rplot) ]

	code = run_cmd(	cmd,
			"{0}/logs/{1}.log.txt".format(outdir, Rplot),
			"{0}/logs/{1}.stderr".format(outdir, Rplot) )

	return code


def run_fraction_plot(	outdir, matrix, R_path, region, \
			species_name, maxcov, hisat2_index, window_size, tables_dir):

	(status, relative_lengths_file) = get_relative_cov_frac(matrix, window_size, hisat2_index, \
								tables_dir, region)

	if status == False:
		sys.exit("ERROR: plotting was unsuccessful\n\n")

	R_lines = [
	'suppressWarnings(library(ggplot2))',
	'',
	'Portion_labels=c(	"rel_genome_size"="Genome size",',
	'			"rel_analysed_size"="Covered by reads",',
	'			"rel_non_NA_analysed_size"="Featured in windows",',
	'			"rel_informative_positions"="Informative positions" )',
	'',
	'Portion_colors=c(	"rel_genome_size"="grey60",',
	'			"rel_analysed_size"="grey30",',
	'			"rel_non_NA_analysed_size"="grey30",',
	'			"rel_informative_positions"="red4" )',
	'',
	'y <- read.table("{0}")'.format(relative_lengths_file),
	'y <- y[c(2,4,6,8),]',
	'y$V1 <- factor(y$V1, levels=as.character(y$V1))',
	'y <- y[order(y$V1, decreasing=T),]',
	'colnames(y) <- c("Labels", "Values")',
	'y$Labels <- factor(y$Labels, levels=as.character(y$Labels))',
	'y$Values <- as.numeric(as.character(y$Values))',
	'',
	'P1 <- ggplot() + ',
	'theme(plot.title=element_text(family="sans", face="bold", colour="grey20", size=13),',
	'axis.title.x=element_text(family="Helvetica", face="plain", colour="grey20", size=10),',
	'axis.title.y=element_text(family="Helvetica", face="plain", colour="grey20", size=10),',
	'axis.text.x=element_text(family="sans", face="plain", colour="grey20", size=11),',
	'axis.text.y=element_text(family="sans", face="italic", colour="grey20", size=11),',
	'legend.text=element_text(family="sans", face="italic", colour="black", size=9),',
	'legend.title=element_text(family="sans", face="plain", colour="black", size=9),',
	'panel.background=element_rect(fill="grey90", colour="grey90"),',
	'aspect.ratio=0.25) + ',
	'ggtitle("{0}, {1}") +'.format(species_name, region),
	'xlab("") + ',
	'ylab("Represented genome portions [%]") +',
	'geom_bar(data=y, mapping=aes(x=Labels, y=Values, fill=Labels), width=0.45,',
	'	  stat="identity", position=position_dodge(), show.legend = FALSE) +',
	'scale_x_discrete(labels=Portion_labels) +',
	'scale_y_continuous(limits=c(0,100)) +',
	'scale_fill_manual(values=Portion_colors) + ',
	'coord_flip()',
	'',
	'basename <- paste("{0}", "frac", "{1}", sep=".")'.format(species_name, region),
	'',
	'svg(paste("{0}", "svg", paste(basename, "svg", sep="."), sep="/"))'.format(outdir),
	'plot(P1)',
	'dev.off()',
	'',
	'png(paste("{0}", "png", paste(basename, "png", sep="."), sep="/"), height=300, width=1000, pointsize=11, units="px", type="cairo", res=220)'.format(outdir),
	'suppressWarnings(plot(P1))',
	'dev.off()' ]

	Rplot = "{0}.{1}.frac.Rplot".format(species_name, region)
	OUTPUT = open("{0}/scripts/{1}".format(outdir, Rplot), "w")
	for line in R_lines:
		OUTPUT.write(line + "\n")
	OUTPUT.close()

	cmd = [	R_path,
		"{0}/scripts/{1}".format(outdir, Rplot) ]

	code = run_cmd(	cmd,
			"{0}/logs/{1}.log.txt".format(outdir, Rplot),
			"{0}/logs/{1}.stderr".format(outdir, Rplot) )

	return code


def run_jaccard_plot(outdir, matrix, R_path, region, species_name):

	R_lines = [
	'suppressWarnings(library(ggplot2))',
	'',
	'x <- read.table("{0}", header=T)'.format(matrix),
	'',
	'P1 <- ggplot(data=x, mapping=aes(x=Jaccard))+ ',
	'theme(plot.title=element_text(family="sans", face="bold", colour="grey20", size=13),',
	'axis.title.x=element_text(family="Helvetica", face="plain", colour="grey20", size=10),',
	'axis.title.y=element_text(family="Helvetica", face="plain", colour="grey20", size=10),',
	'axis.text.x=element_text(family="sans", face="plain", colour="grey20", size=11),',
	'axis.text.y=element_text(family="sans", face="italic", colour="grey20", size=11),',
	'legend.text=element_text(family="sans", face="italic", colour="black", size=9),',
	'legend.title=element_text(family="sans", face="plain", colour="black", size=9),',
	'panel.background=element_rect(fill="grey90", colour="grey90"),',
	'aspect.ratio=0.15) + ',
	'ggtitle("{0}, {1}") +'.format(species_name, region),
	'xlab("Jaccard index [0,1]") +',
	'ylab("%") +',
	'geom_histogram(mapping=aes(y= ..count.. / sum(..count..) * 100), binwidth=0.1, color="grey50", fill="#FFEBCC") +',
	'geom_vline(xintercept=mean(as.numeric(as.character(x[x$Jaccard >= 0 , ]$Jaccard))), color="#562A72", size=0.5) +',
	'scale_x_continuous(limits=c(-0.16,1.06), breaks=seq(-0.1,1,0.1),',
	'        labels=c("NA", seq(0,1,0.1))) +',
	'scale_y_continuous()',
	'',
	'basename <- paste("{0}", "jac", "{1}", sep=".")'.format(species_name, region),
	'',
	'svg(paste("{0}", "svg", paste(basename, "svg", sep="."), sep="/"))'.format(outdir),
	'plot(P1)',
	'dev.off()',
	'',
	'png(paste("{0}", "png", paste(basename, "png", sep="."), sep="/"), height=300, width=1000, pointsize=11, units="px", type="cairo", res=200)'.format(outdir),
	'suppressWarnings(plot(P1))',
	'dev.off()' ]

	Rplot = "{0}.{1}.jac.Rplot".format(species_name, region)
	OUTPUT = open("{0}/scripts/{1}".format(outdir, Rplot), "w")
	for line in R_lines:
		OUTPUT.write(line + "\n")
	OUTPUT.close()

	cmd = [	R_path,
		"{0}/scripts/{1}".format(outdir, Rplot) ]

	code = run_cmd(	cmd,
			"{0}/logs/{1}.log.txt".format(outdir, Rplot),
			"{0}/logs/{1}.stderr".format(outdir, Rplot) )

	return code


def generate_plots(outdir, R_path, tables_dir, Names, Beds, species_name, maxcov, window_size):

	Codes = []
	for region in Beds.keys():
		name = Names[0]
		matrix = "{0}/RES.{1}.{2}.{3}.txt".format(tables_dir, species_name, name, region)
		Codes.append(run_fraction_plot(	outdir, matrix, R_path, region, \
							species_name, maxcov, hisat2_index, \
							window_size, tables_dir	))
		Codes.append(run_jaccard_plot(	outdir, matrix, R_path, region, species_name	))

		for name in Names:
			matrix = "{0}/RES.{1}.{2}.{3}.txt".format(tables_dir, species_name, name, region)
			Codes.append(run_coverage_plot(	outdir, matrix, R_path, region, name, \
							species_name, maxcov	))

	Codes = list(set(Codes))
	if ((len(Codes)==1) and(Codes[0] == 0)):
		return True
	else:
		return False


def remove(file_path):
	if os.path.exists(file_path) == True:
		os.remove(file_path)
		return True
	else:
		return False


def cleanup_intermediate_files(mapping_dir, Names, Beds, cov_dir, tables_dir, species_name):

	for name in Names:
		remove("{0}/{1}.sam".format(mapping_dir, name))
		remove("{0}/{1}.f.bam".format(mapping_dir, name))
		for region in Beds.keys():
			remove("{0}/{1}.{2}.depth".format(cov_dir, name, region))

	return True


### conditions ###

if ((args.species_name is None) or \
(args.reference is None) or \
(args.names is None) or \
(args.reads is None)) and \
(args.version == False):
	sys.exit("ERROR: one of --species-name, --reference, --names, --reads is missing\n\n")


if ((args.region_beds) and (not args.region_names)) or \
   ((args.region_names) and (not args.region_beds)):
	sys.exit("ERROR: --region-beds and --region-names have to be used together")


### main script ###

if __name__ == "__main__":

	### version ###
	sys.stderr.write(VERSION)
	if args.version:
		sys.exit(0)

	### begin ###
	command = " ".join(sys.argv)
	sys.stderr.write("\n\n{0}\n\n".format(command))

	sys.stderr.write("\n### BEGIN: {0} ###\n\n".format(at()))
	cwd = os.getcwd()
	script_dir = sys.path[0]
	counter = 0

	### memory limit ###
	maxmem = int(str(args.max_mem).strip("G"))*1e9
	memory_limit(maxmem)
	sys.stderr.write("[{0}] Set memory limit to: {1}G\n".format(at(), int(maxmem / 1e9)))

	try:

		### create output directory ###
		if os.path.exists(args.output_dir) == False:
			os.mkdir(args.output_dir)

		### generate reference ###
		counter += 1
		outdir = "{0}/{1}_reference".format(args.output_dir, counter)
		donefile = "{0}/create_reference.done".format(outdir)

		if os.path.exists(outdir) == False:
			os.mkdir(outdir)

		if args.filter_reference:

			### filter reference ###
			if (os.path.exists(donefile) == False) or (args.restart == True):
				status = filter_reference(	outdir,
								args.reference,
								args.window_size )

				if status == True:
					sys.stderr.write("[{0}] Sequences shorter than {1} removed from reference\n".format(at(), args.window_size))
					open(donefile, "w").close()
				else:
					sys.exit("[{0}] ERROR: could not filter reference\n\n".format(at()))
			else:
				sys.stderr.write("[{0}] reference already filtered (skipping step)\n".format(at()))

		else:
			### linking the reference ###
			outdir = "{0}/{1}_reference".format(args.output_dir, counter)
			if args.reference[0:1] == "/":
				source_file = args.reference
			else:
				source_file = str(cwd) + "/" + str(args.reference)

			destination_file = "{0}/genome.fa".format(outdir)

			if (os.path.exists(donefile) == False) or (args.restart == True):

				if os.path.exists(destination_file) == True:
					sys.stderr.write("[{0}] reference already linked (skipping step)\n".format(at()))
					code = 0

				elif os.path.exists(destination_file) == False:
					code = link_file(source_file, destination_file)
					open(donefile, "w").close()

				elif (os.path.exists(destination_file) == True) and (args.restart):
					code = unlink_file(destination_file)
					code = link_file(source_file, destination_file)
					open(donefile, "w").close()

				else:
					sys.exit("ERROR: there was an issue linking the new reference\n\n")

				if code != 0:
					sys.exit("ERROR: The genome could not be linked properly\n\n")
			else:
				sys.stderr.write("[{0}] reference already linked (skipping step)\n".format(at()))

		reference_dir = outdir


		### build hisat2 indexes ###
		outdir = "{0}/{1}_reference".format(args.output_dir, counter)
		donefile = "{0}/build_hisat2_indexes.done".format(outdir)
		if os.path.exists(outdir) == False:
			os.mkdir(outdir)

		if (os.path.exists(donefile) == False) or (args.restart == True):
			status = build_hisat2_indexes(	outdir,
							args.threads,
							args.hisat2_path,
							cwd	)
			if status == True:
				sys.stderr.write("[{0}] Indexes built for: {1}\n".format(at(), args.reference))
				open(donefile, "w").close()
			else:
				sys.exit("[{0}] ERROR: could not generate index\n\n".format(at()))
		else:
			sys.stderr.write("[{0}] hisat2 indexes already generated (skipping step)\n".format(at()))

		if args.output_dir[0:1] == "/":
			hisat2_index = "{0}/{1}_reference/genome.fa".format(args.output_dir, counter)
		else:
			hisat2_index = "{0}/{1}/{2}_reference/genome.fa".format(cwd, args.output_dir, counter)


		### link read files ###
		counter += 1
		outdir = "{0}/{1}_reads".format(args.output_dir, counter)
		donefile = "{0}/link_read_files.done".format(outdir)
		if os.path.exists(outdir) == False:
			os.mkdir(outdir)

		if (os.path.exists(donefile) == False) or (args.restart == True):
			Reads = generate_reads_dictionary(args.reads, args.names)
			status = link_read_files(	Reads,
							outdir		)

			if status == True:
				sys.stderr.write("[{0}] Read files linked properly in {1}\n".format(at(), outdir))
				open(donefile, "w").close()
			else:
				sys.exit()

		else:
			sys.stderr.write("[{0}] files already linked (skipping step)\n".format(at()))


		### estimate insert size ###
		if args.reads_type == "PE":
			# no counter cause it works within reads anyway
			outdir = "{0}/{1}_reads/isize_estimation".format(args.output_dir, counter)
			donefile = "{0}/isize_estimation.done".format(outdir)

			if os.path.exists(outdir) == False:
				os.mkdir(outdir)

			if (os.path.exists(donefile) == False) or (args.restart == True):

				Reads = generate_reads_dictionary(args.reads, args.names)
				status = estimate_insert_size(	hisat2_index,
								Reads,
								args.hisat2_map_pars,
								args.hisat2_path,
								args.rscript_path,
								args.isize_read_num,
								args.isize_dist_width,
								outdir,
								args.threads,
								script_dir )

				if status == True:
					sys.stderr.write("[{0}] Insert size estimation complete\n".format(at()))
					open(donefile, "w").close()
				else:
					sys.exit("ERROR: insert sizes could not be estimated\n")

			else:
				sys.stderr.write("[{0}] ins. size already estimated (skipping step)\n".format(at()))

			isize_file = "{0}/{1}_reads/isize_estimation/isize_ranges.table".format(args.output_dir, counter)

		elif args.reads_type == "SE":
			# in this case we don't need to estimate insert size, because there
			# is no insert
			# so we skip the step

			isize_file = None


		### map reads ####
		counter += 1
		outdir = "{0}/{1}_mapping".format(args.output_dir, counter)
		donefile = "{0}/read_mapping.done".format(outdir)

		if os.path.exists(outdir) == False:
			os.mkdir(outdir)

		if (os.path.exists(donefile) == False) or (args.restart == True):

			Reads = generate_reads_dictionary(args.reads, args.names)
			status = map_reads(	hisat2_index,
						Reads,
						args.reads_type,
						args.hisat2_map_pars,
						args.hisat2_path,
						isize_file,
						outdir,
						args.threads	)

			if status == True:
				sys.stderr.write("[{0}] Read mapping complete\n".format(at()))
				open(donefile, "w").close()
			else:
				sys.exit("ERROR: reads could not be mapped\n")

		else:
			sys.stderr.write("[{0}] read mapping already performed (skipping step)\n".format(at()))

		mapping_dir = outdir


		### filter reads ###
		# no counter addition
		outdir = "{0}/{1}_mapping".format(args.output_dir, counter)
		donefile = "{0}/read_filtering.done".format(outdir)

		if os.path.exists(outdir) == False:
			os.mkdir(outdir)

		if (os.path.exists(donefile) == False) or (args.restart == True):

			# samtools sort uses more memory than the one per thread
			# so I set it to 90% of the one specified by the user
			# to account for this
			maxmem = args.max_mem.strip("G")
			adj_maxmem = float(maxmem) * 0.90
			thread_mem_val = float(adj_maxmem) / float(args.threads)
			if thread_mem_val < float(1):
				thread_mem = "768M"
			else:
				thread_mem = str(floor(thread_mem_val)) + "G"

			sys.stderr.write("[{0}] Sorting with {1} threads and {2} per thread\n".format(at(), args.threads, thread_mem))

			Reads = generate_reads_dictionary(args.reads, args.names)
			status = filter_reads(	args.samtools_filters,
						Reads,
						outdir,
						args.threads,
						thread_mem )

			if status == True:
				sys.stderr.write("[{0}] Mapping records filtering complete\n".format(at()))
				open(donefile, "w").close()
			else:
				sys.exit("ERROR: mapping records could not be filtered\n")

		else:
			sys.stderr.write("[{0}] mapping records filtering already performed (skipping step)\n".format(at()))

		mapdir = outdir


		### parse region beds ###
		# no counter addition
		if (args.region_beds and args.region_names):
			Beds = generate_beds_dictionary(args.region_beds, args.region_names, cwd)
		else:
			Beds = {}

		if len(Beds) > 0:
			if os.path.exists("{0}/files".format(args.output_dir)) == False:
				os.mkdir("{0}/files".format(args.output_dir))

			for region in Beds:
				dst = "{0}/files/{1}.bed".format(args.output_dir, region)
				copyfile(Beds[region], dst)

			sys.stderr.write("[{0}] BED files copied in {1}/files\n".format(at(), args.output_dir))


		### extract coverage ###
		counter += 1
		outdir = "{0}/{1}_coverage".format(args.output_dir, counter)
		donefile = "{0}/extract_coverage.done".format(outdir)

		if outdir[0:1] != "/":
			outdir = str(cwd) + "/" + str(outdir)

		if os.path.exists(outdir) == False:
			os.mkdir(outdir)

		if (os.path.exists(donefile) == False) or (args.restart == True):

			Reads = generate_reads_dictionary(args.reads, args.names)
			status = extract_coverage(	Beds,
							Reads,
							outdir,
							mapdir )

			if status == True:
				sys.stderr.write("[{0}] Coverage extraction complete\n".format(at()))
				open(donefile, "w").close()
			else:
				sys.exit("ERROR: coverage could not be extracted\n")

		else:
			sys.stderr.write("[{0}] cov. extraction already performed (skipping step)\n".format(at()))

		cov_dir = outdir


		### filter coverage file ###
		outdir = "{0}/{1}_coverage".format(args.output_dir, counter)
		donefile = "{0}/filter_coverage_file.done".format(outdir)

		if os.path.exists(outdir) == False:
			os.mkdir(outdir)

		if (os.path.exists(donefile) == False) or (args.restart == True):

			status = filter_coverage_file(	cov_dir,
							args.names,
							Beds,
							args.min_cov	)

			if status == True:
				sys.stderr.write("[{0}] Coverage file filtering complete\n".format(at()))
				open(donefile, "w").close()
			else:
				sys.exit("ERROR: coverage file could not be filtered\n")

		else:
			sys.stderr.write("[{0}] coverage file filtering already performed (skipping step)\n".format(at()))


		### get genome file ###
		# no counter
		outdir = reference_dir
		donefile = "{0}/get_genome_file.done".format(outdir)

		if os.path.exists(outdir) == False:
			os.mkdir(outdir)

		if (os.path.exists(donefile) == False) or (args.restart == True):

			status = get_genome_file(	hisat2_index,
							outdir )

			if status == True:
				sys.stderr.write("[{0}] Genome file generated\n".format(at()))
				open(donefile, "w").close()
			else:
				sys.exit("ERROR: couldn't generate genome file\n")

		else:
			sys.stderr.write("[{0}] Genome file already generated (skipping step)\n".format(at()))

		genome_file = "{0}/genome.fa.lengths".format(reference_dir)
		if args.output_dir[0:1] != "/":
			genome_file = str(cwd) + "/" + str(genome_file)


		### run coverage analysis ###
		# no counter addition
		counter += 1
		outdir = "{0}/{1}_tables".format(args.output_dir, counter)
		donefile = "{0}/run_coverage_analysis.done".format(outdir)

		if outdir[0:1] != "/":
			outdir_absolute = str(cwd) + "/" + outdir
		else:
			outdir_absolute = outdir


		if os.path.exists(outdir) == False:
			os.mkdir(outdir)

		if (os.path.exists(donefile) == False) or (args.restart == True):

			status = run_coverage_analysis(	script_dir,
							args.species_name,
							cov_dir,
							outdir_absolute,
							args.names,
							args.region_beds,
                                			args.region_names,
							genome_file,
							args.window_size,
							args.threads,
							args.min_frac_pos,
							args.min_cov_pos,
                                			args.min_feat_length,
							args.min_cov,
							args.n_breaks )

			if status == True:
				sys.stderr.write("[{0}] Coverage analysis completed\n".format(at()))
				open(donefile, "w").close()
			else:
				sys.exit("ERROR: couldn't analyse coverage by window\n")

		else:
			sys.stderr.write("[{0}] Coverage by window already analysed (skipping step)\n".format(at()))

		tables_dir = outdir_absolute


		### create metrics file ###
		outdir = "{0}/{1}_tables".format(args.output_dir, counter)
		donefile = "{0}/filter_output_tables.done".format(outdir)

		if outdir[0:1] != "/":
			outdir_absolute = str(cwd) + "/" + outdir
		else:
			outdir_absolute = outdir

		if os.path.exists(outdir) == False:
			os.mkdir(outdir)

		if (os.path.exists(donefile) == False) or (args.restart == True):

			status = filter_tables(	args.species_name,
						tables_dir,
						args.region_names,
						args.names,
						args.exclude)

			if status == True:
				sys.stderr.write("[{0}] Output tables filtered, removed sequences {1}\n".format(at(), str(args.exclude)))
				open(donefile, "w").close()
			else:
				sys.exit("ERROR: couldn't filter output tables\n")

		else:
			sys.stderr.write("[{0}] Output tables already filtered, removed sequences {1} (skipping step)\n".format(at(), str(args.exclude)))


		### create metrics file ###
		outdir = "{0}/{1}_tables".format(args.output_dir, counter)
		donefile = "{0}/generate_metrics_file.done".format(outdir)

		if outdir[0:1] != "/":
			outdir_absolute = str(cwd) + "/" + outdir
		else:
			outdir_absolute = outdir

		if os.path.exists(outdir) == False:
			os.mkdir(outdir)

		if (os.path.exists(donefile) == False) or (args.restart == True):

			status = generate_metrics_file(	args.species_name,
							args.output_dir,
							tables_dir,
							args.region_names,
							args.names,
							args.max_jacc_uniq,
							"{0}/RES.{1}.coverage_metrics.txt".format(	outdir_absolute,
													args.species_name),
							args.window_size )
			if status == True:
				sys.stderr.write("[{0}] Coverage statistics computed\n".format(at()))
				open(donefile, "w").close()
			else:
				sys.exit("ERROR: couldn't compute coverage statistics\n")

		else:
			sys.stderr.write("[{0}] Coverage statistics already computed (skipping step)\n".format(at()))


		### generate plots ###
		counter += 1
		outdir = "{0}/{1}_plots".format(args.output_dir, counter)
		donefile = "{0}/generate_plots.done".format(outdir)

		if os.path.exists(outdir) == False:
			os.mkdir(outdir)

		Subdirs = ["png", "svg", "scripts", "logs"]
		for subdir in Subdirs:
			if os.path.exists("{0}/{1}".format(outdir, subdir)) == False:
				os.mkdir("{0}/{1}".format(outdir, subdir))

		if outdir[0:1] != "/":
			outdir_absolute = str(cwd) + "/" + outdir
		else:
			outdir_absolute = outdir

		if (os.path.exists(donefile) == False) or (args.restart == True):

			status = generate_plots(outdir_absolute,
						args.rscript_path,
						tables_dir,
						args.names,
						Beds,
						args.species_name,
						args.max_plot_cov,
						args.window_size )

			if status == True:
				sys.stderr.write("[{0}] Plots were generated\n".format(at()))
				open(donefile, "w").close()
			else:
				sys.exit("ERROR: couldn't generate plots\n")

		else:
			sys.stderr.write("[{0}] Plots were already generated (skipping step)\n".format(at()))


		### cleanup ###
		outdir = args.output_dir
		donefile = "{0}/cleanup.done".format(outdir)

		if ((os.path.exists(donefile) == False) or (args.restart == True)) and \
		(args.cleanup == True):

			status = cleanup_intermediate_files(	mapping_dir,
								args.names,
								Beds,
								cov_dir,
								tables_dir,
								args.species_name	)

			if status == True:
				sys.stderr.write("[{0}] Intermediate files deleted\n".format(at()))
				open(donefile, "w").close()
			else:
				sys.exit("ERROR: couldn't delete intermediate files\n")

		else:
			sys.stderr.write("[{0}] Intermediate files were already deleted (skipping step)\n".format(at()))
	except MemoryError:
		sys.stderr.write('\n\nERROR: Memory Exceeded\n\n')
		sys.exit(1)


	### end ###
	sys.stderr.write("\n### END: {0} ###\n\n".format(at()))

	sys.stderr.write("--- TIME: %s seconds ---\n\n\n" % (time.time() - start_time))