diff --git a/NLGenomeSweeper b/NLGenomeSweeper
index fa49b20..c50c7b8 100755
--- a/NLGenomeSweeper
+++ b/NLGenomeSweeper
@@ -1,4 +1,4 @@
-#!/usr/bin/env python
+#!/usr/bin/env python3
 
 import subprocess
 import sys
@@ -48,9 +48,12 @@ profiler_file='data/Vitis_vinifera_NB-ARC_consensus.NLG_based.fa'
 # Output file names
 annotated_candidates = 'Annotated_candidates.bed'
 unannotated_candidates = 'All_candidates.bed'
-annotated_gff= 'All_candidates.gff3'
-consensus_file='Species_specific_consensus.fa'
-filter_candidates='Filtered_candidates.bed'
+annotated_gff = 'All_candidates.gff3'
+consensus_file ='Species_specific_consensus.fa'
+filter_candidates ='Filtered_candidates.bed'
+
+# We'll look to merge domains across introns smaller than this
+max_intron_len = 1000
 
 # Append the contents of an input file to an output file
 def append_file(input_file, output_file):
@@ -62,7 +65,9 @@ def append_file(input_file, output_file):
 
 ##########################################################################
 #
-# identify_candidates_proteins
+# DEPRACATED - no protein search done
+#
+# identify_candidates_proteins 
 #
 # Arguments:
 # program_dir - directory of this program
@@ -85,7 +90,7 @@ def identify_candidates_proteins(program_dir, protein, gff, hmm, outdir):
 	os.mkdir(outdir + search_folder + protein_search_folder)
 	subprocess.run('{}/scripts/hmmer_candidates.sh {} {} {} {} {}'.format(program_dir, protein, gff,
 		program_dir + nbarc_hmm,
-		outdir + search_folder + protein_search_folder, program_dir), shell=True)
+		outdir + search_folder + protein_search_folder, program_dir), shell=True, check=True)
 	shutil.copy(outdir + search_folder + protein_search_folder + annotated_candidates, 
 		outdir + annotated_candidates)
 	
@@ -95,11 +100,6 @@ def identify_candidates_proteins(program_dir, protein, gff, hmm, outdir):
 		subprocess.run('{}/scripts/hmmer_candidates.sh {} {} {} {} {}'.format(program_dir,protein,gff,
 			hmm,outdir + search_folder + protein_custom_folder, program_dir), shell=True)
 		# Concatenate to the pfam search results
-		#annotated1 = pd.read_csv(outdir + search_folder + protein_custom_folder +annotated_candidates, 
-		#	sep='\t',header=None)
-		#annotated2 = pd.read_csv(outdir + '/Annotated_candidates.bed', sep='\t',header=None)
-		#annotated = annotated1.append(annotated2, ignore_index=True).drop_duplicates()
-		#annotated.to_csv(outdir + '/Annotated_candidates.bed', sep='\t', header =False, index = False )
 		append_file(outdir + search_folder + protein_custom_folder + annotated_candidates, 
 			outdir + annotated_candidates)
 
@@ -154,7 +154,7 @@ def identify_candidates_genome(program_dir, genome, consensus, outdir, cores, in
 #			outdir + search_folder + genome_search_folder + 'candiates_sites.bed')
 	# Run the genome based search
 	subprocess.run('{}/scripts/genome_search.sh {} {} {} {} {}'.format(program_dir,genome, 
-		outdir + search_folder + genome_search_folder, program_dir, cores, intron_size), shell=True)
+		outdir + search_folder + genome_search_folder, program_dir, cores, intron_size), shell=True, check=True)
 	#shutil.copy(outdir + search_folder + genome_search_folder + unannotated_candidates, 
 	#	outdir + unannotated_candidates)
 	shutil.copy(outdir + search_folder + genome_search_folder + consensus_file,
@@ -187,7 +187,7 @@ def identify_candidates_genome(program_dir, genome, consensus, outdir, cores, in
 ##########################################################################
 def run_interproscan(program_dir, outdir, t):
 	subprocess.run('{}/scripts/region_interproscan.sh {} {} {}'.format(program_dir, outdir + domain_folder, t, program_dir),
-		shell=True)
+		shell=True, check=True)
 	#shutil.copy(outdir + domain_folder + annotated_gff, 
 	#	outdir + annotated_gff)
 	#shutil.copy(outdir + domain_folder + unannotated_candidates,
@@ -214,12 +214,14 @@ def main(argv):
 		help="A genome sequence in fasta format to search.")
 	
 	input_group = parser.add_argument_group('Input arguments')
+# DEPRACATED - no protein search
 #	input_group.add_argument("-protein", metavar="<fasta file>",  type=str, default = None,
 #		help="Protein sequences in fasta format.")
 #	input_group.add_argument("-gff", metavar="<gff3 file>", type=str, default = None,
 #		help="gff3 annotation of the genome. Required if searching protein sequences.")
 	input_group.add_argument("-consensus", metavar="<fasta file>", type=str, default = None,
-		help="Also search the genome using a custom NB-ARC consensus sequence(s) (optional).")
+		help="Also search the genome using a custom NB-ARC consensus sequence(s) (optional)." + 
+			"It is not recommended to use this option since species specific consensuses will be created.")
 #	input_group.add_argument("-hmm", metavar="<file>",  type=str, default = None,
 #		help="Also search the protein file using a custom NB-ARC HMM.")
 	
@@ -279,7 +281,7 @@ def main(argv):
 		# Search protein file using hmmer
 ##		identify_candidates_proteins(program_dir,args.protein, args.gff, args.hmm, args.outdir )
 	# Search the genome using blast
-	identify_candidates_genome(program_dir, args.genome, args.consensus, args.outdir + nlg_folder, args.t, 1000)
+	identify_candidates_genome(program_dir, args.genome, args.consensus, args.outdir + nlg_folder, args.t, max_intron_len)
 	# Define ORFs and domains using interproscan
 	run_interproscan(program_dir, args.outdir + nlg_folder, args.t)
 
diff --git a/data/Vitis_vinifera_NB-ARC_consensus.NLG_based.fa b/data/Vitis_vinifera_NB-ARC_consensus.NLG_based.fa
old mode 100644
new mode 100755
diff --git a/data/Vitis_vinifera_NB-ARC_consensus.fa b/data/Vitis_vinifera_NB-ARC_consensus.fa
old mode 100644
new mode 100755
diff --git a/data/combined_consensus.fa b/data/combined_consensus.fa
old mode 100644
new mode 100755
index 7753c81..9039e8c
--- a/data/combined_consensus.fa
+++ b/data/combined_consensus.fa
@@ -4,52 +4,27 @@ TAKKEDLFKDILKELAKEADDTSVEKEESELKRKLRRLLLRKRFLLVLDDVWEEEDWEKI
 GVPLPAKENRLRVLLTTRDEEVANAVSAKSEKIEVELLEKDEAWELLENKVFEKELSEDP
 DLEEVAKEIVEKCKGLPLALKVIGGLLATKSTVKEWEKVAEQLNNKLAESRSELNEVLSI
 LSLSYESLPEALKRCFLYLSLFPEDVEIKVKRLVKLWSAEGFVESEDI
->vvinCNL_custom_NB-ARC-consensus.genomewide
+>vvinCNL-consensus
 LLLLLEEEEEQLSVISIVGMGGLGKTTLAKLVYNDEEVKKHFDLRAWVCVSEEFDVKELL
 KEILKSLKSEKLESMELEELQEKLKELLKEKRYLLVLDDVWNEDKWEELKEALKEGAKGS
 RIIVTTRNEEVASIMKTISPYELEKLSEEESWSLFKKKAFKEEEEEECPELEELGKEIVK
 KCKGLPLAIKALGGLLSSKKDEEEWSKVLDSELWELEEEESSILKILKLSYNDLPSHLKQ
 CFLYCSLFPKDYEIKKEELIQLWMAEGFVQ
->vvinNL_custom_NB-ARC-consensus.genomewide
+>vvinNL-consensus
 EEILEALKDDNINMIGVYGMGGVGKTTLVKEVAKKVKEKKLFDEVIIVTVSKKPDLRKIQ
 EEIADKLKLKLEKEESESERADKLRERLKKSKKILLILDDVWEKLDLEEVGIPEEDKEKG
 CKIVLTTRNRDVCSEMKTQKEIKLEVLSEEEAWKLFEKKVGDVLNSPNLKSIAKEVVKEC
 AGLPLAIVTVAKALKNKKDVEVWEDALQQLRRSALKNIKGMEEKVYSALKLSYNKLESEE
 LKSCFLFCALFPEDYEIEIEELIEYLIAEKLI
->vvinRPW8_custom_NB-ARC-consensus.genomewide
+>vvinRPW8-consensus
 KELKELLLKEEEKLLLLTALGGCGKTTLVKKLCKDDEIKGIFKDNILFVNVSKTPNLKVI
 VQKLFKYKKEEELEFQSDEDAINQLEQLLNKIGSNPILLILDDVWPGSESLLEKFKFDIP
 KYKILVTSRTAFPRFKFTYELKPLNDEDAMSLFRHSASLQDGSSYIPEEVIKKIVRGCGG
 LPLALKVIGGSLREQKAEVWLSKLEKWSEGESIFESDEELLDCLQKSLEFSDDKVILKEC
 FLDLGSFPEDQRIPVAALIDMWA
->vvinTNL_custom_NB-ARC-consensus.genomewide
+>vvinTNL-consensus
 ELESLLDIESNDVRLVGILGLGGIGKTTLAKAVYNKISKKFEASSFLENVREKSKKEGLV
 KLQEQLLSEILKEKELKVKNVSEGINVIKKRLRKKKVLLVLDDVDKLEQLEKLAGKKDWF
 GSGSRIIITTRDKHLLKTLEVDEIYEVKELNKDEALELFSLKAFKKEKPEEDYLELSKKV
 VKYAKGLPLALKVLGSDLFGRSIDEWKSALEKLKEIPEKEILEILKISYDGLEETEKEIF
 LDIACFFKG
-
->vvinCNL_custom_NB-ARC-consensus
-ELEELGKKLLANEGGEKLLVISIVGMGGLGKTTLAQKVYNDKHVKTHFDLRAWVCVSDEF
-DLKEILKKILESVSKSSESEDLEELQEKLKELLKEKRFLLVLDDVWNEDEKWEELKTLLP
-DGANGSKIIVTTRNKSVASVMKTSSIHELKLLSDEESWSLFTKKAFENGESSAPPELEEI
-GKEIVKKCGGLPLAIKALGGLLSSKKEEEEWEKVLNSEIWDLEEEEEILPALKLSYNDLP
-SHLKRCFLYCSLFPKDYEIKKEELILLWVAEGLVQE
->vvinTNL_custom_NB-ARC-consensus
-EELKSLLKLESNDVRMVGIYGIGGIGKTTIAKAIYNKISAQFEGVSFLENVREKSKKKGL
-LQLQKKLLKDILKEKKLKVSNVEEGLKVIKKRLSSKKVLIVLDDVDELEQLEALAGEKDW
-FGKGSRIIITTRDKELLEEHEVDELYEVKKLNKEEALELFSLYAFKQELPKEDYEELSKE
-IVEYAKGLPLALKVLGSLLFKKTIKEWESELEKLKKEPEKEIQNVLKISYDGLDETEKEI
-FLDIACFFKG
->vvinNL_custom_NB-ARC-consensus
-EKVMEALEDEKVRVIGIVGMGGVGKTTLVKQVYNDAKVKKLFDLVIWVTVSKEFDLEKIQ
-KEILEKLGLKDEKSESEEEKAEELKERLKEKKFLLVLDDVWEELDLEKLGIPLPDDKNGS
-KIVLTTRSEDVCKEMEAQKKIELELLSEEEAWELFKKKAGESVESHPELEELAEEVVKEC
-KGLPLAIVTLGRALKSKKTVEEWEKALEELKSSLSENIMEDKVLPVLKLSYDSLPSELKS
-CFLYCALFPEDYEIEKEELIELWIAEGLLQ
->vvinRPW8_custom_NB-ARC-consensus
-KELKKLLFKDDESRIVVSAPGGCGKTTLAKKLCKDQQVKEYFKDSILYVTVSKTANLIEI
-IKKLLKENAEEVKEFQNEEDAVNQLERKLKRKVESGRILLVLDDVWSGSESVLEKLKFKI
-SELKVLVTSRNEFPKFGSTYELELLEEEEAKELFRHSAIEEDGSEDSDLDEELVKEIVRR
-CKGLPLALEVVGRSLAGKPVEIWRSTLEKLSEGESIVESEDELLECLQSSLDALDDKDML
-KECFLDLGSFPEDKKIPVTALIDLWA
diff --git a/scripts/createProfile.sh b/scripts/createProfile.sh
index 65c8274..f6271a4 100755
--- a/scripts/createProfile.sh
+++ b/scripts/createProfile.sh
@@ -3,7 +3,23 @@
 #
 # Create custom consensus sequences based on domains desired.
 #
+# This program takes potential domains identified in the genome and compares
+# them to known high quality consnesus sequences of that domain. For each
+# potential domain the closest reference sequence if found and sequences are
+# grouped by their closest reference. The grouped sequences are aligned
+# and a protein consensus sequence is created for each group.
+# 
+# Inputs:
+# $1: Fasta file of sequences containing domain extracted from the genome
+# $2: output directory
+# $3: directory of this program
+# $4: prefix of output file
+# $5: Consensus sequences in fasta format, protein or nucleotide
+# $6: Format of the consensus sequences, 'prot' or 'nucl'
 #
+# Outputs:
+# $outputdir/$prefix.fa: Fasta file of species specific consensus sequences, proteins
+# 
 
 #hmm=$1
 sequences=$1
@@ -56,5 +72,5 @@ do
 	fi
 done
 cat $outputdir/consensus_clusters.*.profile.fa > $outputdir/consensus_clusters.fa
-remove_smalls $outputdir/consensus_clusters.fa $outputdir/species_specific_domains.fa 200
+remove_smalls $outputdir/consensus_clusters.fa $outputdir/$prefix.fa 200
 
diff --git a/scripts/find_genome_candidates.sh b/scripts/find_genome_candidates.sh
index d496f22..2d3a243 100755
--- a/scripts/find_genome_candidates.sh
+++ b/scripts/find_genome_candidates.sh
@@ -1,7 +1,28 @@
 #!/bin/bash
 
+
+#
+# Find candidate domains in a genome with blast.
 #
+# This is a subroutine for finding blast hits of a consensus domain in 
+# a genome sequence. The hits are merged across small gaps as well as
+# introns of a custom length. Hits must span at least 80% of the
+# original domain sequence.
+# 
+# Inputs:
+# $1: genome file in fasta format to search
+# $2: output directory
+# $3: number of cores to use
+# $4: Format of the consensus sequences, 'prot' or 'nucl'
+# $5: Consensus sequences in fasta format, protein or nucleotide
+# $6: The maximum size (bp) of introns
+# $7: directory of this program
 #
+# Outputs:
+# $outputdir/$outname.fa: Fasta sequences of candidate domain found in genome
+# 
+
+
 
 genome=$1
 outputdir=$2
@@ -24,7 +45,7 @@ elif [ $format == 'nucl' ]; then
                 -evalue 1e-4 -num_threads $cores
 fi
 
-# Merge candidates within 10 bp and then combine across introns (default <500bp)
+# Merge candidates within 10 bp and then combine across introns
 awk '{if ($9>$10){tmp=$9;$9=$10;$10=tmp} print $2,$9,$10}' $outputdir/genome_blast.txt |sed 's/ /\t/g'  \
         | bedtools sort | bedtools merge -d 10 | awk '{print $1,$2,$3,$3-$2}' |sed 's/ /\t/g' > $outputdir/genome_blast.merge.txt
 merge_exons $outputdir/genome_blast.merge.txt $outputdir/genome_blast.merge2.txt $intron
diff --git a/scripts/genome_search.sh b/scripts/genome_search.sh
index 01cece5..126971d 100755
--- a/scripts/genome_search.sh
+++ b/scripts/genome_search.sh
@@ -1,14 +1,26 @@
 #!/bin/bash
 
 #
-# Search a genome file for NBS-LRR genes using consensus domain sequence(s).
+# Search a genome file for NBS-LRR genes using consensus domain sequence(s) of the NB-ARC domain.
 #
+# A two pass method is used. The first pass uses blast to find candidate sequences in the
+# genome based on the reference consensus sequences. These sequences are used to build
+# species specific consensus sequences for each class of NBS-LRR genes. A second
+# pass is done to identify candidate sequences using blast with the species specific consensus
+# sequences. The output is the candidate sites of the NB-ARC domains and a fasta file of
+# the sites plus 10 kb of flanking sequence for use with interproscan.
+# 
+# Inputs:
 # $1: genome file in fasta format to search
 # $2: output directory
 # $3: directory of this program
 # $4: number of cores to use
 # $5: The maximum size (bp) of introns
 #
+# Outputs:
+# $outputdir/candidate_sites.bed Location of candidate NB-ARC domain sequences found
+# $outputdir/Candidate_sites.with_flanking.fa Sequences of candidate sites extended by 10 kb
+# 
 
 genome=$1
 outputdir=$2
@@ -18,8 +30,10 @@ intron=$5
 
 source $programdir/scripts/functions.sh
 
+# Log files
 sdout=$outputdir/NLGenomeSweeper.out
 errout=$outputdir/NLGenomeSweeper.err
+# Prefix for output files
 outname="All_candidates"
 
 function exit_error {
@@ -45,15 +59,15 @@ Any problems can be submitted via the GitHub page https://github.com/ntoda03/NLG
 " | tee -a $sdout
 
 
-## Check dependencies
+## Check dependencies, exit if not found
 echo "Checking software requirements..." | tee -a $sdout
 DEPENDENCIES=(samtools bedtools blastp blastx TransDecoder.LongOrfs TransDecoder.Predict muscle interproscan hmmbuild)
 dep_flag=1
 for dependency in "${DEPENDENCIES[@]}"
 do
 	if ! which $dependency &> /dev/null; then
-    		echo "The dependency" $dependency " could not be found. Ensure that is installed and in your path." | tee -a $errout
-		depflag=0
+		echo "The dependency" $dependency " could not be found. Ensure that is installed and in your path." | tee -a $errout
+		dep_flag=0
 	fi
 done
 if [ $dep_flag -eq 0 ]; then
@@ -84,7 +98,7 @@ fi
 echo "Creating species and class specific custom sequences..." | tee -a $sdout
 mkdir -p $outputdir/profiler
 $programdir/scripts/createProfile.sh $outputdir/first_pass/$outname.fa $outputdir/profiler $programdir \
-	species_specific $outputdir/pfam_NB-ARC.fa nucl >> $sdout 2>> $errout
+	species_specific_domains $outputdir/pfam_NB-ARC.fa nucl >> $sdout 2>> $errout
 if [ -s "$outputdir/profiler/species_specific_domains.fa" ]
 then
    echo -e "Custom profiles complete.\n" | tee -a $sdout
diff --git a/scripts/mergeBlastHits.R b/scripts/mergeBlastHits.R
new file mode 100755
index 0000000..75a6293
--- /dev/null
+++ b/scripts/mergeBlastHits.R
@@ -0,0 +1,38 @@
+
+options(stringsAsFactors = FALSE)
+
+m_opt = c(commandArgs()[3:length(commandArgs())])
+
+infile=m_opt[1]
+outfile=m_opt[2] 
+gap=as.numeric(m_opt[3])
+blasttype=m_opt[4]
+method=m_opt[5]
+
+mytab=read.table(infile,sep="\t")
+
+check=(nrow(mytab)-1)
+j=1
+while( j<=check ){
+#while( mytab$V2[j] != "000094F_pilon"){
+  if( mytab$V2[j]==mytab$V2[j+1] && (mytab$V1[j]==mytab$V1[j+1] || blasttype=='any') ){
+    if( (method=="overlap") && ( (abs(mytab$V9[j+1]-mytab$V9[j])<gap) && (abs(mytab$V10[j+1]-mytab$V10[j])<gap) ) ){
+      mytab$V10[j]=mytab$V10[j+1]
+      mytab=mytab[-(j+1),]
+    } else if( (method=="flanking") && ( (abs(mytab$V9[j+1]-mytab$V10[j])<gap) ) ){
+      mytab$V10[j]=mytab$V10[j+1]
+      mytab$V3[j]=((mytab$V3[j]*mytab$V4[j])+(mytab$V3[j+1]*mytab$V4[j+1]))/(mytab$V4[j]+mytab$V4[j+1])
+      mytab$V4[j]=mytab$V4[j]+mytab$V4[j+1]
+      mytab=mytab[-(j+1),]
+    } else if( (method=="anyoverlap") && (mytab$V9[j+1] <= mytab$V10[j]) ){
+      mytab$V10[j]=mytab$V10[j+1]
+      mytab=mytab[-(j+1),]
+    } else{
+      j=j+1}
+  }
+  else{
+    j=j+1}
+  check=(nrow(mytab)-1)
+}
+write.table(mytab,outfile,sep="\t",col.names=FALSE,row.names=FALSE,quote=FALSE)
+
diff --git a/scripts/mergeExons.R b/scripts/mergeExons.R
new file mode 100755
index 0000000..3b0dc4c
--- /dev/null
+++ b/scripts/mergeExons.R
@@ -0,0 +1,25 @@
+
+options(stringsAsFactors = FALSE)
+
+m_opt = c(commandArgs()[3:length(commandArgs())])
+
+infile=m_opt[1]
+outfile=m_opt[2] 
+gap=as.numeric(m_opt[3])
+
+mytab=read.table(infile,sep="\t")
+
+check=(nrow(mytab)-1)
+j=1
+while( j<=check ){
+  if( (mytab$V1[j] == mytab$V1[j+1]) && (abs(mytab$V2[j+1] - mytab$V3[j]) <= gap) ){
+      mytab$V3[j] = mytab$V3[j+1]
+      mytab$V4[j] = mytab$V4[j] + mytab$V4[j+1]
+      mytab = mytab[-(j+1),]
+  }
+  else{
+    j=j+1}
+  check=(nrow(mytab)-1)
+}
+write.table(mytab,outfile,sep="\t",col.names=FALSE,row.names=FALSE,quote=FALSE)
+
diff --git a/scripts/region_interproscan.sh b/scripts/region_interproscan.sh
index 280dca9..ab6215c 100755
--- a/scripts/region_interproscan.sh
+++ b/scripts/region_interproscan.sh
@@ -3,10 +3,17 @@
 #
 # Predict ORFs and domains of a fasta nucleotide sequence file.
 #
-# $1: output directory
-# $2: number of cores to use
+# Interproscan is run on sequences containing NBS-LRR domains and
+# flanking sequences.
+#
+# Inputs:
+# $1: Output directory
+# $2: Number of cores to use
 # $3: Directory of the program
 #
+# Outputs:
+# $outputdir/../All_candidates.gff3: Annotation file of domains for potential NBS-LRR genes
+#
 
 outputdir=$1
 cores=$2
@@ -25,80 +32,74 @@ function exit_error {
         exit 1
 }
 
+
 #trap 'rv=\$?; echo "Error \$rv" >> $sdout' ERR
 #trap 'rv=\$?; echo "Exit \$rv" >> $sdout'  EXIT
 
-seq_num=$(grep ">" $outputdir/Candidate_sites.with_flanking.fa |wc -l)
-seq_num=$(($seq_num-1))
-## Split the file because interproscan has problems with too many sequences
-split_fasta $outputdir/Candidate_sites.with_flanking.fa $outputdir/Candidate_sites.with_flanking.fa_chunk_0000
+function check_mem_usage {
+    local free_mem_MB=$(free --mega | awk 'NR == 2 {print $4}')
+    echo $free_mem_MB
+}
 
-flag=0
-run_cycle=0
-declare -a bg_processes=()
-file_num=$seq_num
 
-echo -e "\nRunning interproscan domain and ORF identification..."  | tee -a $sdout
-while [[ "$run_cycle" -lt 10 && "$flag" -eq 0 ]]
-do
-    ## interproscan is run in the background, number of cores limits number of instances running
-    for i in $(eval echo "{0..$seq_num}");
-    do
-        if [ ! -f $outputdir/Candidate_sites_interpro.$i.gff3 ]; then
-            if [ -f $outputdir/Candidate_sites.with_flanking.fa_chunk_0000${i} ]; then
-                echo -e "Launching $i/$file_num" >> $sdout
-                #echo -ne "$i/$filenum"'\r' | tee -a $sdout
-                interproscan -appli PANTHER,Gene3D,Pfam,SMART,Coils -i $outputdir/Candidate_sites.with_flanking.fa_chunk_0000${i} \
-                    -t n -o $outputdir/Candidate_sites_interpro.$i.gff3 -f GFF3 > $sdout.interproscan_$i 2> $errout.interproscan_$i &
-                sleep 20
-                running_job=$!
-                bg_processes+=($running_job)
-            fi
-        fi
-        # Handle running background processes
-        while [ "${#bg_processes[@]}" -eq "$cores" ]; do
-            echo "Reached max number of jobs. Waiting..." >> $sdout
-            index_run=0
-            for pc_id in "${bg_processes[@]}"; do
-                curr_status=$(ps aux | awk -v pc_id=$pc_id '$2==pc_id {print $11}')
-                curr_runtime=$(ps -o etime= -p $pc_id | tr '-' ':' | \
-                    awk -F: '{ total=0; m=1; } { for (i=0; i < NF; i++) {total += $(NF-i)*m; m *= i >= 2 ? 24 : 60 }} {print total}')
-                # Process is no longer running, remove from list
-                if [ -z "$curr_status" ]; then
-                    echo -e "Job complete" >> $sdout
-                    unset bg_processes[$index_run]
-                    bg_processes=( ${bg_processes[@]} )
-                    break
-                # Process has run for over an hour, interproscan is hanging, kill process
-                elif [ "$curr_runtime" -gt 3600 ]; then
-                    echo -e "Process with status $curr_status killed for hanging" >> $sdout
-                    unset bg_processes[$index_run]
-                    bg_processes=( ${bg_processes[@]} )
-                    break
-                fi
-                index_run=$((index_run+1))
-            done
-            if [ "${#bg_processes[@]}" -eq "$cores" ]; then
-                sleep 20
+function submit_interproscan_job {
+    local outputdir=$1
+    local i=$2
+
+    local running_job=0
+
+    # File hasn't been run yet
+    if [ ! -f $outputdir/Candidate_sites_interpro.$i.gff3 ]; then
+        # The input file exists
+        if [ -f $outputdir/Candidate_sites.with_flanking.fa_chunk_0000${i} ]; then
+            # Only submit new jobs if there is at meast 3G of ram free
+            free_mem_MB=$(check_mem_usage)
+            if [ $free_mem_MB -lt 3000 ]; then
+                echo "High memory usage. Waiting..." >> $sdout
+                while [ $free_mem_MB -lt 3000 ]; do
+                    sleep 10
+                    free_mem_MB=$(check_mem_usage)
+                done
             fi
-        done
-        # Limit memory consumption
-        curr_memusage=$(free -m | awk 'NR==2 {printf "%s", $3/$2}')
-        if [ $(echo "$curr_memusage > 0.8" |bc -l) -ne 0 ]; then
-            echo "High memory usage. Waiting..." >> $sdout
-            while [ $(echo "curr_memusage > 0.8" |bc -l) -ne 0 ]; do
-                sleep 10
-                curr_memusage=$(free -m | awk 'NR==2 {printf "%s", $3/$2}')
-            done
+            echo -e "Launching $i/$file_num" >> $sdout
+            interproscan -appli PANTHER,Gene3D,Pfam,SMART,Coils -i $outputdir/Candidate_sites.with_flanking.fa_chunk_0000${i} \
+                -t n -o $outputdir/Candidate_sites_interpro.$i.gff3 -f GFF3 > $sdout.interproscan_$i 2> $errout.interproscan_$i &
+            running_job=$!
+            sleep 20
         fi
-    done
-    echo "All jobs submitted. Waiting..." >> $sdout
-    wait
+    fi
+
+    echo $running_job
+}
+
+function handle_bg_jobs {
+    local pc_id=$1
+
+    local remove_process=0
+
+    curr_status=$(ps aux | awk -v pc_id=$pc_id '$2==pc_id {print $11}')
+    curr_runtime=$(ps -o etime= -p $pc_id | tr '-' ':' | \
+        awk -F: '{ total=0; m=1; } { for (i=0; i < NF; i++) {total += $(NF-i)*m; m *= i >= 2 ? 24 : 60 }} {print total}')
+
+    # Process is no longer running, remove from list
+    if [ -z "$curr_status" ]; then
+        echo -e "Job complete" >> $sdout
+        remove_process=1
+    elif [ "$curr_runtime" -gt 6400 ]; then
+        echo -e "Process with status $curr_status killed for hanging" >> $sdout
+        remove_process=1
+    fi
+
+    echo $remove_process
+}
+
+
+function verify_interproscan_run {
+    local outputdir=$1
+    local seq_num=$2
+
+    local flag=1
 
-    ## Check to make sure interproscan run on all files, this can be a problem sometimes
-    echo "Checking output files." >> $sdout
-    flag=1
-    run_cycle=$((run_cycle+1))
     for i in $(eval echo "{0..$seq_num}");
     do
         if [ ! -f $outputdir/Candidate_sites_interpro.$i.gff3 ]; then
@@ -113,80 +114,169 @@ do
     else
         echo -e "Some sequences did not execute. Retrying..." >> $sdout
     fi
-done
 
+    echo $flag
+}
+
+
+function run_interproscan {
+    local outputdir=$1
+    local seq_num=$2
+
+    flag=0
+    run_cycle=0
+    file_num=$seq_num
+    declare -a bg_processes=()
+
+    while [[ "$run_cycle" -lt 10 && "$flag" -eq 0 ]]
+    do
+        ## interproscan is run in the background, number of cores limits number of instances running
+        ## Although it has a multithread option, that does not take advantage of the cpus
+        for i in $(eval echo "{0..$seq_num}");
+        do
+            running_job=$(submit_interproscan_job $outputdir $i)
+            if [ $running_job -ne 0 ]; then
+                bg_processes+=($running_job)
+            fi
+            # Handle running background processes
+            while [ "${#bg_processes[@]}" -eq "$cores" ]; do
+                echo "Reached max number of jobs. Waiting..." >> $sdout
+                index_run=0
+                for pc_id in "${bg_processes[@]}"; do
+                    remove_process=$(handle_bg_jobs $pc_id)
+                    if [ $remove_process -eq 1 ]; then
+                        unset bg_processes[$index_run]
+                        bg_processes=( ${bg_processes[@]} )
+                    fi
+                    index_run=$((index_run+1))
+                done
+                if [ "${#bg_processes[@]}" -eq "$cores" ]; then
+                    sleep 20
+                fi
+            done
+        done
+        echo "All jobs submitted. Waiting..." >> $sdout
+        wait
+
+        ## Check to make sure interproscan run on all files, this can be a problem sometimes
+        echo "Checking output files." >> $sdout
+        flag=$(verify_interproscan_run $outputdir $seq_num)
+        run_cycle=$((run_cycle+1))
+    done
+}
+
+
+# Convert gff files and filter for presence of Leucine-rich_repeats
+function convert_interpro_names {
+    local outputdir=$1
+    local seq_num=$2
+
+    for i in $(eval echo "{0..$seq_num}"); do
+        # Combine the files, renaming domains of interest to their common names
+        cat $outputdir/Candidate_sites_interpro.$i.gff3 | sed 's/ /__/g' | \
+            awk 'NF>3 {
+                split($1,a,":");split(a[2],b,"-");$1=a[1];
+                if ($3=="ORF"){start=$4;stop=$5;strand=$7;$4=$4+b[1];$5=$5+b[1];} 
+                else if( $3=="protein_match" || $3=="polypeptide" ){ 
+                    if(strand=="+"){$4=$4*3+b[1]+start;$5=$5*3+b[1]+start;}
+                    else if(strand=="-"){$7=strand;tmp1=b[1]+stop-$5*3+1;tmp2=b[1]+stop-$4*3+1;$4=tmp1;$5=tmp2}
+                }
+                if($3!="nucleic_acid"){print $0} }' | sort -k1,1 -k 4,4 -k 5,5 |uniq |sed 's/ /\t/g' | \
+            sed 's/[^ \t]*Name=\|[^ \t]*ID=/Name=/g' | sed 's/;.*//g' | uniq  | \
+            sed 's/PF07725\|SM00367\|PF13855\|SM00369\|G3DSA:3.80.10.10\|Leucine-rich_repeats,_typical_\(most_populated\)_subfamily/LRR/g' | \
+            sed 's/PTHR23155[^ \t]*\|PTHR11017[^ \t]*/LRR_protein/g' | \
+            sed 's/SM00255\|PF01582\|TIR_domain\|TIR_domain_profile\.\|Toll_-_interleukin_1_-_resistance\|G3DSA:3.40.50.10140\|SSF52200|/TIR/g' | \
+            sed 's/PTHR36766\|PF05659/RPW8/g' | sed 's/PF00931/NB-ARC/g' | uniq | sort -k 1,1 -k 6,6 -k 7,7 | \
+            awk '{printf "%s\t%s\tDomain\t%s\t%s\t%s\t%s\t%s\t%s\n",$1,$2,$4,$5,$6,$7,$8,$9}' > $outputdir/annotation.$i.gff3
+    done
+}
+
+
+function classify_interpro_structure {
+    local outputdir=$1
+
+    >$outputdir/to_delete.txt
+    >$outputdir/All_candidates.classified.bed
+
+    # Get a potential structure classification for candidates
+    i=0
+    while read chr start stop
+    do
+        # Check that candidates have an LRR
+        lrr_count=$(grep "Name=LRR$" $outputdir/annotation.$i.gff3 | wc -l)
+
+        # Try to determine what strand domain is on to look for flanking domains
+        strand=$(awk -v start=$((start-100)) -v stop=$((stop+100)) \
+            '(($5 >= start && $5 <= stop) || ($4 >= start && $4 <= stop)) && $9 == "Name=NB-ARC" {print $7}' $outputdir/annotation.$i.gff3 \
+            |uniq |head -n 1)
+        if [ -z "$strand" ]; then strand="+"; fi
+        awk '$2!="getorf" {print $0}' $outputdir/annotation.$i.gff3 |sort -k5,5 > $outputdir/processing.tmp
+        sed -i '/LRR_protein/d' $outputdir/processing.tmp
+        if [ $strand == "-" ]; then
+            awk -v stop=$stop '$4>stop {print $0}' $outputdir/processing.tmp > $outputdir/before.tmp
+            awk -v start=$start '$5<start {print $0}' $outputdir/processing.tmp > $outputdir/after.tmp
+            before=$(head -n 1 $outputdir/before.tmp |awk -v strand=$strand '{if(strand!=$7){$9="Name=None"} print $9}' |sed 's/.*Name=//g')
+            after=$(tail -n 2 $outputdir/after.tmp |awk -v strand=$strand '{if(strand==$7 && $9=="Name=LRR"){print $9}}' |sed 's/.*Name=//g' |uniq)
+        else
+            awk -v start=$start '$5<start {print $0}' $outputdir/processing.tmp > $outputdir/before.tmp
+            awk -v stop=$stop '$4>stop {print $0}' $outputdir/processing.tmp > $outputdir/after.tmp
+            before=$(tail -n 1 $outputdir/before.tmp |awk -v strand=$strand '{if(strand!=$7){$9="Name=None"} print $9}' |sed 's/.*Name=//g')
+            after=$(head -n 2 $outputdir/after.tmp |awk -v strand=$strand '{if(strand==$7 && $9=="Name=LRR"){print $9}}' |sed 's/.*Name=//g' |uniq)        
+        fi
+
+        # Start classification, all have N and add structure based on flanking interproscan domains
+        class="N"
+        if [ ! -z "$before" ]; then
+            if [ "$before" == "TIR" ]; then class="T$class"; fi
+            if [ "$before" == "Coil" ]; then class="C$class"; fi
+            if [ "$before" == "RPW8" ]; then class="R$class"; fi
+        fi
+        if [ ! -z "$after" ]; then
+            if [ "$after" == "LRR" ]; then class+="L"; fi
+        fi
+
+        # Candidates don't have any LRRs downstream, mark to remove
+        if [ "$lrr_count" -eq 0 ]; then
+            class+=";Filtered"
+            echo -e "$chr\t$start\t$stop\t$class" >> $outputdir/to_delete.txt
+        fi
+        echo -e "$chr\t$start\t$stop\t$class" >> $outputdir/All_candidates.classified.bed
+        i=$((i+1))
+    done < $outputdir/All_candidates.bed
+}
+
+
+
+
+
+free_mem_MB=$(check_mem_usage)
+if [ $free_mem_MB -lt 3000 ]; then
+    echo "Not enough memory to run interproscan. 3G mimimum is required."
+    exit 1
+fi
+
+
+## Split the file because interproscan has problems with too many sequences
+seq_num=$(grep ">" $outputdir/Candidate_sites.with_flanking.fa |wc -l)
+seq_num=$(($seq_num-1))
+split_fasta $outputdir/Candidate_sites.with_flanking.fa $outputdir/Candidate_sites.with_flanking.fa_chunk_0000
+
+echo -e "\nRunning interproscan domain and ORF identification..."  | tee -a $sdout
+run_interproscan $outputdir $seq_num
+
+# Interproscan can hang or crash randomly
 if [ "$flag" -eq 1 ]; then
     echo -e "\n\nDomain identification complete."  | tee -a $sdout
 else
     echo -e "There was a problem with interproscan. Check $errout for more information." | tee -a $sdout
-    exit
+    exit 1
 fi
 
-echo "Converting..." | tee -a $sdout
-# Convert gff files and filter for presence of LRRs
-for i in $(eval echo "{0..$seq_num}"); do
-    # Combine the files, renaming domains of interest to their common names
-    cat $outputdir/Candidate_sites_interpro.$i.gff3 | sed 's/ /__/g' | \
-        awk 'NF>3 {
-            split($1,a,":");split(a[2],b,"-");$1=a[1];
-            if ($3=="ORF"){start=$4;stop=$5;strand=$7;$4=$4+b[1];$5=$5+b[1];} 
-            else if( $3=="protein_match" || $3=="polypeptide" ){ 
-                if(strand=="+"){$4=$4*3+b[1]+start;$5=$5*3+b[1]+start;}
-                else if(strand=="-"){$7=strand;tmp1=b[1]+stop-$5*3+1;tmp2=b[1]+stop-$4*3+1;$4=tmp1;$5=tmp2}
-            }
-            if($3!="nucleic_acid"){print $0} }' | sort -k1,1 -k 4,4 -k 5,5 |uniq |sed 's/ /\t/g' |
-        sed 's/[^ \t]*Name=\|[^ \t]*ID=/Name=/g' | sed 's/;.*//g' | uniq  | \
-        sed 's/PF07725\|SM00367\|PF13855\|SM00369\|G3DSA:3.80.10.10\|Leucine-rich_repeats,_typical_\(most_populated\)_subfamily/LRR/g' | \
-        sed 's/PTHR23155[^ \t]*\|PTHR11017[^ \t]*/LRR_protein/g' | \
-        sed 's/SM00255\|PF01582\|TIR_domain\|TIR_domain_profile\.\|Toll_-_interleukin_1_-_resistance\|G3DSA:3.40.50.10140\|SSF52200|/TIR/g' | \
-        sed 's/PTHR36766\|PF05659/RPW8/g' | sed 's/PF00931/NB-ARC/g' | uniq | sort -k 1,1 -k 6,6 -k 7,7 | \
-        awk '{printf "%s\t%s\tDomain\t%s\t%s\t%s\t%s\t%s\t%s\n",$1,$2,$4,$5,$6,$7,$8,$9}' > $outputdir/annotation.$i.gff3
-done
-
-echo "Classifying..." | tee -a $sdout
->$outputdir/to_delete.txt
->$outputdir/All_candidates.classified.bed
-# Get a potential structure classification for candidates
-i=0
-while read chr start stop
-do
-
-    # Check that candidates have an LRR
-    lrr_count=$(grep "Name=LRR$" $outputdir/annotation.$i.gff3 | wc -l)
-
-    strand=$(awk -v start=$((start-100)) -v stop=$((stop+100)) \
-        '(($5 >= start && $5 <= stop) || ($4 >= start && $4 <= stop)) && $9 == "Name=NB-ARC" {print $7}' $outputdir/annotation.$i.gff3 \
-        |uniq |head -n 1)
-    if [ -z "$strand" ]; then strand="+"; fi
-    awk '$2!="getorf" {print $0}' $outputdir/annotation.$i.gff3 |sort -k5,5 > $outputdir/processing.tmp
-    sed -i '/LRR_protein/d' $outputdir/processing.tmp
-    if [ $strand == "-" ]; then
-        awk -v stop=$stop '$4>stop {print $0}' $outputdir/processing.tmp > $outputdir/before.tmp
-        awk -v start=$start '$5<start {print $0}' $outputdir/processing.tmp > $outputdir/after.tmp
-        before=$(head -n 1 $outputdir/before.tmp |awk -v strand=$strand '{if(strand!=$7){$9="Name=None"} print $9}' |sed 's/.*Name=//g')
-        after=$(tail -n 2 $outputdir/after.tmp |awk -v strand=$strand '{if(strand==$7 && $9=="Name=LRR"){print $9}}' |sed 's/.*Name=//g' |uniq)
-    else
-        awk -v start=$start '$5<start {print $0}' $outputdir/processing.tmp > $outputdir/before.tmp
-        awk -v stop=$stop '$4>stop {print $0}' $outputdir/processing.tmp > $outputdir/after.tmp
-        before=$(tail -n 1 $outputdir/before.tmp |awk -v strand=$strand '{if(strand!=$7){$9="Name=None"} print $9}' |sed 's/.*Name=//g')
-        after=$(head -n 2 $outputdir/after.tmp |awk -v strand=$strand '{if(strand==$7 && $9=="Name=LRR"){print $9}}' |sed 's/.*Name=//g' |uniq)        
-    fi
-    class="N"
-    if [ ! -z "$before" ]; then
-        if [ "$before" == "TIR" ]; then class="T$class"; fi
-        if [ "$before" == "Coil" ]; then class="C$class"; fi
-        if [ "$before" == "RPW8" ]; then class="R$class"; fi
-    fi
-    if [ ! -z "$after" ]; then
-        if [ "$after" == "LRR" ]; then class+="L"; fi
-    fi
-    if [ "$lrr_count" -eq 0 ]; then
-        class+=";Filtered"
-        echo -e "$chr\t$start\t$stop\t$class" >> $outputdir/to_delete.txt
-    fi
-    echo -e "$chr\t$start\t$stop\t$class" >> $outputdir/All_candidates.classified.bed
-    i=$((i+1))
-done < $outputdir/All_candidates.bed
+echo "Converting names to human readable..." | tee -a $sdout
+convert_interpro_names $outputdir $seq_num
+
+echo "Classifying structure..." | tee -a $sdout
+classify_interpro_structure $outputdir 
 
 echo "Filtering for presence of LRRs..." | tee -a $sdout
 # Remove candidates with no LRR
@@ -196,13 +286,12 @@ grep "Filtered" $outputdir/All_candidates.classified.bed >  $outputdir/Filtered_
 num_found=$(wc -l < $outputdir/Final_candidates.bed)
 echo -e "\nCandidate filtering for LRRs. $num_found final candidates found after filtering." | tee -a $sdout
 
+echo "Outputing final candidates and annotations" | tee -a $sdout
 cp $outputdir/Final_candidates.bed $outputdir/../Final_candidates.bed
 cp $outputdir/Filtered_candidates.bed $outputdir/../Filtered_candidates.bed
 cp $outputdir/All_candidates.classified.bed $outputdir/../All_candidates.bed
-
 cat $outputdir/annotation.*.gff3 > $outputdir/All_candidates.gff3
 sed -i '1s/^/##gff-version 3\n/' $outputdir/All_candidates.gff3
-
 cp $outputdir/All_candidates.gff3 $outputdir/../All_candidates.gff3
 
 echo "
diff --git a/scripts/removesmalls.pl b/scripts/removesmalls.pl
new file mode 100755
index 0000000..1bc5f90
--- /dev/null
+++ b/scripts/removesmalls.pl
@@ -0,0 +1,35 @@
+## removesmalls.pl
+#!/usr/bin/perl
+use strict;
+use warnings;
+
+my $ifile; my $ofile;
+my $cutoff=$ARGV[2];
+my $flag = 0;
+my $seq; my $name; my $read;
+
+open($ifile, $ARGV[0] );
+open($ofile, ">$ARGV[1]");
+
+while( $read = <$ifile> ){
+        chomp $read;
+	if( $read =~ />/){
+		if($flag == 0){
+			$flag = 1;
+			$name = $read;
+			$seq = "";
+		} elsif ( length($seq) > $cutoff ){
+			print $ofile $name."\n".$seq."\n";
+			$name = $read;
+			$seq = "";
+		} else{
+                        $name = $read;
+                        $seq = "";
+		}
+	} else{
+		$seq .= $read;
+	}
+}
+if ( length($seq) > $cutoff ){
+	print $ofile $name."\n".$seq."\n";}
+