diff --git a/NLGenomeSweeper b/NLGenomeSweeper index fa49b20..c50c7b8 100755 --- a/NLGenomeSweeper +++ b/NLGenomeSweeper @@ -1,4 +1,4 @@ -#!/usr/bin/env python +#!/usr/bin/env python3 import subprocess import sys @@ -48,9 +48,12 @@ profiler_file='data/Vitis_vinifera_NB-ARC_consensus.NLG_based.fa' # Output file names annotated_candidates = 'Annotated_candidates.bed' unannotated_candidates = 'All_candidates.bed' -annotated_gff= 'All_candidates.gff3' -consensus_file='Species_specific_consensus.fa' -filter_candidates='Filtered_candidates.bed' +annotated_gff = 'All_candidates.gff3' +consensus_file ='Species_specific_consensus.fa' +filter_candidates ='Filtered_candidates.bed' + +# We'll look to merge domains across introns smaller than this +max_intron_len = 1000 # Append the contents of an input file to an output file def append_file(input_file, output_file): @@ -62,7 +65,9 @@ def append_file(input_file, output_file): ########################################################################## # -# identify_candidates_proteins +# DEPRACATED - no protein search done +# +# identify_candidates_proteins # # Arguments: # program_dir - directory of this program @@ -85,7 +90,7 @@ def identify_candidates_proteins(program_dir, protein, gff, hmm, outdir): os.mkdir(outdir + search_folder + protein_search_folder) subprocess.run('{}/scripts/hmmer_candidates.sh {} {} {} {} {}'.format(program_dir, protein, gff, program_dir + nbarc_hmm, - outdir + search_folder + protein_search_folder, program_dir), shell=True) + outdir + search_folder + protein_search_folder, program_dir), shell=True, check=True) shutil.copy(outdir + search_folder + protein_search_folder + annotated_candidates, outdir + annotated_candidates) @@ -95,11 +100,6 @@ def identify_candidates_proteins(program_dir, protein, gff, hmm, outdir): subprocess.run('{}/scripts/hmmer_candidates.sh {} {} {} {} {}'.format(program_dir,protein,gff, hmm,outdir + search_folder + protein_custom_folder, program_dir), shell=True) # Concatenate to the pfam search results - #annotated1 = pd.read_csv(outdir + search_folder + protein_custom_folder +annotated_candidates, - # sep='\t',header=None) - #annotated2 = pd.read_csv(outdir + '/Annotated_candidates.bed', sep='\t',header=None) - #annotated = annotated1.append(annotated2, ignore_index=True).drop_duplicates() - #annotated.to_csv(outdir + '/Annotated_candidates.bed', sep='\t', header =False, index = False ) append_file(outdir + search_folder + protein_custom_folder + annotated_candidates, outdir + annotated_candidates) @@ -154,7 +154,7 @@ def identify_candidates_genome(program_dir, genome, consensus, outdir, cores, in # outdir + search_folder + genome_search_folder + 'candiates_sites.bed') # Run the genome based search subprocess.run('{}/scripts/genome_search.sh {} {} {} {} {}'.format(program_dir,genome, - outdir + search_folder + genome_search_folder, program_dir, cores, intron_size), shell=True) + outdir + search_folder + genome_search_folder, program_dir, cores, intron_size), shell=True, check=True) #shutil.copy(outdir + search_folder + genome_search_folder + unannotated_candidates, # outdir + unannotated_candidates) shutil.copy(outdir + search_folder + genome_search_folder + consensus_file, @@ -187,7 +187,7 @@ def identify_candidates_genome(program_dir, genome, consensus, outdir, cores, in ########################################################################## def run_interproscan(program_dir, outdir, t): subprocess.run('{}/scripts/region_interproscan.sh {} {} {}'.format(program_dir, outdir + domain_folder, t, program_dir), - shell=True) + shell=True, check=True) #shutil.copy(outdir + domain_folder + annotated_gff, # outdir + annotated_gff) #shutil.copy(outdir + domain_folder + unannotated_candidates, @@ -214,12 +214,14 @@ def main(argv): help="A genome sequence in fasta format to search.") input_group = parser.add_argument_group('Input arguments') +# DEPRACATED - no protein search # input_group.add_argument("-protein", metavar="", type=str, default = None, # help="Protein sequences in fasta format.") # input_group.add_argument("-gff", metavar="", type=str, default = None, # help="gff3 annotation of the genome. Required if searching protein sequences.") input_group.add_argument("-consensus", metavar="", type=str, default = None, - help="Also search the genome using a custom NB-ARC consensus sequence(s) (optional).") + help="Also search the genome using a custom NB-ARC consensus sequence(s) (optional)." + + "It is not recommended to use this option since species specific consensuses will be created.") # input_group.add_argument("-hmm", metavar="", type=str, default = None, # help="Also search the protein file using a custom NB-ARC HMM.") @@ -279,7 +281,7 @@ def main(argv): # Search protein file using hmmer ## identify_candidates_proteins(program_dir,args.protein, args.gff, args.hmm, args.outdir ) # Search the genome using blast - identify_candidates_genome(program_dir, args.genome, args.consensus, args.outdir + nlg_folder, args.t, 1000) + identify_candidates_genome(program_dir, args.genome, args.consensus, args.outdir + nlg_folder, args.t, max_intron_len) # Define ORFs and domains using interproscan run_interproscan(program_dir, args.outdir + nlg_folder, args.t) diff --git a/data/Vitis_vinifera_NB-ARC_consensus.NLG_based.fa b/data/Vitis_vinifera_NB-ARC_consensus.NLG_based.fa old mode 100644 new mode 100755 diff --git a/data/Vitis_vinifera_NB-ARC_consensus.fa b/data/Vitis_vinifera_NB-ARC_consensus.fa old mode 100644 new mode 100755 diff --git a/data/combined_consensus.fa b/data/combined_consensus.fa old mode 100644 new mode 100755 index 7753c81..9039e8c --- a/data/combined_consensus.fa +++ b/data/combined_consensus.fa @@ -4,52 +4,27 @@ TAKKEDLFKDILKELAKEADDTSVEKEESELKRKLRRLLLRKRFLLVLDDVWEEEDWEKI GVPLPAKENRLRVLLTTRDEEVANAVSAKSEKIEVELLEKDEAWELLENKVFEKELSEDP DLEEVAKEIVEKCKGLPLALKVIGGLLATKSTVKEWEKVAEQLNNKLAESRSELNEVLSI LSLSYESLPEALKRCFLYLSLFPEDVEIKVKRLVKLWSAEGFVESEDI ->vvinCNL_custom_NB-ARC-consensus.genomewide +>vvinCNL-consensus LLLLLEEEEEQLSVISIVGMGGLGKTTLAKLVYNDEEVKKHFDLRAWVCVSEEFDVKELL KEILKSLKSEKLESMELEELQEKLKELLKEKRYLLVLDDVWNEDKWEELKEALKEGAKGS RIIVTTRNEEVASIMKTISPYELEKLSEEESWSLFKKKAFKEEEEEECPELEELGKEIVK KCKGLPLAIKALGGLLSSKKDEEEWSKVLDSELWELEEEESSILKILKLSYNDLPSHLKQ CFLYCSLFPKDYEIKKEELIQLWMAEGFVQ ->vvinNL_custom_NB-ARC-consensus.genomewide +>vvinNL-consensus EEILEALKDDNINMIGVYGMGGVGKTTLVKEVAKKVKEKKLFDEVIIVTVSKKPDLRKIQ EEIADKLKLKLEKEESESERADKLRERLKKSKKILLILDDVWEKLDLEEVGIPEEDKEKG CKIVLTTRNRDVCSEMKTQKEIKLEVLSEEEAWKLFEKKVGDVLNSPNLKSIAKEVVKEC AGLPLAIVTVAKALKNKKDVEVWEDALQQLRRSALKNIKGMEEKVYSALKLSYNKLESEE LKSCFLFCALFPEDYEIEIEELIEYLIAEKLI ->vvinRPW8_custom_NB-ARC-consensus.genomewide +>vvinRPW8-consensus KELKELLLKEEEKLLLLTALGGCGKTTLVKKLCKDDEIKGIFKDNILFVNVSKTPNLKVI VQKLFKYKKEEELEFQSDEDAINQLEQLLNKIGSNPILLILDDVWPGSESLLEKFKFDIP KYKILVTSRTAFPRFKFTYELKPLNDEDAMSLFRHSASLQDGSSYIPEEVIKKIVRGCGG LPLALKVIGGSLREQKAEVWLSKLEKWSEGESIFESDEELLDCLQKSLEFSDDKVILKEC FLDLGSFPEDQRIPVAALIDMWA ->vvinTNL_custom_NB-ARC-consensus.genomewide +>vvinTNL-consensus ELESLLDIESNDVRLVGILGLGGIGKTTLAKAVYNKISKKFEASSFLENVREKSKKEGLV KLQEQLLSEILKEKELKVKNVSEGINVIKKRLRKKKVLLVLDDVDKLEQLEKLAGKKDWF GSGSRIIITTRDKHLLKTLEVDEIYEVKELNKDEALELFSLKAFKKEKPEEDYLELSKKV VKYAKGLPLALKVLGSDLFGRSIDEWKSALEKLKEIPEKEILEILKISYDGLEETEKEIF LDIACFFKG - ->vvinCNL_custom_NB-ARC-consensus -ELEELGKKLLANEGGEKLLVISIVGMGGLGKTTLAQKVYNDKHVKTHFDLRAWVCVSDEF -DLKEILKKILESVSKSSESEDLEELQEKLKELLKEKRFLLVLDDVWNEDEKWEELKTLLP -DGANGSKIIVTTRNKSVASVMKTSSIHELKLLSDEESWSLFTKKAFENGESSAPPELEEI -GKEIVKKCGGLPLAIKALGGLLSSKKEEEEWEKVLNSEIWDLEEEEEILPALKLSYNDLP -SHLKRCFLYCSLFPKDYEIKKEELILLWVAEGLVQE ->vvinTNL_custom_NB-ARC-consensus -EELKSLLKLESNDVRMVGIYGIGGIGKTTIAKAIYNKISAQFEGVSFLENVREKSKKKGL -LQLQKKLLKDILKEKKLKVSNVEEGLKVIKKRLSSKKVLIVLDDVDELEQLEALAGEKDW -FGKGSRIIITTRDKELLEEHEVDELYEVKKLNKEEALELFSLYAFKQELPKEDYEELSKE -IVEYAKGLPLALKVLGSLLFKKTIKEWESELEKLKKEPEKEIQNVLKISYDGLDETEKEI -FLDIACFFKG ->vvinNL_custom_NB-ARC-consensus -EKVMEALEDEKVRVIGIVGMGGVGKTTLVKQVYNDAKVKKLFDLVIWVTVSKEFDLEKIQ -KEILEKLGLKDEKSESEEEKAEELKERLKEKKFLLVLDDVWEELDLEKLGIPLPDDKNGS -KIVLTTRSEDVCKEMEAQKKIELELLSEEEAWELFKKKAGESVESHPELEELAEEVVKEC -KGLPLAIVTLGRALKSKKTVEEWEKALEELKSSLSENIMEDKVLPVLKLSYDSLPSELKS -CFLYCALFPEDYEIEKEELIELWIAEGLLQ ->vvinRPW8_custom_NB-ARC-consensus -KELKKLLFKDDESRIVVSAPGGCGKTTLAKKLCKDQQVKEYFKDSILYVTVSKTANLIEI -IKKLLKENAEEVKEFQNEEDAVNQLERKLKRKVESGRILLVLDDVWSGSESVLEKLKFKI -SELKVLVTSRNEFPKFGSTYELELLEEEEAKELFRHSAIEEDGSEDSDLDEELVKEIVRR -CKGLPLALEVVGRSLAGKPVEIWRSTLEKLSEGESIVESEDELLECLQSSLDALDDKDML -KECFLDLGSFPEDKKIPVTALIDLWA diff --git a/scripts/createProfile.sh b/scripts/createProfile.sh index 65c8274..f6271a4 100755 --- a/scripts/createProfile.sh +++ b/scripts/createProfile.sh @@ -3,7 +3,23 @@ # # Create custom consensus sequences based on domains desired. # +# This program takes potential domains identified in the genome and compares +# them to known high quality consnesus sequences of that domain. For each +# potential domain the closest reference sequence if found and sequences are +# grouped by their closest reference. The grouped sequences are aligned +# and a protein consensus sequence is created for each group. +# +# Inputs: +# $1: Fasta file of sequences containing domain extracted from the genome +# $2: output directory +# $3: directory of this program +# $4: prefix of output file +# $5: Consensus sequences in fasta format, protein or nucleotide +# $6: Format of the consensus sequences, 'prot' or 'nucl' # +# Outputs: +# $outputdir/$prefix.fa: Fasta file of species specific consensus sequences, proteins +# #hmm=$1 sequences=$1 @@ -56,5 +72,5 @@ do fi done cat $outputdir/consensus_clusters.*.profile.fa > $outputdir/consensus_clusters.fa -remove_smalls $outputdir/consensus_clusters.fa $outputdir/species_specific_domains.fa 200 +remove_smalls $outputdir/consensus_clusters.fa $outputdir/$prefix.fa 200 diff --git a/scripts/find_genome_candidates.sh b/scripts/find_genome_candidates.sh index d496f22..2d3a243 100755 --- a/scripts/find_genome_candidates.sh +++ b/scripts/find_genome_candidates.sh @@ -1,7 +1,28 @@ #!/bin/bash + +# +# Find candidate domains in a genome with blast. # +# This is a subroutine for finding blast hits of a consensus domain in +# a genome sequence. The hits are merged across small gaps as well as +# introns of a custom length. Hits must span at least 80% of the +# original domain sequence. +# +# Inputs: +# $1: genome file in fasta format to search +# $2: output directory +# $3: number of cores to use +# $4: Format of the consensus sequences, 'prot' or 'nucl' +# $5: Consensus sequences in fasta format, protein or nucleotide +# $6: The maximum size (bp) of introns +# $7: directory of this program # +# Outputs: +# $outputdir/$outname.fa: Fasta sequences of candidate domain found in genome +# + + genome=$1 outputdir=$2 @@ -24,7 +45,7 @@ elif [ $format == 'nucl' ]; then -evalue 1e-4 -num_threads $cores fi -# Merge candidates within 10 bp and then combine across introns (default <500bp) +# Merge candidates within 10 bp and then combine across introns awk '{if ($9>$10){tmp=$9;$9=$10;$10=tmp} print $2,$9,$10}' $outputdir/genome_blast.txt |sed 's/ /\t/g' \ | bedtools sort | bedtools merge -d 10 | awk '{print $1,$2,$3,$3-$2}' |sed 's/ /\t/g' > $outputdir/genome_blast.merge.txt merge_exons $outputdir/genome_blast.merge.txt $outputdir/genome_blast.merge2.txt $intron diff --git a/scripts/genome_search.sh b/scripts/genome_search.sh index 01cece5..126971d 100755 --- a/scripts/genome_search.sh +++ b/scripts/genome_search.sh @@ -1,14 +1,26 @@ #!/bin/bash # -# Search a genome file for NBS-LRR genes using consensus domain sequence(s). +# Search a genome file for NBS-LRR genes using consensus domain sequence(s) of the NB-ARC domain. # +# A two pass method is used. The first pass uses blast to find candidate sequences in the +# genome based on the reference consensus sequences. These sequences are used to build +# species specific consensus sequences for each class of NBS-LRR genes. A second +# pass is done to identify candidate sequences using blast with the species specific consensus +# sequences. The output is the candidate sites of the NB-ARC domains and a fasta file of +# the sites plus 10 kb of flanking sequence for use with interproscan. +# +# Inputs: # $1: genome file in fasta format to search # $2: output directory # $3: directory of this program # $4: number of cores to use # $5: The maximum size (bp) of introns # +# Outputs: +# $outputdir/candidate_sites.bed Location of candidate NB-ARC domain sequences found +# $outputdir/Candidate_sites.with_flanking.fa Sequences of candidate sites extended by 10 kb +# genome=$1 outputdir=$2 @@ -18,8 +30,10 @@ intron=$5 source $programdir/scripts/functions.sh +# Log files sdout=$outputdir/NLGenomeSweeper.out errout=$outputdir/NLGenomeSweeper.err +# Prefix for output files outname="All_candidates" function exit_error { @@ -45,15 +59,15 @@ Any problems can be submitted via the GitHub page https://github.com/ntoda03/NLG " | tee -a $sdout -## Check dependencies +## Check dependencies, exit if not found echo "Checking software requirements..." | tee -a $sdout DEPENDENCIES=(samtools bedtools blastp blastx TransDecoder.LongOrfs TransDecoder.Predict muscle interproscan hmmbuild) dep_flag=1 for dependency in "${DEPENDENCIES[@]}" do if ! which $dependency &> /dev/null; then - echo "The dependency" $dependency " could not be found. Ensure that is installed and in your path." | tee -a $errout - depflag=0 + echo "The dependency" $dependency " could not be found. Ensure that is installed and in your path." | tee -a $errout + dep_flag=0 fi done if [ $dep_flag -eq 0 ]; then @@ -84,7 +98,7 @@ fi echo "Creating species and class specific custom sequences..." | tee -a $sdout mkdir -p $outputdir/profiler $programdir/scripts/createProfile.sh $outputdir/first_pass/$outname.fa $outputdir/profiler $programdir \ - species_specific $outputdir/pfam_NB-ARC.fa nucl >> $sdout 2>> $errout + species_specific_domains $outputdir/pfam_NB-ARC.fa nucl >> $sdout 2>> $errout if [ -s "$outputdir/profiler/species_specific_domains.fa" ] then echo -e "Custom profiles complete.\n" | tee -a $sdout diff --git a/scripts/mergeBlastHits.R b/scripts/mergeBlastHits.R new file mode 100755 index 0000000..75a6293 --- /dev/null +++ b/scripts/mergeBlastHits.R @@ -0,0 +1,38 @@ + +options(stringsAsFactors = FALSE) + +m_opt = c(commandArgs()[3:length(commandArgs())]) + +infile=m_opt[1] +outfile=m_opt[2] +gap=as.numeric(m_opt[3]) +blasttype=m_opt[4] +method=m_opt[5] + +mytab=read.table(infile,sep="\t") + +check=(nrow(mytab)-1) +j=1 +while( j<=check ){ +#while( mytab$V2[j] != "000094F_pilon"){ + if( mytab$V2[j]==mytab$V2[j+1] && (mytab$V1[j]==mytab$V1[j+1] || blasttype=='any') ){ + if( (method=="overlap") && ( (abs(mytab$V9[j+1]-mytab$V9[j])> $sdout' ERR #trap 'rv=\$?; echo "Exit \$rv" >> $sdout' EXIT -seq_num=$(grep ">" $outputdir/Candidate_sites.with_flanking.fa |wc -l) -seq_num=$(($seq_num-1)) -## Split the file because interproscan has problems with too many sequences -split_fasta $outputdir/Candidate_sites.with_flanking.fa $outputdir/Candidate_sites.with_flanking.fa_chunk_0000 +function check_mem_usage { + local free_mem_MB=$(free --mega | awk 'NR == 2 {print $4}') + echo $free_mem_MB +} -flag=0 -run_cycle=0 -declare -a bg_processes=() -file_num=$seq_num -echo -e "\nRunning interproscan domain and ORF identification..." | tee -a $sdout -while [[ "$run_cycle" -lt 10 && "$flag" -eq 0 ]] -do - ## interproscan is run in the background, number of cores limits number of instances running - for i in $(eval echo "{0..$seq_num}"); - do - if [ ! -f $outputdir/Candidate_sites_interpro.$i.gff3 ]; then - if [ -f $outputdir/Candidate_sites.with_flanking.fa_chunk_0000${i} ]; then - echo -e "Launching $i/$file_num" >> $sdout - #echo -ne "$i/$filenum"'\r' | tee -a $sdout - interproscan -appli PANTHER,Gene3D,Pfam,SMART,Coils -i $outputdir/Candidate_sites.with_flanking.fa_chunk_0000${i} \ - -t n -o $outputdir/Candidate_sites_interpro.$i.gff3 -f GFF3 > $sdout.interproscan_$i 2> $errout.interproscan_$i & - sleep 20 - running_job=$! - bg_processes+=($running_job) - fi - fi - # Handle running background processes - while [ "${#bg_processes[@]}" -eq "$cores" ]; do - echo "Reached max number of jobs. Waiting..." >> $sdout - index_run=0 - for pc_id in "${bg_processes[@]}"; do - curr_status=$(ps aux | awk -v pc_id=$pc_id '$2==pc_id {print $11}') - curr_runtime=$(ps -o etime= -p $pc_id | tr '-' ':' | \ - awk -F: '{ total=0; m=1; } { for (i=0; i < NF; i++) {total += $(NF-i)*m; m *= i >= 2 ? 24 : 60 }} {print total}') - # Process is no longer running, remove from list - if [ -z "$curr_status" ]; then - echo -e "Job complete" >> $sdout - unset bg_processes[$index_run] - bg_processes=( ${bg_processes[@]} ) - break - # Process has run for over an hour, interproscan is hanging, kill process - elif [ "$curr_runtime" -gt 3600 ]; then - echo -e "Process with status $curr_status killed for hanging" >> $sdout - unset bg_processes[$index_run] - bg_processes=( ${bg_processes[@]} ) - break - fi - index_run=$((index_run+1)) - done - if [ "${#bg_processes[@]}" -eq "$cores" ]; then - sleep 20 +function submit_interproscan_job { + local outputdir=$1 + local i=$2 + + local running_job=0 + + # File hasn't been run yet + if [ ! -f $outputdir/Candidate_sites_interpro.$i.gff3 ]; then + # The input file exists + if [ -f $outputdir/Candidate_sites.with_flanking.fa_chunk_0000${i} ]; then + # Only submit new jobs if there is at meast 3G of ram free + free_mem_MB=$(check_mem_usage) + if [ $free_mem_MB -lt 3000 ]; then + echo "High memory usage. Waiting..." >> $sdout + while [ $free_mem_MB -lt 3000 ]; do + sleep 10 + free_mem_MB=$(check_mem_usage) + done fi - done - # Limit memory consumption - curr_memusage=$(free -m | awk 'NR==2 {printf "%s", $3/$2}') - if [ $(echo "$curr_memusage > 0.8" |bc -l) -ne 0 ]; then - echo "High memory usage. Waiting..." >> $sdout - while [ $(echo "curr_memusage > 0.8" |bc -l) -ne 0 ]; do - sleep 10 - curr_memusage=$(free -m | awk 'NR==2 {printf "%s", $3/$2}') - done + echo -e "Launching $i/$file_num" >> $sdout + interproscan -appli PANTHER,Gene3D,Pfam,SMART,Coils -i $outputdir/Candidate_sites.with_flanking.fa_chunk_0000${i} \ + -t n -o $outputdir/Candidate_sites_interpro.$i.gff3 -f GFF3 > $sdout.interproscan_$i 2> $errout.interproscan_$i & + running_job=$! + sleep 20 fi - done - echo "All jobs submitted. Waiting..." >> $sdout - wait + fi + + echo $running_job +} + +function handle_bg_jobs { + local pc_id=$1 + + local remove_process=0 + + curr_status=$(ps aux | awk -v pc_id=$pc_id '$2==pc_id {print $11}') + curr_runtime=$(ps -o etime= -p $pc_id | tr '-' ':' | \ + awk -F: '{ total=0; m=1; } { for (i=0; i < NF; i++) {total += $(NF-i)*m; m *= i >= 2 ? 24 : 60 }} {print total}') + + # Process is no longer running, remove from list + if [ -z "$curr_status" ]; then + echo -e "Job complete" >> $sdout + remove_process=1 + elif [ "$curr_runtime" -gt 6400 ]; then + echo -e "Process with status $curr_status killed for hanging" >> $sdout + remove_process=1 + fi + + echo $remove_process +} + + +function verify_interproscan_run { + local outputdir=$1 + local seq_num=$2 + + local flag=1 - ## Check to make sure interproscan run on all files, this can be a problem sometimes - echo "Checking output files." >> $sdout - flag=1 - run_cycle=$((run_cycle+1)) for i in $(eval echo "{0..$seq_num}"); do if [ ! -f $outputdir/Candidate_sites_interpro.$i.gff3 ]; then @@ -113,80 +114,169 @@ do else echo -e "Some sequences did not execute. Retrying..." >> $sdout fi -done + echo $flag +} + + +function run_interproscan { + local outputdir=$1 + local seq_num=$2 + + flag=0 + run_cycle=0 + file_num=$seq_num + declare -a bg_processes=() + + while [[ "$run_cycle" -lt 10 && "$flag" -eq 0 ]] + do + ## interproscan is run in the background, number of cores limits number of instances running + ## Although it has a multithread option, that does not take advantage of the cpus + for i in $(eval echo "{0..$seq_num}"); + do + running_job=$(submit_interproscan_job $outputdir $i) + if [ $running_job -ne 0 ]; then + bg_processes+=($running_job) + fi + # Handle running background processes + while [ "${#bg_processes[@]}" -eq "$cores" ]; do + echo "Reached max number of jobs. Waiting..." >> $sdout + index_run=0 + for pc_id in "${bg_processes[@]}"; do + remove_process=$(handle_bg_jobs $pc_id) + if [ $remove_process -eq 1 ]; then + unset bg_processes[$index_run] + bg_processes=( ${bg_processes[@]} ) + fi + index_run=$((index_run+1)) + done + if [ "${#bg_processes[@]}" -eq "$cores" ]; then + sleep 20 + fi + done + done + echo "All jobs submitted. Waiting..." >> $sdout + wait + + ## Check to make sure interproscan run on all files, this can be a problem sometimes + echo "Checking output files." >> $sdout + flag=$(verify_interproscan_run $outputdir $seq_num) + run_cycle=$((run_cycle+1)) + done +} + + +# Convert gff files and filter for presence of Leucine-rich_repeats +function convert_interpro_names { + local outputdir=$1 + local seq_num=$2 + + for i in $(eval echo "{0..$seq_num}"); do + # Combine the files, renaming domains of interest to their common names + cat $outputdir/Candidate_sites_interpro.$i.gff3 | sed 's/ /__/g' | \ + awk 'NF>3 { + split($1,a,":");split(a[2],b,"-");$1=a[1]; + if ($3=="ORF"){start=$4;stop=$5;strand=$7;$4=$4+b[1];$5=$5+b[1];} + else if( $3=="protein_match" || $3=="polypeptide" ){ + if(strand=="+"){$4=$4*3+b[1]+start;$5=$5*3+b[1]+start;} + else if(strand=="-"){$7=strand;tmp1=b[1]+stop-$5*3+1;tmp2=b[1]+stop-$4*3+1;$4=tmp1;$5=tmp2} + } + if($3!="nucleic_acid"){print $0} }' | sort -k1,1 -k 4,4 -k 5,5 |uniq |sed 's/ /\t/g' | \ + sed 's/[^ \t]*Name=\|[^ \t]*ID=/Name=/g' | sed 's/;.*//g' | uniq | \ + sed 's/PF07725\|SM00367\|PF13855\|SM00369\|G3DSA:3.80.10.10\|Leucine-rich_repeats,_typical_\(most_populated\)_subfamily/LRR/g' | \ + sed 's/PTHR23155[^ \t]*\|PTHR11017[^ \t]*/LRR_protein/g' | \ + sed 's/SM00255\|PF01582\|TIR_domain\|TIR_domain_profile\.\|Toll_-_interleukin_1_-_resistance\|G3DSA:3.40.50.10140\|SSF52200|/TIR/g' | \ + sed 's/PTHR36766\|PF05659/RPW8/g' | sed 's/PF00931/NB-ARC/g' | uniq | sort -k 1,1 -k 6,6 -k 7,7 | \ + awk '{printf "%s\t%s\tDomain\t%s\t%s\t%s\t%s\t%s\t%s\n",$1,$2,$4,$5,$6,$7,$8,$9}' > $outputdir/annotation.$i.gff3 + done +} + + +function classify_interpro_structure { + local outputdir=$1 + + >$outputdir/to_delete.txt + >$outputdir/All_candidates.classified.bed + + # Get a potential structure classification for candidates + i=0 + while read chr start stop + do + # Check that candidates have an LRR + lrr_count=$(grep "Name=LRR$" $outputdir/annotation.$i.gff3 | wc -l) + + # Try to determine what strand domain is on to look for flanking domains + strand=$(awk -v start=$((start-100)) -v stop=$((stop+100)) \ + '(($5 >= start && $5 <= stop) || ($4 >= start && $4 <= stop)) && $9 == "Name=NB-ARC" {print $7}' $outputdir/annotation.$i.gff3 \ + |uniq |head -n 1) + if [ -z "$strand" ]; then strand="+"; fi + awk '$2!="getorf" {print $0}' $outputdir/annotation.$i.gff3 |sort -k5,5 > $outputdir/processing.tmp + sed -i '/LRR_protein/d' $outputdir/processing.tmp + if [ $strand == "-" ]; then + awk -v stop=$stop '$4>stop {print $0}' $outputdir/processing.tmp > $outputdir/before.tmp + awk -v start=$start '$5 $outputdir/after.tmp + before=$(head -n 1 $outputdir/before.tmp |awk -v strand=$strand '{if(strand!=$7){$9="Name=None"} print $9}' |sed 's/.*Name=//g') + after=$(tail -n 2 $outputdir/after.tmp |awk -v strand=$strand '{if(strand==$7 && $9=="Name=LRR"){print $9}}' |sed 's/.*Name=//g' |uniq) + else + awk -v start=$start '$5 $outputdir/before.tmp + awk -v stop=$stop '$4>stop {print $0}' $outputdir/processing.tmp > $outputdir/after.tmp + before=$(tail -n 1 $outputdir/before.tmp |awk -v strand=$strand '{if(strand!=$7){$9="Name=None"} print $9}' |sed 's/.*Name=//g') + after=$(head -n 2 $outputdir/after.tmp |awk -v strand=$strand '{if(strand==$7 && $9=="Name=LRR"){print $9}}' |sed 's/.*Name=//g' |uniq) + fi + + # Start classification, all have N and add structure based on flanking interproscan domains + class="N" + if [ ! -z "$before" ]; then + if [ "$before" == "TIR" ]; then class="T$class"; fi + if [ "$before" == "Coil" ]; then class="C$class"; fi + if [ "$before" == "RPW8" ]; then class="R$class"; fi + fi + if [ ! -z "$after" ]; then + if [ "$after" == "LRR" ]; then class+="L"; fi + fi + + # Candidates don't have any LRRs downstream, mark to remove + if [ "$lrr_count" -eq 0 ]; then + class+=";Filtered" + echo -e "$chr\t$start\t$stop\t$class" >> $outputdir/to_delete.txt + fi + echo -e "$chr\t$start\t$stop\t$class" >> $outputdir/All_candidates.classified.bed + i=$((i+1)) + done < $outputdir/All_candidates.bed +} + + + + + +free_mem_MB=$(check_mem_usage) +if [ $free_mem_MB -lt 3000 ]; then + echo "Not enough memory to run interproscan. 3G mimimum is required." + exit 1 +fi + + +## Split the file because interproscan has problems with too many sequences +seq_num=$(grep ">" $outputdir/Candidate_sites.with_flanking.fa |wc -l) +seq_num=$(($seq_num-1)) +split_fasta $outputdir/Candidate_sites.with_flanking.fa $outputdir/Candidate_sites.with_flanking.fa_chunk_0000 + +echo -e "\nRunning interproscan domain and ORF identification..." | tee -a $sdout +run_interproscan $outputdir $seq_num + +# Interproscan can hang or crash randomly if [ "$flag" -eq 1 ]; then echo -e "\n\nDomain identification complete." | tee -a $sdout else echo -e "There was a problem with interproscan. Check $errout for more information." | tee -a $sdout - exit + exit 1 fi -echo "Converting..." | tee -a $sdout -# Convert gff files and filter for presence of LRRs -for i in $(eval echo "{0..$seq_num}"); do - # Combine the files, renaming domains of interest to their common names - cat $outputdir/Candidate_sites_interpro.$i.gff3 | sed 's/ /__/g' | \ - awk 'NF>3 { - split($1,a,":");split(a[2],b,"-");$1=a[1]; - if ($3=="ORF"){start=$4;stop=$5;strand=$7;$4=$4+b[1];$5=$5+b[1];} - else if( $3=="protein_match" || $3=="polypeptide" ){ - if(strand=="+"){$4=$4*3+b[1]+start;$5=$5*3+b[1]+start;} - else if(strand=="-"){$7=strand;tmp1=b[1]+stop-$5*3+1;tmp2=b[1]+stop-$4*3+1;$4=tmp1;$5=tmp2} - } - if($3!="nucleic_acid"){print $0} }' | sort -k1,1 -k 4,4 -k 5,5 |uniq |sed 's/ /\t/g' | - sed 's/[^ \t]*Name=\|[^ \t]*ID=/Name=/g' | sed 's/;.*//g' | uniq | \ - sed 's/PF07725\|SM00367\|PF13855\|SM00369\|G3DSA:3.80.10.10\|Leucine-rich_repeats,_typical_\(most_populated\)_subfamily/LRR/g' | \ - sed 's/PTHR23155[^ \t]*\|PTHR11017[^ \t]*/LRR_protein/g' | \ - sed 's/SM00255\|PF01582\|TIR_domain\|TIR_domain_profile\.\|Toll_-_interleukin_1_-_resistance\|G3DSA:3.40.50.10140\|SSF52200|/TIR/g' | \ - sed 's/PTHR36766\|PF05659/RPW8/g' | sed 's/PF00931/NB-ARC/g' | uniq | sort -k 1,1 -k 6,6 -k 7,7 | \ - awk '{printf "%s\t%s\tDomain\t%s\t%s\t%s\t%s\t%s\t%s\n",$1,$2,$4,$5,$6,$7,$8,$9}' > $outputdir/annotation.$i.gff3 -done - -echo "Classifying..." | tee -a $sdout ->$outputdir/to_delete.txt ->$outputdir/All_candidates.classified.bed -# Get a potential structure classification for candidates -i=0 -while read chr start stop -do - - # Check that candidates have an LRR - lrr_count=$(grep "Name=LRR$" $outputdir/annotation.$i.gff3 | wc -l) - - strand=$(awk -v start=$((start-100)) -v stop=$((stop+100)) \ - '(($5 >= start && $5 <= stop) || ($4 >= start && $4 <= stop)) && $9 == "Name=NB-ARC" {print $7}' $outputdir/annotation.$i.gff3 \ - |uniq |head -n 1) - if [ -z "$strand" ]; then strand="+"; fi - awk '$2!="getorf" {print $0}' $outputdir/annotation.$i.gff3 |sort -k5,5 > $outputdir/processing.tmp - sed -i '/LRR_protein/d' $outputdir/processing.tmp - if [ $strand == "-" ]; then - awk -v stop=$stop '$4>stop {print $0}' $outputdir/processing.tmp > $outputdir/before.tmp - awk -v start=$start '$5 $outputdir/after.tmp - before=$(head -n 1 $outputdir/before.tmp |awk -v strand=$strand '{if(strand!=$7){$9="Name=None"} print $9}' |sed 's/.*Name=//g') - after=$(tail -n 2 $outputdir/after.tmp |awk -v strand=$strand '{if(strand==$7 && $9=="Name=LRR"){print $9}}' |sed 's/.*Name=//g' |uniq) - else - awk -v start=$start '$5 $outputdir/before.tmp - awk -v stop=$stop '$4>stop {print $0}' $outputdir/processing.tmp > $outputdir/after.tmp - before=$(tail -n 1 $outputdir/before.tmp |awk -v strand=$strand '{if(strand!=$7){$9="Name=None"} print $9}' |sed 's/.*Name=//g') - after=$(head -n 2 $outputdir/after.tmp |awk -v strand=$strand '{if(strand==$7 && $9=="Name=LRR"){print $9}}' |sed 's/.*Name=//g' |uniq) - fi - class="N" - if [ ! -z "$before" ]; then - if [ "$before" == "TIR" ]; then class="T$class"; fi - if [ "$before" == "Coil" ]; then class="C$class"; fi - if [ "$before" == "RPW8" ]; then class="R$class"; fi - fi - if [ ! -z "$after" ]; then - if [ "$after" == "LRR" ]; then class+="L"; fi - fi - if [ "$lrr_count" -eq 0 ]; then - class+=";Filtered" - echo -e "$chr\t$start\t$stop\t$class" >> $outputdir/to_delete.txt - fi - echo -e "$chr\t$start\t$stop\t$class" >> $outputdir/All_candidates.classified.bed - i=$((i+1)) -done < $outputdir/All_candidates.bed +echo "Converting names to human readable..." | tee -a $sdout +convert_interpro_names $outputdir $seq_num + +echo "Classifying structure..." | tee -a $sdout +classify_interpro_structure $outputdir echo "Filtering for presence of LRRs..." | tee -a $sdout # Remove candidates with no LRR @@ -196,13 +286,12 @@ grep "Filtered" $outputdir/All_candidates.classified.bed > $outputdir/Filtered_ num_found=$(wc -l < $outputdir/Final_candidates.bed) echo -e "\nCandidate filtering for LRRs. $num_found final candidates found after filtering." | tee -a $sdout +echo "Outputing final candidates and annotations" | tee -a $sdout cp $outputdir/Final_candidates.bed $outputdir/../Final_candidates.bed cp $outputdir/Filtered_candidates.bed $outputdir/../Filtered_candidates.bed cp $outputdir/All_candidates.classified.bed $outputdir/../All_candidates.bed - cat $outputdir/annotation.*.gff3 > $outputdir/All_candidates.gff3 sed -i '1s/^/##gff-version 3\n/' $outputdir/All_candidates.gff3 - cp $outputdir/All_candidates.gff3 $outputdir/../All_candidates.gff3 echo " diff --git a/scripts/removesmalls.pl b/scripts/removesmalls.pl new file mode 100755 index 0000000..1bc5f90 --- /dev/null +++ b/scripts/removesmalls.pl @@ -0,0 +1,35 @@ +## removesmalls.pl +#!/usr/bin/perl +use strict; +use warnings; + +my $ifile; my $ofile; +my $cutoff=$ARGV[2]; +my $flag = 0; +my $seq; my $name; my $read; + +open($ifile, $ARGV[0] ); +open($ofile, ">$ARGV[1]"); + +while( $read = <$ifile> ){ + chomp $read; + if( $read =~ />/){ + if($flag == 0){ + $flag = 1; + $name = $read; + $seq = ""; + } elsif ( length($seq) > $cutoff ){ + print $ofile $name."\n".$seq."\n"; + $name = $read; + $seq = ""; + } else{ + $name = $read; + $seq = ""; + } + } else{ + $seq .= $read; + } +} +if ( length($seq) > $cutoff ){ + print $ofile $name."\n".$seq."\n";} +