Improved comments

Modularized interproscan script.
ntoda03 · Jun 21, 2020 · 5755a59 · 5755a59
1 parent 149fcb9
commit 5755a59
Show file tree

Hide file tree

Showing 11 changed files with 403 additions and 188 deletions.
diff --git a/NLGenomeSweeper b/NLGenomeSweeper
@@ -1,4 +1,4 @@
-#!/usr/bin/env python
+#!/usr/bin/env python3
 
 import subprocess
 import sys
@@ -48,9 +48,12 @@ profiler_file='data/Vitis_vinifera_NB-ARC_consensus.NLG_based.fa'
 # Output file names
 annotated_candidates = 'Annotated_candidates.bed'
 unannotated_candidates = 'All_candidates.bed'
-annotated_gff= 'All_candidates.gff3'
-consensus_file='Species_specific_consensus.fa'
-filter_candidates='Filtered_candidates.bed'
+annotated_gff = 'All_candidates.gff3'
+consensus_file ='Species_specific_consensus.fa'
+filter_candidates ='Filtered_candidates.bed'
+
+# We'll look to merge domains across introns smaller than this
+max_intron_len = 1000
 
 # Append the contents of an input file to an output file
 def append_file(input_file, output_file):
@@ -62,7 +65,9 @@ def append_file(input_file, output_file):
 
 ##########################################################################
 #
-# identify_candidates_proteins
+# DEPRACATED - no protein search done
+#
+# identify_candidates_proteins 
 #
 # Arguments:
 # program_dir - directory of this program
@@ -85,7 +90,7 @@ def identify_candidates_proteins(program_dir, protein, gff, hmm, outdir):
 	os.mkdir(outdir + search_folder + protein_search_folder)
 	subprocess.run('{}/scripts/hmmer_candidates.sh {} {} {} {} {}'.format(program_dir, protein, gff,
 		program_dir + nbarc_hmm,
-		outdir + search_folder + protein_search_folder, program_dir), shell=True)
+		outdir + search_folder + protein_search_folder, program_dir), shell=True, check=True)
 	shutil.copy(outdir + search_folder + protein_search_folder + annotated_candidates, 
 		outdir + annotated_candidates)
 
@@ -95,11 +100,6 @@ def identify_candidates_proteins(program_dir, protein, gff, hmm, outdir):
 		subprocess.run('{}/scripts/hmmer_candidates.sh {} {} {} {} {}'.format(program_dir,protein,gff,
 			hmm,outdir + search_folder + protein_custom_folder, program_dir), shell=True)
 		# Concatenate to the pfam search results
-		#annotated1 = pd.read_csv(outdir + search_folder + protein_custom_folder +annotated_candidates, 
-		#	sep='\t',header=None)
-		#annotated2 = pd.read_csv(outdir + '/Annotated_candidates.bed', sep='\t',header=None)
-		#annotated = annotated1.append(annotated2, ignore_index=True).drop_duplicates()
-		#annotated.to_csv(outdir + '/Annotated_candidates.bed', sep='\t', header =False, index = False )
 		append_file(outdir + search_folder + protein_custom_folder + annotated_candidates, 
 			outdir + annotated_candidates)
 
@@ -154,7 +154,7 @@ def identify_candidates_genome(program_dir, genome, consensus, outdir, cores, in
 #			outdir + search_folder + genome_search_folder + 'candiates_sites.bed')
 	# Run the genome based search
 	subprocess.run('{}/scripts/genome_search.sh {} {} {} {} {}'.format(program_dir,genome, 
-		outdir + search_folder + genome_search_folder, program_dir, cores, intron_size), shell=True)
+		outdir + search_folder + genome_search_folder, program_dir, cores, intron_size), shell=True, check=True)
 	#shutil.copy(outdir + search_folder + genome_search_folder + unannotated_candidates, 
 	#	outdir + unannotated_candidates)
 	shutil.copy(outdir + search_folder + genome_search_folder + consensus_file,
@@ -187,7 +187,7 @@ def identify_candidates_genome(program_dir, genome, consensus, outdir, cores, in
 ##########################################################################
 def run_interproscan(program_dir, outdir, t):
 	subprocess.run('{}/scripts/region_interproscan.sh {} {} {}'.format(program_dir, outdir + domain_folder, t, program_dir),
-		shell=True)
+		shell=True, check=True)
 	#shutil.copy(outdir + domain_folder + annotated_gff, 
 	#	outdir + annotated_gff)
 	#shutil.copy(outdir + domain_folder + unannotated_candidates,
@@ -214,12 +214,14 @@ def main(argv):
 		help="A genome sequence in fasta format to search.")
 
 	input_group = parser.add_argument_group('Input arguments')
+# DEPRACATED - no protein search
 #	input_group.add_argument("-protein", metavar="<fasta file>",  type=str, default = None,
 #		help="Protein sequences in fasta format.")
 #	input_group.add_argument("-gff", metavar="<gff3 file>", type=str, default = None,
 #		help="gff3 annotation of the genome. Required if searching protein sequences.")
 	input_group.add_argument("-consensus", metavar="<fasta file>", type=str, default = None,
-		help="Also search the genome using a custom NB-ARC consensus sequence(s) (optional).")
+		help="Also search the genome using a custom NB-ARC consensus sequence(s) (optional)." + 
+			"It is not recommended to use this option since species specific consensuses will be created.")
 #	input_group.add_argument("-hmm", metavar="<file>",  type=str, default = None,
 #		help="Also search the protein file using a custom NB-ARC HMM.")
 
@@ -279,7 +281,7 @@ def main(argv):
 		# Search protein file using hmmer
 ##		identify_candidates_proteins(program_dir,args.protein, args.gff, args.hmm, args.outdir )
 	# Search the genome using blast
-	identify_candidates_genome(program_dir, args.genome, args.consensus, args.outdir + nlg_folder, args.t, 1000)
+	identify_candidates_genome(program_dir, args.genome, args.consensus, args.outdir + nlg_folder, args.t, max_intron_len)
 	# Define ORFs and domains using interproscan
 	run_interproscan(program_dir, args.outdir + nlg_folder, args.t)
 

diff --git a/data/Vitis_vinifera_NB-ARC_consensus.NLG_based.fa b/data/Vitis_vinifera_NB-ARC_consensus.NLG_based.fa
diff --git a/data/Vitis_vinifera_NB-ARC_consensus.fa b/data/Vitis_vinifera_NB-ARC_consensus.fa
diff --git a/data/combined_consensus.fa b/data/combined_consensus.fa
@@ -4,52 +4,27 @@ TAKKEDLFKDILKELAKEADDTSVEKEESELKRKLRRLLLRKRFLLVLDDVWEEEDWEKI
 GVPLPAKENRLRVLLTTRDEEVANAVSAKSEKIEVELLEKDEAWELLENKVFEKELSEDP
 DLEEVAKEIVEKCKGLPLALKVIGGLLATKSTVKEWEKVAEQLNNKLAESRSELNEVLSI
 LSLSYESLPEALKRCFLYLSLFPEDVEIKVKRLVKLWSAEGFVESEDI
->vvinCNL_custom_NB-ARC-consensus.genomewide
+>vvinCNL-consensus
 LLLLLEEEEEQLSVISIVGMGGLGKTTLAKLVYNDEEVKKHFDLRAWVCVSEEFDVKELL
 KEILKSLKSEKLESMELEELQEKLKELLKEKRYLLVLDDVWNEDKWEELKEALKEGAKGS
 RIIVTTRNEEVASIMKTISPYELEKLSEEESWSLFKKKAFKEEEEEECPELEELGKEIVK
 KCKGLPLAIKALGGLLSSKKDEEEWSKVLDSELWELEEEESSILKILKLSYNDLPSHLKQ
 CFLYCSLFPKDYEIKKEELIQLWMAEGFVQ
->vvinNL_custom_NB-ARC-consensus.genomewide
+>vvinNL-consensus
 EEILEALKDDNINMIGVYGMGGVGKTTLVKEVAKKVKEKKLFDEVIIVTVSKKPDLRKIQ
 EEIADKLKLKLEKEESESERADKLRERLKKSKKILLILDDVWEKLDLEEVGIPEEDKEKG
 CKIVLTTRNRDVCSEMKTQKEIKLEVLSEEEAWKLFEKKVGDVLNSPNLKSIAKEVVKEC
 AGLPLAIVTVAKALKNKKDVEVWEDALQQLRRSALKNIKGMEEKVYSALKLSYNKLESEE
 LKSCFLFCALFPEDYEIEIEELIEYLIAEKLI
->vvinRPW8_custom_NB-ARC-consensus.genomewide
+>vvinRPW8-consensus
 KELKELLLKEEEKLLLLTALGGCGKTTLVKKLCKDDEIKGIFKDNILFVNVSKTPNLKVI
 VQKLFKYKKEEELEFQSDEDAINQLEQLLNKIGSNPILLILDDVWPGSESLLEKFKFDIP
 KYKILVTSRTAFPRFKFTYELKPLNDEDAMSLFRHSASLQDGSSYIPEEVIKKIVRGCGG
 LPLALKVIGGSLREQKAEVWLSKLEKWSEGESIFESDEELLDCLQKSLEFSDDKVILKEC
 FLDLGSFPEDQRIPVAALIDMWA
->vvinTNL_custom_NB-ARC-consensus.genomewide
+>vvinTNL-consensus
 ELESLLDIESNDVRLVGILGLGGIGKTTLAKAVYNKISKKFEASSFLENVREKSKKEGLV
 KLQEQLLSEILKEKELKVKNVSEGINVIKKRLRKKKVLLVLDDVDKLEQLEKLAGKKDWF
 GSGSRIIITTRDKHLLKTLEVDEIYEVKELNKDEALELFSLKAFKKEKPEEDYLELSKKV
 VKYAKGLPLALKVLGSDLFGRSIDEWKSALEKLKEIPEKEILEILKISYDGLEETEKEIF
 LDIACFFKG
-
->vvinCNL_custom_NB-ARC-consensus
-ELEELGKKLLANEGGEKLLVISIVGMGGLGKTTLAQKVYNDKHVKTHFDLRAWVCVSDEF
-DLKEILKKILESVSKSSESEDLEELQEKLKELLKEKRFLLVLDDVWNEDEKWEELKTLLP
-DGANGSKIIVTTRNKSVASVMKTSSIHELKLLSDEESWSLFTKKAFENGESSAPPELEEI
-GKEIVKKCGGLPLAIKALGGLLSSKKEEEEWEKVLNSEIWDLEEEEEILPALKLSYNDLP
-SHLKRCFLYCSLFPKDYEIKKEELILLWVAEGLVQE
->vvinTNL_custom_NB-ARC-consensus
-EELKSLLKLESNDVRMVGIYGIGGIGKTTIAKAIYNKISAQFEGVSFLENVREKSKKKGL
-LQLQKKLLKDILKEKKLKVSNVEEGLKVIKKRLSSKKVLIVLDDVDELEQLEALAGEKDW
-FGKGSRIIITTRDKELLEEHEVDELYEVKKLNKEEALELFSLYAFKQELPKEDYEELSKE
-IVEYAKGLPLALKVLGSLLFKKTIKEWESELEKLKKEPEKEIQNVLKISYDGLDETEKEI
-FLDIACFFKG
->vvinNL_custom_NB-ARC-consensus
-EKVMEALEDEKVRVIGIVGMGGVGKTTLVKQVYNDAKVKKLFDLVIWVTVSKEFDLEKIQ
-KEILEKLGLKDEKSESEEEKAEELKERLKEKKFLLVLDDVWEELDLEKLGIPLPDDKNGS
-KIVLTTRSEDVCKEMEAQKKIELELLSEEEAWELFKKKAGESVESHPELEELAEEVVKEC
-KGLPLAIVTLGRALKSKKTVEEWEKALEELKSSLSENIMEDKVLPVLKLSYDSLPSELKS
-CFLYCALFPEDYEIEKEELIELWIAEGLLQ
->vvinRPW8_custom_NB-ARC-consensus
-KELKKLLFKDDESRIVVSAPGGCGKTTLAKKLCKDQQVKEYFKDSILYVTVSKTANLIEI
-IKKLLKENAEEVKEFQNEEDAVNQLERKLKRKVESGRILLVLDDVWSGSESVLEKLKFKI
-SELKVLVTSRNEFPKFGSTYELELLEEEEAKELFRHSAIEEDGSEDSDLDEELVKEIVRR
-CKGLPLALEVVGRSLAGKPVEIWRSTLEKLSEGESIVESEDELLECLQSSLDALDDKDML
-KECFLDLGSFPEDKKIPVTALIDLWA
diff --git a/scripts/createProfile.sh b/scripts/createProfile.sh
@@ -3,7 +3,23 @@
 #
 # Create custom consensus sequences based on domains desired.
 #
+# This program takes potential domains identified in the genome and compares
+# them to known high quality consnesus sequences of that domain. For each
+# potential domain the closest reference sequence if found and sequences are
+# grouped by their closest reference. The grouped sequences are aligned
+# and a protein consensus sequence is created for each group.
+# 
+# Inputs:
+# $1: Fasta file of sequences containing domain extracted from the genome
+# $2: output directory
+# $3: directory of this program
+# $4: prefix of output file
+# $5: Consensus sequences in fasta format, protein or nucleotide
+# $6: Format of the consensus sequences, 'prot' or 'nucl'
 #
+# Outputs:
+# $outputdir/$prefix.fa: Fasta file of species specific consensus sequences, proteins
+# 
 
 #hmm=$1
 sequences=$1
@@ -56,5 +72,5 @@ do
 	fi
 done
 cat $outputdir/consensus_clusters.*.profile.fa > $outputdir/consensus_clusters.fa
-remove_smalls $outputdir/consensus_clusters.fa $outputdir/species_specific_domains.fa 200
+remove_smalls $outputdir/consensus_clusters.fa $outputdir/$prefix.fa 200
 
diff --git a/scripts/find_genome_candidates.sh b/scripts/find_genome_candidates.sh
@@ -1,7 +1,28 @@
 #!/bin/bash
 
+
+#
+# Find candidate domains in a genome with blast.
 #
+# This is a subroutine for finding blast hits of a consensus domain in 
+# a genome sequence. The hits are merged across small gaps as well as
+# introns of a custom length. Hits must span at least 80% of the
+# original domain sequence.
+# 
+# Inputs:
+# $1: genome file in fasta format to search
+# $2: output directory
+# $3: number of cores to use
+# $4: Format of the consensus sequences, 'prot' or 'nucl'
+# $5: Consensus sequences in fasta format, protein or nucleotide
+# $6: The maximum size (bp) of introns
+# $7: directory of this program
 #
+# Outputs:
+# $outputdir/$outname.fa: Fasta sequences of candidate domain found in genome
+# 
+
+
 
 genome=$1
 outputdir=$2
@@ -24,7 +45,7 @@ elif [ $format == 'nucl' ]; then
                 -evalue 1e-4 -num_threads $cores
 fi
 
-# Merge candidates within 10 bp and then combine across introns (default <500bp)
+# Merge candidates within 10 bp and then combine across introns
 awk '{if ($9>$10){tmp=$9;$9=$10;$10=tmp} print $2,$9,$10}' $outputdir/genome_blast.txt |sed 's/ /\t/g'  \
         | bedtools sort | bedtools merge -d 10 | awk '{print $1,$2,$3,$3-$2}' |sed 's/ /\t/g' > $outputdir/genome_blast.merge.txt
 merge_exons $outputdir/genome_blast.merge.txt $outputdir/genome_blast.merge2.txt $intron

diff --git a/scripts/genome_search.sh b/scripts/genome_search.sh
@@ -1,14 +1,26 @@
 #!/bin/bash
 
 #
-# Search a genome file for NBS-LRR genes using consensus domain sequence(s).
+# Search a genome file for NBS-LRR genes using consensus domain sequence(s) of the NB-ARC domain.
 #
+# A two pass method is used. The first pass uses blast to find candidate sequences in the
+# genome based on the reference consensus sequences. These sequences are used to build
+# species specific consensus sequences for each class of NBS-LRR genes. A second
+# pass is done to identify candidate sequences using blast with the species specific consensus
+# sequences. The output is the candidate sites of the NB-ARC domains and a fasta file of
+# the sites plus 10 kb of flanking sequence for use with interproscan.
+# 
+# Inputs:
 # $1: genome file in fasta format to search
 # $2: output directory
 # $3: directory of this program
 # $4: number of cores to use
 # $5: The maximum size (bp) of introns
 #
+# Outputs:
+# $outputdir/candidate_sites.bed Location of candidate NB-ARC domain sequences found
+# $outputdir/Candidate_sites.with_flanking.fa Sequences of candidate sites extended by 10 kb
+# 
 
 genome=$1
 outputdir=$2
@@ -18,8 +30,10 @@ intron=$5
 
 source $programdir/scripts/functions.sh
 
+# Log files
 sdout=$outputdir/NLGenomeSweeper.out
 errout=$outputdir/NLGenomeSweeper.err
+# Prefix for output files
 outname="All_candidates"
 
 function exit_error {
@@ -45,15 +59,15 @@ Any problems can be submitted via the GitHub page https://github.com/ntoda03/NLG
 " | tee -a $sdout
 
 
-## Check dependencies
+## Check dependencies, exit if not found
 echo "Checking software requirements..." | tee -a $sdout
 DEPENDENCIES=(samtools bedtools blastp blastx TransDecoder.LongOrfs TransDecoder.Predict muscle interproscan hmmbuild)
 dep_flag=1
 for dependency in "${DEPENDENCIES[@]}"
 do
 	if ! which $dependency &> /dev/null; then
-    		echo "The dependency" $dependency " could not be found. Ensure that is installed and in your path." | tee -a $errout
-		depflag=0
+		echo "The dependency" $dependency " could not be found. Ensure that is installed and in your path." | tee -a $errout
+		dep_flag=0
 	fi
 done
 if [ $dep_flag -eq 0 ]; then
@@ -84,7 +98,7 @@ fi
 echo "Creating species and class specific custom sequences..." | tee -a $sdout
 mkdir -p $outputdir/profiler
 $programdir/scripts/createProfile.sh $outputdir/first_pass/$outname.fa $outputdir/profiler $programdir \
-	species_specific $outputdir/pfam_NB-ARC.fa nucl >> $sdout 2>> $errout
+	species_specific_domains $outputdir/pfam_NB-ARC.fa nucl >> $sdout 2>> $errout
 if [ -s "$outputdir/profiler/species_specific_domains.fa" ]
 then
    echo -e "Custom profiles complete.\n" | tee -a $sdout

diff --git a/scripts/mergeBlastHits.R b/scripts/mergeBlastHits.R
@@ -0,0 +1,38 @@
+
+options(stringsAsFactors = FALSE)
+
+m_opt = c(commandArgs()[3:length(commandArgs())])
+
+infile=m_opt[1]
+outfile=m_opt[2] 
+gap=as.numeric(m_opt[3])
+blasttype=m_opt[4]
+method=m_opt[5]
+
+mytab=read.table(infile,sep="\t")
+
+check=(nrow(mytab)-1)
+j=1
+while( j<=check ){
+#while( mytab$V2[j] != "000094F_pilon"){
+  if( mytab$V2[j]==mytab$V2[j+1] && (mytab$V1[j]==mytab$V1[j+1] || blasttype=='any') ){
+    if( (method=="overlap") && ( (abs(mytab$V9[j+1]-mytab$V9[j])<gap) && (abs(mytab$V10[j+1]-mytab$V10[j])<gap) ) ){
+      mytab$V10[j]=mytab$V10[j+1]
+      mytab=mytab[-(j+1),]
+    } else if( (method=="flanking") && ( (abs(mytab$V9[j+1]-mytab$V10[j])<gap) ) ){
+      mytab$V10[j]=mytab$V10[j+1]
+      mytab$V3[j]=((mytab$V3[j]*mytab$V4[j])+(mytab$V3[j+1]*mytab$V4[j+1]))/(mytab$V4[j]+mytab$V4[j+1])
+      mytab$V4[j]=mytab$V4[j]+mytab$V4[j+1]
+      mytab=mytab[-(j+1),]
+    } else if( (method=="anyoverlap") && (mytab$V9[j+1] <= mytab$V10[j]) ){
+      mytab$V10[j]=mytab$V10[j+1]
+      mytab=mytab[-(j+1),]
+    } else{
+      j=j+1}
+  }
+  else{
+    j=j+1}
+  check=(nrow(mytab)-1)
+}
+write.table(mytab,outfile,sep="\t",col.names=FALSE,row.names=FALSE,quote=FALSE)
+
diff --git a/scripts/mergeExons.R b/scripts/mergeExons.R
@@ -0,0 +1,25 @@
+
+options(stringsAsFactors = FALSE)
+
+m_opt = c(commandArgs()[3:length(commandArgs())])
+
+infile=m_opt[1]
+outfile=m_opt[2] 
+gap=as.numeric(m_opt[3])
+
+mytab=read.table(infile,sep="\t")
+
+check=(nrow(mytab)-1)
+j=1
+while( j<=check ){
+  if( (mytab$V1[j] == mytab$V1[j+1]) && (abs(mytab$V2[j+1] - mytab$V3[j]) <= gap) ){
+      mytab$V3[j] = mytab$V3[j+1]
+      mytab$V4[j] = mytab$V4[j] + mytab$V4[j+1]
+      mytab = mytab[-(j+1),]
+  }
+  else{
+    j=j+1}
+  check=(nrow(mytab)-1)
+}
+write.table(mytab,outfile,sep="\t",col.names=FALSE,row.names=FALSE,quote=FALSE)
+