Skip to content

Commit

Permalink
Improved comments
Browse files Browse the repository at this point in the history
Modularized interproscan script.
  • Loading branch information
Nicholas TODA committed Jun 21, 2020
1 parent 149fcb9 commit 5755a59
Show file tree
Hide file tree
Showing 11 changed files with 403 additions and 188 deletions.
32 changes: 17 additions & 15 deletions NLGenomeSweeper
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
#!/usr/bin/env python
#!/usr/bin/env python3

import subprocess
import sys
Expand Down Expand Up @@ -48,9 +48,12 @@ profiler_file='data/Vitis_vinifera_NB-ARC_consensus.NLG_based.fa'
# Output file names
annotated_candidates = 'Annotated_candidates.bed'
unannotated_candidates = 'All_candidates.bed'
annotated_gff= 'All_candidates.gff3'
consensus_file='Species_specific_consensus.fa'
filter_candidates='Filtered_candidates.bed'
annotated_gff = 'All_candidates.gff3'
consensus_file ='Species_specific_consensus.fa'
filter_candidates ='Filtered_candidates.bed'

# We'll look to merge domains across introns smaller than this
max_intron_len = 1000

# Append the contents of an input file to an output file
def append_file(input_file, output_file):
Expand All @@ -62,7 +65,9 @@ def append_file(input_file, output_file):

##########################################################################
#
# identify_candidates_proteins
# DEPRACATED - no protein search done
#
# identify_candidates_proteins
#
# Arguments:
# program_dir - directory of this program
Expand All @@ -85,7 +90,7 @@ def identify_candidates_proteins(program_dir, protein, gff, hmm, outdir):
os.mkdir(outdir + search_folder + protein_search_folder)
subprocess.run('{}/scripts/hmmer_candidates.sh {} {} {} {} {}'.format(program_dir, protein, gff,
program_dir + nbarc_hmm,
outdir + search_folder + protein_search_folder, program_dir), shell=True)
outdir + search_folder + protein_search_folder, program_dir), shell=True, check=True)
shutil.copy(outdir + search_folder + protein_search_folder + annotated_candidates,
outdir + annotated_candidates)

Expand All @@ -95,11 +100,6 @@ def identify_candidates_proteins(program_dir, protein, gff, hmm, outdir):
subprocess.run('{}/scripts/hmmer_candidates.sh {} {} {} {} {}'.format(program_dir,protein,gff,
hmm,outdir + search_folder + protein_custom_folder, program_dir), shell=True)
# Concatenate to the pfam search results
#annotated1 = pd.read_csv(outdir + search_folder + protein_custom_folder +annotated_candidates,
# sep='\t',header=None)
#annotated2 = pd.read_csv(outdir + '/Annotated_candidates.bed', sep='\t',header=None)
#annotated = annotated1.append(annotated2, ignore_index=True).drop_duplicates()
#annotated.to_csv(outdir + '/Annotated_candidates.bed', sep='\t', header =False, index = False )
append_file(outdir + search_folder + protein_custom_folder + annotated_candidates,
outdir + annotated_candidates)

Expand Down Expand Up @@ -154,7 +154,7 @@ def identify_candidates_genome(program_dir, genome, consensus, outdir, cores, in
# outdir + search_folder + genome_search_folder + 'candiates_sites.bed')
# Run the genome based search
subprocess.run('{}/scripts/genome_search.sh {} {} {} {} {}'.format(program_dir,genome,
outdir + search_folder + genome_search_folder, program_dir, cores, intron_size), shell=True)
outdir + search_folder + genome_search_folder, program_dir, cores, intron_size), shell=True, check=True)
#shutil.copy(outdir + search_folder + genome_search_folder + unannotated_candidates,
# outdir + unannotated_candidates)
shutil.copy(outdir + search_folder + genome_search_folder + consensus_file,
Expand Down Expand Up @@ -187,7 +187,7 @@ def identify_candidates_genome(program_dir, genome, consensus, outdir, cores, in
##########################################################################
def run_interproscan(program_dir, outdir, t):
subprocess.run('{}/scripts/region_interproscan.sh {} {} {}'.format(program_dir, outdir + domain_folder, t, program_dir),
shell=True)
shell=True, check=True)
#shutil.copy(outdir + domain_folder + annotated_gff,
# outdir + annotated_gff)
#shutil.copy(outdir + domain_folder + unannotated_candidates,
Expand All @@ -214,12 +214,14 @@ def main(argv):
help="A genome sequence in fasta format to search.")

input_group = parser.add_argument_group('Input arguments')
# DEPRACATED - no protein search
# input_group.add_argument("-protein", metavar="<fasta file>", type=str, default = None,
# help="Protein sequences in fasta format.")
# input_group.add_argument("-gff", metavar="<gff3 file>", type=str, default = None,
# help="gff3 annotation of the genome. Required if searching protein sequences.")
input_group.add_argument("-consensus", metavar="<fasta file>", type=str, default = None,
help="Also search the genome using a custom NB-ARC consensus sequence(s) (optional).")
help="Also search the genome using a custom NB-ARC consensus sequence(s) (optional)." +
"It is not recommended to use this option since species specific consensuses will be created.")
# input_group.add_argument("-hmm", metavar="<file>", type=str, default = None,
# help="Also search the protein file using a custom NB-ARC HMM.")

Expand Down Expand Up @@ -279,7 +281,7 @@ def main(argv):
# Search protein file using hmmer
## identify_candidates_proteins(program_dir,args.protein, args.gff, args.hmm, args.outdir )
# Search the genome using blast
identify_candidates_genome(program_dir, args.genome, args.consensus, args.outdir + nlg_folder, args.t, 1000)
identify_candidates_genome(program_dir, args.genome, args.consensus, args.outdir + nlg_folder, args.t, max_intron_len)
# Define ORFs and domains using interproscan
run_interproscan(program_dir, args.outdir + nlg_folder, args.t)

Expand Down
Empty file modified data/Vitis_vinifera_NB-ARC_consensus.NLG_based.fa
100644 → 100755
Empty file.
Empty file modified data/Vitis_vinifera_NB-ARC_consensus.fa
100644 → 100755
Empty file.
33 changes: 4 additions & 29 deletions data/combined_consensus.fa
100644 → 100755
Original file line number Diff line number Diff line change
Expand Up @@ -4,52 +4,27 @@ TAKKEDLFKDILKELAKEADDTSVEKEESELKRKLRRLLLRKRFLLVLDDVWEEEDWEKI
GVPLPAKENRLRVLLTTRDEEVANAVSAKSEKIEVELLEKDEAWELLENKVFEKELSEDP
DLEEVAKEIVEKCKGLPLALKVIGGLLATKSTVKEWEKVAEQLNNKLAESRSELNEVLSI
LSLSYESLPEALKRCFLYLSLFPEDVEIKVKRLVKLWSAEGFVESEDI
>vvinCNL_custom_NB-ARC-consensus.genomewide
>vvinCNL-consensus
LLLLLEEEEEQLSVISIVGMGGLGKTTLAKLVYNDEEVKKHFDLRAWVCVSEEFDVKELL
KEILKSLKSEKLESMELEELQEKLKELLKEKRYLLVLDDVWNEDKWEELKEALKEGAKGS
RIIVTTRNEEVASIMKTISPYELEKLSEEESWSLFKKKAFKEEEEEECPELEELGKEIVK
KCKGLPLAIKALGGLLSSKKDEEEWSKVLDSELWELEEEESSILKILKLSYNDLPSHLKQ
CFLYCSLFPKDYEIKKEELIQLWMAEGFVQ
>vvinNL_custom_NB-ARC-consensus.genomewide
>vvinNL-consensus
EEILEALKDDNINMIGVYGMGGVGKTTLVKEVAKKVKEKKLFDEVIIVTVSKKPDLRKIQ
EEIADKLKLKLEKEESESERADKLRERLKKSKKILLILDDVWEKLDLEEVGIPEEDKEKG
CKIVLTTRNRDVCSEMKTQKEIKLEVLSEEEAWKLFEKKVGDVLNSPNLKSIAKEVVKEC
AGLPLAIVTVAKALKNKKDVEVWEDALQQLRRSALKNIKGMEEKVYSALKLSYNKLESEE
LKSCFLFCALFPEDYEIEIEELIEYLIAEKLI
>vvinRPW8_custom_NB-ARC-consensus.genomewide
>vvinRPW8-consensus
KELKELLLKEEEKLLLLTALGGCGKTTLVKKLCKDDEIKGIFKDNILFVNVSKTPNLKVI
VQKLFKYKKEEELEFQSDEDAINQLEQLLNKIGSNPILLILDDVWPGSESLLEKFKFDIP
KYKILVTSRTAFPRFKFTYELKPLNDEDAMSLFRHSASLQDGSSYIPEEVIKKIVRGCGG
LPLALKVIGGSLREQKAEVWLSKLEKWSEGESIFESDEELLDCLQKSLEFSDDKVILKEC
FLDLGSFPEDQRIPVAALIDMWA
>vvinTNL_custom_NB-ARC-consensus.genomewide
>vvinTNL-consensus
ELESLLDIESNDVRLVGILGLGGIGKTTLAKAVYNKISKKFEASSFLENVREKSKKEGLV
KLQEQLLSEILKEKELKVKNVSEGINVIKKRLRKKKVLLVLDDVDKLEQLEKLAGKKDWF
GSGSRIIITTRDKHLLKTLEVDEIYEVKELNKDEALELFSLKAFKKEKPEEDYLELSKKV
VKYAKGLPLALKVLGSDLFGRSIDEWKSALEKLKEIPEKEILEILKISYDGLEETEKEIF
LDIACFFKG

>vvinCNL_custom_NB-ARC-consensus
ELEELGKKLLANEGGEKLLVISIVGMGGLGKTTLAQKVYNDKHVKTHFDLRAWVCVSDEF
DLKEILKKILESVSKSSESEDLEELQEKLKELLKEKRFLLVLDDVWNEDEKWEELKTLLP
DGANGSKIIVTTRNKSVASVMKTSSIHELKLLSDEESWSLFTKKAFENGESSAPPELEEI
GKEIVKKCGGLPLAIKALGGLLSSKKEEEEWEKVLNSEIWDLEEEEEILPALKLSYNDLP
SHLKRCFLYCSLFPKDYEIKKEELILLWVAEGLVQE
>vvinTNL_custom_NB-ARC-consensus
EELKSLLKLESNDVRMVGIYGIGGIGKTTIAKAIYNKISAQFEGVSFLENVREKSKKKGL
LQLQKKLLKDILKEKKLKVSNVEEGLKVIKKRLSSKKVLIVLDDVDELEQLEALAGEKDW
FGKGSRIIITTRDKELLEEHEVDELYEVKKLNKEEALELFSLYAFKQELPKEDYEELSKE
IVEYAKGLPLALKVLGSLLFKKTIKEWESELEKLKKEPEKEIQNVLKISYDGLDETEKEI
FLDIACFFKG
>vvinNL_custom_NB-ARC-consensus
EKVMEALEDEKVRVIGIVGMGGVGKTTLVKQVYNDAKVKKLFDLVIWVTVSKEFDLEKIQ
KEILEKLGLKDEKSESEEEKAEELKERLKEKKFLLVLDDVWEELDLEKLGIPLPDDKNGS
KIVLTTRSEDVCKEMEAQKKIELELLSEEEAWELFKKKAGESVESHPELEELAEEVVKEC
KGLPLAIVTLGRALKSKKTVEEWEKALEELKSSLSENIMEDKVLPVLKLSYDSLPSELKS
CFLYCALFPEDYEIEKEELIELWIAEGLLQ
>vvinRPW8_custom_NB-ARC-consensus
KELKKLLFKDDESRIVVSAPGGCGKTTLAKKLCKDQQVKEYFKDSILYVTVSKTANLIEI
IKKLLKENAEEVKEFQNEEDAVNQLERKLKRKVESGRILLVLDDVWSGSESVLEKLKFKI
SELKVLVTSRNEFPKFGSTYELELLEEEEAKELFRHSAIEEDGSEDSDLDEELVKEIVRR
CKGLPLALEVVGRSLAGKPVEIWRSTLEKLSEGESIVESEDELLECLQSSLDALDDKDML
KECFLDLGSFPEDKKIPVTALIDLWA
18 changes: 17 additions & 1 deletion scripts/createProfile.sh
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,23 @@
#
# Create custom consensus sequences based on domains desired.
#
# This program takes potential domains identified in the genome and compares
# them to known high quality consnesus sequences of that domain. For each
# potential domain the closest reference sequence if found and sequences are
# grouped by their closest reference. The grouped sequences are aligned
# and a protein consensus sequence is created for each group.
#
# Inputs:
# $1: Fasta file of sequences containing domain extracted from the genome
# $2: output directory
# $3: directory of this program
# $4: prefix of output file
# $5: Consensus sequences in fasta format, protein or nucleotide
# $6: Format of the consensus sequences, 'prot' or 'nucl'
#
# Outputs:
# $outputdir/$prefix.fa: Fasta file of species specific consensus sequences, proteins
#

#hmm=$1
sequences=$1
Expand Down Expand Up @@ -56,5 +72,5 @@ do
fi
done
cat $outputdir/consensus_clusters.*.profile.fa > $outputdir/consensus_clusters.fa
remove_smalls $outputdir/consensus_clusters.fa $outputdir/species_specific_domains.fa 200
remove_smalls $outputdir/consensus_clusters.fa $outputdir/$prefix.fa 200

23 changes: 22 additions & 1 deletion scripts/find_genome_candidates.sh
Original file line number Diff line number Diff line change
@@ -1,7 +1,28 @@
#!/bin/bash


#
# Find candidate domains in a genome with blast.
#
# This is a subroutine for finding blast hits of a consensus domain in
# a genome sequence. The hits are merged across small gaps as well as
# introns of a custom length. Hits must span at least 80% of the
# original domain sequence.
#
# Inputs:
# $1: genome file in fasta format to search
# $2: output directory
# $3: number of cores to use
# $4: Format of the consensus sequences, 'prot' or 'nucl'
# $5: Consensus sequences in fasta format, protein or nucleotide
# $6: The maximum size (bp) of introns
# $7: directory of this program
#
# Outputs:
# $outputdir/$outname.fa: Fasta sequences of candidate domain found in genome
#



genome=$1
outputdir=$2
Expand All @@ -24,7 +45,7 @@ elif [ $format == 'nucl' ]; then
-evalue 1e-4 -num_threads $cores
fi

# Merge candidates within 10 bp and then combine across introns (default <500bp)
# Merge candidates within 10 bp and then combine across introns
awk '{if ($9>$10){tmp=$9;$9=$10;$10=tmp} print $2,$9,$10}' $outputdir/genome_blast.txt |sed 's/ /\t/g' \
| bedtools sort | bedtools merge -d 10 | awk '{print $1,$2,$3,$3-$2}' |sed 's/ /\t/g' > $outputdir/genome_blast.merge.txt
merge_exons $outputdir/genome_blast.merge.txt $outputdir/genome_blast.merge2.txt $intron
Expand Down
24 changes: 19 additions & 5 deletions scripts/genome_search.sh
Original file line number Diff line number Diff line change
@@ -1,14 +1,26 @@
#!/bin/bash

#
# Search a genome file for NBS-LRR genes using consensus domain sequence(s).
# Search a genome file for NBS-LRR genes using consensus domain sequence(s) of the NB-ARC domain.
#
# A two pass method is used. The first pass uses blast to find candidate sequences in the
# genome based on the reference consensus sequences. These sequences are used to build
# species specific consensus sequences for each class of NBS-LRR genes. A second
# pass is done to identify candidate sequences using blast with the species specific consensus
# sequences. The output is the candidate sites of the NB-ARC domains and a fasta file of
# the sites plus 10 kb of flanking sequence for use with interproscan.
#
# Inputs:
# $1: genome file in fasta format to search
# $2: output directory
# $3: directory of this program
# $4: number of cores to use
# $5: The maximum size (bp) of introns
#
# Outputs:
# $outputdir/candidate_sites.bed Location of candidate NB-ARC domain sequences found
# $outputdir/Candidate_sites.with_flanking.fa Sequences of candidate sites extended by 10 kb
#

genome=$1
outputdir=$2
Expand All @@ -18,8 +30,10 @@ intron=$5

source $programdir/scripts/functions.sh

# Log files
sdout=$outputdir/NLGenomeSweeper.out
errout=$outputdir/NLGenomeSweeper.err
# Prefix for output files
outname="All_candidates"

function exit_error {
Expand All @@ -45,15 +59,15 @@ Any problems can be submitted via the GitHub page https://github.com/ntoda03/NLG
" | tee -a $sdout


## Check dependencies
## Check dependencies, exit if not found
echo "Checking software requirements..." | tee -a $sdout
DEPENDENCIES=(samtools bedtools blastp blastx TransDecoder.LongOrfs TransDecoder.Predict muscle interproscan hmmbuild)
dep_flag=1
for dependency in "${DEPENDENCIES[@]}"
do
if ! which $dependency &> /dev/null; then
echo "The dependency" $dependency " could not be found. Ensure that is installed and in your path." | tee -a $errout
depflag=0
echo "The dependency" $dependency " could not be found. Ensure that is installed and in your path." | tee -a $errout
dep_flag=0
fi
done
if [ $dep_flag -eq 0 ]; then
Expand Down Expand Up @@ -84,7 +98,7 @@ fi
echo "Creating species and class specific custom sequences..." | tee -a $sdout
mkdir -p $outputdir/profiler
$programdir/scripts/createProfile.sh $outputdir/first_pass/$outname.fa $outputdir/profiler $programdir \
species_specific $outputdir/pfam_NB-ARC.fa nucl >> $sdout 2>> $errout
species_specific_domains $outputdir/pfam_NB-ARC.fa nucl >> $sdout 2>> $errout
if [ -s "$outputdir/profiler/species_specific_domains.fa" ]
then
echo -e "Custom profiles complete.\n" | tee -a $sdout
Expand Down
38 changes: 38 additions & 0 deletions scripts/mergeBlastHits.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@

options(stringsAsFactors = FALSE)

m_opt = c(commandArgs()[3:length(commandArgs())])

infile=m_opt[1]
outfile=m_opt[2]
gap=as.numeric(m_opt[3])
blasttype=m_opt[4]
method=m_opt[5]

mytab=read.table(infile,sep="\t")

check=(nrow(mytab)-1)
j=1
while( j<=check ){
#while( mytab$V2[j] != "000094F_pilon"){
if( mytab$V2[j]==mytab$V2[j+1] && (mytab$V1[j]==mytab$V1[j+1] || blasttype=='any') ){
if( (method=="overlap") && ( (abs(mytab$V9[j+1]-mytab$V9[j])<gap) && (abs(mytab$V10[j+1]-mytab$V10[j])<gap) ) ){
mytab$V10[j]=mytab$V10[j+1]
mytab=mytab[-(j+1),]
} else if( (method=="flanking") && ( (abs(mytab$V9[j+1]-mytab$V10[j])<gap) ) ){
mytab$V10[j]=mytab$V10[j+1]
mytab$V3[j]=((mytab$V3[j]*mytab$V4[j])+(mytab$V3[j+1]*mytab$V4[j+1]))/(mytab$V4[j]+mytab$V4[j+1])
mytab$V4[j]=mytab$V4[j]+mytab$V4[j+1]
mytab=mytab[-(j+1),]
} else if( (method=="anyoverlap") && (mytab$V9[j+1] <= mytab$V10[j]) ){
mytab$V10[j]=mytab$V10[j+1]
mytab=mytab[-(j+1),]
} else{
j=j+1}
}
else{
j=j+1}
check=(nrow(mytab)-1)
}
write.table(mytab,outfile,sep="\t",col.names=FALSE,row.names=FALSE,quote=FALSE)

25 changes: 25 additions & 0 deletions scripts/mergeExons.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@

options(stringsAsFactors = FALSE)

m_opt = c(commandArgs()[3:length(commandArgs())])

infile=m_opt[1]
outfile=m_opt[2]
gap=as.numeric(m_opt[3])

mytab=read.table(infile,sep="\t")

check=(nrow(mytab)-1)
j=1
while( j<=check ){
if( (mytab$V1[j] == mytab$V1[j+1]) && (abs(mytab$V2[j+1] - mytab$V3[j]) <= gap) ){
mytab$V3[j] = mytab$V3[j+1]
mytab$V4[j] = mytab$V4[j] + mytab$V4[j+1]
mytab = mytab[-(j+1),]
}
else{
j=j+1}
check=(nrow(mytab)-1)
}
write.table(mytab,outfile,sep="\t",col.names=FALSE,row.names=FALSE,quote=FALSE)

Loading

0 comments on commit 5755a59

Please sign in to comment.