diff --git a/candidates.txt b/candidates.txt new file mode 100644 index 0000000..6c68c21 --- /dev/null +++ b/candidates.txt @@ -0,0 +1,4 @@ +proteome_03 +proteome_42 +proteome_45 +proteome_50 diff --git a/projectScript.sh b/projectScript.sh new file mode 100644 index 0000000..0f58e3c --- /dev/null +++ b/projectScript.sh @@ -0,0 +1,37 @@ +#Compiles all the individual hsp70 and mcrA sequences into one file +cat ref_sequences/hsp70*.fasta > hsp70seq.fasta +cat ref_sequences/mcrA*.fasta > mcrAseq.fasta + +#Uses combined seq files to form alignments using muscle +~/Private/Biocomputing2022/Tools/muscle -in hsp70seq.fasta -out hsp70align.afa +~/Private/Biocomputing2022/Tools/muscle -in mcrAseq.fasta -out mcrAalign.afa + +#Uses alignments to build a profile with hmmbuild +~/Private/Biocomputing2022/Tools/hmmbuild hsp70profile.hmm hsp70align.afa +~/Private/Biocomputing2022/Tools/hmmbuild mcrAprofile.hmm mcrAalign.afa + +#Sets headings for summary file and creates or overwrites summary csv file +echo proteome, hsp70_matches, mcrA_matches > summary.csv + +#loops through all proteomes in proteomes file +for file in proteomes/* +do + +#Represents the name of the current proteosome and modifies formatting +a=$(echo $file | sed 's/.fasta//' | sed -e 's/proteomes.//') + +#Searches the hsp70 profile in current proteome and gets match number +b=$(~/Private/Biocomputing2022/Tools/hmmsearch hsp70profile.hmm $file | grep 'Domain search space' | cut -d ' ' -f 20) + +#Searches for mcrA profile in current proteome and gets match number +c=$(~/Private/Biocomputing2022/Tools/hmmsearch mcrAprofile.hmm $file | grep 'Domain search space' | cut -d ' ' -f 20) + +#Appends proteome name and hsp70 and mcrA matches to summary file +echo $a, $b, $c >> summary.csv +done + +#Removes heading of summary file as well as proteomes with no matches +#Sorts by number of hsp70 matches (highest to lowest), gets top four matches (each with 3 hsp70) and saves proteome names to candidate file +cat summary.csv | sed '1d' | sed -e '/, 0/d'| sort -k2,2nr | head -n 4 | cut -d, -f 1 > candidates.txt + + diff --git a/summary.csv b/summary.csv new file mode 100644 index 0000000..d4d7377 --- /dev/null +++ b/summary.csv @@ -0,0 +1,51 @@ +proteome, hsp70_matches, mcrA_matches +proteome_01, 4, 0 +proteome_02, 2, 0 +proteome_03, 3, 1 +proteome_04, 4, 0 +proteome_05, 2, 1 +proteome_06, 0, 0 +proteome_07, 2, 1 +proteome_08, 5, 0 +proteome_09, 1, 0 +proteome_10, 3, 0 +proteome_11, 6, 0 +proteome_12, 6, 0 +proteome_13, 3, 0 +proteome_14, 2, 0 +proteome_15, 1, 1 +proteome_16, 1, 1 +proteome_17, 4, 0 +proteome_18, 8, 0 +proteome_19, 1, 2 +proteome_20, 3, 0 +proteome_21, 5, 0 +proteome_22, 9, 0 +proteome_23, 2, 2 +proteome_24, 2, 1 +proteome_25, 5, 0 +proteome_26, 1, 0 +proteome_27, 1, 0 +proteome_28, 1, 0 +proteome_29, 0, 1 +proteome_30, 1, 0 +proteome_31, 7, 0 +proteome_32, 4, 0 +proteome_33, 0, 0 +proteome_34, 2, 0 +proteome_35, 1, 0 +proteome_36, 3, 0 +proteome_37, 1, 0 +proteome_38, 1, 1 +proteome_39, 1, 1 +proteome_40, 2, 0 +proteome_41, 1, 0 +proteome_42, 3, 1 +proteome_43, 3, 0 +proteome_44, 1, 1 +proteome_45, 3, 1 +proteome_46, 2, 0 +proteome_47, 1, 0 +proteome_48, 1, 1 +proteome_49, 3, 0 +proteome_50, 3, 1