Merge pull request #46 from Darcy220606/dev

Add release 0.1.5 updates
Darcy220606 · Oct 27, 2022 · 45f073f · 45f073f
2 parents 6e5319e + 311d8c0
commit 45f073f
Show file tree

Hide file tree

Showing 11 changed files with 180 additions and 31 deletions.
diff --git a/CHANGES.txt b/CHANGES.txt
@@ -1 +1,12 @@
-v<version>, <date> -- Initial release.
+v<0.1.0>, <date> -- Initial release.
+v<0.1.1>, <date> -- minor changes.
+v<0.1.2>, <date> -- minor changes.
+v<0.1.3>, <10.10.2022> -- PyPi package and conda-recipe / biocontainer release.
+v<0.1.4>, <18.10.2022>
+    -- Included a new optional argument "--complete_summary" to concatenate the results from multiple samples in one table
+    -- Added a universal log file, to append to an existing log file rather than creating multiple new ones every time a sample is run.
+    -- The "--path_list" can be called multiple times to include a list of files from individual samples in multiple lists
+v<0.1.5>, <27.10.2022> -- Initial release.
+    -- adapt reading of hmmer_hmmsearch output to deal with varying header lines
+    -- fix syntax in "if" statements in "check_input.py"
+    -- include "check_faa_path" function, to find .faa files also in subdirectories
diff --git a/README.md b/README.md
@@ -1,6 +1,6 @@
 # AMPcombi : AntiMicrobial Peptides parsing and functional classification tool
 
-# ![Logo](docs/amp-combi-logo.png)
+<img src="https://raw.githubusercontent.com/Darcy220606/AMPcombi/main/docs/amp-combi-logo.png" width="620" height="200" />
 
 This tool parses the results of antimicrobial peptide (AMP) prediction tools into a single table and aligns the hits against a reference AMP database for functional classifications.
 

diff --git a/ampcombi/ampcombi.py b/ampcombi/ampcombi.py
@@ -28,7 +28,7 @@
                                 add_help=True)
 
 parser.add_argument("--amp_results", dest="amp", nargs='?', help="Enter the path to the folder that contains the different tool's output files in sub-folders named by sample name. \n If paths are to be inferred, sub-folders in this results-directory have to be organized like '/amp_results/toolsubdir/samplesubdir/tool.sample.filetype' \n (default: %(default)s)",
-                    type=str, default="./test_files/")
+                    type=str, default='./test_files/')
 parser.add_argument("--sample_list", dest="samples", nargs='*', help="Enter a list of sample-names, e.g. sample_1 sample_2 sample_n. \n If not given, the sample-names will be inferred from the folder structure",
                     default=[])
 parser.add_argument("--path_list", dest="files", nargs='*', action='append', help="Enter the list of paths to the files to be summarized as a list of lists, e.g. --path_list path/to/my/sample1.ampir.tsv path/to/my/sample1.amplify.tsv --path_list path/to/my/sample2.ampir.ts path/to/my/sample2.amplify.tsv. \n If not given, the file-paths will be inferred from the folder structure",
@@ -73,7 +73,7 @@
 # MAIN FUNCTION
 #########################################
 def main_workflow():
-    # print_header()
+    # print AMPcombi header
     print_header()
     # check input parameters
     check_input_complete(path, samplelist_in, filepaths_in, tools)
@@ -83,7 +83,6 @@ def main_workflow():
     filepaths = check_pathlist(filepaths_in, samplelist, fileending, path)
     # check amp_ref_database filepaths and create a directory if input empty
     db = check_ref_database(database)
-
     # initiate a final_summary dataframe to concatenate each new sample-summary
     if (complete_summary):
         complete_summary_df = pd.DataFrame([])
@@ -98,11 +97,12 @@ def main_workflow():
         os.makedirs(samplelist[i], exist_ok=True)
         # fill main_list with tool-output filepaths for sample i
         read_path(main_list, filepaths[i], p, tooldict, faa_path, samplelist[i])
+        # get the path to the samples' corresponding faa file
+        faa_name = check_faa_path(faa_path, samplelist[i])
         # use main_list to create the summary file for sample i
-        summary_df = summary(main_list, samplelist[i], faa_path)
+        summary_df = summary(main_list, samplelist[i], faa_name)
         # Generate the AMP-faa.fasta for sample i
         out_path = samplelist[i] +'/'+samplelist[i]+'_amp.faa'
-        faa_name = faa_path+samplelist[i]+'.faa'
         amp_fasta(summary_df, faa_name, out_path)
         amp_faa_paths.append(out_path)
         print(f'The fasta containing AMP sequences for {samplelist[i]} was saved to {samplelist[i]}/ \n')
@@ -131,12 +131,10 @@ def main_workflow():
 def main():
     if (args.log_file == True and not os.path.exists('ampcombi.log')):
         with open(f'ampcombi.log', 'w') as f:
-            #print(f'AMPcombi version: {args.version}')
             with redirect_stdout(f):
                 main_workflow()
     elif(args.log_file == True and os.path.exists('ampcombi.log')):
         with open(f'ampcombi.log', 'a') as f:
-            #print(f'AMPcombi version: {args.version}')
             with redirect_stdout(f):
                 main_workflow()
     else: main_workflow()

diff --git a/ampcombi/check_input.py b/ampcombi/check_input.py
@@ -2,6 +2,7 @@
 
 import os
 import sys
+import pathlib
 from amp_database import download_DRAMP
 
 def check_samplelist(samplelist, tools, path):
@@ -22,13 +23,21 @@ def check_pathlist(filepaths, samplelist, fileending, path):
             pathlist = []
             for dirpath, subdirs, files in os.walk(path):
                 for file in files:
-                    if ((sample in dirpath)&((list(filter(file.endswith, fileending))!=[]))):
+                    if ((sample in dirpath) and ((list(filter(file.endswith, fileending))!=[]))):
                         pathlist.append(dirpath+'/'+file)
             filepaths.append(pathlist)
         return filepaths
     else:
         return filepaths
 
+def check_faa_path(faa_path, samplename):
+    path_list = list(pathlib.Path(faa_path).rglob(f"*{samplename}*.faa"))
+    if (len(path_list)>1):
+        sys.exit(f'AMPcombi interrupted: There is more than one .faa file for {samplename} in the folder given with --faa_path')
+    elif(not path_list):
+        sys.exit(f'AMPcombi interrupted: There is no .faa file containing {samplename} in the folder given with --faa_path')
+    return path_list[0]
+
 def check_ref_database(database):
     if(database==None):
         print('<--AMP_database> was not given, the current DRAMP general-AMP database will be downloaded and used')
@@ -52,38 +61,39 @@ def check_directory_tree(path, tools, samplelist):
     print(f'Checking directory tree {path} for sub-directories \n ')
     # get first level of sub-directories, check if at least one is named by a tool-name
     subdirs_1 = [x for x in os.listdir(path) if x in tools]
-    if (subdirs_1 == []):
+    if (not subdirs_1):
         sys.exit(f'AMPcombi interrupted: First level sub-directories in {path} are not named by tool-names. Please check the directories names and the keys given in "--tooldict". \n ')
     else:
         print('First level sub-directories passed check.')
     # get second level of sub-directories, check if at least one is named by a sample-name
     subdirs_2 = []
     for dir in subdirs_1:
         subdirs = [x for x in os.listdir(path+dir) if x in samplelist]
-        if (subdirs != []):
+        if (subdirs):
             subdirs_2.append(subdirs)
-    if (subdirs_2 == []):
+    if (not subdirs_2):
         sys.exit(f'AMPcombi interrupted: Second level sub-directories in {path} are not named by sample-names. Please check the directories names and the names given as "--sample_list" \n ')
     else:
         print('Second level sub-directories passed check')
     print('Finished directory check')
 
 def check_input_complete(path, samplelist, filepaths, tools):
     # 1. Head folder does not exist and filepaths-list was not given
-    if((check_path(path)==False)&(filepaths==[])):
+    if((not check_path(path)) and (not filepaths)):
         sys.exit('AMPcombi interrupted: Please provide the correct path to either the folder containing all amp files to be summarized (--amp_results) or the list of paths to the files (--path_list)')
     # 2. Head folder does not exist, filepaths-list was given but no samplelist
-    elif((check_path(path)==False)&(filepaths!=[])&(samplelist==[])):
+    elif((not check_path(path)) and (filepaths) and (not samplelist)):
         sys.exit('AMPcombi interrupted: Please provide a list of sample-names (--sample_list) in addition to --path_list')
     # 3. Head folder does not exist, filepaths- and samplelist are given:
-    elif((check_path(path)==False)&(filepaths!=[])&(samplelist!=[])):
+    elif((not check_path(path)) and (not filepaths) and (not samplelist)):
         for file in filepaths:
+            print(f'in check_input_complete the file in filepath is:')
             # 3.1. check if paths in filepath-list exist
-            if(check_path(file)==False):
+            if(not check_path(file)):
                 sys.exit(f'AMPcombi interrupted: The path {file} does not exist. Please check the --path_list input.')
             # 3.2. check if paths contain sample-names from samplelist
-            if(any(n in file for n in samplelist)==False):
+            if(not any(n in file for n in samplelist)):
                 sys.exit(f'AMPcombi interrupted: The path {file} does not contain any of the sample-names given in --sample_list')
     # 4. Head folder and sample-list are given
-    elif((check_path(path)==True)&(samplelist!=[])):
+    elif((check_path(path)) and (not samplelist)):
         check_directory_tree(path, tools, samplelist)
diff --git a/ampcombi/environment.yml b/ampcombi/environment.yml
@@ -17,6 +17,7 @@ dependencies:
   - et_xmlfile=1.0.1
   - icu=70.1
   - idna=3.4
+  - jsonschema=4.16.0
   - libblas=3.9.0
   - libcblas=3.9.0
   - libcxx=14.0.6

diff --git a/ampcombi/print_header.py b/ampcombi/print_header.py
@@ -2,7 +2,7 @@
 
 def print_header():
     print("""
-$$$$$$\  $$\      $$\ $$$$$$$\                                     $$\       $$\|
+ $$$$$$\  $$\      $$\ $$$$$$$\                                    $$\       $$\|
 $$  __$$\ $$$\    $$$ |$$  __$$\                                   $$ |      \__|
 $ /   $$ |$$$$\  $$$$ |$$ |  $$ | $$$$$$$\  $$$$$$\  $$$$$$\$$$$\  $$$$$$$\  $$\ 
 $$$$$$$$ |$$\$$\$$ $$ |$$$$$$$  |$$  _____|$$  __$$\ $$  _$$  _$$\ $$  __$$\ $$ |

diff --git a/ampcombi/reformat_tables.py b/ampcombi/reformat_tables.py
@@ -4,6 +4,7 @@
 
 import pandas as pd
 from Bio import SeqIO
+import os
 
 #########################################
 # FUNCTION: KEEP ONLY LINES WITH KEYWORD
@@ -100,14 +101,37 @@ def neubi(path, p):
     #  AMP_hmmsearch
 #########################################
 def hmmsearch(path):
-    hmmer_dict = {'level_0':'evalue_hmmer', 'level_1':'score_hmmer', 'level_2':'bias', 'level_3':'eval_domain', 'level_4':'score_domain', 'level_5':'bias_domain', 'level_6':'exp_dom', '-------':'N_dom', '------':'contig_id'}
-    hmmer_df = pd.read_table(path, delim_whitespace=True, header=[15]).reset_index().rename(columns=hmmer_dict)
-    hmmer_df = hmmer_df.drop(hmmer_df.iloc[:,9:17], axis=1) #drop unnecessary columns
-    for index, row in hmmer_df.iterrows():
-        if (row.str.contains('Domain').any()):              #identify index of first row with 'Domain'
-            i = index
-            break
-    hmmer_df = hmmer_df[hmmer_df.index<i]                   #only keep rows previous to index i
+    # list of words in header rows to be removed
+    key_words = ["# hmmsearch ::", "# HMMER ", "# Copyright (C) ", "# Freely distributed", 
+               "# - - - ", "# query HMM file:", "# target sequence database:", 
+               "# output directed to file:", "Query:", "Accession:", 
+               "Description:", "Scores for complete sequences",  "--- full sequence",
+               "# number of worker threads:", "inclusion threshold", "E-value", "-------"]
+    no_hits = "[No hits detected that satisfy reporting thresholds]"
+    hmmer_dict = {0:'evalue_hmmer', 1:'score_hmmer', 2:'bias', 3:'eval_domain', 4:'score_domain', 5:'bias_domain', 6:'exp_dom', 7:'N_dom', 8:'contig_id'}
+    # open the file and read line by line
+    with open(path, "r") as fp:
+        lines = fp.readlines()
+    # Open hmmer_tmp.txt file and only write lines not containing any of key_words
+    with open("hmmer_tmp.txt", "w") as fp:
+        for line in lines:
+            if not any(phrase in line for phrase in key_words):
+                fp.write(line)
+    with open('hmmer_tmp.txt') as tmp:
+        if no_hits in tmp.read():
+            print('The hmmersearch-file did not contain any hits')
+            hmmer_df = pd.DataFrame(columns=[val for val in hmmer_dict.values()])
+        else:
+            hmmer_df = pd.read_table("hmmer_tmp.txt", delim_whitespace=True, header=None).reset_index().rename(columns=hmmer_dict).drop(columns = [9,10,11,12,13,14,15,16]).dropna()
+            for index, row in hmmer_df.iterrows():
+                #identify the footer part of the file: index of first row with '#'
+                if (row.str.contains('#').any()):
+                    i = index
+                    break
+            # eliminate all rows with footer information
+            hmmer_df = hmmer_df[hmmer_df.index<i] 
+        #remove the temporary file
+    os.remove('hmmer_tmp.txt')  
     return hmmer_df[['contig_id', 'evalue_hmmer']]
 
 #########################################
@@ -158,7 +182,7 @@ def summary(df_list, samplename, faa_path):
     #replace all NAs (where a tool did not identify the contig as AMP) with 0
     merge_df = merge_df.fillna(0)
     #add amino-acid sequences
-    faa_df = faa2table(faa_path+samplename+'.faa')
+    faa_df = faa2table(faa_path)
     merge_df = merge_df.merge(faa_df, how='inner', on='contig_id')
     # sort by sum of p-values over rows
     merge_df = merge_df.set_index('contig_id')

diff --git a/ampcombi/version.py b/ampcombi/version.py
@@ -1 +1 @@
-__version__ = '0.1.4'
+__version__ = '0.1.5'
diff --git a/dist/AMPcombi-0.1.4.tar.gz b/dist/AMPcombi-0.1.4.tar.gz
diff --git a/setup.py b/setup.py
@@ -5,7 +5,7 @@
 
 setup(
     name='AMPcombi',
-    version='0.1.4',
+    version='0.1.5',
     author='Anan Ibrahim, Louisa Perelo',
     author_email='[email protected], [email protected]',
     packages=['ampcombi'],

diff --git a/shinyapp/shinyapp_html_file.R b/shinyapp/shinyapp_html_file.R
@@ -0,0 +1,105 @@
+#!/usr/bin/env Rscript
+
+##############################
+# Rscript to visualise the complete summary tables generated by AMPcombi ####
+##############################
+# Date ####
+# October, 19 2022
+##############################
+# Authors ####
+# Anan Ibrahim
+# [email protected] 
+# @darcy220606
+##############################
+# Working_directory ####
+##############################
+#setwd("/home/aibrahim/github/testing_ampcombi_on_deepevo")
+##############################
+# Libraries used + arguments ####
+library(dplyr)
+library(DT)
+library(shiny)
+library(data.table)
+library(ggplot2)
+library("optparse")
+library(htmlwidgets)
+
+option_list = list(
+  make_option(c("-f", "--file"), type="character", default=NULL, 
+              help="AMpcombi complete summary table", metavar="character"),
+  make_option(c("-o", "--out"), type="character", default="AMPcombi_summary.html", 
+              help="Provide the name of the output file [default= %default]", metavar="character")
+); 
+
+opt_parser = OptionParser(option_list=option_list);
+opt = parse_args(opt_parser);
+
+##############################
+# Generate the html file ####
+##############################
+#args[2] = "AMPcombi_summary.html"
+
+table <-
+  readr::read_csv(opt$file) %>% 
+  unique()
+
+about_page <- tabPanel(
+  title = strong('About'),
+  br(),
+  includeMarkdown("https://raw.githubusercontent.com/Darcy220606/AMPcombi/dev/README.md")
+)
+## page 2: Summary table
+summary_page <- tabPanel(title = strong('Summary table'),
+                         mainPanel(DTOutput('tbl'), 
+                                   width = 20))
+## page 3: Plots and figures
+plots_page <- tabPanel(title = strong('Plots'),
+                       titlePanel("Analysis"),
+                       sidebarLayout(
+                         sidebarPanel(
+                         ),
+                         mainPanel(
+                           tabsetPanel(
+                             tabPanel(
+                               title = "Plot"
+                             ),
+                             tabPanel(
+                               title = "Statistics",
+                             )
+                           )
+                         )
+                       ))
+
+
+## Shiny app
+ui <- navbarPage(
+  title = strong("AMPcombi"), 
+  about_page,
+  summary_page,
+  plots_page,
+  tags$style(type = 'text/css', '.navbar { background-color: #a2d2ff;
+                                               font-family: Arial;
+                                               font-size: 15px;
+                                               color: #023047; }')
+)
+server <- function(input, output)
+{output$tbl = renderDT(table,
+                       class = 'cell-border stripe',     ## add column border
+                       options = list( paging = TRUE,    ## paginate the output
+                                       pageLength = 100, ## number of rows to output for each page
+                                       scrollX = TRUE,   ## enable scrolling on X axis
+                                       scrollY = TRUE,   ## enable scrolling on Y axis
+                                       autoWidth = TRUE, ## use smart column width handling
+                                       #width = 200,
+                                       server = TRUE,   ## use client-side processing only load the 100 on display
+                                       dom = 'Bfrtip',
+                                       #bordered = TRUE,
+                                       buttons = c('csv', 'excel'), ## the user can just download what on display because server=TRUE
+                                       columnDefs = list(list(targets = '_all', className = 'dt-center'),
+                                                         list(targets = c(0, 8, 9), visible = TRUE))),
+                       extensions = 'Buttons',
+                       selection = 'multiple',         ## enable selection of a single row
+                       filter = 'top',                 ## include column filters at the bottom
+                       rownames = FALSE                ## don't show row numbers/names
+)}
+shinyApp(ui = ui, server = server)