diff --git a/CHANGES.txt b/CHANGES.txt index e996c17..88c5915 100644 --- a/CHANGES.txt +++ b/CHANGES.txt @@ -9,4 +9,8 @@ v<0.1.4>, <18.10.2022> v<0.1.5>, <27.10.2022> -- Initial release. -- adapt reading of hmmer_hmmsearch output to deal with varying header lines -- fix syntax in "if" statements in "check_input.py" - -- include "check_faa_path" function, to find .faa files also in subdirectories \ No newline at end of file + -- include "check_faa_path" function, to find .faa files also in subdirectories +v<0.1.6>, <02.11.2022> + -- Included the HTML output for the complete summary + -- add option --threads for diamond (make database and alignment) + -- included check if database was downloaded once to not download again \ No newline at end of file diff --git a/README.md b/README.md index 2b87414..bf29075 100644 --- a/README.md +++ b/README.md @@ -64,6 +64,7 @@ ampcombi \ Here the head folder containing output files has to be given. AMPcombi finds and summarizes the output files from different tools, if the folder is structured and named as: `/result_folder/toolsubdir/samplesubdir/sample.tool.filetype`. - Note that the filetype ending might vary and can be specified with `--tooldict`, if it is different from the default. When passing a dictionary via command line, this has to be done as a string with single quotes `' '` and the dictionary keys and items with double quotes `" "`. i.e. `'{"key1":"item1", "key2":"item2"}'` +- Note that `--sample_list` can also be given if only specfic samples are needed from the driectory. The path to the folder containing the respective protein fasta files has to be provided with `--faa_folder`. The files have to be named with `.faa`. @@ -112,8 +113,9 @@ The path to the folder containing the respective protein fasta files has to be p | --faa_folder | path to the folder containing the samples` .faa files, Filenames have to contain the corresponding sample-name, i.e. sample_1.faa | ./test_faa/ | ./faa_files/| | --tooldict | dictionary of AMP-tools and their respective output file endings | '{"ampir":"ampir.tsv", "amplify":"amplify.tsv", "macrel":"macrel.tsv", "hmmer_hmmsearch":"hmmsearch.txt", "ensembleamppred":"ensembleamppred.txt"}' | - | | --amp_database | path to the folder containing the reference database files: (1) a fasta file with <.fasta> file extension and (2) the corresponding table with with functional and taxonomic classifications in <.tsv> file extension | [DRAMP 'general amps'](http://dramp.cpu-bioinfor.org/downloads/) database | ./amp_ref_database/ | -| --complete_summary | Concatenates all samples' summarized tables into one | False | True | +| --complete_summary | concatenates all samples' summarized tables into one and generates both 'csv' and interactive 'html' files | False | True | | --log | print messages into log file instead of stdout | False | True | +| --threads | adjust the number of threads required for DIAMOND alignemnt depending on the computing resources available | 4 | 32 | | --version | print the version number into stdout | - | 0.1.4 | - Note: The fasta file corresponding to the AMP database should not contain any characters other than ['A','C','D','E','F','G','H','I','K','L','M','N','P','Q','R','S','T','V','W','Y'] @@ -136,6 +138,7 @@ The output will be written into your working directory, containing the following | ├── sample_2_ampcombi.csv | └── sample_2_diamond_matches.txt ├── AMPcombi_summary.csv +├── AMPcombi_summary.html └── ampcombi.log ``` diff --git a/ampcombi/HTML.R b/ampcombi/HTML.R new file mode 100755 index 0000000..81627cd --- /dev/null +++ b/ampcombi/HTML.R @@ -0,0 +1,72 @@ +#!/usr/bin/env Rscript + +############################## +# Rscript to visualise the complete summary tables generated by AMPcombi #### +############################## +# Date #### +# October, 19 2022 +############################## +# Authors #### +# Anan Ibrahim - ananhamido@hotmail.com - @darcy220606 +# Louisa Perelo - louperelo@gmail.com - @louperelo +############################## +# Working_directory #### +setwd(getwd()) +############################## +# Libraries used + arguments #### +if (!require("dplyr")) install.packages('dplyr') +if (!require("DT")) install.packages('DT') +if (!require("optparse")) install.packages('optparse') +if (!require("htmlwidgets")) install.packages('htmlwidgets') + +library("dplyr") +library("DT") +library("optparse") +library("htmlwidgets") + +option_list = list( + make_option(c("-f", "--file"), type="character", default="AMPcombi_summary.csv", + help="AMpcombi complete summary table [default= %default]", metavar="character"), + make_option(c("-o", "--out"), type="character", default="AMPcombi_summary.html", + help="Provide the name of the output file [default= %default]", metavar="character")); +# Turns warnings off +#options(warn=-1) +opt_parser = OptionParser(option_list=option_list); +opt = parse_args(opt_parser); + +############################## +#Generate HTML interactive files #### +table <- + readr::read_csv(opt$file,show_col_types = FALSE) %>% + unique() + +result<-datatable(table, + class = 'cell-border stripe', ## add column border + options = list( paging = TRUE, ## paginate the output + pageLength = 100, ## number of rows to output for each page + scrollX = TRUE, ## enable scrolling on X axis + scrollY = TRUE, ## enable scrolling on Y axis + autoWidth = TRUE, ## use smart column width handling + #width = 100, + #height=100, + server = FALSE, ## use client-side processing only load the 100 on display + dom = 'Bfrtip', + language = list(sSearch = "Keyword look-up:"), + #bordered = TRUE, + buttons = c('csv', 'excel'), ## the user can just download what on display because server=TRUE + columnDefs = list(list(targets = '_all', className = 'dt-center'), + list(targets='aa_sequence', visible=TRUE, width='20'))), + extensions = 'Buttons', + selection = 'multiple', ## enable selection of a single row + filter = 'top', ## include column filters at the bottom + rownames = FALSE ## don't show row numbers/names + ) + +# Change the HTML size to fill the browser +result$sizingPolicy$defaultWidth<-"100%" + +htmlwidgets::saveWidget(result, opt$out, selfcontained = FALSE) + +# CLean up the library folder created +unlink("AMPcombi_summary_files", recursive = TRUE) +############################## diff --git a/ampcombi/amp_database.py b/ampcombi/amp_database.py index ae3bbb9..7e54a42 100755 --- a/ampcombi/amp_database.py +++ b/ampcombi/amp_database.py @@ -44,14 +44,14 @@ def download_DRAMP(db): ######################################## # FUNCTION: CREATE DIAMOND COMPATIBLE DATBASE FORMATS ######################################### -def create_diamond_ref_db(db): +def create_diamond_ref_db(db,threads): cwd = os.getcwd() for file in os.listdir(db): if file.endswith('.fasta'): path = os.path.join(os.path.abspath(db) + '/' + file) os.chdir(db) #process = subprocess.Popen([f'{scripts_path}/diamond_makedb.sh', path]) - subprocess.run('diamond_makedb.sh', text=True, input=path) + subprocess.run('diamond_makedb.sh', text=True, input=f'{path}\n{threads}') os.chdir(cwd) print return path @@ -59,13 +59,13 @@ def create_diamond_ref_db(db): ######################################## # FUNCTION: DIAMOND ALIGNMENT ######################################### -def diamond_alignment(db, amp_faa_paths, amp_matches): +def diamond_alignment(db, amp_faa_paths, amp_matches,threads): #create temp folder and delete at the end cwd = os.getcwd() for path in amp_faa_paths: # align the query with the database temp = tempfile.mkdtemp() - subprocess.run('diamond_alignment.sh', text=True, input=f'{path}\n{temp}\n{db}') + subprocess.run('diamond_alignment.sh', text=True, input=f'{path}\n{temp}\n{db}\n{threads}') shutil.move(temp+'/diamond_matches.tsv', amp_matches) shutil.rmtree(temp) # mege the diamond_alignment with the ref_db table diff --git a/ampcombi/ampcombi.py b/ampcombi/ampcombi.py old mode 100644 new mode 100755 index 995e706..0288f78 --- a/ampcombi/ampcombi.py +++ b/ampcombi/ampcombi.py @@ -13,6 +13,7 @@ from check_input import * from amp_database import * from print_header import * +from visualise_complete_summary import * # Define input arguments: parser = argparse.ArgumentParser(prog = 'ampcombi', formatter_class=argparse.RawDescriptionHelpFormatter, @@ -41,10 +42,12 @@ type=str, default='{"ampir":"ampir.tsv", "amplify":"amplify.tsv", "macrel":"macrel.tsv", "neubi":"neubi.fasta", "hmmer_hmmsearch":"hmmsearch.txt", "ensembleamppred":"ensembleamppred.txt"}') parser.add_argument("--amp_database", dest="ref_db", nargs='?', help="Enter the path to the folder containing the reference database files (.fa and .tsv); a fasta file and the corresponding table with functional and taxonomic classifications. \n (default: DRAMP database)", type=str, default=None) -parser.add_argument("--complete_summary", dest="complete", nargs='?', help="Concatenates all sample summaries to one final summary", +parser.add_argument("--complete_summary", dest="complete", nargs='?', help="Concatenates all sample summaries to one final summary and outputs both csv and interactive html files", type=bool, default=False) parser.add_argument("--log", dest="log_file", nargs='?', help="Silences the standard output and captures it in a log file)", type=bool, default=False) +parser.add_argument("--threads", dest="cores", nargs='?', help="Changes the threads used for DIAMOND alignment (default: %(default)s)", + type=bool, default='4') parser.add_argument('--version', action='version', version='%(prog)s ' + __version__) # get command line arguments @@ -59,6 +62,7 @@ tooldict = json.loads(args.tools) database = args.ref_db complete_summary = args.complete +threads = args.cores # additional variables # extract list of tools from input dictionary. If not given, default dict contains all possible tools @@ -75,10 +79,10 @@ def main_workflow(): # print AMPcombi header print_header() - # check input parameters - check_input_complete(path, samplelist_in, filepaths_in, tools) # check input sample-list and create sample-list if input empty samplelist = check_samplelist(samplelist_in, tools, path) + # check input parameters + check_input_complete(path, samplelist, filepaths_in, tools) # check input filepaths and create list of list of filepaths per sample if input empty filepaths = check_pathlist(filepaths_in, samplelist, fileending, path) # check amp_ref_database filepaths and create a directory if input empty @@ -89,7 +93,7 @@ def main_workflow(): # generate summary for each sample amp_faa_paths = [] - create_diamond_ref_db(db) + create_diamond_ref_db(db,threads) for i in range(0, len(samplelist)): main_list = [] print('\n ########################################################## ') @@ -107,8 +111,8 @@ def main_workflow(): amp_faa_paths.append(out_path) print(f'The fasta containing AMP sequences for {samplelist[i]} was saved to {samplelist[i]}/ \n') amp_matches = samplelist[i] +'/'+samplelist[i]+'_diamond_matches.txt' - print(f'The diamond alignment for {samplelist[i]} in process....') - diamond_df = diamond_alignment(db, amp_faa_paths, amp_matches) + print(f'The diamond alignment for {samplelist[i]} in progress ....') + diamond_df = diamond_alignment(db, amp_faa_paths, amp_matches, threads) print(f'The diamond alignment for {samplelist[i]} was saved to {samplelist[i]}/.') # Merge summary_df and diamond_df sample_summary_df = pd.merge(summary_df, diamond_df, on = 'contig_id', how='left') @@ -121,10 +125,11 @@ def main_workflow(): # concatenate the sample summary to the complete summary and overwrite it complete_summary_df = pd.concat([complete_summary_df, sample_summary_df]) complete_summary_df.to_csv('AMPcombi_summary.csv', sep=',', index=False) + html_generator() else: continue if (complete_summary): - print(f'\n FINISHED: The AMPcombi_summary.csv file was saved to your current working directory.') + print(f'\n FINISHED: The AMPcombi_summary.csv and AMPcombi_summary.html files were saved to your current working directory.') else: print(f'\n FINISHED: AMPcombi created summaries for all input samples.') diff --git a/ampcombi/check_input.py b/ampcombi/check_input.py index 4ffdd61..adef0f8 100755 --- a/ampcombi/check_input.py +++ b/ampcombi/check_input.py @@ -39,20 +39,25 @@ def check_faa_path(faa_path, samplename): return path_list[0] def check_ref_database(database): - if(database==None): + if((database==None) and (not os.path.exists('amp_ref_database'))): print('<--AMP_database> was not given, the current DRAMP general-AMP database will be downloaded and used') database = 'amp_ref_database' os.makedirs(database, exist_ok=True) db = database download_DRAMP(db) - return db - else: - if os.path.exists(database): + return db + elif ((not database==None)): + if (os.path.exists(database)): db = database + print(f'<--AMP_database> = ${db} is found and will be used') return db - else: - if not os.path.exists(database): - sys.exit(f'Reference amp database path {database} does not exist, please check the path.') + if (not os.path.exists(database)): + sys.exit(f'Reference amp database path {database} does not exist, please check the path.') + elif((database==None) and (os.path.exists('amp_ref_database'))): + print('<--AMP_database> = DRAMP is already downloaded and will be reused') + database = 'amp_ref_database' + db = database + return db def check_path(path): return os.path.exists(path) #returns True or False diff --git a/ampcombi/diamond_alignment.sh b/ampcombi/diamond_alignment.sh index 2a6ab9a..e0bd5b4 100755 --- a/ampcombi/diamond_alignment.sh +++ b/ampcombi/diamond_alignment.sh @@ -7,18 +7,21 @@ INPUT_FASTA=$1 OUTPUT_DIR=$2 REF_DIR=$3 +THREADS=$4 read INPUT_FASTA read OUTPUT_DIR read REF_DIR +read THREADS -# Adjust path according to the input folder with the ist of fasta files +# Adjust path according to the input folder with the list of fasta files IN=$INPUT_FASTA OUT=$OUTPUT_DIR REF_DB=$REF_DIR +P=$THREADS diamond blastp \ --p 28 -d $REF_DB/amp_ref -q $IN --quiet \ +-p $P -d $REF_DB/amp_ref -q $IN --quiet \ --outfmt 6 qseqid sseqid pident evalue nident full_qseq full_sseq qseq sseq qcovhsp scovhsp --max-target-seqs 1 --ultra-sensitive -e10000 --id2 1 -s1 -c1 --masking 0 --gapped-filter-evalue 0 --algo 0 --min-score 0 --shape-mask 1111 \ -o $OUT/diamond_matches.txt diff --git a/ampcombi/diamond_makedb.sh b/ampcombi/diamond_makedb.sh index ed76ca8..1d7c142 100755 --- a/ampcombi/diamond_makedb.sh +++ b/ampcombi/diamond_makedb.sh @@ -5,13 +5,16 @@ ######################################### INPUT_FASTA=$1 +THREADS=$2 #OUTPUT_DIR=$2 read INPUT_FASTA +read THREADS #read OUTPUT_DIR IN=$INPUT_FASTA +P=$THREADS #OUT=$OUTPUT_DIR #cd $OUT -diamond makedb --in $IN -p 28 -d amp_ref --quiet \ No newline at end of file +diamond makedb --in $IN -p $P -d amp_ref --quiet \ No newline at end of file diff --git a/ampcombi/print_header.py b/ampcombi/print_header.py old mode 100644 new mode 100755 diff --git a/ampcombi/version.py b/ampcombi/version.py index e2888cc..63eb0cb 100644 --- a/ampcombi/version.py +++ b/ampcombi/version.py @@ -1 +1 @@ -__version__ = '0.1.5' \ No newline at end of file +__version__ = '0.1.6' \ No newline at end of file diff --git a/ampcombi/visualise_complete_summary.py b/ampcombi/visualise_complete_summary.py new file mode 100755 index 0000000..7b4fe8e --- /dev/null +++ b/ampcombi/visualise_complete_summary.py @@ -0,0 +1,11 @@ +#!/bin/python3 + +# TITLE: Visualise teh complete summary and save it to a HTML file + +import subprocess + +######################################## +# FUNCTION: GENERATE AN INTERACTIVE HTML SUMMARY +######################################### +def html_generator(): + subprocess.run('HTML.R', text=True) \ No newline at end of file diff --git a/setup.py b/setup.py index 7da8db5..e1b51a7 100644 --- a/setup.py +++ b/setup.py @@ -5,7 +5,7 @@ setup( name='AMPcombi', - version='0.1.5', + version='0.1.6', author='Anan Ibrahim, Louisa Perelo', author_email='ananhamido@hotmail.com, louperelo@gmail.com', packages=['ampcombi'], @@ -17,7 +17,9 @@ 'ampcombi/diamond_makedb.sh', 'ampcombi/reformat_tables.py', 'ampcombi/print_header.py', - 'ampcombi/version.py'], + 'ampcombi/version.py', + 'ampcombi/visualise_complete_summary.py', + 'ampcombi/HTML.R'], url='http://pypi.python.org/pypi/AMPcombi/', license='LICENSE.txt', description='A parsing tool for AMP tools.', diff --git a/shinyapp/shinyapp_html_file.R b/shinyapp/shinyapp_html_file.R deleted file mode 100644 index 75784cf..0000000 --- a/shinyapp/shinyapp_html_file.R +++ /dev/null @@ -1,105 +0,0 @@ -#!/usr/bin/env Rscript - -############################## -# Rscript to visualise the complete summary tables generated by AMPcombi #### -############################## -# Date #### -# October, 19 2022 -############################## -# Authors #### -# Anan Ibrahim -# ananhamido@hotmail.com -# @darcy220606 -############################## -# Working_directory #### -############################## -#setwd("/home/aibrahim/github/testing_ampcombi_on_deepevo") -############################## -# Libraries used + arguments #### -library(dplyr) -library(DT) -library(shiny) -library(data.table) -library(ggplot2) -library("optparse") -library(htmlwidgets) - -option_list = list( - make_option(c("-f", "--file"), type="character", default=NULL, - help="AMpcombi complete summary table", metavar="character"), - make_option(c("-o", "--out"), type="character", default="AMPcombi_summary.html", - help="Provide the name of the output file [default= %default]", metavar="character") -); - -opt_parser = OptionParser(option_list=option_list); -opt = parse_args(opt_parser); - -############################## -# Generate the html file #### -############################## -#args[2] = "AMPcombi_summary.html" - -table <- - readr::read_csv(opt$file) %>% - unique() - -about_page <- tabPanel( - title = strong('About'), - br(), - includeMarkdown("https://raw.githubusercontent.com/Darcy220606/AMPcombi/dev/README.md") -) -## page 2: Summary table -summary_page <- tabPanel(title = strong('Summary table'), - mainPanel(DTOutput('tbl'), - width = 20)) -## page 3: Plots and figures -plots_page <- tabPanel(title = strong('Plots'), - titlePanel("Analysis"), - sidebarLayout( - sidebarPanel( - ), - mainPanel( - tabsetPanel( - tabPanel( - title = "Plot" - ), - tabPanel( - title = "Statistics", - ) - ) - ) - )) - - -## Shiny app -ui <- navbarPage( - title = strong("AMPcombi"), - about_page, - summary_page, - plots_page, - tags$style(type = 'text/css', '.navbar { background-color: #a2d2ff; - font-family: Arial; - font-size: 15px; - color: #023047; }') -) -server <- function(input, output) -{output$tbl = renderDT(table, - class = 'cell-border stripe', ## add column border - options = list( paging = TRUE, ## paginate the output - pageLength = 100, ## number of rows to output for each page - scrollX = TRUE, ## enable scrolling on X axis - scrollY = TRUE, ## enable scrolling on Y axis - autoWidth = TRUE, ## use smart column width handling - #width = 200, - server = TRUE, ## use client-side processing only load the 100 on display - dom = 'Bfrtip', - #bordered = TRUE, - buttons = c('csv', 'excel'), ## the user can just download what on display because server=TRUE - columnDefs = list(list(targets = '_all', className = 'dt-center'), - list(targets = c(0, 8, 9), visible = TRUE))), - extensions = 'Buttons', - selection = 'multiple', ## enable selection of a single row - filter = 'top', ## include column filters at the bottom - rownames = FALSE ## don't show row numbers/names -)} -shinyApp(ui = ui, server = server)