Fixes and conda packaging for run_dbcan (#101)

* Edited hmmscan_parser and run_dbcan so that they can be imported as python modules and called from other python scripts without invoking subprocesses, etc. They retain the oriignal functionality of being called via CLI through a shell. Additionally, prediction.py had a bug in get_validation_results that caused an empty folder structure to be created when the output was given as an absolute path instead of relative path. This bug should now be fixed. * Missed some bugfix updates from the main branch of run_dbcan in last commit, which are included here. Also changed the call to hmmscan_parser in runHmmScan to a function call instead of subprocess. This will remove the overhead of creating a new subprocess, and makes more sense given that hmmscan_parser is now a module import. * Added build files for conda installation. Also moved around the arg input in run_dbcan so that the entry points in setup.py is correct. * Added note about changing git_url to meta.yaml when merging into main repo. * Fixed bug where including branch in git_url prevented conda build from executing successfully. * Added missing dependencies to conda build file. * Changed run_dbcan imports to solve module not found error in conda installation. * Removed bioconda as a package. It's a channel, not a package. * Added init file for proper module installation, and licence file name to meta file. * made setuptools a build dependency instead of run dependency * Edited import to function properly * Edited import to function properly * Changed to noarch in meta.yaml * Change meta.yaml to point to main repo for merging into it. Also pull version info from setup.py, meaning there is a single source of truth for version number now.
linnabrown · Jul 28, 2022 · 3a93d08 · 3a93d08
1 parent 5264805
commit 3a93d08
Show file tree

Hide file tree

Showing 8 changed files with 271 additions and 205 deletions.
diff --git a/.gitignore b/.gitignore
@@ -120,7 +120,7 @@ venv.bak/
 #vscode
 .vscode/
 
-meta.yaml
+conda-recipe/meta.yaml
 .DS_Store
 
 #comment Hotpep
@@ -129,4 +129,3 @@ Hotpep_bkp*
 
 #ignore example data
 EscheriaColiK12MG1655.*
-
diff --git a/conda-recipe/meta.yaml b/conda-recipe/meta.yaml
@@ -0,0 +1,41 @@
+{% set name = "run_dbcan" %}
+{% set data = load_setup_py_data(setup_file='../setup.py', from_recipe_dir=True) %}
+
+package:
+  name: "{{ name|lower }}"
+  version: {{ data.get('version') }}
+#  pulls version number from setup.py
+
+source:
+  #  may want to change this to pull from a stable git release branch, or look for stable release git commit labels, or from another source such as PyPi
+  git_url: https://github.com/linnabrown/run_dbcan.git
+  git_depth: 1 # only grab most recent commit on branch
+
+build:
+  noarch: python
+  number: 0
+  script: {{ PYTHON }} setup.py install
+
+requirements:
+  build:
+    - python
+    - setuptools
+    - git
+  host:
+    - python
+  run:
+    - diamond
+    - hmmer
+    - prodigal
+    - python
+    - natsort
+    - scipy
+    - psutil
+    - numpy
+
+about:
+  home: http://bcb.unl.edu/dbCAN2/
+  dev_url: https://github.com/linnabrown/run_dbcan
+  license: GPLv3
+  license_file: LICENSE
+  summary: A standalone tool of http://bcb.unl.edu/dbCAN2/
diff --git a/dbcan/eCAMI/prediction.py b/dbcan/eCAMI/prediction.py
@@ -13,7 +13,7 @@
 import re
 #import psutil
 import argparse,sys
-from multiprocessing import Pool 
+from multiprocessing import Pool
 # add by Le Nov 14, 2021
 
 # from eCAMI_data import eCAMI_data_path
@@ -40,23 +40,23 @@ def read_file(database_dir,input_fasta_file):
             else:
                 new_text[proteinname[len(proteinname)-1]]+=text[i]
     protein_string=[]
-#    print(len(proteinname))
+    #    print(len(proteinname))
     for each_name in proteinname:
         protein_string.append(new_text[each_name])
     return [proteinname,protein_string]
 
 
 def find_all(sub,s):
-	index_list = []
-	index = s.find(sub)
-	while index != -1:
-		index_list.append(index+1)
-		index = s.find(sub,index+1)
-	
-	if len(index_list) > 0:
-		return index_list
-	else:
-		return -1
+    index_list = []
+    index = s.find(sub)
+    while index != -1:
+        index_list.append(index+1)
+        index = s.find(sub,index+1)
+
+    if len(index_list) > 0:
+        return index_list
+    else:
+        return -1
 
 
 def get_kmer_dict(n_mer_dir_path):
@@ -152,9 +152,9 @@ def get_cluster_number(file_name,fam_kmer_dict,output_dir,n_mer,piece_number,pro
             each_cluster_score=sorted(each_cluster_score,key=(lambda x:x[1]),reverse=True)
             if each_cluster_score[0][2]>=beta and each_cluster_score[0][1]>=important_n_mer_number:
                 sort_fam.append([each_cluster_score[0][1],each_cluster_score[0][2],each_fam])
-                selected_fam_dict[each_fam]=[each_cluster_score[0][1]],\
-                                  each_cluster_score[0][2],each_cluster_score[0][3],\
-                                  kmer_message[each_cluster_score[0][0]],kmer_label[each_cluster_score[0][0]]
+                selected_fam_dict[each_fam]=[each_cluster_score[0][1]], \
+                                            each_cluster_score[0][2],each_cluster_score[0][3], \
+                                            kmer_message[each_cluster_score[0][0]],kmer_label[each_cluster_score[0][0]]
         if len(sort_fam)==0:
             continue
         sort_fam=sorted(sort_fam,key=(lambda x:x[1]),reverse=True)
@@ -182,7 +182,7 @@ def get_cluster_number(file_name,fam_kmer_dict,output_dir,n_mer,piece_number,pro
                 used_fam.append(temp_fam[0].split('_')[0])
                 if int(temp_fam[1])>1 and temp_name in selected_fam_dict.keys():
                     fw.write(protein_name[i]+'\t'+write_line(selected_fam_dict[temp_name],protein_string[i]))
-                    
+
 
     fw.close()
 
@@ -196,22 +196,13 @@ def get_validation_results(input_fasta_file,database_dir,output_dir,output_file_
         database_dir=database_dir+'/'
     [protein_name,protein_string] = read_file(database_dir,input_fasta_file)
 
-    current_path=os.getcwd()
+    # Simple method to make multiple folders along path if needed, should handle both relative and absolute paths.
+    # The previous code that handled this directory creation had a bug where it would treat absolute paths as relative
+    # paths and create a huge empty folder structure as a subdirectory of the current working directory.
+    out_dir_abs = os.path.abspath(output_dir)
+    os.makedirs(out_dir_abs, exist_ok=True)
+    temp_output_dir = output_dir
 
-    all_output_dir_name=output_dir.split('/')
-    while '' in all_output_dir_name:
-        all_output_dir_name.remove('')
-    for i in range(len(all_output_dir_name)):
-        if i==0:
-            temp_dir_list=os.listdir(current_path)
-            if all_output_dir_name[i] not in temp_dir_list:
-                os.mkdir(all_output_dir_name[i])
-        else:
-            temp_dir_name='/'.join(all_output_dir_name[0:i])
-            temp_dir_list=os.listdir(temp_dir_name)
-            if all_output_dir_name[i] not in temp_dir_list:
-                os.mkdir(temp_dir_name+'/'+all_output_dir_name[i])
-    temp_output_dir='/'.join(all_output_dir_name)  
     pred_file_name=output_file_name
     if temp_output_dir:
         temp_output_dir=output_dir+'/'
@@ -221,7 +212,7 @@ def get_validation_results(input_fasta_file,database_dir,output_dir,output_file_
 
     for i in range(jobs):
         file_name=input_fasta_file+'_thread_'+str(i)+'.txt'
-#        get_cluster_number(file_name,fam_kmer_dict,temp_output_dir,n_mer,piece_number[i],protein_name,protein_string,important_n_mer_number,beta)
+        #        get_cluster_number(file_name,fam_kmer_dict,temp_output_dir,n_mer,piece_number[i],protein_name,protein_string,important_n_mer_number,beta)
 
         pool.apply_async(get_cluster_number, (file_name,fam_kmer_dict,temp_output_dir,n_mer,piece_number[i],protein_name,protein_string,important_n_mer_number,beta,))
     pool.close()
@@ -234,7 +225,7 @@ def get_validation_results(input_fasta_file,database_dir,output_dir,output_file_
         fw.write(text)
         os.remove(temp_output_dir+input_fasta_file+'_thread_'+str(i)+'.txt')
     fw.close()
-        
+
 
 
 
@@ -266,14 +257,14 @@ def get_validation_results(input_fasta_file,database_dir,output_dir,output_file_
 
 class eCAMI_config(object):
     def __init__(
-        self,
-        db_type = 'CAZyme',
-        output = 'examples/prediction/output/test_pred_cluster_labels.txt',
-        input = 'examples/prediction/input/test.faa',
-        k_mer = 8,
-        jobs = 8,
-        important_k_mer_number = 5,
-        beta = 2.0
+            self,
+            db_type = 'CAZyme',
+            output = 'examples/prediction/output/test_pred_cluster_labels.txt',
+            input = 'examples/prediction/input/test.faa',
+            k_mer = 8,
+            jobs = 8,
+            important_k_mer_number = 5,
+            beta = 2.0
     ) -> None:
         if db_type == 'CAZyme':
             self.kmer_db = f'{os.path.dirname(__file__)}/CAZyme'
@@ -292,13 +283,13 @@ def __init__(
 def eCAMI_main(args):
     starttime = datetime.datetime.now()
     # args = arg_parser(sys.argv[1:])
-    
+
     database_dir=os.path.dirname(args.input)
     input_fasta_file = os.path.basename(args.input)
-    
+
     output_dir=os.path.dirname(args.output)
     output_file_name = os.path.basename(args.output)
-    
+
     n_mer=args.k_mer
     n_mer_dir_path=args.kmer_db
 

diff --git a/dbcan_cli/__init__.py b/dbcan_cli/__init__.py
diff --git a/dbcan_cli/hmmscan-parser.py b/dbcan_cli/hmmscan-parser.py
diff --git a/dbcan_cli/hmmscan_parser.py b/dbcan_cli/hmmscan_parser.py
@@ -0,0 +1,63 @@
+#!/usr/bin/env python3
+##########################################################
+# hmmscan parser for dbCAN meta server
+#
+# Based off the hmmscan parser used in the dbCAN server,
+# written by Dr. Yin
+#
+# Written by Tanner Yohe under the supervision
+# of Dr. Yin in the YinLab at NIU.
+#
+# Updated by Le Huang from tips the contributor WATSON Mick <[email protected]>,
+# Thank you!
+#
+# Modified by Alex Fraser to have a run() method that can be called and returns data for better integration with other
+# scripts. This script also retains the ability to be called from shell and output to pipe redirection.
+# This file had to be renamed from "hmmscan-parser.py" to "hmmscan_parser.py" because of python module import conventions.
+# Modified on 07/06/22
+#
+# INPUT
+# python hmmscan-parser-dbCANmeta.py [inputFile] [eval] [coverage]
+# eval and coverage are optional, inputFile is required
+# -updating info:
+# -adds pid for every subprocess to make codes robust.
+# Last updated: 1/10/19
+###########################################################
+
+from subprocess import call
+import sys
+import os
+
+
+def run(input_file, eval_num=1e-15, coverage=0.35, verbose=False):
+
+	tmpfile = "temp." + str(os.getpid())
+
+	call("cat "+input_file+"  | grep -v '^#' | awk '{print $1,$3,$4,$6,$13,$16,$17,$18,$19}' | sed 's/ /\t/g' | sort -k 3,3 -k 8n -k 9n | perl -e 'while(<>){chomp;@a=split;next if $a[-1]==$a[-2];push(@{$b{$a[2]}},$_);}foreach(sort keys %b){@a=@{$b{$_}};for($i=0;$i<$#a;$i++){@b=split(/\t/,$a[$i]);@c=split(/\t/,$a[$i+1]);$len1=$b[-1]-$b[-2];$len2=$c[-1]-$c[-2];$len3=$b[-1]-$c[-2];if($len3>0 and ($len3/$len1>0.5 or $len3/$len2>0.5)){if($b[4]<$c[4]){splice(@a,$i+1,1);}else{splice(@a,$i,1);}$i=$i-1;}}foreach(@a){print $_.\"\n\";}}' > " + tmpfile, shell=True)
+
+	output = ""
+	with open(tmpfile) as f:
+		for line in f:
+			row = line.rstrip().split('\t')
+			row.append(float(int(row[6])-int(row[5]))/int(row[1]))
+			if float(row[4]) <= eval_num and float(row[-1]) >= coverage:
+				if verbose:
+					print('\t'.join([str(x) for x in row]))
+				output += '\t'.join([str(x) for x in row]) + '\n'
+	call(['rm', tmpfile])
+
+	return output
+
+
+if __name__ == "__main__":
+	if len(sys.argv) > 3:
+		file = sys.argv[1]
+		eval_arg = float(sys.argv[2])
+		coverage_arg = float(sys.argv[3])
+		run(file, eval_arg, coverage_arg, verbose=True)
+	if len(sys.argv) > 1:
+		file = sys.argv[1]
+		run(file, verbose=True)
+	else:
+		print("Please give a hmmscan output file as the first command")
+		exit()