Skip to content

Commit

Permalink
Fixes and conda packaging for run_dbcan (#101)
Browse files Browse the repository at this point in the history
* Edited hmmscan_parser and run_dbcan so that they can be imported as python modules and called from other python scripts without invoking subprocesses, etc. They retain the oriignal functionality of being called via CLI through a shell.

Additionally, prediction.py had a bug in get_validation_results that caused an empty folder structure to be created when the output was given as an absolute path instead of relative path. This bug should now be fixed.

* Missed some bugfix updates from the main branch of run_dbcan in last commit, which are included here.

Also changed the call to hmmscan_parser in runHmmScan to a function call instead of subprocess. This will remove the overhead of creating a new subprocess, and makes more sense given that hmmscan_parser is now a module import.

* Added build files for conda installation.

Also moved around the arg input in run_dbcan so that the entry points in setup.py is correct.

* Added note about changing git_url to meta.yaml when merging into main repo.

* Fixed bug where including branch in git_url prevented conda build from executing successfully.

* Added missing dependencies to conda build file.

* Changed run_dbcan imports to solve module not found error in conda installation.

* Removed bioconda as a package. It's a channel, not a package.

* Added init file for proper module installation, and licence file name to meta file.

* made setuptools a build dependency instead of run dependency

* Edited import to function properly

* Edited import to function properly

* Changed to noarch in meta.yaml

* Change meta.yaml to point to main repo for merging into it.

Also pull version info from setup.py, meaning there is a single source of truth for version number now.
  • Loading branch information
AlexSCFraser authored Jul 28, 2022
1 parent 5264805 commit 3a93d08
Show file tree
Hide file tree
Showing 8 changed files with 271 additions and 205 deletions.
3 changes: 1 addition & 2 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -120,7 +120,7 @@ venv.bak/
#vscode
.vscode/

meta.yaml
conda-recipe/meta.yaml
.DS_Store

#comment Hotpep
Expand All @@ -129,4 +129,3 @@ Hotpep_bkp*

#ignore example data
EscheriaColiK12MG1655.*

41 changes: 41 additions & 0 deletions conda-recipe/meta.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
{% set name = "run_dbcan" %}
{% set data = load_setup_py_data(setup_file='../setup.py', from_recipe_dir=True) %}

package:
name: "{{ name|lower }}"
version: {{ data.get('version') }}
# pulls version number from setup.py

source:
# may want to change this to pull from a stable git release branch, or look for stable release git commit labels, or from another source such as PyPi
git_url: https://github.com/linnabrown/run_dbcan.git
git_depth: 1 # only grab most recent commit on branch

build:
noarch: python
number: 0
script: {{ PYTHON }} setup.py install

requirements:
build:
- python
- setuptools
- git
host:
- python
run:
- diamond
- hmmer
- prodigal
- python
- natsort
- scipy
- psutil
- numpy

about:
home: http://bcb.unl.edu/dbCAN2/
dev_url: https://github.com/linnabrown/run_dbcan
license: GPLv3
license_file: LICENSE
summary: A standalone tool of http://bcb.unl.edu/dbCAN2/
79 changes: 35 additions & 44 deletions dbcan/eCAMI/prediction.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
import re
#import psutil
import argparse,sys
from multiprocessing import Pool
from multiprocessing import Pool
# add by Le Nov 14, 2021

# from eCAMI_data import eCAMI_data_path
Expand All @@ -40,23 +40,23 @@ def read_file(database_dir,input_fasta_file):
else:
new_text[proteinname[len(proteinname)-1]]+=text[i]
protein_string=[]
# print(len(proteinname))
# print(len(proteinname))
for each_name in proteinname:
protein_string.append(new_text[each_name])
return [proteinname,protein_string]


def find_all(sub,s):
index_list = []
index = s.find(sub)
while index != -1:
index_list.append(index+1)
index = s.find(sub,index+1)
if len(index_list) > 0:
return index_list
else:
return -1
index_list = []
index = s.find(sub)
while index != -1:
index_list.append(index+1)
index = s.find(sub,index+1)

if len(index_list) > 0:
return index_list
else:
return -1


def get_kmer_dict(n_mer_dir_path):
Expand Down Expand Up @@ -152,9 +152,9 @@ def get_cluster_number(file_name,fam_kmer_dict,output_dir,n_mer,piece_number,pro
each_cluster_score=sorted(each_cluster_score,key=(lambda x:x[1]),reverse=True)
if each_cluster_score[0][2]>=beta and each_cluster_score[0][1]>=important_n_mer_number:
sort_fam.append([each_cluster_score[0][1],each_cluster_score[0][2],each_fam])
selected_fam_dict[each_fam]=[each_cluster_score[0][1]],\
each_cluster_score[0][2],each_cluster_score[0][3],\
kmer_message[each_cluster_score[0][0]],kmer_label[each_cluster_score[0][0]]
selected_fam_dict[each_fam]=[each_cluster_score[0][1]], \
each_cluster_score[0][2],each_cluster_score[0][3], \
kmer_message[each_cluster_score[0][0]],kmer_label[each_cluster_score[0][0]]
if len(sort_fam)==0:
continue
sort_fam=sorted(sort_fam,key=(lambda x:x[1]),reverse=True)
Expand Down Expand Up @@ -182,7 +182,7 @@ def get_cluster_number(file_name,fam_kmer_dict,output_dir,n_mer,piece_number,pro
used_fam.append(temp_fam[0].split('_')[0])
if int(temp_fam[1])>1 and temp_name in selected_fam_dict.keys():
fw.write(protein_name[i]+'\t'+write_line(selected_fam_dict[temp_name],protein_string[i]))


fw.close()

Expand All @@ -196,22 +196,13 @@ def get_validation_results(input_fasta_file,database_dir,output_dir,output_file_
database_dir=database_dir+'/'
[protein_name,protein_string] = read_file(database_dir,input_fasta_file)

current_path=os.getcwd()
# Simple method to make multiple folders along path if needed, should handle both relative and absolute paths.
# The previous code that handled this directory creation had a bug where it would treat absolute paths as relative
# paths and create a huge empty folder structure as a subdirectory of the current working directory.
out_dir_abs = os.path.abspath(output_dir)
os.makedirs(out_dir_abs, exist_ok=True)
temp_output_dir = output_dir

all_output_dir_name=output_dir.split('/')
while '' in all_output_dir_name:
all_output_dir_name.remove('')
for i in range(len(all_output_dir_name)):
if i==0:
temp_dir_list=os.listdir(current_path)
if all_output_dir_name[i] not in temp_dir_list:
os.mkdir(all_output_dir_name[i])
else:
temp_dir_name='/'.join(all_output_dir_name[0:i])
temp_dir_list=os.listdir(temp_dir_name)
if all_output_dir_name[i] not in temp_dir_list:
os.mkdir(temp_dir_name+'/'+all_output_dir_name[i])
temp_output_dir='/'.join(all_output_dir_name)
pred_file_name=output_file_name
if temp_output_dir:
temp_output_dir=output_dir+'/'
Expand All @@ -221,7 +212,7 @@ def get_validation_results(input_fasta_file,database_dir,output_dir,output_file_

for i in range(jobs):
file_name=input_fasta_file+'_thread_'+str(i)+'.txt'
# get_cluster_number(file_name,fam_kmer_dict,temp_output_dir,n_mer,piece_number[i],protein_name,protein_string,important_n_mer_number,beta)
# get_cluster_number(file_name,fam_kmer_dict,temp_output_dir,n_mer,piece_number[i],protein_name,protein_string,important_n_mer_number,beta)

pool.apply_async(get_cluster_number, (file_name,fam_kmer_dict,temp_output_dir,n_mer,piece_number[i],protein_name,protein_string,important_n_mer_number,beta,))
pool.close()
Expand All @@ -234,7 +225,7 @@ def get_validation_results(input_fasta_file,database_dir,output_dir,output_file_
fw.write(text)
os.remove(temp_output_dir+input_fasta_file+'_thread_'+str(i)+'.txt')
fw.close()




Expand Down Expand Up @@ -266,14 +257,14 @@ def get_validation_results(input_fasta_file,database_dir,output_dir,output_file_

class eCAMI_config(object):
def __init__(
self,
db_type = 'CAZyme',
output = 'examples/prediction/output/test_pred_cluster_labels.txt',
input = 'examples/prediction/input/test.faa',
k_mer = 8,
jobs = 8,
important_k_mer_number = 5,
beta = 2.0
self,
db_type = 'CAZyme',
output = 'examples/prediction/output/test_pred_cluster_labels.txt',
input = 'examples/prediction/input/test.faa',
k_mer = 8,
jobs = 8,
important_k_mer_number = 5,
beta = 2.0
) -> None:
if db_type == 'CAZyme':
self.kmer_db = f'{os.path.dirname(__file__)}/CAZyme'
Expand All @@ -292,13 +283,13 @@ def __init__(
def eCAMI_main(args):
starttime = datetime.datetime.now()
# args = arg_parser(sys.argv[1:])

database_dir=os.path.dirname(args.input)
input_fasta_file = os.path.basename(args.input)

output_dir=os.path.dirname(args.output)
output_file_name = os.path.basename(args.output)

n_mer=args.k_mer
n_mer_dir_path=args.kmer_db

Expand Down
Empty file added dbcan_cli/__init__.py
Empty file.
53 changes: 0 additions & 53 deletions dbcan_cli/hmmscan-parser.py

This file was deleted.

63 changes: 63 additions & 0 deletions dbcan_cli/hmmscan_parser.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
#!/usr/bin/env python3
##########################################################
# hmmscan parser for dbCAN meta server
#
# Based off the hmmscan parser used in the dbCAN server,
# written by Dr. Yin
#
# Written by Tanner Yohe under the supervision
# of Dr. Yin in the YinLab at NIU.
#
# Updated by Le Huang from tips the contributor WATSON Mick <[email protected]>,
# Thank you!
#
# Modified by Alex Fraser to have a run() method that can be called and returns data for better integration with other
# scripts. This script also retains the ability to be called from shell and output to pipe redirection.
# This file had to be renamed from "hmmscan-parser.py" to "hmmscan_parser.py" because of python module import conventions.
# Modified on 07/06/22
#
# INPUT
# python hmmscan-parser-dbCANmeta.py [inputFile] [eval] [coverage]
# eval and coverage are optional, inputFile is required
# -updating info:
# -adds pid for every subprocess to make codes robust.
# Last updated: 1/10/19
###########################################################

from subprocess import call
import sys
import os


def run(input_file, eval_num=1e-15, coverage=0.35, verbose=False):

tmpfile = "temp." + str(os.getpid())

call("cat "+input_file+" | grep -v '^#' | awk '{print $1,$3,$4,$6,$13,$16,$17,$18,$19}' | sed 's/ /\t/g' | sort -k 3,3 -k 8n -k 9n | perl -e 'while(<>){chomp;@a=split;next if $a[-1]==$a[-2];push(@{$b{$a[2]}},$_);}foreach(sort keys %b){@a=@{$b{$_}};for($i=0;$i<$#a;$i++){@b=split(/\t/,$a[$i]);@c=split(/\t/,$a[$i+1]);$len1=$b[-1]-$b[-2];$len2=$c[-1]-$c[-2];$len3=$b[-1]-$c[-2];if($len3>0 and ($len3/$len1>0.5 or $len3/$len2>0.5)){if($b[4]<$c[4]){splice(@a,$i+1,1);}else{splice(@a,$i,1);}$i=$i-1;}}foreach(@a){print $_.\"\n\";}}' > " + tmpfile, shell=True)

output = ""
with open(tmpfile) as f:
for line in f:
row = line.rstrip().split('\t')
row.append(float(int(row[6])-int(row[5]))/int(row[1]))
if float(row[4]) <= eval_num and float(row[-1]) >= coverage:
if verbose:
print('\t'.join([str(x) for x in row]))
output += '\t'.join([str(x) for x in row]) + '\n'
call(['rm', tmpfile])

return output


if __name__ == "__main__":
if len(sys.argv) > 3:
file = sys.argv[1]
eval_arg = float(sys.argv[2])
coverage_arg = float(sys.argv[3])
run(file, eval_arg, coverage_arg, verbose=True)
if len(sys.argv) > 1:
file = sys.argv[1]
run(file, verbose=True)
else:
print("Please give a hmmscan output file as the first command")
exit()
Loading

0 comments on commit 3a93d08

Please sign in to comment.